# HG changeset patch
# User Paul Boddie <paul@boddie.org.uk>
# Date 1322175893 -3600
# Node ID 13a33b37aeacda9d1e2323ba5642f8c2fde35357
# Parent  73ba8de88eb58538a7ac0afde2555ff0a9c373aa
Introduced usage of urllib2 in order to detect and handle missing or
inaccessible resources.
Added content type inspection and testing in order to avoid parsing unsupported
data formats.
Made the format entry optional in the sources dictionary.
Updated the release notes and added a "to do" item around network timeouts
and/or asynchronous data retrieval.

diff -r 73ba8de88eb5 -r 13a33b37aeac EventAggregatorSupport.py
--- a/EventAggregatorSupport.py	Sat Nov 12 00:53:01 2011 +0100
+++ b/EventAggregatorSupport.py	Fri Nov 25 00:04:53 2011 +0100
@@ -20,7 +20,7 @@
 import re
 import bisect
 import operator
-import urllib
+import urllib, urllib2
 
 try:
     from cStringIO import StringIO
@@ -103,7 +103,7 @@
 
 # Content type parsing.
 
-encoding_regexp_str = ur'charset=(?P<encoding>[-A-Za-z0-9]+)'
+encoding_regexp_str = ur'(?P<content_type>[^\s;]*)(?:;\s*charset=(?P<encoding>[-A-Za-z0-9]+))?'
 encoding_regexp = re.compile(encoding_regexp_str)
 
 # Simple content parsing.
@@ -144,12 +144,12 @@
     else:
         return None
 
-def getContentEncoding(content_type):
+def getContentTypeAndEncoding(content_type):
     m = encoding_regexp.search(content_type)
     if m:
-        return m.group("encoding")
+        return m.group("content_type"), m.group("encoding")
     else:
-        return None
+        return None, None
 
 def int_or_none(x):
     if x is None:
@@ -1240,7 +1240,9 @@
 
     for source in sources:
         try:
-            url, format = sources_dict[source].split()
+            details = sources_dict[source].split()
+            url = details[0]
+            format = (details[1:] or ["ical"])[0]
         except (KeyError, ValueError):
             pass
         else:
@@ -1257,10 +1259,13 @@
             url = url.replace("{end}", urllib.quote_plus(calendar_end and str(calendar_end) or ""))
 
             # Get a parser.
+            # NOTE: This could be done reactively by choosing a parser based on
+            # NOTE: the content type provided by the URL.
 
             if format == "ical" and vCalendar is not None:
                 parser = vCalendar.parse
                 resource_cls = EventCalendar
+                required_content_type = "text/calendar"
             else:
                 continue
 
@@ -1283,15 +1288,22 @@
 
                 # Access the remote data source.
 
-                cache_entry.open(mode="w")
-                f = urllib.urlopen(url)
                 try:
-                    cache_entry.write(url + "\n")
-                    cache_entry.write((f.headers.get("content-type") or "") + "\n")
-                    cache_entry.write(f.read())
-                finally:
-                    cache_entry.close()
-                    f.close()
+                    f = urllib2.urlopen(url)
+                    cache_entry.open(mode="w")
+                    try:
+                        cache_entry.write(url + "\n")
+                        cache_entry.write((f.headers.get("content-type") or "") + "\n")
+                        cache_entry.write(f.read())
+                    finally:
+                        cache_entry.close()
+                        f.close()
+
+                # In case of an exception, just ignore the remote source.
+                # NOTE: This could be reported somewhere.
+
+                except IOError:
+                    continue
 
             # Open the cache entry and read it.
 
@@ -1306,7 +1318,16 @@
             f = StringIO(data)
             try:
                 url = f.readline()
-                encoding = getContentEncoding(f.readline())
+
+                # Get the content type and encoding, making sure that the data
+                # can be parsed.
+
+                content_type, encoding = getContentTypeAndEncoding(f.readline())
+                if content_type != required_content_type:
+                    continue
+
+                # Send the data to the parser.
+
                 uf = codecs.getreader(encoding or "utf-8")(f)
                 try:
                     resources.append(resource_cls(url, parser(uf)))
diff -r 73ba8de88eb5 -r 13a33b37aeac README.txt
--- a/README.txt	Sat Nov 12 00:53:01 2011 +0100
+++ b/README.txt	Fri Nov 25 00:04:53 2011 +0100
@@ -298,6 +298,12 @@
 time zone information for the correct interpretation of time information in
 those summaries. Thus, it is highly recommended that pytz be installed.
 
+New in EventAggregator 0.8.2 (Changes since EventAggregator 0.8.1)
+------------------------------------------------------------------
+
+  * Improved the error handling around remote event source data retrieval,
+    introducing handling of missing resources and unsupported content types.
+
 New in EventAggregator 0.8.1 (Changes since EventAggregator 0.8)
 ----------------------------------------------------------------
 
diff -r 73ba8de88eb5 -r 13a33b37aeac TO_DO.txt
--- a/TO_DO.txt	Sat Nov 12 00:53:01 2011 +0100
+++ b/TO_DO.txt	Fri Nov 25 00:04:53 2011 +0100
@@ -113,3 +113,10 @@
 way of avoiding repetition of the same events described in different places is
 for authors to include a UID property identifying each event, using the same
 value regardless of where the event is being published.
+
+Remote Source Timeouts
+----------------------
+
+Sometimes, network problems can cause delays in accessing remote sources. The
+library should support either a timeout mechanism or asynchronous retrieval of
+remote source data.