# HG changeset patch # User Paul Boddie # Date 1322175893 -3600 # Node ID 13a33b37aeacda9d1e2323ba5642f8c2fde35357 # Parent 73ba8de88eb58538a7ac0afde2555ff0a9c373aa Introduced usage of urllib2 in order to detect and handle missing or inaccessible resources. Added content type inspection and testing in order to avoid parsing unsupported data formats. Made the format entry optional in the sources dictionary. Updated the release notes and added a "to do" item around network timeouts and/or asynchronous data retrieval. diff -r 73ba8de88eb5 -r 13a33b37aeac EventAggregatorSupport.py --- a/EventAggregatorSupport.py Sat Nov 12 00:53:01 2011 +0100 +++ b/EventAggregatorSupport.py Fri Nov 25 00:04:53 2011 +0100 @@ -20,7 +20,7 @@ import re import bisect import operator -import urllib +import urllib, urllib2 try: from cStringIO import StringIO @@ -103,7 +103,7 @@ # Content type parsing. -encoding_regexp_str = ur'charset=(?P[-A-Za-z0-9]+)' +encoding_regexp_str = ur'(?P[^\s;]*)(?:;\s*charset=(?P[-A-Za-z0-9]+))?' encoding_regexp = re.compile(encoding_regexp_str) # Simple content parsing. @@ -144,12 +144,12 @@ else: return None -def getContentEncoding(content_type): +def getContentTypeAndEncoding(content_type): m = encoding_regexp.search(content_type) if m: - return m.group("encoding") + return m.group("content_type"), m.group("encoding") else: - return None + return None, None def int_or_none(x): if x is None: @@ -1240,7 +1240,9 @@ for source in sources: try: - url, format = sources_dict[source].split() + details = sources_dict[source].split() + url = details[0] + format = (details[1:] or ["ical"])[0] except (KeyError, ValueError): pass else: @@ -1257,10 +1259,13 @@ url = url.replace("{end}", urllib.quote_plus(calendar_end and str(calendar_end) or "")) # Get a parser. + # NOTE: This could be done reactively by choosing a parser based on + # NOTE: the content type provided by the URL. if format == "ical" and vCalendar is not None: parser = vCalendar.parse resource_cls = EventCalendar + required_content_type = "text/calendar" else: continue @@ -1283,15 +1288,22 @@ # Access the remote data source. - cache_entry.open(mode="w") - f = urllib.urlopen(url) try: - cache_entry.write(url + "\n") - cache_entry.write((f.headers.get("content-type") or "") + "\n") - cache_entry.write(f.read()) - finally: - cache_entry.close() - f.close() + f = urllib2.urlopen(url) + cache_entry.open(mode="w") + try: + cache_entry.write(url + "\n") + cache_entry.write((f.headers.get("content-type") or "") + "\n") + cache_entry.write(f.read()) + finally: + cache_entry.close() + f.close() + + # In case of an exception, just ignore the remote source. + # NOTE: This could be reported somewhere. + + except IOError: + continue # Open the cache entry and read it. @@ -1306,7 +1318,16 @@ f = StringIO(data) try: url = f.readline() - encoding = getContentEncoding(f.readline()) + + # Get the content type and encoding, making sure that the data + # can be parsed. + + content_type, encoding = getContentTypeAndEncoding(f.readline()) + if content_type != required_content_type: + continue + + # Send the data to the parser. + uf = codecs.getreader(encoding or "utf-8")(f) try: resources.append(resource_cls(url, parser(uf))) diff -r 73ba8de88eb5 -r 13a33b37aeac README.txt --- a/README.txt Sat Nov 12 00:53:01 2011 +0100 +++ b/README.txt Fri Nov 25 00:04:53 2011 +0100 @@ -298,6 +298,12 @@ time zone information for the correct interpretation of time information in those summaries. Thus, it is highly recommended that pytz be installed. +New in EventAggregator 0.8.2 (Changes since EventAggregator 0.8.1) +------------------------------------------------------------------ + + * Improved the error handling around remote event source data retrieval, + introducing handling of missing resources and unsupported content types. + New in EventAggregator 0.8.1 (Changes since EventAggregator 0.8) ---------------------------------------------------------------- diff -r 73ba8de88eb5 -r 13a33b37aeac TO_DO.txt --- a/TO_DO.txt Sat Nov 12 00:53:01 2011 +0100 +++ b/TO_DO.txt Fri Nov 25 00:04:53 2011 +0100 @@ -113,3 +113,10 @@ way of avoiding repetition of the same events described in different places is for authors to include a UID property identifying each event, using the same value regardless of where the event is being published. + +Remote Source Timeouts +---------------------- + +Sometimes, network problems can cause delays in accessing remote sources. The +library should support either a timeout mechanism or asynchronous retrieval of +remote source data.