vContent

vContent.py

0:09250e78af5e
2008-10-16 Paul Boddie Created a vContent module based on essential parts of the RDFCalendar.Parsers module.
     1 #!/usr/bin/env python     2      3 """     4 Parsing of vCard, vCalendar and iCalendar files.     5      6 Copyright (C) 2005, 2006, 2007, 2008 Paul Boddie <paul@boddie.org.uk>     7      8 This program is free software; you can redistribute it and/or modify it under     9 the terms of the GNU Lesser General Public License as published by the Free    10 Software Foundation; either version 3 of the License, or (at your option) any    11 later version.    12     13 This program is distributed in the hope that it will be useful, but WITHOUT    14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS    15 FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for more    16 details.    17     18 You should have received a copy of the GNU Lesser General Public License along    19 with this program.  If not, see <http://www.gnu.org/licenses/>.    20     21 --------    22     23 References:    24     25 RFC 2445: Internet Calendaring and Scheduling Core Object Specification    26           (iCalendar)    27           http://rfc.net/rfc2445.html    28     29 RFC 2425: A MIME Content-Type for Directory Information    30           http://rfc.net/rfc2425.html    31     32 RFC 2426: vCard MIME Directory Profile    33           http://rfc.net/rfc2426.html    34 """    35     36 # Encoding-related imports.    37     38 import base64, quopri    39     40 # Simple reader class.    41     42 class Reader:    43     44     "A simple class wrapping a file, providing simple pushback capabilities."    45     46     def __init__(self, f, non_standard_newline=0):    47     48         """    49         Initialise the object with the file 'f'. If 'non_standard_newline' is    50         set to a true value (unlike the default), lines ending with CR will be    51         treated as complete lines.    52         """    53     54         self.f = f    55         self.non_standard_newline = non_standard_newline    56         self.lines = []    57         self.line_number = 0    58     59     def pushback(self, line):    60     61         """    62         Push the given 'line' back so that the next line read is actually the    63         given 'line' and not the next line from the underlying file.    64         """    65     66         self.lines.append(line)    67         self.line_number -= 1    68     69     def readline(self):    70     71         """    72         If no pushed-back lines exist, read a line directly from the file.    73         Otherwise, read from the list of pushed-back lines.    74         """    75     76         self.line_number += 1    77         if self.lines:    78             return self.lines.pop()    79         else:    80             # NOTE: Sanity check for broken lines (\r instead of \r\n or \n).    81             line = self.f.readline()    82             while line.endswith("\r") and not self.non_standard_newline:    83                 line += self.f.readline()    84             if line.endswith("\r") and self.non_standard_newline:    85                 return line + "\n"    86             else:    87                 return line    88     89     def read_until(self, targets):    90     91         """    92         Read from the stream until one of the 'targets' is seen. Return the    93         string from the current position up to the target found, along with the    94         target string, using a tuple of the form (string, target). If no target    95         was found, return the entire string together with a target of None.    96         """    97     98         indexes = {}    99    100         # Remember the entire text read and the index of the current line in   101         # that text.   102    103         lines = []   104    105         line = self.readline()   106         lines.append(line)   107         start = 0   108    109         while indexes == {} and line != "":   110             for target in targets:   111                 index = line.find(target)   112    113                 # Always choose the first matching target.   114    115                 if index != -1 and not indexes.has_key(start + index):   116                     indexes[start + index] = target   117    118             start += len(line)   119             line = self.readline()   120             lines.append(line)   121    122         text = "".join(lines)   123    124         if indexes:   125             min_index = reduce(min, indexes.keys())   126             target = indexes[min_index]   127    128             # Skip the target.   129             # Since the end of the buffer should always be a newline, ignore the   130             # last element.   131    132             lines = text[min_index + len(target):].split("\n")[:]   133             if not lines[-1]:   134                 del lines[-1]   135             lines.reverse()   136    137             for line in lines:   138                 self.pushback(line + "\n")   139    140             return text[:min_index], target   141         else:   142             return text, None   143    144 class StreamParser:   145    146     "A stream parser for content in vCard/vCalendar/iCalendar-like formats."   147    148     def __init__(self, f):   149    150         "Initialise the parser for the given file 'f'."   151    152         self.f = f   153    154     def __iter__(self):   155    156         "Return self as the iterator."   157    158         return self   159    160     def next(self):   161    162         """   163         Return the next content item in the file as a tuple of the form   164         (name, parameters, values).   165         """   166    167         return self.parse_content_line()   168    169     def parse_content_line(self):   170    171         """   172         Return the name, parameters and a list containing value information for   173         the current content line in the file being parsed.   174         """   175    176         f = self.f   177    178         parameters = {}   179         name, sep = f.read_until([";", ":"])   180    181         name = name.strip()   182    183         if not name and sep is None:   184             raise StopIteration   185    186         while sep == ";":   187    188             # Find the actual modifier.   189    190             parameter_name, sep = f.read_until(["=", ";", ":"])   191             parameter_name = parameter_name.strip()   192    193             if sep == "=":   194                 parameter_value, sep = f.read_until([";", ":"])   195                 parameter_value = parameter_value.strip()   196             else:   197                 parameter_value = None   198    199             # Append a key, value tuple to the parameters list.   200    201             parameters[parameter_name] = parameter_value   202    203         # Get the value content.   204    205         if sep != ":":   206             raise ValueError, f.line_number   207    208         # Strip all appropriate whitespace from the right end of each line.   209         # For subsequent lines, remove the first whitespace character.   210         # See section 4.1 of the iCalendar specification.   211    212         line = f.readline()   213         value_lines = [line.rstrip("\r\n")]   214         line = f.readline()   215         while line != "" and line[0] in [" ", "\t"]:   216             value_lines.append(line.rstrip("\r\n")[1:])   217             line = f.readline()   218    219         # Since one line too many will have been read, push the line back into the   220         # file.   221    222         f.pushback(line)   223    224         # Decode the value.   225    226         value = self.decode("".join(value_lines), parameters.get("ENCODING"))   227    228         return name, parameters, value   229    230     def decode(self, value, encoding):   231    232         "Decode the 'value' with the given 'encoding'."   233    234         # NOTE: Assuming ISO 8869-1 for the character set.   235    236         if encoding == "QUOTED-PRINTABLE":   237             return unicode(quopri.decodestring(value), "iso-8859-1")   238         elif encoding == "BASE64":   239             return base64.decodestring(value)   240         else:   241             # NOTE: Introducing newline conversions.   242             # Replace quoted characters (see 4.3.11 in RFC 2445).   243    244             return value.replace("\r", "").replace("\\N", "\n").replace("\\n", "\n").replace("\\,", ",").replace("\\;", ";")   245    246 class Parser:   247    248     "A parser for content in vCard/vCalendar/iCalendar-like formats."   249    250     def __init__(self):   251    252         "Initialise the parser."   253    254         self.elements = [] # also known as components   255         self.document = []   256         self.current = self.document   257    258     def parse(self, f):   259    260         "Parse the contents of the file 'f'."   261    262         parser = StreamParser(f)   263    264         for name, parameters, value in parser:   265    266             # Add new elements/components to the current position in the   267             # document, recording the element as the active element.   268    269             if name == "BEGIN":   270                 children = []   271                 element = (value, parameters, children)   272                 self.elements.append(element)   273                 self.current.append(element)   274                 self.current = children   275    276             # End elements by removing them from the active element stack and   277             # making the next element's children the current position for new   278             # content.   279    280             elif name == "END":   281                 start_element = self.elements.pop()   282                 start_value, start_parameters, children = start_element   283                 if start_value != value:   284                     raise ParseError, "Mismatch in BEGIN and END declarations (%r and %r) at line %d." % (   285                         start_value, value, f.line_number)   286                 if self.elements:   287                     parent_value, parent_parameters, children = self.elements[-1]   288                     self.current = children   289                 else:   290                     self.current = self.document   291    292             else:   293                 self.current.append((name, parameters, value))   294    295         return self.document   296    297 # Public functions.   298    299 def parse(f, non_standard_newline=0):   300    301     """   302     Parse the resource data found through the use of the file object 'f', which   303     should provide Unicode data, and put the resource information in the given   304     'store'. (The codecs module can be used to open files or to wrap streams in   305     order to provide Unicode data.)   306    307     The optional 'non_standard_newline' can be set to a true value (unlike the   308     default) in order to attempt to process files with CR as the end of line   309     character.   310    311     As a result of parsing the resource, the root node of the imported resource   312     is returned.   313     """   314    315     reader = Reader(f, non_standard_newline=non_standard_newline)   316     parser = Parser()   317     return parser.parse(reader)   318    319 # vim: tabstop=4 expandtab shiftwidth=4