vContent

vContent.py

7:7eeb730fcbdb
2008-11-02 Paul Boddie Added elementary writing support. Converted test.vcf to use CRLF newlines. Fixed quoting in test.ics. Made the decode method use the names of properties, although this has no real use currently. Made format information global in the vCalendar module.
     1 #!/usr/bin/env python     2      3 """     4 Parsing of vCard, vCalendar and iCalendar files.     5      6 Copyright (C) 2005, 2006, 2007, 2008 Paul Boddie <paul@boddie.org.uk>     7      8 This program is free software; you can redistribute it and/or modify it under     9 the terms of the GNU Lesser General Public License as published by the Free    10 Software Foundation; either version 3 of the License, or (at your option) any    11 later version.    12     13 This program is distributed in the hope that it will be useful, but WITHOUT    14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS    15 FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for more    16 details.    17     18 You should have received a copy of the GNU Lesser General Public License along    19 with this program.  If not, see <http://www.gnu.org/licenses/>.    20     21 --------    22     23 References:    24     25 RFC 2445: Internet Calendaring and Scheduling Core Object Specification    26           (iCalendar)    27           http://rfc.net/rfc2445.html    28     29 RFC 2425: A MIME Content-Type for Directory Information    30           http://rfc.net/rfc2425.html    31     32 RFC 2426: vCard MIME Directory Profile    33           http://rfc.net/rfc2426.html    34 """    35     36 try:    37     set    38 except NameError:    39     from sets import Set as set    40     41 # Encoding-related imports.    42     43 import base64, quopri    44     45 # Tokenisation help.    46     47 import re    48     49 # Reader and parser classes.    50     51 class Reader:    52     53     "A simple class wrapping a file, providing simple pushback capabilities."    54     55     SEPARATORS = re.compile('[;:"]')    56     SEPARATORS_PLUS_EQUALS = re.compile('[=;:"]')    57     58     def __init__(self, f, non_standard_newline=0):    59     60         """    61         Initialise the object with the file 'f'. If 'non_standard_newline' is    62         set to a true value (unlike the default), lines ending with CR will be    63         treated as complete lines.    64         """    65     66         self.f = f    67         self.non_standard_newline = non_standard_newline    68         self.lines = []    69         self.line_number = 0    70     71     def pushback(self, line):    72     73         """    74         Push the given 'line' back so that the next line read is actually the    75         given 'line' and not the next line from the underlying file.    76         """    77     78         self.lines.append(line)    79         self.line_number -= 1    80     81     def readline(self):    82     83         """    84         If no pushed-back lines exist, read a line directly from the file.    85         Otherwise, read from the list of pushed-back lines.    86         """    87     88         self.line_number += 1    89         if self.lines:    90             return self.lines.pop()    91         else:    92             # NOTE: Sanity check for broken lines (\r instead of \r\n or \n).    93             line = self.f.readline()    94             while line.endswith("\r") and not self.non_standard_newline:    95                 line += self.f.readline()    96             if line.endswith("\r") and self.non_standard_newline:    97                 return line + "\n"    98             else:    99                 return line   100    101     def read_until(self, targets):   102    103         """   104         Read from the stream until one of the 'targets' is seen. Return the   105         string from the current position up to the target found, along with the   106         target string, using a tuple of the form (string, target). If no target   107         was found, return the entire string together with a target of None.   108         """   109    110         # Remember the entire text read and the index of the current line in   111         # that text.   112    113         lines = []   114    115         line = self.readline()   116         lines.append(line)   117         start = 0   118    119         # Remember the first target.   120    121         first = None   122         first_pos = None   123         in_quoted_region = 0   124    125         # Process each line, looking for the targets.   126    127         while line != "":   128             match = targets.search(line, start)   129    130             # Where nothing matches, get the next line.   131    132             if match is None:   133                 line = self.readline()   134                 lines.append(line)   135                 start = 0   136    137             # Where a double quote matches, toggle the region state.   138    139             elif match.group() == '"':   140                 in_quoted_region = not in_quoted_region   141                 start = match.end()   142    143             # Where something else matches outside a region, stop searching.   144    145             elif not in_quoted_region:   146                 first = match.group()   147                 first_pos = match.start()   148                 break   149    150             # Otherwise, keep looking for the end of the region.   151    152             else:   153                 start = match.end()   154    155         # Where no more input can provide the targets, return a special result.   156    157         else:   158             text = "".join(lines)   159             return text, None   160    161         # Push back the text after the target.   162    163         after_target = lines[-1][first_pos + len(first):]   164         self.pushback(after_target)   165    166         # Produce the lines until the matching line, together with the portion   167         # of the matching line before the target.   168    169         lines[-1] = lines[-1][:first_pos]   170         text = "".join(lines)   171         return text, first   172    173 class StreamParser:   174    175     "A stream parser for content in vCard/vCalendar/iCalendar-like formats."   176    177     def __init__(self, f):   178    179         "Initialise the parser for the given file 'f'."   180    181         self.f = f   182    183     def __iter__(self):   184    185         "Return self as the iterator."   186    187         return self   188    189     def next(self):   190    191         """   192         Return the next content item in the file as a tuple of the form   193         (name, parameters, values).   194         """   195    196         return self.parse_content_line()   197    198     def decode_content(self, value):   199    200         "Decode the given 'value', replacing quoted characters."   201    202         return value.replace("\r", "").replace("\\N", "\n").replace("\\n", "\n")   203    204     # Internal methods.   205    206     def parse_content_line(self):   207    208         """   209         Return the name, parameters and value information for the current   210         content line in the file being parsed.   211         """   212    213         f = self.f   214    215         parameters = {}   216         name, sep = f.read_until(f.SEPARATORS)   217    218         name = name.strip()   219    220         if not name and sep is None:   221             raise StopIteration   222    223         while sep == ";":   224    225             # Find the actual modifier.   226    227             parameter_name, sep = f.read_until(f.SEPARATORS_PLUS_EQUALS)   228             parameter_name = parameter_name.strip()   229    230             if sep == "=":   231                 parameter_value, sep = f.read_until(f.SEPARATORS)   232                 parameter_value = parameter_value.strip()   233             else:   234                 parameter_value = None   235    236             # Append a key, value tuple to the parameters list.   237    238             parameters[parameter_name] = parameter_value   239    240         # Get the value content.   241    242         if sep != ":":   243             raise ValueError, f.line_number   244    245         # Strip all appropriate whitespace from the right end of each line.   246         # For subsequent lines, remove the first whitespace character.   247         # See section 4.1 of the iCalendar specification.   248    249         line = f.readline()   250         value_lines = [line.rstrip("\r\n")]   251         line = f.readline()   252         while line != "" and line[0] in [" ", "\t"]:   253             value_lines.append(line.rstrip("\r\n")[1:])   254             line = f.readline()   255    256         # Since one line too many will have been read, push the line back into the   257         # file.   258    259         f.pushback(line)   260    261         # Decode the value.   262    263         value = self.decode(name, parameters, "".join(value_lines))   264    265         return name, parameters, value   266    267     def decode(self, name, parameters, value):   268    269         "Decode using 'name' and 'parameters' the given 'value'."   270    271         encoding = parameters.get("ENCODING")   272         charset = parameters.get("CHARSET")   273    274         value = self.decode_content(value)   275    276         if encoding == "QUOTED-PRINTABLE":   277             return unicode(quopri.decodestring(value), charset or "iso-8859-1")   278         elif encoding == "BASE64":   279             return base64.decodestring(value)   280         else:   281             return value   282    283 class ParserBase:   284    285     "An abstract parser for content in vCard/vCalendar/iCalendar-like formats."   286    287     def __init__(self):   288    289         "Initialise the parser."   290    291         self.names = []   292    293     def parse(self, f, parser_cls=None):   294    295         "Parse the contents of the file 'f'."   296    297         parser = (parser_cls or StreamParser)(f)   298    299         for name, parameters, value in parser:   300    301             if name == "BEGIN":   302                 self.names.append(value)   303                 self.startComponent(value, parameters)   304    305             elif name == "END":   306                 start_name = self.names.pop()   307                 if start_name != value:   308                     raise ParseError, "Mismatch in BEGIN and END declarations (%r and %r) at line %d." % (   309                         start_name, value, f.line_number)   310    311                 self.endComponent(value)   312    313             else:   314                 self.handleProperty(name, parameters, value)   315    316 class Parser(ParserBase):   317    318     "A SAX-like parser for vCard/vCalendar/iCalendar-like formats."   319    320     def __init__(self):   321         ParserBase.__init__(self)   322         self.components = []   323    324     def startComponent(self, name, parameters):   325    326         """   327         Add the component with the given 'name' and 'parameters', recording an   328         empty list of children as part of the component's content.   329         """   330    331         component = self.handleProperty(name, parameters, [])   332         self.components.append(component)   333         return component   334    335     def endComponent(self, name):   336    337         """   338         End the component with the given 'name' by removing it from the active   339         component stack.   340         """   341    342         if len(self.components) > 1:   343             return self.components.pop()   344         elif self.components:   345             return self.components[-1]   346    347     def handleProperty(self, name, parameters, value):   348    349         """   350         Record the property with the given 'name', 'parameters' and 'value' as   351         part of the current component's children.   352         """   353    354         component = self.makeComponent(name, parameters, value)   355         self.attachComponent(component)   356         return component   357    358     # Component object construction/manipulation methods.   359    360     def attachComponent(self, component):   361    362         "Attach the given 'component' to its parent."   363    364         if self.components:   365             component_name, component_parameters, component_children = self.components[-1]   366             component_children.append(component)   367    368     def makeComponent(self, name, parameters, value):   369    370         """   371         Make a component object from the given 'name', 'parameters' and 'value'.   372         """   373    374         return (name, parameters, value)   375    376     # Public methods.   377    378     def parse(self, f, parser_cls=None):   379    380         "Parse the contents of the file 'f'."   381    382         ParserBase.parse(self, f, parser_cls)   383         return self.components[0]   384    385 # Writer classes.   386    387 class StreamWriter:   388    389     "A stream writer for content in vCard/vCalendar/iCalendar-like formats."   390    391     def __init__(self, f, line_length=76):   392    393         "Initialise the parser for the given file 'f'."   394    395         self.f = f   396         self.line_length = line_length   397    398     def write(self, name, parameters, value):   399    400         """   401         Write a content line for the given 'name', 'parameters' and 'value'   402         information.   403         """   404    405         f = self.f   406    407         f.write(name)   408         self.write_parameters(parameters)   409         f.write(":")   410    411         for line in self.fold(self.encode(name, parameters, value)):   412             f.write(line)   413             f.write("\r\n")   414    415     def encode_content(self, value):   416    417         "Encode the given 'value', quoting characters."   418    419         return value.replace("\n", "\\n")   420    421     # Internal methods.   422    423     def write_parameters(self, parameters):   424    425         "Write the given 'parameters'."   426    427         f = self.f   428    429         for parameter_name, parameter_value in parameters.items():   430             f.write(";")   431             f.write(parameter_name)   432             f.write("=")   433             f.write(parameter_value)   434    435     def encode(self, name, parameters, value):   436    437         "Encode using 'name' and 'parameters' the given 'value'."   438    439         encoding = parameters.get("ENCODING")   440         charset = parameters.get("CHARSET")   441    442         if encoding == "QUOTED-PRINTABLE":   443             value = quopri.encodestring(value.encode(charset or "iso-8859-1"))   444         elif encoding == "BASE64":   445             value = base64.encodestring(value)   446    447         return self.encode_content(value)   448    449     def fold(self, text):   450    451         "Fold the given 'text'."   452    453         line_length = self.line_length   454         i = 0   455         lines = []   456    457         line = text[i:i+line_length]   458         while line:   459             lines.append(line)   460             i += line_length   461             line = text[i:i+line_length]   462    463         return lines   464    465 # Public functions.   466    467 def parse(f, non_standard_newline=0, parser_cls=None):   468    469     """   470     Parse the resource data found through the use of the file object 'f', which   471     should provide Unicode data. (The codecs module can be used to open files or   472     to wrap streams in order to provide Unicode data.)   473    474     The optional 'non_standard_newline' can be set to a true value (unlike the   475     default) in order to attempt to process files with CR as the end of line   476     character.   477    478     As a result of parsing the resource, the root node of the imported resource   479     is returned.   480     """   481    482     reader = Reader(f, non_standard_newline)   483     parser = (parser_cls or Parser)()   484     return parser.parse(reader)   485    486 def iterparse(f, non_standard_newline=0, parser_cls=None):   487    488     """   489     Parse the resource data found through the use of the file object 'f', which   490     should provide Unicode data. (The codecs module can be used to open files or   491     to wrap streams in order to provide Unicode data.)   492    493     The optional 'non_standard_newline' can be set to a true value (unlike the   494     default) in order to attempt to process files with CR as the end of line   495     character.   496    497     An iterator is returned which provides event tuples describing parsing   498     events of the form (name, parameters, value).   499     """   500    501     reader = Reader(f, non_standard_newline)   502     parser = (parser_cls or StreamParser)(reader)   503     return iter(parser)   504    505 # vim: tabstop=4 expandtab shiftwidth=4