vContent (file vContent.py at 3fdf59812622)

     1 #!/usr/bin/env python     2      3 """     4 Parsing of vCard, vCalendar and iCalendar files.     5      6 Copyright (C) 2005, 2006, 2007, 2008 Paul Boddie <paul@boddie.org.uk>     7      8 This program is free software; you can redistribute it and/or modify it under     9 the terms of the GNU Lesser General Public License as published by the Free    10 Software Foundation; either version 3 of the License, or (at your option) any    11 later version.    12     13 This program is distributed in the hope that it will be useful, but WITHOUT    14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS    15 FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for more    16 details.    17     18 You should have received a copy of the GNU Lesser General Public License along    19 with this program.  If not, see <http://www.gnu.org/licenses/>.    20     21 --------    22     23 References:    24     25 RFC 2445: Internet Calendaring and Scheduling Core Object Specification    26           (iCalendar)    27           http://rfc.net/rfc2445.html    28     29 RFC 2425: A MIME Content-Type for Directory Information    30           http://rfc.net/rfc2425.html    31     32 RFC 2426: vCard MIME Directory Profile    33           http://rfc.net/rfc2426.html    34 """    35     36 try:    37     set    38 except NameError:    39     from sets import Set as set    40     41 # Encoding-related imports.    42     43 import base64, quopri    44     45 # Tokenisation help.    46     47 import re    48     49 # Simple reader class.    50     51 class Reader:    52     53     "A simple class wrapping a file, providing simple pushback capabilities."    54     55     SEPARATORS = re.compile('[;:"]')    56     SEPARATORS_PLUS_EQUALS = re.compile('[=;:"]')    57     58     def __init__(self, f, non_standard_newline=0):    59     60         """    61         Initialise the object with the file 'f'. If 'non_standard_newline' is    62         set to a true value (unlike the default), lines ending with CR will be    63         treated as complete lines.    64         """    65     66         self.f = f    67         self.non_standard_newline = non_standard_newline    68         self.lines = []    69         self.line_number = 0    70     71     def pushback(self, line):    72     73         """    74         Push the given 'line' back so that the next line read is actually the    75         given 'line' and not the next line from the underlying file.    76         """    77     78         self.lines.append(line)    79         self.line_number -= 1    80     81     def readline(self):    82     83         """    84         If no pushed-back lines exist, read a line directly from the file.    85         Otherwise, read from the list of pushed-back lines.    86         """    87     88         self.line_number += 1    89         if self.lines:    90             return self.lines.pop()    91         else:    92             # NOTE: Sanity check for broken lines (\r instead of \r\n or \n).    93             line = self.f.readline()    94             while line.endswith("\r") and not self.non_standard_newline:    95                 line += self.f.readline()    96             if line.endswith("\r") and self.non_standard_newline:    97                 return line + "\n"    98             else:    99                 return line   100    101     def read_until(self, targets):   102    103         """   104         Read from the stream until one of the 'targets' is seen. Return the   105         string from the current position up to the target found, along with the   106         target string, using a tuple of the form (string, target). If no target   107         was found, return the entire string together with a target of None.   108         """   109    110         # Remember the entire text read and the index of the current line in   111         # that text.   112    113         lines = []   114    115         line = self.readline()   116         lines.append(line)   117         start = 0   118    119         # Remember the first target.   120    121         first = None   122         first_pos = None   123         in_quoted_region = 0   124    125         # Process each line, looking for the targets.   126    127         while line != "":   128             match = targets.search(line, start)   129    130             # Where nothing matches, get the next line.   131    132             if match is None:   133                 line = self.readline()   134                 lines.append(line)   135                 start = 0   136    137             # Where a double quote matches, toggle the region state.   138    139             elif match.group() == '"':   140                 in_quoted_region = not in_quoted_region   141                 start = match.end()   142    143             # Where something else matches outside a region, stop searching.   144    145             elif not in_quoted_region:   146                 first = match.group()   147                 first_pos = match.start()   148                 break   149    150             # Otherwise, keep looking for the end of the region.   151    152             else:   153                 start = match.end()   154    155         # Where no more input can provide the targets, return a special result.   156    157         else:   158             text = "".join(lines)   159             return text, None   160    161         # Push back the text after the target.   162    163         after_target = lines[-1][first_pos + len(first):]   164         self.pushback(after_target)   165    166         # Produce the lines until the matching line, together with the portion   167         # of the matching line before the target.   168    169         lines[-1] = lines[-1][:first_pos]   170         text = "".join(lines)   171         return text, first   172    173 class StreamParser:   174    175     "A stream parser for content in vCard/vCalendar/iCalendar-like formats."   176    177     def __init__(self, f):   178    179         "Initialise the parser for the given file 'f'."   180    181         self.f = f   182    183     def __iter__(self):   184    185         "Return self as the iterator."   186    187         return self   188    189     def next(self):   190    191         """   192         Return the next content item in the file as a tuple of the form   193         (name, parameters, values).   194         """   195    196         return self.parse_content_line()   197    198     # Internal methods.   199    200     def parse_content_line(self):   201    202         """   203         Return the name, parameters and a list containing value information for   204         the current content line in the file being parsed.   205         """   206    207         f = self.f   208    209         parameters = {}   210         name, sep = f.read_until(f.SEPARATORS)   211    212         name = name.strip()   213    214         if not name and sep is None:   215             raise StopIteration   216    217         while sep == ";":   218    219             # Find the actual modifier.   220    221             parameter_name, sep = f.read_until(f.SEPARATORS_PLUS_EQUALS)   222             parameter_name = parameter_name.strip()   223    224             if sep == "=":   225                 parameter_value, sep = f.read_until(f.SEPARATORS)   226                 parameter_value = parameter_value.strip()   227             else:   228                 parameter_value = None   229    230             # Append a key, value tuple to the parameters list.   231    232             parameters[parameter_name] = parameter_value   233    234         # Get the value content.   235    236         if sep != ":":   237             raise ValueError, f.line_number   238    239         # Strip all appropriate whitespace from the right end of each line.   240         # For subsequent lines, remove the first whitespace character.   241         # See section 4.1 of the iCalendar specification.   242    243         line = f.readline()   244         value_lines = [line.rstrip("\r\n")]   245         line = f.readline()   246         while line != "" and line[0] in [" ", "\t"]:   247             value_lines.append(line.rstrip("\r\n")[1:])   248             line = f.readline()   249    250         # Since one line too many will have been read, push the line back into the   251         # file.   252    253         f.pushback(line)   254    255         # Decode the value.   256    257         value = self.decode("".join(value_lines), parameters)   258    259         return name, parameters, value   260    261     def decode(self, value, parameters):   262    263         "Decode the 'value' using the given 'parameters'."   264    265         encoding = parameters.get("ENCODING")   266         charset = parameters.get("CHARSET")   267    268         # NOTE: Introducing newline conversions.   269         # Replace quoted characters (see 4.3.11 in RFC 2445).   270    271         value = value.replace("\r", "").replace("\\N", "\n").replace("\\n", "\n").replace("\\,", ",").replace("\\;", ";")   272    273         if encoding == "QUOTED-PRINTABLE":   274             return unicode(quopri.decodestring(value), charset or "iso-8859-1")   275         elif encoding == "BASE64":   276             return base64.decodestring(value)   277         else:   278             return value   279    280 class ParserBase:   281    282     "An abstract parser for content in vCard/vCalendar/iCalendar-like formats."   283    284     def __init__(self):   285    286         "Initialise the parser."   287    288         self.names = []   289    290     def parse(self, f, parser_cls=None):   291    292         "Parse the contents of the file 'f'."   293    294         parser = (parser_cls or StreamParser)(f)   295    296         for name, parameters, value in parser:   297    298             if name == "BEGIN":   299                 self.names.append(value)   300                 self.startComponent(value, parameters)   301    302             elif name == "END":   303                 start_name = self.names.pop()   304                 if start_name != value:   305                     raise ParseError, "Mismatch in BEGIN and END declarations (%r and %r) at line %d." % (   306                         start_name, value, f.line_number)   307    308                 self.endComponent(value)   309    310             else:   311                 self.handleProperty(name, parameters, value)   312    313 class Parser(ParserBase):   314    315     "A SAX-like parser for vCard/vCalendar/iCalendar-like formats."   316    317     def __init__(self):   318         ParserBase.__init__(self)   319         self.components = []   320    321     def startComponent(self, name, parameters):   322    323         """   324         Add the component with the given 'name' and 'parameters', recording an   325         empty list of children as part of the component's content.   326         """   327    328         component = self.handleProperty(name, parameters, [])   329         self.components.append(component)   330         return component   331    332     def endComponent(self, name):   333    334         """   335         End the component with the given 'name' by removing it from the active   336         component stack.   337         """   338    339         if len(self.components) > 1:   340             return self.components.pop()   341         elif self.components:   342             return self.components[-1]   343    344     def handleProperty(self, name, parameters, value):   345    346         """   347         Record the property with the given 'name', 'parameters' and 'value' as   348         part of the current component's children.   349         """   350    351         component = self.makeComponent(name, parameters, value)   352         self.attachComponent(component)   353         return component   354    355     # Component object construction/manipulation methods.   356    357     def attachComponent(self, component):   358    359         "Attach the given 'component' to its parent."   360    361         if self.components:   362             component_name, component_parameters, component_children = self.components[-1]   363             component_children.append(component)   364    365     def makeComponent(self, name, parameters, value):   366    367         """   368         Make a component object from the given 'name', 'parameters' and 'value'.   369         """   370    371         return (name, parameters, value)   372    373     # Public methods.   374    375     def parse(self, f, parser_cls=None):   376    377         "Parse the contents of the file 'f'."   378    379         ParserBase.parse(self, f, parser_cls)   380         return self.components[0]   381    382 # Public functions.   383    384 def parse(f, non_standard_newline=0, parser_cls=None):   385    386     """   387     Parse the resource data found through the use of the file object 'f', which   388     should provide Unicode data. (The codecs module can be used to open files or   389     to wrap streams in order to provide Unicode data.)   390    391     The optional 'non_standard_newline' can be set to a true value (unlike the   392     default) in order to attempt to process files with CR as the end of line   393     character.   394    395     As a result of parsing the resource, the root node of the imported resource   396     is returned.   397     """   398    399     reader = Reader(f, non_standard_newline)   400     parser = (parser_cls or Parser)()   401     return parser.parse(reader)   402    403 def iterparse(f, non_standard_newline=0, parser_cls=None):   404    405     """   406     Parse the resource data found through the use of the file object 'f', which   407     should provide Unicode data. (The codecs module can be used to open files or   408     to wrap streams in order to provide Unicode data.)   409    410     The optional 'non_standard_newline' can be set to a true value (unlike the   411     default) in order to attempt to process files with CR as the end of line   412     character.   413    414     An iterator is returned which provides event tuples describing parsing   415     events of the form (name, parameters, value).   416     """   417    418     reader = Reader(f, non_standard_newline)   419     parser = (parser_cls or StreamParser)(reader)   420     return iter(parser)   421    422 # vim: tabstop=4 expandtab shiftwidth=4