vContent (file vContent.py at 7f92fa2ec22c)

     1 #!/usr/bin/env python     2      3 """     4 Parsing of vCard, vCalendar and iCalendar files.     5      6 Copyright (C) 2005, 2006, 2007, 2008 Paul Boddie <paul@boddie.org.uk>     7      8 This program is free software; you can redistribute it and/or modify it under     9 the terms of the GNU Lesser General Public License as published by the Free    10 Software Foundation; either version 3 of the License, or (at your option) any    11 later version.    12     13 This program is distributed in the hope that it will be useful, but WITHOUT    14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS    15 FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for more    16 details.    17     18 You should have received a copy of the GNU Lesser General Public License along    19 with this program.  If not, see <http://www.gnu.org/licenses/>.    20     21 --------    22     23 References:    24     25 RFC 2445: Internet Calendaring and Scheduling Core Object Specification    26           (iCalendar)    27           http://rfc.net/rfc2445.html    28     29 RFC 2425: A MIME Content-Type for Directory Information    30           http://rfc.net/rfc2425.html    31     32 RFC 2426: vCard MIME Directory Profile    33           http://rfc.net/rfc2426.html    34 """    35     36 try:    37     set    38 except NameError:    39     from sets import Set as set    40     41 # Encoding-related imports.    42     43 import base64, quopri    44 import codecs    45     46 # Tokenisation help.    47     48 import re    49     50 # Configuration.    51     52 default_encoding = "utf-8"    53     54 # Reader and parser classes.    55     56 class Reader:    57     58     "A simple class wrapping a file, providing simple pushback capabilities."    59     60     def __init__(self, f, non_standard_newline=0):    61     62         """    63         Initialise the object with the file 'f'. If 'non_standard_newline' is    64         set to a true value (unlike the default), lines ending with CR will be    65         treated as complete lines.    66         """    67     68         self.f = f    69         self.non_standard_newline = non_standard_newline    70         self.lines = []    71         self.line_number = 1 # about to read line 1    72     73     def close(self):    74     75         "Close the reader."    76     77         self.f.close()    78     79     def pushback(self, line):    80     81         """    82         Push the given 'line' back so that the next line read is actually the    83         given 'line' and not the next line from the underlying file.    84         """    85     86         self.lines.append(line)    87         self.line_number -= 1    88     89     def readline(self):    90     91         """    92         If no pushed-back lines exist, read a line directly from the file.    93         Otherwise, read from the list of pushed-back lines.    94         """    95     96         self.line_number += 1    97         if self.lines:    98             return self.lines.pop()    99         else:   100             # NOTE: Sanity check for broken lines (\r instead of \r\n or \n).   101             line = self.f.readline()   102             while line.endswith("\r") and not self.non_standard_newline:   103                 line += self.f.readline()   104             if line.endswith("\r") and self.non_standard_newline:   105                 return line + "\n"   106             else:   107                 return line   108    109     def read_content_line(self):   110    111         """   112         Read an entire content line, itself potentially consisting of many   113         physical lines of text.   114         """   115    116         # Skip blank lines.   117    118         line = self.readline()   119         while line:   120             line_stripped = line.rstrip("\r\n")   121             if not line_stripped:   122                 line = self.readline()   123             else:   124                 break   125         else:   126             return ""   127    128         # Strip all appropriate whitespace from the right end of each line.   129         # For subsequent lines, remove the first whitespace character.   130         # See section 4.1 of the iCalendar specification.   131    132         lines = [line_stripped]   133    134         line = self.readline()   135         while line.startswith(" ") or line.startswith("\t"):   136             lines.append(line[1:].rstrip("\r\n"))   137             line = self.readline()   138    139         # Since one line too many will have been read, push the line back into   140         # the file.   141    142         if line:   143             self.pushback(line)   144    145         return "".join(lines)   146    147     def get_content_line(self):   148    149         "Return a content line object for the current line."   150    151         return ContentLine(self.read_content_line())   152    153 class ContentLine:   154    155     "A content line which can be searched."   156    157     SEPARATORS = re.compile('[;:"]')   158     SEPARATORS_PLUS_EQUALS = re.compile('[=;:"]')   159    160     def __init__(self, text):   161         self.text = text   162         self.start = 0   163    164     def get_remaining(self):   165    166         "Get the remaining text from the content line."   167    168         return self.text[self.start:]   169    170     def search(self, targets):   171    172         """   173         Find one of the 'targets' in the text, returning the string from the   174         current position up to the target found, along with the target string,   175         using a tuple of the form (string, target). If no target was found,   176         return the entire string together with a target of None.   177         """   178    179         text = self.text   180         start = pos = self.start   181         length = len(text)   182    183         # Remember the first target.   184    185         first = None   186         first_pos = None   187         in_quoted_region = 0   188    189         # Process the text, looking for the targets.   190    191         while pos < length:   192             match = targets.search(text, pos)   193    194             # Where nothing matches, end the search.   195    196             if match is None:   197                 pos = length   198    199             # Where a double quote matches, toggle the region state.   200    201             elif match.group() == '"':   202                 in_quoted_region = not in_quoted_region   203                 pos = match.end()   204    205             # Where something else matches outside a region, stop searching.   206    207             elif not in_quoted_region:   208                 first = match.group()   209                 first_pos = match.start()   210                 break   211    212             # Otherwise, keep looking for the end of the region.   213    214             else:   215                 pos = match.end()   216    217         # Where no more input can provide the targets, return a special result.   218    219         else:   220             self.start = length   221             return text[start:], None   222    223         self.start = match.end()   224         return text[start:first_pos], first   225    226 class StreamParser:   227    228     "A stream parser for content in vCard/vCalendar/iCalendar-like formats."   229    230     def __init__(self, f):   231    232         "Initialise the parser for the given file 'f'."   233    234         self.f = f   235    236     def close(self):   237    238         "Close the reader."   239    240         self.f.close()   241    242     def __iter__(self):   243    244         "Return self as the iterator."   245    246         return self   247    248     def next(self):   249    250         """   251         Return the next content item in the file as a tuple of the form   252         (name, parameters, values).   253         """   254    255         return self.parse_content_line()   256    257     def decode_content(self, value):   258    259         "Decode the given 'value', replacing quoted characters."   260    261         return value.replace("\r", "").replace("\\N", "\n").replace("\\n", "\n")   262    263     # Internal methods.   264    265     def parse_content_line(self):   266    267         """   268         Return the name, parameters and value information for the current   269         content line in the file being parsed.   270         """   271    272         f = self.f   273         line_number = f.line_number   274         line = f.get_content_line()   275    276         # Read the property name.   277    278         name, sep = line.search(line.SEPARATORS)   279         name = name.strip()   280    281         if not name and sep is None:   282             raise StopIteration   283    284         # Read the parameters.   285    286         parameters = {}   287    288         while sep == ";":   289    290             # Find the actual modifier.   291    292             parameter_name, sep = line.search(line.SEPARATORS_PLUS_EQUALS)   293             parameter_name = parameter_name.strip()   294    295             if sep == "=":   296                 parameter_value, sep = line.search(line.SEPARATORS)   297                 parameter_value = parameter_value.strip()   298             else:   299                 parameter_value = None   300    301             # Append a key, value tuple to the parameters list.   302    303             parameters[parameter_name] = parameter_value   304    305         # Get the value content.   306    307         if sep != ":":   308             raise ValueError, line_number   309    310         # Obtain and decode the value.   311    312         value = self.decode(name, parameters, line.get_remaining())   313    314         return name, parameters, value   315    316     def decode(self, name, parameters, value):   317    318         "Decode using 'name' and 'parameters' the given 'value'."   319    320         encoding = parameters.get("ENCODING")   321         charset = parameters.get("CHARSET")   322    323         value = self.decode_content(value)   324    325         if encoding == "QUOTED-PRINTABLE":   326             return unicode(quopri.decodestring(value), charset or "iso-8859-1")   327         elif encoding == "BASE64":   328             return base64.decodestring(value)   329         else:   330             return value   331    332 class ParserBase:   333    334     "An abstract parser for content in vCard/vCalendar/iCalendar-like formats."   335    336     def __init__(self):   337    338         "Initialise the parser."   339    340         self.names = []   341    342     def parse(self, f, parser_cls=None):   343    344         "Parse the contents of the file 'f'."   345    346         parser = (parser_cls or StreamParser)(f)   347    348         for name, parameters, value in parser:   349    350             if name == "BEGIN":   351                 self.names.append(value)   352                 self.startComponent(value, parameters)   353    354             elif name == "END":   355                 start_name = self.names.pop()   356                 if start_name != value:   357                     raise ParseError, "Mismatch in BEGIN and END declarations (%r and %r) at line %d." % (   358                         start_name, value, f.line_number)   359    360                 self.endComponent(value)   361    362             else:   363                 self.handleProperty(name, parameters, value)   364    365 class Parser(ParserBase):   366    367     "A SAX-like parser for vCard/vCalendar/iCalendar-like formats."   368    369     def __init__(self):   370         ParserBase.__init__(self)   371         self.components = []   372    373     def startComponent(self, name, parameters):   374    375         """   376         Add the component with the given 'name' and 'parameters', recording an   377         empty list of children as part of the component's content.   378         """   379    380         component = self.handleProperty(name, parameters, [])   381         self.components.append(component)   382         return component   383    384     def endComponent(self, name):   385    386         """   387         End the component with the given 'name' by removing it from the active   388         component stack.   389         """   390    391         if len(self.components) > 1:   392             return self.components.pop()   393         elif self.components:   394             return self.components[-1]   395    396     def handleProperty(self, name, parameters, value):   397    398         """   399         Record the property with the given 'name', 'parameters' and 'value' as   400         part of the current component's children.   401         """   402    403         component = self.makeComponent(name, parameters, value)   404         self.attachComponent(component)   405         return component   406    407     # Component object construction/manipulation methods.   408    409     def attachComponent(self, component):   410    411         "Attach the given 'component' to its parent."   412    413         if self.components:   414             component_name, component_parameters, component_children = self.components[-1]   415             component_children.append(component)   416    417     def makeComponent(self, name, parameters, value):   418    419         """   420         Make a component object from the given 'name', 'parameters' and 'value'.   421         """   422    423         return (name, parameters, value)   424    425     # Public methods.   426    427     def parse(self, f, parser_cls=None):   428    429         "Parse the contents of the file 'f'."   430    431         ParserBase.parse(self, f, parser_cls)   432         return self.components[0]   433    434 # Writer classes.   435    436 class Writer:   437    438     "A simple class wrapping a file, providing simple output capabilities."   439    440     default_line_length = 76   441    442     def __init__(self, f, line_length=None):   443    444         """   445         Initialise the object with the file 'f'. If 'line_length' is set, the   446         length of written lines will conform to the specified value instead of   447         the default value.    448         """   449    450         self.f = f   451         self.line_length = line_length or self.default_line_length   452         self.char_offset = 0   453    454     def close(self):   455    456         "Close the writer."   457    458         self.f.close()   459    460     def write(self, text):   461    462         "Write the 'text' to the file."   463    464         f = self.f   465         line_length = self.line_length   466    467         i = 0   468         remaining = len(text)   469    470         while remaining:   471             space = line_length - self.char_offset   472             if remaining > space:   473                 f.write(text[i:i + space])   474                 f.write("\r\n ")   475                 self.char_offset = 1   476                 i += space   477                 remaining -= space   478             else:   479                 f.write(text[i:])   480                 self.char_offset += remaining   481                 i += remaining   482                 remaining = 0   483    484     def end_line(self):   485    486         "End the current content line."   487    488         if self.char_offset > 0:   489             self.char_offset = 0   490             self.f.write("\r\n")   491    492 class StreamWriter:   493    494     "A stream writer for content in vCard/vCalendar/iCalendar-like formats."   495    496     def __init__(self, f):   497    498         "Initialise the parser for the given file 'f'."   499    500         self.f = f   501    502     def close(self):   503    504         "Close the writer."   505    506         self.f.close()   507    508     def write_content_line(self, name, parameters, value):   509    510         """   511         Write a content line for the given 'name', 'parameters' and 'value'   512         information.   513         """   514    515         f = self.f   516    517         f.write(name)   518         for parameter_name, parameter_value in parameters.items():   519             f.write(";")   520             f.write(parameter_name)   521             f.write("=")   522             f.write(parameter_value)   523         f.write(":")   524         f.write(self.encode(name, parameters, value))   525         f.end_line()   526    527     def encode_content(self, value):   528    529         "Encode the given 'value', quoting characters."   530    531         return value.replace("\n", "\\n")   532    533     # Internal methods.   534    535     def encode(self, name, parameters, value):   536    537         "Encode using 'name' and 'parameters' the given 'value'."   538    539         encoding = parameters.get("ENCODING")   540         charset = parameters.get("CHARSET")   541    542         if encoding == "QUOTED-PRINTABLE":   543             value = quopri.encodestring(value.encode(charset or "iso-8859-1"))   544         elif encoding == "BASE64":   545             value = base64.encodestring(value)   546    547         return self.encode_content(value)   548    549 # Utility functions.   550    551 def is_input_stream(stream_or_string):   552     return hasattr(stream_or_string, "read")   553    554 def get_input_stream(stream_or_string):   555     if is_input_stream(stream_or_string):   556         return stream_or_string   557     else:   558         return codecs.open(stream_or_string, encoding=default_encoding)   559    560 def get_output_stream(stream_or_string):   561     if hasattr(stream_or_string, "write"):   562         return stream_or_string   563     else:   564         return codecs.open(stream_or_string, "w", encoding=default_encoding)   565    566 # Public functions.   567    568 def parse(stream_or_string, non_standard_newline=0, parser_cls=None):   569    570     """   571     Parse the resource data found through the use of the 'stream_or_string',   572     which is either a stream providing Unicode data (the codecs module can be   573     used to open files or to wrap streams in order to provide Unicode data) or a   574     filename identifying a file to be parsed.   575    576     The optional 'non_standard_newline' can be set to a true value (unlike the   577     default) in order to attempt to process files with CR as the end of line   578     character.   579    580     As a result of parsing the resource, the root node of the imported resource   581     is returned.   582     """   583    584     stream = get_input_stream(stream_or_string)   585     reader = Reader(stream, non_standard_newline)   586    587     # Parse using the reader.   588    589     try:   590         parser = (parser_cls or Parser)()   591         return parser.parse(reader)   592    593     # Close any opened streams.   594    595     finally:   596         if not is_input_stream(stream_or_string):   597             reader.close()   598    599 def iterparse(stream_or_string, non_standard_newline=0, parser_cls=None):   600    601     """   602     Parse the resource data found through the use of the 'stream_or_string',   603     which is either a stream providing Unicode data (the codecs module can be   604     used to open files or to wrap streams in order to provide Unicode data) or a   605     filename identifying a file to be parsed.   606    607     The optional 'non_standard_newline' can be set to a true value (unlike the   608     default) in order to attempt to process files with CR as the end of line   609     character.   610    611     An iterator is returned which provides event tuples describing parsing   612     events of the form (name, parameters, value).   613     """   614    615     stream = get_input_stream(stream_or_string)   616     reader = Reader(stream, non_standard_newline)   617     parser = (parser_cls or StreamParser)(reader)   618     return parser   619    620 def iterwrite(stream_or_string, line_length=None, writer_cls=None):   621     stream = get_output_stream(stream_or_string)   622     _writer = Writer(stream, line_length)   623     writer = (writer_cls or StreamWriter)(_writer)   624     return writer   625    626 # vim: tabstop=4 expandtab shiftwidth=4