vContent

vContent.py

15:7668ad359843
2011-07-18 Paul Boddie Added documentation and packaging-related files. Added some docstrings to the tests.
     1 #!/usr/bin/env python     2      3 """     4 Parsing of vCard, vCalendar and iCalendar files.     5      6 Copyright (C) 2005, 2006, 2007, 2008, 2009 Paul Boddie <paul@boddie.org.uk>     7      8 This program is free software; you can redistribute it and/or modify it under     9 the terms of the GNU General Public License as published by the Free Software    10 Foundation; either version 3 of the License, or (at your option) any later    11 version.    12     13 This program is distributed in the hope that it will be useful, but WITHOUT    14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS    15 FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more    16 details.    17     18 You should have received a copy of the GNU General Public License along with    19 this program.  If not, see <http://www.gnu.org/licenses/>.    20     21 --------    22     23 References:    24     25 RFC 2445: Internet Calendaring and Scheduling Core Object Specification    26           (iCalendar)    27           http://rfc.net/rfc2445.html    28     29 RFC 2425: A MIME Content-Type for Directory Information    30           http://rfc.net/rfc2425.html    31     32 RFC 2426: vCard MIME Directory Profile    33           http://rfc.net/rfc2426.html    34 """    35     36 try:    37     set    38 except NameError:    39     from sets import Set as set    40     41 # Encoding-related imports.    42     43 import base64, quopri    44 import codecs    45     46 # Tokenisation help.    47     48 import re    49     50 # Configuration.    51     52 default_encoding = "utf-8"    53     54 # Reader and parser classes.    55     56 class Reader:    57     58     "A simple class wrapping a file, providing simple pushback capabilities."    59     60     def __init__(self, f, non_standard_newline=0):    61     62         """    63         Initialise the object with the file 'f'. If 'non_standard_newline' is    64         set to a true value (unlike the default), lines ending with CR will be    65         treated as complete lines.    66         """    67     68         self.f = f    69         self.non_standard_newline = non_standard_newline    70         self.lines = []    71         self.line_number = 1 # about to read line 1    72     73     def close(self):    74     75         "Close the reader."    76     77         self.f.close()    78     79     def pushback(self, line):    80     81         """    82         Push the given 'line' back so that the next line read is actually the    83         given 'line' and not the next line from the underlying file.    84         """    85     86         self.lines.append(line)    87         self.line_number -= 1    88     89     def readline(self):    90     91         """    92         If no pushed-back lines exist, read a line directly from the file.    93         Otherwise, read from the list of pushed-back lines.    94         """    95     96         self.line_number += 1    97         if self.lines:    98             return self.lines.pop()    99         else:   100             # Sanity check for broken lines (\r instead of \r\n or \n).   101             line = self.f.readline()   102             while line.endswith("\r") and not self.non_standard_newline:   103                 line += self.f.readline()   104             if line.endswith("\r") and self.non_standard_newline:   105                 return line + "\n"   106             else:   107                 return line   108    109     def read_content_line(self):   110    111         """   112         Read an entire content line, itself potentially consisting of many   113         physical lines of text, returning a string.   114         """   115    116         # Skip blank lines.   117    118         line = self.readline()   119         while line:   120             line_stripped = line.rstrip("\r\n")   121             if not line_stripped:   122                 line = self.readline()   123             else:   124                 break   125         else:   126             return ""   127    128         # Strip all appropriate whitespace from the right end of each line.   129         # For subsequent lines, remove the first whitespace character.   130         # See section 4.1 of the iCalendar specification.   131    132         lines = [line_stripped]   133    134         line = self.readline()   135         while line.startswith(" ") or line.startswith("\t"):   136             lines.append(line[1:].rstrip("\r\n"))   137             line = self.readline()   138    139         # Since one line too many will have been read, push the line back into   140         # the file.   141    142         if line:   143             self.pushback(line)   144    145         return "".join(lines)   146    147     def get_content_line(self):   148    149         "Return a content line object for the current line."   150    151         return ContentLine(self.read_content_line())   152    153 class ContentLine:   154    155     "A content line which can be searched."   156    157     SEPARATORS = re.compile('[;:"]')   158     SEPARATORS_PLUS_EQUALS = re.compile('[=;:"]')   159    160     def __init__(self, text):   161         self.text = text   162         self.start = 0   163    164     def get_remaining(self):   165    166         "Get the remaining text from the content line."   167    168         return self.text[self.start:]   169    170     def search(self, targets):   171    172         """   173         Find one of the 'targets' in the text, returning the string from the   174         current position up to the target found, along with the target string,   175         using a tuple of the form (string, target). If no target was found,   176         return the entire string together with a target of None.   177    178         The 'targets' parameter must be a regular expression object or an object   179         compatible with the API of such objects.   180         """   181    182         text = self.text   183         start = pos = self.start   184         length = len(text)   185    186         # Remember the first target.   187    188         first = None   189         first_pos = None   190         in_quoted_region = 0   191    192         # Process the text, looking for the targets.   193    194         while pos < length:   195             match = targets.search(text, pos)   196    197             # Where nothing matches, end the search.   198    199             if match is None:   200                 pos = length   201    202             # Where a double quote matches, toggle the region state.   203    204             elif match.group() == '"':   205                 in_quoted_region = not in_quoted_region   206                 pos = match.end()   207    208             # Where something else matches outside a region, stop searching.   209    210             elif not in_quoted_region:   211                 first = match.group()   212                 first_pos = match.start()   213                 break   214    215             # Otherwise, keep looking for the end of the region.   216    217             else:   218                 pos = match.end()   219    220         # Where no more input can provide the targets, return a special result.   221    222         else:   223             self.start = length   224             return text[start:], None   225    226         self.start = match.end()   227         return text[start:first_pos], first   228    229 class StreamParser:   230    231     "A stream parser for content in vCard/vCalendar/iCalendar-like formats."   232    233     def __init__(self, f):   234    235         "Initialise the parser for the given file 'f'."   236    237         self.f = f   238    239     def close(self):   240    241         "Close the reader."   242    243         self.f.close()   244    245     def __iter__(self):   246    247         "Return self as the iterator."   248    249         return self   250    251     def next(self):   252    253         """   254         Return the next content item in the file as a tuple of the form   255         (name, parameters, values).   256         """   257    258         return self.parse_content_line()   259    260     def decode_content(self, value):   261    262         "Decode the given 'value', replacing quoted characters."   263    264         return value.replace("\r", "").replace("\\N", "\n").replace("\\n", "\n")   265    266     # Internal methods.   267    268     def parse_content_line(self):   269    270         """   271         Return the name, parameters and value information for the current   272         content line in the file being parsed.   273         """   274    275         f = self.f   276         line_number = f.line_number   277         line = f.get_content_line()   278    279         # Read the property name.   280    281         name, sep = line.search(line.SEPARATORS)   282         name = name.strip()   283    284         if not name and sep is None:   285             raise StopIteration   286    287         # Read the parameters.   288    289         parameters = {}   290    291         while sep == ";":   292    293             # Find the actual modifier.   294    295             parameter_name, sep = line.search(line.SEPARATORS_PLUS_EQUALS)   296             parameter_name = parameter_name.strip()   297    298             if sep == "=":   299                 parameter_value, sep = line.search(line.SEPARATORS)   300                 parameter_value = parameter_value.strip()   301             else:   302                 parameter_value = None   303    304             # Append a key, value tuple to the parameters list.   305    306             parameters[parameter_name] = parameter_value   307    308         # Get the value content.   309    310         if sep != ":":   311             raise ValueError, line_number   312    313         # Obtain and decode the value.   314    315         value = self.decode(name, parameters, line.get_remaining())   316    317         return name, parameters, value   318    319     def decode(self, name, parameters, value):   320    321         "Decode using 'name' and 'parameters' the given 'value'."   322    323         encoding = parameters.get("ENCODING")   324         charset = parameters.get("CHARSET")   325    326         value = self.decode_content(value)   327    328         if encoding == "QUOTED-PRINTABLE":   329             return unicode(quopri.decodestring(value), charset or "iso-8859-1")   330         elif encoding == "BASE64":   331             return base64.decodestring(value)   332         else:   333             return value   334    335 class ParserBase:   336    337     "An abstract parser for content in vCard/vCalendar/iCalendar-like formats."   338    339     def __init__(self):   340    341         "Initialise the parser."   342    343         self.names = []   344    345     def parse(self, f, parser_cls=None):   346    347         "Parse the contents of the file 'f'."   348    349         parser = (parser_cls or StreamParser)(f)   350    351         for name, parameters, value in parser:   352    353             if name == "BEGIN":   354                 self.names.append(value)   355                 self.startComponent(value, parameters)   356    357             elif name == "END":   358                 start_name = self.names.pop()   359                 if start_name != value:   360                     raise ParseError, "Mismatch in BEGIN and END declarations (%r and %r) at line %d." % (   361                         start_name, value, f.line_number)   362    363                 self.endComponent(value)   364    365             else:   366                 self.handleProperty(name, parameters, value)   367    368 class Parser(ParserBase):   369    370     "A SAX-like parser for vCard/vCalendar/iCalendar-like formats."   371    372     def __init__(self):   373         ParserBase.__init__(self)   374         self.components = []   375    376     def startComponent(self, name, parameters):   377    378         """   379         Add the component with the given 'name' and 'parameters', recording an   380         empty list of children as part of the component's content.   381         """   382    383         component = self.handleProperty(name, parameters)   384         self.components.append(component)   385         return component   386    387     def endComponent(self, name):   388    389         """   390         End the component with the given 'name' by removing it from the active   391         component stack. If only one component exists on the stack, retain it   392         for later inspection.   393         """   394    395         if len(self.components) > 1:   396             return self.components.pop()   397    398         # Or return the only element.   399    400         elif self.components:   401             return self.components[0]   402    403     def handleProperty(self, name, parameters, value=None):   404    405         """   406         Record the property with the given 'name', 'parameters' and optional   407         'value' as part of the current component's children.   408         """   409    410         component = self.makeComponent(name, parameters, value)   411         self.attachComponent(component)   412         return component   413    414     # Component object construction/manipulation methods.   415    416     def attachComponent(self, component):   417    418         "Attach the given 'component' to its parent."   419    420         if self.components:   421             component_name, component_parameters, component_children = self.components[-1]   422             component_children.append(component)   423    424     def makeComponent(self, name, parameters, value=None):   425    426         """   427         Make a component object from the given 'name', 'parameters' and optional   428         'value'.   429         """   430    431         return (name, parameters, value or [])   432    433     # Public methods.   434    435     def parse(self, f, parser_cls=None):   436    437         "Parse the contents of the file 'f'."   438    439         ParserBase.parse(self, f, parser_cls)   440         return self.components[0]   441    442 # Writer classes.   443    444 class Writer:   445    446     "A simple class wrapping a file, providing simple output capabilities."   447    448     default_line_length = 76   449    450     def __init__(self, f, line_length=None):   451    452         """   453         Initialise the object with the file 'f'. If 'line_length' is set, the   454         length of written lines will conform to the specified value instead of   455         the default value.    456         """   457    458         self.f = f   459         self.line_length = line_length or self.default_line_length   460         self.char_offset = 0   461    462     def close(self):   463    464         "Close the writer."   465    466         self.f.close()   467    468     def write(self, text):   469    470         "Write the 'text' to the file."   471    472         f = self.f   473         line_length = self.line_length   474    475         i = 0   476         remaining = len(text)   477    478         while remaining:   479             space = line_length - self.char_offset   480             if remaining > space:   481                 f.write(text[i:i + space])   482                 f.write("\r\n ")   483                 self.char_offset = 1   484                 i += space   485                 remaining -= space   486             else:   487                 f.write(text[i:])   488                 self.char_offset += remaining   489                 i += remaining   490                 remaining = 0   491    492     def end_line(self):   493    494         "End the current content line."   495    496         if self.char_offset > 0:   497             self.char_offset = 0   498             self.f.write("\r\n")   499    500 class StreamWriter:   501    502     "A stream writer for content in vCard/vCalendar/iCalendar-like formats."   503    504     def __init__(self, f):   505    506         "Initialise the parser for the given file 'f'."   507    508         self.f = f   509    510     def close(self):   511    512         "Close the writer."   513    514         self.f.close()   515    516     def write(self, name, parameters, value):   517    518         """   519         Write a content line, serialising the given 'name', 'parameters' and   520         'value' information.   521         """   522    523         self.write_content_line(name, self.encode_parameters(parameters), self.encode_value(name, parameters, value))   524    525     # Internal methods.   526    527     def write_content_line(self, name, encoded_parameters, encoded_value):   528    529         """   530         Write a content line for the given 'name', 'encoded_parameters' and   531         'encoded_value' information.   532         """   533    534         f = self.f   535    536         f.write(name)   537         for param_name, param_value in encoded_parameters.items():   538             f.write(";")   539             f.write(param_name)   540             f.write("=")   541             f.write(param_value)   542         f.write(":")   543         f.write(encoded_value)   544         f.end_line()   545    546     def encode_quoted_parameter_value(self, value):   547    548         "Encode the given 'value'."   549    550         return '"%s"' % value   551    552     def encode_value(self, name, parameters, value):   553    554         """   555         Encode using 'name' and 'parameters' the given 'value' so that the   556         resulting encoded form employs any specified character encodings.   557         """   558    559         encoding = parameters.get("ENCODING")   560         charset = parameters.get("CHARSET")   561    562         if encoding == "QUOTED-PRINTABLE":   563             value = quopri.encodestring(value.encode(charset or "iso-8859-1"))   564         elif encoding == "BASE64":   565             value = base64.encodestring(value)   566    567         return self.encode_content(value)   568    569     # Overrideable methods.   570    571     def encode_parameters(self, parameters):   572    573         """   574         Encode the given 'parameters' according to the vCalendar specification.   575         """   576    577         encoded_parameters = {}   578    579         for param_name, param_value in parameters.items():   580    581             # Basic format support merely involves quoting values which seem to   582             # need it. Other more specific formats may define exactly which   583             # parameters should be quoted.   584    585             if ContentLine.SEPARATORS.search(param_value):   586                 param_value = self.encode_quoted_parameter_value(param_value)   587    588             encoded_parameters[param_name] = param_value   589    590         return encoded_parameters   591    592     def encode_content(self, value):   593    594         "Encode the given 'value', quoting characters."   595    596         return value.replace("\n", "\\n")   597    598 # Utility functions.   599    600 def is_input_stream(stream_or_string):   601     return hasattr(stream_or_string, "read")   602    603 def get_input_stream(stream_or_string, encoding=None):   604     if is_input_stream(stream_or_string):   605         return stream_or_string   606     else:   607         return codecs.open(stream_or_string, encoding=(encoding or default_encoding))   608    609 def get_output_stream(stream_or_string, encoding=None):   610     if hasattr(stream_or_string, "write"):   611         return stream_or_string   612     else:   613         return codecs.open(stream_or_string, "w", encoding=(encoding or default_encoding))   614    615 # Public functions.   616    617 def parse(stream_or_string, encoding=None, non_standard_newline=0, parser_cls=None):   618    619     """   620     Parse the resource data found through the use of the 'stream_or_string',   621     which is either a stream providing Unicode data (the codecs module can be   622     used to open files or to wrap streams in order to provide Unicode data) or a   623     filename identifying a file to be parsed.   624    625     The optional 'encoding' can be used to specify the character encoding used   626     by the file to be parsed.   627    628     The optional 'non_standard_newline' can be set to a true value (unlike the   629     default) in order to attempt to process files with CR as the end of line   630     character.   631    632     As a result of parsing the resource, the root node of the imported resource   633     is returned.   634     """   635    636     stream = get_input_stream(stream_or_string, encoding)   637     reader = Reader(stream, non_standard_newline)   638    639     # Parse using the reader.   640    641     try:   642         parser = (parser_cls or Parser)()   643         return parser.parse(reader)   644    645     # Close any opened streams.   646    647     finally:   648         if not is_input_stream(stream_or_string):   649             reader.close()   650    651 def iterparse(stream_or_string, encoding=None, non_standard_newline=0, parser_cls=None):   652    653     """   654     Parse the resource data found through the use of the 'stream_or_string',   655     which is either a stream providing Unicode data (the codecs module can be   656     used to open files or to wrap streams in order to provide Unicode data) or a   657     filename identifying a file to be parsed.   658    659     The optional 'encoding' can be used to specify the character encoding used   660     by the file to be parsed.   661    662     The optional 'non_standard_newline' can be set to a true value (unlike the   663     default) in order to attempt to process files with CR as the end of line   664     character.   665    666     An iterator is returned which provides event tuples describing parsing   667     events of the form (name, parameters, value).   668     """   669    670     stream = get_input_stream(stream_or_string, encoding)   671     reader = Reader(stream, non_standard_newline)   672     parser = (parser_cls or StreamParser)(reader)   673     return parser   674    675 def iterwrite(stream_or_string, encoding=None, line_length=None, writer_cls=None):   676    677     """   678     Return a writer which will send data to the resource found through the use   679     of 'stream_or_string', which is either a stream accepting Unicode data (the   680     codecs module can be used to open files or to wrap streams in order to   681     accept Unicode data) or a filename identifying a file to be parsed.   682    683     The optional 'encoding' can be used to specify the character encoding used   684     by the file to be written.   685    686     The optional 'line_length' can be used to specify how long lines should be   687     in the resulting data.   688     """   689    690     stream = get_output_stream(stream_or_string, encoding)   691     _writer = Writer(stream, line_length)   692     writer = (writer_cls or StreamWriter)(_writer)   693     return writer   694    695 # vim: tabstop=4 expandtab shiftwidth=4