vContent

vContent.py

11:428f343f0626
2009-03-14 Paul Boddie Changed the StreamWriter infrastructure in order to provide a more coherent mechanism for encoding data for output, employing the write method as the public method for sending data for output, and fixing the ALTREP output for the test iCalendar file. Introduced encoding parameters in the convenience functions. Added and improved docstrings. Made the streaming tests check the equivalence of the generated files and the original files.
     1 #!/usr/bin/env python     2      3 """     4 Parsing of vCard, vCalendar and iCalendar files.     5      6 Copyright (C) 2005, 2006, 2007, 2008, 2009 Paul Boddie <paul@boddie.org.uk>     7      8 This program is free software; you can redistribute it and/or modify it under     9 the terms of the GNU Lesser General Public License as published by the Free    10 Software Foundation; either version 3 of the License, or (at your option) any    11 later version.    12     13 This program is distributed in the hope that it will be useful, but WITHOUT    14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS    15 FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for more    16 details.    17     18 You should have received a copy of the GNU Lesser General Public License along    19 with this program.  If not, see <http://www.gnu.org/licenses/>.    20     21 --------    22     23 References:    24     25 RFC 2445: Internet Calendaring and Scheduling Core Object Specification    26           (iCalendar)    27           http://rfc.net/rfc2445.html    28     29 RFC 2425: A MIME Content-Type for Directory Information    30           http://rfc.net/rfc2425.html    31     32 RFC 2426: vCard MIME Directory Profile    33           http://rfc.net/rfc2426.html    34 """    35     36 try:    37     set    38 except NameError:    39     from sets import Set as set    40     41 # Encoding-related imports.    42     43 import base64, quopri    44 import codecs    45     46 # Tokenisation help.    47     48 import re    49     50 # Configuration.    51     52 default_encoding = "utf-8"    53     54 # Reader and parser classes.    55     56 class Reader:    57     58     "A simple class wrapping a file, providing simple pushback capabilities."    59     60     def __init__(self, f, non_standard_newline=0):    61     62         """    63         Initialise the object with the file 'f'. If 'non_standard_newline' is    64         set to a true value (unlike the default), lines ending with CR will be    65         treated as complete lines.    66         """    67     68         self.f = f    69         self.non_standard_newline = non_standard_newline    70         self.lines = []    71         self.line_number = 1 # about to read line 1    72     73     def close(self):    74     75         "Close the reader."    76     77         self.f.close()    78     79     def pushback(self, line):    80     81         """    82         Push the given 'line' back so that the next line read is actually the    83         given 'line' and not the next line from the underlying file.    84         """    85     86         self.lines.append(line)    87         self.line_number -= 1    88     89     def readline(self):    90     91         """    92         If no pushed-back lines exist, read a line directly from the file.    93         Otherwise, read from the list of pushed-back lines.    94         """    95     96         self.line_number += 1    97         if self.lines:    98             return self.lines.pop()    99         else:   100             # Sanity check for broken lines (\r instead of \r\n or \n).   101             line = self.f.readline()   102             while line.endswith("\r") and not self.non_standard_newline:   103                 line += self.f.readline()   104             if line.endswith("\r") and self.non_standard_newline:   105                 return line + "\n"   106             else:   107                 return line   108    109     def read_content_line(self):   110    111         """   112         Read an entire content line, itself potentially consisting of many   113         physical lines of text, returning a string.   114         """   115    116         # Skip blank lines.   117    118         line = self.readline()   119         while line:   120             line_stripped = line.rstrip("\r\n")   121             if not line_stripped:   122                 line = self.readline()   123             else:   124                 break   125         else:   126             return ""   127    128         # Strip all appropriate whitespace from the right end of each line.   129         # For subsequent lines, remove the first whitespace character.   130         # See section 4.1 of the iCalendar specification.   131    132         lines = [line_stripped]   133    134         line = self.readline()   135         while line.startswith(" ") or line.startswith("\t"):   136             lines.append(line[1:].rstrip("\r\n"))   137             line = self.readline()   138    139         # Since one line too many will have been read, push the line back into   140         # the file.   141    142         if line:   143             self.pushback(line)   144    145         return "".join(lines)   146    147     def get_content_line(self):   148    149         "Return a content line object for the current line."   150    151         return ContentLine(self.read_content_line())   152    153 class ContentLine:   154    155     "A content line which can be searched."   156    157     SEPARATORS = re.compile('[;:"]')   158     SEPARATORS_PLUS_EQUALS = re.compile('[=;:"]')   159    160     def __init__(self, text):   161         self.text = text   162         self.start = 0   163    164     def get_remaining(self):   165    166         "Get the remaining text from the content line."   167    168         return self.text[self.start:]   169    170     def search(self, targets):   171    172         """   173         Find one of the 'targets' in the text, returning the string from the   174         current position up to the target found, along with the target string,   175         using a tuple of the form (string, target). If no target was found,   176         return the entire string together with a target of None.   177    178         The 'targets' parameter must be a regular expression object or an object   179         compatible with the API of such objects.   180         """   181    182         text = self.text   183         start = pos = self.start   184         length = len(text)   185    186         # Remember the first target.   187    188         first = None   189         first_pos = None   190         in_quoted_region = 0   191    192         # Process the text, looking for the targets.   193    194         while pos < length:   195             match = targets.search(text, pos)   196    197             # Where nothing matches, end the search.   198    199             if match is None:   200                 pos = length   201    202             # Where a double quote matches, toggle the region state.   203    204             elif match.group() == '"':   205                 in_quoted_region = not in_quoted_region   206                 pos = match.end()   207    208             # Where something else matches outside a region, stop searching.   209    210             elif not in_quoted_region:   211                 first = match.group()   212                 first_pos = match.start()   213                 break   214    215             # Otherwise, keep looking for the end of the region.   216    217             else:   218                 pos = match.end()   219    220         # Where no more input can provide the targets, return a special result.   221    222         else:   223             self.start = length   224             return text[start:], None   225    226         self.start = match.end()   227         return text[start:first_pos], first   228    229 class StreamParser:   230    231     "A stream parser for content in vCard/vCalendar/iCalendar-like formats."   232    233     def __init__(self, f):   234    235         "Initialise the parser for the given file 'f'."   236    237         self.f = f   238    239     def close(self):   240    241         "Close the reader."   242    243         self.f.close()   244    245     def __iter__(self):   246    247         "Return self as the iterator."   248    249         return self   250    251     def next(self):   252    253         """   254         Return the next content item in the file as a tuple of the form   255         (name, parameters, values).   256         """   257    258         return self.parse_content_line()   259    260     def decode_content(self, value):   261    262         "Decode the given 'value', replacing quoted characters."   263    264         return value.replace("\r", "").replace("\\N", "\n").replace("\\n", "\n")   265    266     # Internal methods.   267    268     def parse_content_line(self):   269    270         """   271         Return the name, parameters and value information for the current   272         content line in the file being parsed.   273         """   274    275         f = self.f   276         line_number = f.line_number   277         line = f.get_content_line()   278    279         # Read the property name.   280    281         name, sep = line.search(line.SEPARATORS)   282         name = name.strip()   283    284         if not name and sep is None:   285             raise StopIteration   286    287         # Read the parameters.   288    289         parameters = {}   290    291         while sep == ";":   292    293             # Find the actual modifier.   294    295             parameter_name, sep = line.search(line.SEPARATORS_PLUS_EQUALS)   296             parameter_name = parameter_name.strip()   297    298             if sep == "=":   299                 parameter_value, sep = line.search(line.SEPARATORS)   300                 parameter_value = parameter_value.strip()   301             else:   302                 parameter_value = None   303    304             # Append a key, value tuple to the parameters list.   305    306             parameters[parameter_name] = parameter_value   307    308         # Get the value content.   309    310         if sep != ":":   311             raise ValueError, line_number   312    313         # Obtain and decode the value.   314    315         value = self.decode(name, parameters, line.get_remaining())   316    317         return name, parameters, value   318    319     def decode(self, name, parameters, value):   320    321         "Decode using 'name' and 'parameters' the given 'value'."   322    323         encoding = parameters.get("ENCODING")   324         charset = parameters.get("CHARSET")   325    326         value = self.decode_content(value)   327    328         if encoding == "QUOTED-PRINTABLE":   329             return unicode(quopri.decodestring(value), charset or "iso-8859-1")   330         elif encoding == "BASE64":   331             return base64.decodestring(value)   332         else:   333             return value   334    335 class ParserBase:   336    337     "An abstract parser for content in vCard/vCalendar/iCalendar-like formats."   338    339     def __init__(self):   340    341         "Initialise the parser."   342    343         self.names = []   344    345     def parse(self, f, parser_cls=None):   346    347         "Parse the contents of the file 'f'."   348    349         parser = (parser_cls or StreamParser)(f)   350    351         for name, parameters, value in parser:   352    353             if name == "BEGIN":   354                 self.names.append(value)   355                 self.startComponent(value, parameters)   356    357             elif name == "END":   358                 start_name = self.names.pop()   359                 if start_name != value:   360                     raise ParseError, "Mismatch in BEGIN and END declarations (%r and %r) at line %d." % (   361                         start_name, value, f.line_number)   362    363                 self.endComponent(value)   364    365             else:   366                 self.handleProperty(name, parameters, value)   367    368 class Parser(ParserBase):   369    370     "A SAX-like parser for vCard/vCalendar/iCalendar-like formats."   371    372     def __init__(self):   373         ParserBase.__init__(self)   374         self.components = []   375    376     def startComponent(self, name, parameters):   377    378         """   379         Add the component with the given 'name' and 'parameters', recording an   380         empty list of children as part of the component's content.   381         """   382    383         component = self.handleProperty(name, parameters, [])   384         self.components.append(component)   385         return component   386    387     def endComponent(self, name):   388    389         """   390         End the component with the given 'name' by removing it from the active   391         component stack.   392         """   393    394         if len(self.components) > 1:   395             return self.components.pop()   396         elif self.components:   397             return self.components[-1]   398    399     def handleProperty(self, name, parameters, value):   400    401         """   402         Record the property with the given 'name', 'parameters' and 'value' as   403         part of the current component's children.   404         """   405    406         component = self.makeComponent(name, parameters, value)   407         self.attachComponent(component)   408         return component   409    410     # Component object construction/manipulation methods.   411    412     def attachComponent(self, component):   413    414         "Attach the given 'component' to its parent."   415    416         if self.components:   417             component_name, component_parameters, component_children = self.components[-1]   418             component_children.append(component)   419    420     def makeComponent(self, name, parameters, value):   421    422         """   423         Make a component object from the given 'name', 'parameters' and 'value'.   424         """   425    426         return (name, parameters, value)   427    428     # Public methods.   429    430     def parse(self, f, parser_cls=None):   431    432         "Parse the contents of the file 'f'."   433    434         ParserBase.parse(self, f, parser_cls)   435         return self.components[0]   436    437 # Writer classes.   438    439 class Writer:   440    441     "A simple class wrapping a file, providing simple output capabilities."   442    443     default_line_length = 76   444    445     def __init__(self, f, line_length=None):   446    447         """   448         Initialise the object with the file 'f'. If 'line_length' is set, the   449         length of written lines will conform to the specified value instead of   450         the default value.    451         """   452    453         self.f = f   454         self.line_length = line_length or self.default_line_length   455         self.char_offset = 0   456    457     def close(self):   458    459         "Close the writer."   460    461         self.f.close()   462    463     def write(self, text):   464    465         "Write the 'text' to the file."   466    467         f = self.f   468         line_length = self.line_length   469    470         i = 0   471         remaining = len(text)   472    473         while remaining:   474             space = line_length - self.char_offset   475             if remaining > space:   476                 f.write(text[i:i + space])   477                 f.write("\r\n ")   478                 self.char_offset = 1   479                 i += space   480                 remaining -= space   481             else:   482                 f.write(text[i:])   483                 self.char_offset += remaining   484                 i += remaining   485                 remaining = 0   486    487     def end_line(self):   488    489         "End the current content line."   490    491         if self.char_offset > 0:   492             self.char_offset = 0   493             self.f.write("\r\n")   494    495 class StreamWriter:   496    497     "A stream writer for content in vCard/vCalendar/iCalendar-like formats."   498    499     def __init__(self, f):   500    501         "Initialise the parser for the given file 'f'."   502    503         self.f = f   504    505     def close(self):   506    507         "Close the writer."   508    509         self.f.close()   510    511     def write(self, name, parameters, value):   512    513         """   514         Write a content line, serialising the given 'name', 'parameters' and   515         'value' information.   516         """   517    518         self.write_content_line(name, self.encode_parameters(parameters), self.encode_value(name, parameters, value))   519    520     # Internal methods.   521    522     def write_content_line(self, name, encoded_parameters, encoded_value):   523    524         """   525         Write a content line for the given 'name', 'encoded_parameters' and   526         'encoded_value' information.   527         """   528    529         f = self.f   530    531         f.write(name)   532         for param_name, param_value in encoded_parameters.items():   533             f.write(";")   534             f.write(param_name)   535             f.write("=")   536             f.write(param_value)   537         f.write(":")   538         f.write(encoded_value)   539         f.end_line()   540    541     def encode_quoted_parameter_value(self, value):   542    543         "Encode the given 'value'."   544    545         return '"%s"' % value   546    547     def encode_value(self, name, parameters, value):   548    549         """   550         Encode using 'name' and 'parameters' the given 'value' so that the   551         resulting encoded form employs any specified character encodings.   552         """   553    554         encoding = parameters.get("ENCODING")   555         charset = parameters.get("CHARSET")   556    557         if encoding == "QUOTED-PRINTABLE":   558             value = quopri.encodestring(value.encode(charset or "iso-8859-1"))   559         elif encoding == "BASE64":   560             value = base64.encodestring(value)   561    562         return self.encode_content(value)   563    564     # Overrideable methods.   565    566     def encode_parameters(self, parameters):   567    568         """   569         Encode the given 'parameters' according to the vCalendar specification.   570         """   571    572         encoded_parameters = {}   573    574         for param_name, param_value in parameters.items():   575    576             # Basic format support merely involves quoting values which seem to   577             # need it. Other more specific formats may define exactly which   578             # parameters should be quoted.   579    580             if ContentLine.SEPARATORS.search(param_value):   581                 param_value = self.encode_quoted_parameter_value(param_value)   582    583             encoded_parameters[param_name] = param_value   584    585         return encoded_parameters   586    587     def encode_content(self, value):   588    589         "Encode the given 'value', quoting characters."   590    591         return value.replace("\n", "\\n")   592    593 # Utility functions.   594    595 def is_input_stream(stream_or_string):   596     return hasattr(stream_or_string, "read")   597    598 def get_input_stream(stream_or_string, encoding=None):   599     if is_input_stream(stream_or_string):   600         return stream_or_string   601     else:   602         return codecs.open(stream_or_string, encoding=(encoding or default_encoding))   603    604 def get_output_stream(stream_or_string, encoding=None):   605     if hasattr(stream_or_string, "write"):   606         return stream_or_string   607     else:   608         return codecs.open(stream_or_string, "w", encoding=(encoding or default_encoding))   609    610 # Public functions.   611    612 def parse(stream_or_string, encoding=None, non_standard_newline=0, parser_cls=None):   613    614     """   615     Parse the resource data found through the use of the 'stream_or_string',   616     which is either a stream providing Unicode data (the codecs module can be   617     used to open files or to wrap streams in order to provide Unicode data) or a   618     filename identifying a file to be parsed.   619    620     The optional 'encoding' can be used to specify the character encoding used   621     by the file to be parsed.   622    623     The optional 'non_standard_newline' can be set to a true value (unlike the   624     default) in order to attempt to process files with CR as the end of line   625     character.   626    627     As a result of parsing the resource, the root node of the imported resource   628     is returned.   629     """   630    631     stream = get_input_stream(stream_or_string, encoding)   632     reader = Reader(stream, non_standard_newline)   633    634     # Parse using the reader.   635    636     try:   637         parser = (parser_cls or Parser)()   638         return parser.parse(reader)   639    640     # Close any opened streams.   641    642     finally:   643         if not is_input_stream(stream_or_string):   644             reader.close()   645    646 def iterparse(stream_or_string, encoding=None, non_standard_newline=0, parser_cls=None):   647    648     """   649     Parse the resource data found through the use of the 'stream_or_string',   650     which is either a stream providing Unicode data (the codecs module can be   651     used to open files or to wrap streams in order to provide Unicode data) or a   652     filename identifying a file to be parsed.   653    654     The optional 'encoding' can be used to specify the character encoding used   655     by the file to be parsed.   656    657     The optional 'non_standard_newline' can be set to a true value (unlike the   658     default) in order to attempt to process files with CR as the end of line   659     character.   660    661     An iterator is returned which provides event tuples describing parsing   662     events of the form (name, parameters, value).   663     """   664    665     stream = get_input_stream(stream_or_string, encoding)   666     reader = Reader(stream, non_standard_newline)   667     parser = (parser_cls or StreamParser)(reader)   668     return parser   669    670 def iterwrite(stream_or_string, encoding=None, line_length=None, writer_cls=None):   671    672     """   673     Return a writer which will send data to the resource found through the use   674     of 'stream_or_string', which is either a stream accepting Unicode data (the   675     codecs module can be used to open files or to wrap streams in order to   676     accept Unicode data) or a filename identifying a file to be parsed.   677    678     The optional 'encoding' can be used to specify the character encoding used   679     by the file to be written.   680    681     The optional 'line_length' can be used to specify how long lines should be   682     in the resulting data.   683     """   684    685     stream = get_output_stream(stream_or_string, encoding)   686     _writer = Writer(stream, line_length)   687     writer = (writer_cls or StreamWriter)(_writer)   688     return writer   689    690 # vim: tabstop=4 expandtab shiftwidth=4