vContent

vContent.py

23:92ec5d7be0b5
2013-04-25 Paul Boddie Merged decoding change.
     1 #!/usr/bin/env python     2      3 """     4 Parsing of vCard, vCalendar and iCalendar files.     5      6 Copyright (C) 2005, 2006, 2007, 2008, 2009 Paul Boddie <paul@boddie.org.uk>     7      8 This program is free software; you can redistribute it and/or modify it under     9 the terms of the GNU General Public License as published by the Free Software    10 Foundation; either version 3 of the License, or (at your option) any later    11 version.    12     13 This program is distributed in the hope that it will be useful, but WITHOUT    14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS    15 FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more    16 details.    17     18 You should have received a copy of the GNU General Public License along with    19 this program.  If not, see <http://www.gnu.org/licenses/>.    20     21 --------    22     23 References:    24     25 RFC 5545: Internet Calendaring and Scheduling Core Object Specification    26           (iCalendar)    27           http://tools.ietf.org/html/rfc5545    28     29 RFC 2445: Internet Calendaring and Scheduling Core Object Specification    30           (iCalendar)    31           http://tools.ietf.org/html/rfc2445    32     33 RFC 2425: A MIME Content-Type for Directory Information    34           http://tools.ietf.org/html/rfc2425    35     36 RFC 2426: vCard MIME Directory Profile    37           http://tools.ietf.org/html/rfc2426    38 """    39     40 try:    41     set    42 except NameError:    43     from sets import Set as set    44     45 # Encoding-related imports.    46     47 import base64, quopri    48 import codecs    49     50 # Tokenisation help.    51     52 import re    53     54 # Configuration.    55     56 default_encoding = "utf-8"    57     58 # Reader and parser classes.    59     60 class Reader:    61     62     "A simple class wrapping a file, providing simple pushback capabilities."    63     64     def __init__(self, f, non_standard_newline=0):    65     66         """    67         Initialise the object with the file 'f'. If 'non_standard_newline' is    68         set to a true value (unlike the default), lines ending with CR will be    69         treated as complete lines.    70         """    71     72         self.f = f    73         self.non_standard_newline = non_standard_newline    74         self.lines = []    75         self.line_number = 1 # about to read line 1    76     77     def close(self):    78     79         "Close the reader."    80     81         self.f.close()    82     83     def pushback(self, line):    84     85         """    86         Push the given 'line' back so that the next line read is actually the    87         given 'line' and not the next line from the underlying file.    88         """    89     90         self.lines.append(line)    91         self.line_number -= 1    92     93     def readline(self):    94     95         """    96         If no pushed-back lines exist, read a line directly from the file.    97         Otherwise, read from the list of pushed-back lines.    98         """    99    100         self.line_number += 1   101         if self.lines:   102             return self.lines.pop()   103         else:   104             # Sanity check for broken lines (\r instead of \r\n or \n).   105             line = self.f.readline()   106             while line.endswith("\r") and not self.non_standard_newline:   107                 line += self.f.readline()   108             if line.endswith("\r") and self.non_standard_newline:   109                 return line + "\n"   110             else:   111                 return line   112    113     def read_content_line(self):   114    115         """   116         Read an entire content line, itself potentially consisting of many   117         physical lines of text, returning a string.   118         """   119    120         # Skip blank lines.   121    122         line = self.readline()   123         while line:   124             line_stripped = line.rstrip("\r\n")   125             if not line_stripped:   126                 line = self.readline()   127             else:   128                 break   129         else:   130             return ""   131    132         # Strip all appropriate whitespace from the right end of each line.   133         # For subsequent lines, remove the first whitespace character.   134         # See section 4.1 of the iCalendar specification.   135    136         lines = [line_stripped]   137    138         line = self.readline()   139         while line.startswith(" ") or line.startswith("\t"):   140             lines.append(line[1:].rstrip("\r\n"))   141             line = self.readline()   142    143         # Since one line too many will have been read, push the line back into   144         # the file.   145    146         if line:   147             self.pushback(line)   148    149         return "".join(lines)   150    151     def get_content_line(self):   152    153         "Return a content line object for the current line."   154    155         return ContentLine(self.read_content_line())   156    157 class ContentLine:   158    159     "A content line which can be searched."   160    161     SEPARATORS = re.compile('[;:"]')   162     SEPARATORS_PLUS_EQUALS = re.compile('[=;:"]')   163    164     def __init__(self, text):   165         self.text = text   166         self.start = 0   167    168     def get_remaining(self):   169    170         "Get the remaining text from the content line."   171    172         return self.text[self.start:]   173    174     def search(self, targets):   175    176         """   177         Find one of the 'targets' in the text, returning the string from the   178         current position up to the target found, along with the target string,   179         using a tuple of the form (string, target). If no target was found,   180         return the entire string together with a target of None.   181    182         The 'targets' parameter must be a regular expression object or an object   183         compatible with the API of such objects.   184         """   185    186         text = self.text   187         start = pos = self.start   188         length = len(text)   189    190         # Remember the first target.   191    192         first = None   193         first_pos = None   194         in_quoted_region = 0   195    196         # Process the text, looking for the targets.   197    198         while pos < length:   199             match = targets.search(text, pos)   200    201             # Where nothing matches, end the search.   202    203             if match is None:   204                 pos = length   205    206             # Where a double quote matches, toggle the region state.   207    208             elif match.group() == '"':   209                 in_quoted_region = not in_quoted_region   210                 pos = match.end()   211    212             # Where something else matches outside a region, stop searching.   213    214             elif not in_quoted_region:   215                 first = match.group()   216                 first_pos = match.start()   217                 break   218    219             # Otherwise, keep looking for the end of the region.   220    221             else:   222                 pos = match.end()   223    224         # Where no more input can provide the targets, return a special result.   225    226         else:   227             self.start = length   228             return text[start:], None   229    230         self.start = match.end()   231         return text[start:first_pos], first   232    233 class StreamParser:   234    235     "A stream parser for content in vCard/vCalendar/iCalendar-like formats."   236    237     def __init__(self, f):   238    239         "Initialise the parser for the given file 'f'."   240    241         self.f = f   242    243     def close(self):   244    245         "Close the reader."   246    247         self.f.close()   248    249     def __iter__(self):   250    251         "Return self as the iterator."   252    253         return self   254    255     def next(self):   256    257         """   258         Return the next content item in the file as a tuple of the form   259         (name, parameters, values).   260         """   261    262         return self.parse_content_line()   263    264     def decode_content(self, value):   265    266         "Decode the given 'value', replacing quoted characters."   267    268         return value.replace("\r", "").replace("\\N", "\n").replace("\\n", "\n")   269    270     # Internal methods.   271    272     def parse_content_line(self):   273    274         """   275         Return the name, parameters and value information for the current   276         content line in the file being parsed.   277         """   278    279         f = self.f   280         line_number = f.line_number   281         line = f.get_content_line()   282    283         # Read the property name.   284    285         name, sep = line.search(line.SEPARATORS)   286         name = name.strip()   287    288         if not name and sep is None:   289             raise StopIteration   290    291         # Read the parameters.   292    293         parameters = {}   294    295         while sep == ";":   296    297             # Find the actual modifier.   298    299             parameter_name, sep = line.search(line.SEPARATORS_PLUS_EQUALS)   300             parameter_name = parameter_name.strip()   301    302             if sep == "=":   303                 parameter_value, sep = line.search(line.SEPARATORS)   304                 parameter_value = parameter_value.strip()   305             else:   306                 parameter_value = None   307    308             # Append a key, value tuple to the parameters list.   309    310             parameters[parameter_name] = parameter_value   311    312         # Get the value content.   313    314         if sep != ":":   315             raise ValueError, line_number   316    317         # Obtain and decode the value.   318    319         value = self.decode(name, parameters, line.get_remaining())   320    321         return name, parameters, value   322    323     def decode(self, name, parameters, value):   324    325         "Decode using 'name' and 'parameters' the given 'value'."   326    327         encoding = parameters.get("ENCODING")   328         charset = parameters.get("CHARSET")   329    330         value = self.decode_content(value)   331    332         if encoding == "QUOTED-PRINTABLE":   333             return unicode(quopri.decodestring(value), charset or "iso-8859-1")   334         elif encoding == "BASE64":   335             return base64.decodestring(value)   336         else:   337             return value   338    339 class ParserBase:   340    341     "An abstract parser for content in vCard/vCalendar/iCalendar-like formats."   342    343     def __init__(self):   344    345         "Initialise the parser."   346    347         self.names = []   348    349     def parse(self, f, parser_cls=None):   350    351         "Parse the contents of the file 'f'."   352    353         parser = (parser_cls or StreamParser)(f)   354    355         for name, parameters, value in parser:   356    357             if name == "BEGIN":   358                 self.names.append(value)   359                 self.startComponent(value, parameters)   360    361             elif name == "END":   362                 start_name = self.names.pop()   363                 if start_name != value:   364                     raise ParseError, "Mismatch in BEGIN and END declarations (%r and %r) at line %d." % (   365                         start_name, value, f.line_number)   366    367                 self.endComponent(value)   368    369             else:   370                 self.handleProperty(name, parameters, value)   371    372 class Parser(ParserBase):   373    374     "A SAX-like parser for vCard/vCalendar/iCalendar-like formats."   375    376     def __init__(self):   377         ParserBase.__init__(self)   378         self.components = []   379    380     def startComponent(self, name, parameters):   381    382         """   383         Add the component with the given 'name' and 'parameters', recording an   384         empty list of children as part of the component's content.   385         """   386    387         component = self.handleProperty(name, parameters)   388         self.components.append(component)   389         return component   390    391     def endComponent(self, name):   392    393         """   394         End the component with the given 'name' by removing it from the active   395         component stack. If only one component exists on the stack, retain it   396         for later inspection.   397         """   398    399         if len(self.components) > 1:   400             return self.components.pop()   401    402         # Or return the only element.   403    404         elif self.components:   405             return self.components[0]   406    407     def handleProperty(self, name, parameters, value=None):   408    409         """   410         Record the property with the given 'name', 'parameters' and optional   411         'value' as part of the current component's children.   412         """   413    414         component = self.makeComponent(name, parameters, value)   415         self.attachComponent(component)   416         return component   417    418     # Component object construction/manipulation methods.   419    420     def attachComponent(self, component):   421    422         "Attach the given 'component' to its parent."   423    424         if self.components:   425             component_name, component_parameters, component_children = self.components[-1]   426             component_children.append(component)   427    428     def makeComponent(self, name, parameters, value=None):   429    430         """   431         Make a component object from the given 'name', 'parameters' and optional   432         'value'.   433         """   434    435         return (name, parameters, value or [])   436    437     # Public methods.   438    439     def parse(self, f, parser_cls=None):   440    441         "Parse the contents of the file 'f'."   442    443         ParserBase.parse(self, f, parser_cls)   444         return self.components[0]   445    446 # Writer classes.   447    448 class Writer:   449    450     "A simple class wrapping a file, providing simple output capabilities."   451    452     default_line_length = 76   453    454     def __init__(self, write, line_length=None):   455    456         """   457         Initialise the object with the given 'write' operation. If 'line_length'   458         is set, the length of written lines will conform to the specified value   459         instead of the default value.    460         """   461    462         self._write = write   463         self.line_length = line_length or self.default_line_length   464         self.char_offset = 0   465    466     def write(self, text):   467    468         "Write the 'text' to the file."   469    470         write = self._write   471         line_length = self.line_length   472    473         i = 0   474         remaining = len(text)   475    476         while remaining:   477             space = line_length - self.char_offset   478             if remaining > space:   479                 write(text[i:i + space])   480                 write("\r\n ")   481                 self.char_offset = 1   482                 i += space   483                 remaining -= space   484             else:   485                 write(text[i:])   486                 self.char_offset += remaining   487                 i += remaining   488                 remaining = 0   489    490     def end_line(self):   491    492         "End the current content line."   493    494         if self.char_offset > 0:   495             self.char_offset = 0   496             self._write("\r\n")   497    498 class StreamWriter:   499    500     "A stream writer for content in vCard/vCalendar/iCalendar-like formats."   501    502     def __init__(self, f):   503    504         "Initialise the stream writer with the given 'f' stream object."   505    506         self.f = f   507    508     def write(self, name, parameters, value):   509    510         """   511         Write a content line, serialising the given 'name', 'parameters' and   512         'value' information.   513         """   514    515         self.write_content_line(name, self.encode_parameters(parameters), self.encode_value(name, parameters, value))   516    517     # Internal methods.   518    519     def write_content_line(self, name, encoded_parameters, encoded_value):   520    521         """   522         Write a content line for the given 'name', 'encoded_parameters' and   523         'encoded_value' information.   524         """   525    526         f = self.f   527    528         f.write(name)   529         for param_name, param_value in encoded_parameters.items():   530             f.write(";")   531             f.write(param_name)   532             f.write("=")   533             f.write(param_value)   534         f.write(":")   535         f.write(encoded_value)   536         f.end_line()   537    538     def encode_quoted_parameter_value(self, value):   539    540         "Encode the given 'value'."   541    542         return '"%s"' % value   543    544     def encode_value(self, name, parameters, value):   545    546         """   547         Encode using 'name' and 'parameters' the given 'value' so that the   548         resulting encoded form employs any specified character encodings.   549         """   550    551         encoding = parameters.get("ENCODING")   552         charset = parameters.get("CHARSET")   553    554         if encoding == "QUOTED-PRINTABLE":   555             value = quopri.encodestring(value.encode(charset or "iso-8859-1"))   556         elif encoding == "BASE64":   557             value = base64.encodestring(value)   558    559         return self.encode_content(value)   560    561     # Overrideable methods.   562    563     def encode_parameters(self, parameters):   564    565         """   566         Encode the given 'parameters' according to the vCalendar specification.   567         """   568    569         encoded_parameters = {}   570    571         for param_name, param_value in parameters.items():   572    573             # Basic format support merely involves quoting values which seem to   574             # need it. Other more specific formats may define exactly which   575             # parameters should be quoted.   576    577             if ContentLine.SEPARATORS.search(param_value):   578                 param_value = self.encode_quoted_parameter_value(param_value)   579    580             encoded_parameters[param_name] = param_value   581    582         return encoded_parameters   583    584     def encode_content(self, value):   585    586         "Encode the given 'value', quoting characters."   587    588         return value.replace("\n", "\\n")   589    590 # Utility functions.   591    592 def is_input_stream(stream_or_string):   593     return hasattr(stream_or_string, "read")   594    595 def get_input_stream(stream_or_string, encoding=None):   596     if is_input_stream(stream_or_string):   597         return stream_or_string   598     else:   599         return codecs.open(stream_or_string, encoding=(encoding or default_encoding))   600    601 def get_output_stream(stream_or_string, encoding=None):   602     if hasattr(stream_or_string, "write"):   603         return stream_or_string   604     else:   605         return codecs.open(stream_or_string, "w", encoding=(encoding or default_encoding))   606    607 # Public functions.   608    609 def parse(stream_or_string, encoding=None, non_standard_newline=0, parser_cls=None):   610    611     """   612     Parse the resource data found through the use of the 'stream_or_string',   613     which is either a stream providing Unicode data (the codecs module can be   614     used to open files or to wrap streams in order to provide Unicode data) or a   615     filename identifying a file to be parsed.   616    617     The optional 'encoding' can be used to specify the character encoding used   618     by the file to be parsed.   619    620     The optional 'non_standard_newline' can be set to a true value (unlike the   621     default) in order to attempt to process files with CR as the end of line   622     character.   623    624     As a result of parsing the resource, the root node of the imported resource   625     is returned.   626     """   627    628     stream = get_input_stream(stream_or_string, encoding)   629     reader = Reader(stream, non_standard_newline)   630    631     # Parse using the reader.   632    633     try:   634         parser = (parser_cls or Parser)()   635         return parser.parse(reader)   636    637     # Close any opened streams.   638    639     finally:   640         if not is_input_stream(stream_or_string):   641             reader.close()   642    643 def iterparse(stream_or_string, encoding=None, non_standard_newline=0, parser_cls=None):   644    645     """   646     Parse the resource data found through the use of the 'stream_or_string',   647     which is either a stream providing Unicode data (the codecs module can be   648     used to open files or to wrap streams in order to provide Unicode data) or a   649     filename identifying a file to be parsed.   650    651     The optional 'encoding' can be used to specify the character encoding used   652     by the file to be parsed.   653    654     The optional 'non_standard_newline' can be set to a true value (unlike the   655     default) in order to attempt to process files with CR as the end of line   656     character.   657    658     An iterator is returned which provides event tuples describing parsing   659     events of the form (name, parameters, value).   660     """   661    662     stream = get_input_stream(stream_or_string, encoding)   663     reader = Reader(stream, non_standard_newline)   664     parser = (parser_cls or StreamParser)(reader)   665     return parser   666    667 def iterwrite(stream_or_string=None, write=None, encoding=None, line_length=None, writer_cls=None):   668    669     """   670     Return a writer which will either send data to the resource found through   671     the use of 'stream_or_string' or using the given 'write' operation.   672    673     The 'stream_or_string' parameter may be either a stream accepting Unicode   674     data (the codecs module can be used to open files or to wrap streams in   675     order to accept Unicode data) or a filename identifying a file to be   676     written.   677    678     The optional 'encoding' can be used to specify the character encoding used   679     by the file to be written.   680    681     The optional 'line_length' can be used to specify how long lines should be   682     in the resulting data.   683     """   684    685     if stream_or_string:   686         stream = get_output_stream(stream_or_string, encoding)   687         _writer = Writer(stream.write, line_length)   688     elif write:   689         _writer = Writer(write, line_length)   690     else:   691         raise IOError, "No stream, filename or write operation specified."   692    693     return (writer_cls or StreamWriter)(_writer)   694    695 # vim: tabstop=4 expandtab shiftwidth=4