vContent

vContent.py

19:7a832601e349
2012-04-02 Paul Boddie Added tag rel-0-1 for changeset d63aaad6475b
     1 #!/usr/bin/env python     2      3 """     4 Parsing of vCard, vCalendar and iCalendar files.     5      6 Copyright (C) 2005, 2006, 2007, 2008, 2009 Paul Boddie <paul@boddie.org.uk>     7      8 This program is free software; you can redistribute it and/or modify it under     9 the terms of the GNU General Public License as published by the Free Software    10 Foundation; either version 3 of the License, or (at your option) any later    11 version.    12     13 This program is distributed in the hope that it will be useful, but WITHOUT    14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS    15 FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more    16 details.    17     18 You should have received a copy of the GNU General Public License along with    19 this program.  If not, see <http://www.gnu.org/licenses/>.    20     21 --------    22     23 References:    24     25 RFC 5545: Internet Calendaring and Scheduling Core Object Specification    26           (iCalendar)    27           http://tools.ietf.org/html/rfc5545    28     29 RFC 2445: Internet Calendaring and Scheduling Core Object Specification    30           (iCalendar)    31           http://tools.ietf.org/html/rfc2445    32     33 RFC 2425: A MIME Content-Type for Directory Information    34           http://tools.ietf.org/html/rfc2425    35     36 RFC 2426: vCard MIME Directory Profile    37           http://tools.ietf.org/html/rfc2426    38 """    39     40 try:    41     set    42 except NameError:    43     from sets import Set as set    44     45 # Encoding-related imports.    46     47 import base64, quopri    48 import codecs    49     50 # Tokenisation help.    51     52 import re    53     54 # Configuration.    55     56 default_encoding = "utf-8"    57     58 # Reader and parser classes.    59     60 class Reader:    61     62     "A simple class wrapping a file, providing simple pushback capabilities."    63     64     def __init__(self, f, non_standard_newline=0):    65     66         """    67         Initialise the object with the file 'f'. If 'non_standard_newline' is    68         set to a true value (unlike the default), lines ending with CR will be    69         treated as complete lines.    70         """    71     72         self.f = f    73         self.non_standard_newline = non_standard_newline    74         self.lines = []    75         self.line_number = 1 # about to read line 1    76     77     def close(self):    78     79         "Close the reader."    80     81         self.f.close()    82     83     def pushback(self, line):    84     85         """    86         Push the given 'line' back so that the next line read is actually the    87         given 'line' and not the next line from the underlying file.    88         """    89     90         self.lines.append(line)    91         self.line_number -= 1    92     93     def readline(self):    94     95         """    96         If no pushed-back lines exist, read a line directly from the file.    97         Otherwise, read from the list of pushed-back lines.    98         """    99    100         self.line_number += 1   101         if self.lines:   102             return self.lines.pop()   103         else:   104             # Sanity check for broken lines (\r instead of \r\n or \n).   105             line = self.f.readline()   106             while line.endswith("\r") and not self.non_standard_newline:   107                 line += self.f.readline()   108             if line.endswith("\r") and self.non_standard_newline:   109                 return line + "\n"   110             else:   111                 return line   112    113     def read_content_line(self):   114    115         """   116         Read an entire content line, itself potentially consisting of many   117         physical lines of text, returning a string.   118         """   119    120         # Skip blank lines.   121    122         line = self.readline()   123         while line:   124             line_stripped = line.rstrip("\r\n")   125             if not line_stripped:   126                 line = self.readline()   127             else:   128                 break   129         else:   130             return ""   131    132         # Strip all appropriate whitespace from the right end of each line.   133         # For subsequent lines, remove the first whitespace character.   134         # See section 4.1 of the iCalendar specification.   135    136         lines = [line_stripped]   137    138         line = self.readline()   139         while line.startswith(" ") or line.startswith("\t"):   140             lines.append(line[1:].rstrip("\r\n"))   141             line = self.readline()   142    143         # Since one line too many will have been read, push the line back into   144         # the file.   145    146         if line:   147             self.pushback(line)   148    149         return "".join(lines)   150    151     def get_content_line(self):   152    153         "Return a content line object for the current line."   154    155         return ContentLine(self.read_content_line())   156    157 class ContentLine:   158    159     "A content line which can be searched."   160    161     SEPARATORS = re.compile('[;:"]')   162     SEPARATORS_PLUS_EQUALS = re.compile('[=;:"]')   163    164     def __init__(self, text):   165         self.text = text   166         self.start = 0   167    168     def get_remaining(self):   169    170         "Get the remaining text from the content line."   171    172         return self.text[self.start:]   173    174     def search(self, targets):   175    176         """   177         Find one of the 'targets' in the text, returning the string from the   178         current position up to the target found, along with the target string,   179         using a tuple of the form (string, target). If no target was found,   180         return the entire string together with a target of None.   181    182         The 'targets' parameter must be a regular expression object or an object   183         compatible with the API of such objects.   184         """   185    186         text = self.text   187         start = pos = self.start   188         length = len(text)   189    190         # Remember the first target.   191    192         first = None   193         first_pos = None   194         in_quoted_region = 0   195    196         # Process the text, looking for the targets.   197    198         while pos < length:   199             match = targets.search(text, pos)   200    201             # Where nothing matches, end the search.   202    203             if match is None:   204                 pos = length   205    206             # Where a double quote matches, toggle the region state.   207    208             elif match.group() == '"':   209                 in_quoted_region = not in_quoted_region   210                 pos = match.end()   211    212             # Where something else matches outside a region, stop searching.   213    214             elif not in_quoted_region:   215                 first = match.group()   216                 first_pos = match.start()   217                 break   218    219             # Otherwise, keep looking for the end of the region.   220    221             else:   222                 pos = match.end()   223    224         # Where no more input can provide the targets, return a special result.   225    226         else:   227             self.start = length   228             return text[start:], None   229    230         self.start = match.end()   231         return text[start:first_pos], first   232    233 class StreamParser:   234    235     "A stream parser for content in vCard/vCalendar/iCalendar-like formats."   236    237     def __init__(self, f):   238    239         "Initialise the parser for the given file 'f'."   240    241         self.f = f   242    243     def close(self):   244    245         "Close the reader."   246    247         self.f.close()   248    249     def __iter__(self):   250    251         "Return self as the iterator."   252    253         return self   254    255     def next(self):   256    257         """   258         Return the next content item in the file as a tuple of the form   259         (name, parameters, values).   260         """   261    262         return self.parse_content_line()   263    264     def decode_content(self, value):   265    266         "Decode the given 'value', replacing quoted characters."   267    268         return value.replace("\r", "").replace("\\N", "\n").replace("\\n", "\n")   269    270     # Internal methods.   271    272     def parse_content_line(self):   273    274         """   275         Return the name, parameters and value information for the current   276         content line in the file being parsed.   277         """   278    279         f = self.f   280         line_number = f.line_number   281         line = f.get_content_line()   282    283         # Read the property name.   284    285         name, sep = line.search(line.SEPARATORS)   286         name = name.strip()   287    288         if not name and sep is None:   289             raise StopIteration   290    291         # Read the parameters.   292    293         parameters = {}   294    295         while sep == ";":   296    297             # Find the actual modifier.   298    299             parameter_name, sep = line.search(line.SEPARATORS_PLUS_EQUALS)   300             parameter_name = parameter_name.strip()   301    302             if sep == "=":   303                 parameter_value, sep = line.search(line.SEPARATORS)   304                 parameter_value = parameter_value.strip()   305             else:   306                 parameter_value = None   307    308             # Append a key, value tuple to the parameters list.   309    310             parameters[parameter_name] = parameter_value   311    312         # Get the value content.   313    314         if sep != ":":   315             raise ValueError, line_number   316    317         # Obtain and decode the value.   318    319         value = self.decode(name, parameters, line.get_remaining())   320    321         return name, parameters, value   322    323     def decode(self, name, parameters, value):   324    325         "Decode using 'name' and 'parameters' the given 'value'."   326    327         encoding = parameters.get("ENCODING")   328         charset = parameters.get("CHARSET")   329    330         value = self.decode_content(value)   331    332         if encoding == "QUOTED-PRINTABLE":   333             return unicode(quopri.decodestring(value), charset or "iso-8859-1")   334         elif encoding == "BASE64":   335             return base64.decodestring(value)   336         else:   337             return value   338    339 class ParserBase:   340    341     "An abstract parser for content in vCard/vCalendar/iCalendar-like formats."   342    343     def __init__(self):   344    345         "Initialise the parser."   346    347         self.names = []   348    349     def parse(self, f, parser_cls=None):   350    351         "Parse the contents of the file 'f'."   352    353         parser = (parser_cls or StreamParser)(f)   354    355         for name, parameters, value in parser:   356    357             if name == "BEGIN":   358                 self.names.append(value)   359                 self.startComponent(value, parameters)   360    361             elif name == "END":   362                 start_name = self.names.pop()   363                 if start_name != value:   364                     raise ParseError, "Mismatch in BEGIN and END declarations (%r and %r) at line %d." % (   365                         start_name, value, f.line_number)   366    367                 self.endComponent(value)   368    369             else:   370                 self.handleProperty(name, parameters, value)   371    372 class Parser(ParserBase):   373    374     "A SAX-like parser for vCard/vCalendar/iCalendar-like formats."   375    376     def __init__(self):   377         ParserBase.__init__(self)   378         self.components = []   379    380     def startComponent(self, name, parameters):   381    382         """   383         Add the component with the given 'name' and 'parameters', recording an   384         empty list of children as part of the component's content.   385         """   386    387         component = self.handleProperty(name, parameters)   388         self.components.append(component)   389         return component   390    391     def endComponent(self, name):   392    393         """   394         End the component with the given 'name' by removing it from the active   395         component stack. If only one component exists on the stack, retain it   396         for later inspection.   397         """   398    399         if len(self.components) > 1:   400             return self.components.pop()   401    402         # Or return the only element.   403    404         elif self.components:   405             return self.components[0]   406    407     def handleProperty(self, name, parameters, value=None):   408    409         """   410         Record the property with the given 'name', 'parameters' and optional   411         'value' as part of the current component's children.   412         """   413    414         component = self.makeComponent(name, parameters, value)   415         self.attachComponent(component)   416         return component   417    418     # Component object construction/manipulation methods.   419    420     def attachComponent(self, component):   421    422         "Attach the given 'component' to its parent."   423    424         if self.components:   425             component_name, component_parameters, component_children = self.components[-1]   426             component_children.append(component)   427    428     def makeComponent(self, name, parameters, value=None):   429    430         """   431         Make a component object from the given 'name', 'parameters' and optional   432         'value'.   433         """   434    435         return (name, parameters, value or [])   436    437     # Public methods.   438    439     def parse(self, f, parser_cls=None):   440    441         "Parse the contents of the file 'f'."   442    443         ParserBase.parse(self, f, parser_cls)   444         return self.components[0]   445    446 # Writer classes.   447    448 class Writer:   449    450     "A simple class wrapping a file, providing simple output capabilities."   451    452     default_line_length = 76   453    454     def __init__(self, f, line_length=None):   455    456         """   457         Initialise the object with the file 'f'. If 'line_length' is set, the   458         length of written lines will conform to the specified value instead of   459         the default value.    460         """   461    462         self.f = f   463         self.line_length = line_length or self.default_line_length   464         self.char_offset = 0   465    466     def close(self):   467    468         "Close the writer."   469    470         self.f.close()   471    472     def write(self, text):   473    474         "Write the 'text' to the file."   475    476         f = self.f   477         line_length = self.line_length   478    479         i = 0   480         remaining = len(text)   481    482         while remaining:   483             space = line_length - self.char_offset   484             if remaining > space:   485                 f.write(text[i:i + space])   486                 f.write("\r\n ")   487                 self.char_offset = 1   488                 i += space   489                 remaining -= space   490             else:   491                 f.write(text[i:])   492                 self.char_offset += remaining   493                 i += remaining   494                 remaining = 0   495    496     def end_line(self):   497    498         "End the current content line."   499    500         if self.char_offset > 0:   501             self.char_offset = 0   502             self.f.write("\r\n")   503    504 class StreamWriter:   505    506     "A stream writer for content in vCard/vCalendar/iCalendar-like formats."   507    508     def __init__(self, f):   509    510         "Initialise the parser for the given file 'f'."   511    512         self.f = f   513    514     def close(self):   515    516         "Close the writer."   517    518         self.f.close()   519    520     def write(self, name, parameters, value):   521    522         """   523         Write a content line, serialising the given 'name', 'parameters' and   524         'value' information.   525         """   526    527         self.write_content_line(name, self.encode_parameters(parameters), self.encode_value(name, parameters, value))   528    529     # Internal methods.   530    531     def write_content_line(self, name, encoded_parameters, encoded_value):   532    533         """   534         Write a content line for the given 'name', 'encoded_parameters' and   535         'encoded_value' information.   536         """   537    538         f = self.f   539    540         f.write(name)   541         for param_name, param_value in encoded_parameters.items():   542             f.write(";")   543             f.write(param_name)   544             f.write("=")   545             f.write(param_value)   546         f.write(":")   547         f.write(encoded_value)   548         f.end_line()   549    550     def encode_quoted_parameter_value(self, value):   551    552         "Encode the given 'value'."   553    554         return '"%s"' % value   555    556     def encode_value(self, name, parameters, value):   557    558         """   559         Encode using 'name' and 'parameters' the given 'value' so that the   560         resulting encoded form employs any specified character encodings.   561         """   562    563         encoding = parameters.get("ENCODING")   564         charset = parameters.get("CHARSET")   565    566         if encoding == "QUOTED-PRINTABLE":   567             value = quopri.encodestring(value.encode(charset or "iso-8859-1"))   568         elif encoding == "BASE64":   569             value = base64.encodestring(value)   570    571         return self.encode_content(value)   572    573     # Overrideable methods.   574    575     def encode_parameters(self, parameters):   576    577         """   578         Encode the given 'parameters' according to the vCalendar specification.   579         """   580    581         encoded_parameters = {}   582    583         for param_name, param_value in parameters.items():   584    585             # Basic format support merely involves quoting values which seem to   586             # need it. Other more specific formats may define exactly which   587             # parameters should be quoted.   588    589             if ContentLine.SEPARATORS.search(param_value):   590                 param_value = self.encode_quoted_parameter_value(param_value)   591    592             encoded_parameters[param_name] = param_value   593    594         return encoded_parameters   595    596     def encode_content(self, value):   597    598         "Encode the given 'value', quoting characters."   599    600         return value.replace("\n", "\\n")   601    602 # Utility functions.   603    604 def is_input_stream(stream_or_string):   605     return hasattr(stream_or_string, "read")   606    607 def get_input_stream(stream_or_string, encoding=None):   608     if is_input_stream(stream_or_string):   609         return stream_or_string   610     else:   611         return codecs.open(stream_or_string, encoding=(encoding or default_encoding))   612    613 def get_output_stream(stream_or_string, encoding=None):   614     if hasattr(stream_or_string, "write"):   615         return stream_or_string   616     else:   617         return codecs.open(stream_or_string, "w", encoding=(encoding or default_encoding))   618    619 # Public functions.   620    621 def parse(stream_or_string, encoding=None, non_standard_newline=0, parser_cls=None):   622    623     """   624     Parse the resource data found through the use of the 'stream_or_string',   625     which is either a stream providing Unicode data (the codecs module can be   626     used to open files or to wrap streams in order to provide Unicode data) or a   627     filename identifying a file to be parsed.   628    629     The optional 'encoding' can be used to specify the character encoding used   630     by the file to be parsed.   631    632     The optional 'non_standard_newline' can be set to a true value (unlike the   633     default) in order to attempt to process files with CR as the end of line   634     character.   635    636     As a result of parsing the resource, the root node of the imported resource   637     is returned.   638     """   639    640     stream = get_input_stream(stream_or_string, encoding)   641     reader = Reader(stream, non_standard_newline)   642    643     # Parse using the reader.   644    645     try:   646         parser = (parser_cls or Parser)()   647         return parser.parse(reader)   648    649     # Close any opened streams.   650    651     finally:   652         if not is_input_stream(stream_or_string):   653             reader.close()   654    655 def iterparse(stream_or_string, encoding=None, non_standard_newline=0, parser_cls=None):   656    657     """   658     Parse the resource data found through the use of the 'stream_or_string',   659     which is either a stream providing Unicode data (the codecs module can be   660     used to open files or to wrap streams in order to provide Unicode data) or a   661     filename identifying a file to be parsed.   662    663     The optional 'encoding' can be used to specify the character encoding used   664     by the file to be parsed.   665    666     The optional 'non_standard_newline' can be set to a true value (unlike the   667     default) in order to attempt to process files with CR as the end of line   668     character.   669    670     An iterator is returned which provides event tuples describing parsing   671     events of the form (name, parameters, value).   672     """   673    674     stream = get_input_stream(stream_or_string, encoding)   675     reader = Reader(stream, non_standard_newline)   676     parser = (parser_cls or StreamParser)(reader)   677     return parser   678    679 def iterwrite(stream_or_string, encoding=None, line_length=None, writer_cls=None):   680    681     """   682     Return a writer which will send data to the resource found through the use   683     of 'stream_or_string', which is either a stream accepting Unicode data (the   684     codecs module can be used to open files or to wrap streams in order to   685     accept Unicode data) or a filename identifying a file to be parsed.   686    687     The optional 'encoding' can be used to specify the character encoding used   688     by the file to be written.   689    690     The optional 'line_length' can be used to specify how long lines should be   691     in the resulting data.   692     """   693    694     stream = get_output_stream(stream_or_string, encoding)   695     _writer = Writer(stream, line_length)   696     writer = (writer_cls or StreamWriter)(_writer)   697     return writer   698    699 # vim: tabstop=4 expandtab shiftwidth=4