imip-agent

vContent.py

1174:2fffc03fa3ef
2016-05-12 Paul Boddie Introduced line length configuration for more convenient testing of output. Moved tabular file parsing to the text module for potential use by the tools.
     1 #!/usr/bin/env python     2      3 """     4 Parsing of vCard, vCalendar and iCalendar files.     5      6 Copyright (C) 2005, 2006, 2007, 2008, 2009, 2011, 2013,     7               2014, 2015 Paul Boddie <paul@boddie.org.uk>     8      9 This program is free software; you can redistribute it and/or modify it under    10 the terms of the GNU General Public License as published by the Free Software    11 Foundation; either version 3 of the License, or (at your option) any later    12 version.    13     14 This program is distributed in the hope that it will be useful, but WITHOUT    15 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS    16 FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more    17 details.    18     19 You should have received a copy of the GNU General Public License along with    20 this program.  If not, see <http://www.gnu.org/licenses/>.    21     22 --------    23     24 References:    25     26 RFC 5545: Internet Calendaring and Scheduling Core Object Specification    27           (iCalendar)    28           http://tools.ietf.org/html/rfc5545    29     30 RFC 2445: Internet Calendaring and Scheduling Core Object Specification    31           (iCalendar)    32           http://tools.ietf.org/html/rfc2445    33     34 RFC 2425: A MIME Content-Type for Directory Information    35           http://tools.ietf.org/html/rfc2425    36     37 RFC 2426: vCard MIME Directory Profile    38           http://tools.ietf.org/html/rfc2426    39 """    40     41 try:    42     set    43 except NameError:    44     from sets import Set as set    45     46 # Encoding-related imports.    47     48 import base64, quopri    49 import codecs    50     51 # Tokenisation help.    52     53 import re    54     55 # Configuration.    56     57 default_encoding = "utf-8"    58     59 class ParseError(Exception):    60     61     "General parsing errors."    62     63     pass    64     65 class WriteError(Exception):    66     67     "General writing errors."    68     69     pass    70     71 # Reader and parser classes.    72     73 class Reader:    74     75     "A simple class wrapping a file, providing simple pushback capabilities."    76     77     def __init__(self, f, non_standard_newline=0):    78     79         """    80         Initialise the object with the file 'f'. If 'non_standard_newline' is    81         set to a true value (unlike the default), lines ending with CR will be    82         treated as complete lines.    83         """    84     85         self.f = f    86         self.non_standard_newline = non_standard_newline    87         self.lines = []    88         self.line_number = 1 # about to read line 1    89     90     def close(self):    91     92         "Close the reader."    93     94         self.f.close()    95     96     def pushback(self, line):    97     98         """    99         Push the given 'line' back so that the next line read is actually the   100         given 'line' and not the next line from the underlying file.   101         """   102    103         self.lines.append(line)   104         self.line_number -= 1   105    106     def readline(self):   107    108         """   109         If no pushed-back lines exist, read a line directly from the file.   110         Otherwise, read from the list of pushed-back lines.   111         """   112    113         self.line_number += 1   114         if self.lines:   115             return self.lines.pop()   116         else:   117             # Sanity check for broken lines (\r instead of \r\n or \n).   118             line = self.f.readline()   119             while line.endswith("\r") and not self.non_standard_newline:   120                 s = self.f.readline()   121                 if not s:   122                     break   123                 line += s   124             if line.endswith("\r") and self.non_standard_newline:   125                 return line + "\n"   126             else:   127                 return line   128    129     def read_content_line(self):   130    131         """   132         Read an entire content line, itself potentially consisting of many   133         physical lines of text, returning a string.   134         """   135    136         # Skip blank lines.   137    138         line = self.readline()   139         while line:   140             line_stripped = line.rstrip("\r\n")   141             if not line_stripped:   142                 line = self.readline()   143             else:   144                 break   145         else:   146             return ""   147    148         # Strip all appropriate whitespace from the right end of each line.   149         # For subsequent lines, remove the first whitespace character.   150         # See section 4.1 of the iCalendar specification.   151    152         lines = [line_stripped]   153    154         line = self.readline()   155         while line.startswith(" ") or line.startswith("\t"):   156             lines.append(line[1:].rstrip("\r\n"))   157             line = self.readline()   158    159         # Since one line too many will have been read, push the line back into   160         # the file.   161    162         if line:   163             self.pushback(line)   164    165         return "".join(lines)   166    167     def get_content_line(self):   168    169         "Return a content line object for the current line."   170    171         return ContentLine(self.read_content_line())   172    173 class ContentLine:   174    175     "A content line which can be searched."   176    177     SEPARATORS = re.compile('[;:"]')   178     SEPARATORS_PLUS_EQUALS = re.compile('[=;:"]')   179    180     def __init__(self, text):   181         self.text = text   182         self.start = 0   183    184     def __repr__(self):   185         return "ContentLine(%r)" % self.text   186    187     def get_remaining(self):   188    189         "Get the remaining text from the content line."   190    191         return self.text[self.start:]   192    193     def search(self, targets):   194    195         """   196         Find one of the 'targets' in the text, returning the string from the   197         current position up to the target found, along with the target string,   198         using a tuple of the form (string, target). If no target was found,   199         return the entire string together with a target of None.   200    201         The 'targets' parameter must be a regular expression object or an object   202         compatible with the API of such objects.   203         """   204    205         text = self.text   206         start = pos = self.start   207         length = len(text)   208    209         # Remember the first target.   210    211         first = None   212         first_pos = None   213         in_quoted_region = 0   214    215         # Process the text, looking for the targets.   216    217         while pos < length:   218             match = targets.search(text, pos)   219    220             # Where nothing matches, end the search.   221    222             if match is None:   223                 pos = length   224    225             # Where a double quote matches, toggle the region state.   226    227             elif match.group() == '"':   228                 in_quoted_region = not in_quoted_region   229                 pos = match.end()   230    231             # Where something else matches outside a region, stop searching.   232    233             elif not in_quoted_region:   234                 first = match.group()   235                 first_pos = match.start()   236                 break   237    238             # Otherwise, keep looking for the end of the region.   239    240             else:   241                 pos = match.end()   242    243         # Where no more input can provide the targets, return a special result.   244    245         else:   246             self.start = length   247             return text[start:], None   248    249         self.start = match.end()   250         return text[start:first_pos], first   251    252 class StreamParser:   253    254     "A stream parser for content in vCard/vCalendar/iCalendar-like formats."   255    256     def __init__(self, f):   257    258         "Initialise the parser for the given file 'f'."   259    260         self.f = f   261    262     def close(self):   263    264         "Close the reader."   265    266         self.f.close()   267    268     def __iter__(self):   269    270         "Return self as the iterator."   271    272         return self   273    274     def next(self):   275    276         """   277         Return the next content item in the file as a tuple of the form   278         (name, parameters, values).   279         """   280    281         return self.parse_content_line()   282    283     def decode_content(self, value):   284    285         "Decode the given 'value', replacing quoted characters."   286    287         return value.replace("\r", "").replace("\\N", "\n").replace("\\n", "\n")   288    289     # Internal methods.   290    291     def parse_content_line(self):   292    293         """   294         Return the name, parameters and value information for the current   295         content line in the file being parsed.   296         """   297    298         f = self.f   299         line_number = f.line_number   300         line = f.get_content_line()   301    302         # Read the property name.   303    304         name, sep = line.search(line.SEPARATORS)   305         name = name.strip()   306    307         if not name and sep is None:   308             raise StopIteration   309    310         # Read the parameters.   311    312         parameters = {}   313    314         while sep == ";":   315    316             # Find the actual modifier.   317    318             parameter_name, sep = line.search(line.SEPARATORS_PLUS_EQUALS)   319             parameter_name = parameter_name.strip()   320    321             if sep == "=":   322                 parameter_value, sep = line.search(line.SEPARATORS)   323                 parameter_value = parameter_value.strip()   324             else:   325                 parameter_value = None   326    327             # Append a key, value tuple to the parameters list.   328    329             parameters[parameter_name] = parameter_value   330    331         # Get the value content.   332    333         if sep != ":":   334             raise ValueError, (line_number, line)   335    336         # Obtain and decode the value.   337    338         value = self.decode(name, parameters, line.get_remaining())   339    340         return name, parameters, value   341    342     def decode(self, name, parameters, value):   343    344         "Decode using 'name' and 'parameters' the given 'value'."   345    346         encoding = parameters.get("ENCODING")   347         charset = parameters.get("CHARSET")   348    349         value = self.decode_content(value)   350    351         if encoding == "QUOTED-PRINTABLE":   352             return unicode(quopri.decodestring(value), charset or "iso-8859-1")   353         elif encoding == "BASE64":   354             return base64.decodestring(value)   355         else:   356             return value   357    358 class ParserBase:   359    360     "An abstract parser for content in vCard/vCalendar/iCalendar-like formats."   361    362     def __init__(self):   363    364         "Initialise the parser."   365    366         self.names = []   367    368     def parse(self, f, parser_cls=None):   369    370         "Parse the contents of the file 'f'."   371    372         parser = (parser_cls or StreamParser)(f)   373    374         for name, parameters, value in parser:   375    376             if name == "BEGIN":   377                 self.names.append(value)   378                 self.startComponent(value, parameters)   379    380             elif name == "END":   381                 start_name = self.names.pop()   382                 if start_name != value:   383                     raise ParseError, "Mismatch in BEGIN and END declarations (%r and %r) at line %d." % (   384                         start_name, value, f.line_number)   385    386                 self.endComponent(value)   387    388             else:   389                 self.handleProperty(name, parameters, value)   390    391 class Parser(ParserBase):   392    393     "A SAX-like parser for vCard/vCalendar/iCalendar-like formats."   394    395     def __init__(self):   396         ParserBase.__init__(self)   397         self.components = []   398    399     def startComponent(self, name, parameters):   400    401         """   402         Add the component with the given 'name' and 'parameters', recording an   403         empty list of children as part of the component's content.   404         """   405    406         component = self.handleProperty(name, parameters)   407         self.components.append(component)   408         return component   409    410     def endComponent(self, name):   411    412         """   413         End the component with the given 'name' by removing it from the active   414         component stack. If only one component exists on the stack, retain it   415         for later inspection.   416         """   417    418         if len(self.components) > 1:   419             return self.components.pop()   420    421         # Or return the only element.   422    423         elif self.components:   424             return self.components[0]   425    426     def handleProperty(self, name, parameters, value=None):   427    428         """   429         Record the property with the given 'name', 'parameters' and optional   430         'value' as part of the current component's children.   431         """   432    433         component = self.makeComponent(name, parameters, value)   434         self.attachComponent(component)   435         return component   436    437     # Component object construction/manipulation methods.   438    439     def attachComponent(self, component):   440    441         "Attach the given 'component' to its parent."   442    443         if self.components:   444             component_name, component_parameters, component_children = self.components[-1]   445             component_children.append(component)   446    447     def makeComponent(self, name, parameters, value=None):   448    449         """   450         Make a component object from the given 'name', 'parameters' and optional   451         'value'.   452         """   453    454         return (name, parameters, value or [])   455    456     # Public methods.   457    458     def parse(self, f, parser_cls=None):   459    460         "Parse the contents of the file 'f'."   461    462         ParserBase.parse(self, f, parser_cls)   463         try:   464             return self.components[0]   465         except IndexError:   466             raise ParseError, "No vContent component found in file."   467    468 # Writer classes.   469    470 class Writer:   471    472     "A simple class wrapping a file, providing simple output capabilities."   473    474     default_line_length = 76   475    476     def __init__(self, write, line_length=None):   477    478         """   479         Initialise the object with the given 'write' operation. If 'line_length'   480         is set, the length of written lines will conform to the specified value   481         instead of the default value.    482         """   483    484         self._write = write   485         self.line_length = line_length or self.default_line_length   486         self.char_offset = 0   487    488     def write(self, text):   489    490         "Write the 'text' to the file."   491    492         write = self._write   493         line_length = self.line_length   494    495         i = 0   496         remaining = len(text)   497    498         while remaining:   499             space = line_length - self.char_offset   500             if remaining > space:   501                 write(text[i:i + space])   502                 write("\r\n ")   503                 self.char_offset = 1   504                 i += space   505                 remaining -= space   506             else:   507                 write(text[i:])   508                 self.char_offset += remaining   509                 i += remaining   510                 remaining = 0   511    512     def end_line(self):   513    514         "End the current content line."   515    516         if self.char_offset > 0:   517             self.char_offset = 0   518             self._write("\r\n")   519    520 class StreamWriter:   521    522     "A stream writer for content in vCard/vCalendar/iCalendar-like formats."   523    524     def __init__(self, f):   525    526         "Initialise the stream writer with the given 'f' stream object."   527    528         self.f = f   529    530     def append(self, record):   531         self.write(*record)   532    533     def write(self, name, parameters, value):   534    535         """   536         Write a content line, serialising the given 'name', 'parameters' and   537         'value' information.   538         """   539    540         self.write_content_line(name, self.encode_parameters(parameters), self.encode_value(name, parameters, value))   541    542     # Internal methods.   543    544     def write_content_line(self, name, encoded_parameters, encoded_value):   545    546         """   547         Write a content line for the given 'name', 'encoded_parameters' and   548         'encoded_value' information.   549         """   550    551         f = self.f   552    553         f.write(name)   554         for param_name, param_value in encoded_parameters.items():   555             f.write(";")   556             f.write(param_name)   557             f.write("=")   558             f.write(param_value)   559         f.write(":")   560         f.write(encoded_value)   561         f.end_line()   562    563     def encode_quoted_parameter_value(self, value):   564    565         "Encode the given 'value'."   566    567         return '"%s"' % value   568    569     def encode_value(self, name, parameters, value):   570    571         """   572         Encode using 'name' and 'parameters' the given 'value' so that the   573         resulting encoded form employs any specified character encodings.   574         """   575    576         encoding = parameters.get("ENCODING")   577         charset = parameters.get("CHARSET")   578    579         try:   580             if encoding == "QUOTED-PRINTABLE":   581                 value = quopri.encodestring(value.encode(charset or "iso-8859-1"))   582             elif encoding == "BASE64":   583                 value = base64.encodestring(value)   584    585             return self.encode_content(value)   586         except TypeError:   587             raise WriteError, "Property %r value with parameters %r cannot be encoded: %r" % (name, parameters, value)   588    589     # Overrideable methods.   590    591     def encode_parameters(self, parameters):   592    593         """   594         Encode the given 'parameters' according to the vCalendar specification.   595         """   596    597         encoded_parameters = {}   598    599         for param_name, param_value in parameters.items():   600    601             # Basic format support merely involves quoting values which seem to   602             # need it. Other more specific formats may define exactly which   603             # parameters should be quoted.   604    605             if ContentLine.SEPARATORS.search(param_value):   606                 param_value = self.encode_quoted_parameter_value(param_value)   607    608             encoded_parameters[param_name] = param_value   609    610         return encoded_parameters   611    612     def encode_content(self, value):   613    614         "Encode the given 'value', quoting characters."   615    616         return (value or "").replace("\n", "\\n")   617    618 # Utility functions.   619    620 def is_input_stream(stream_or_string):   621     return hasattr(stream_or_string, "read")   622    623 def get_input_stream(stream_or_string, encoding=None):   624     if is_input_stream(stream_or_string):   625         if isinstance(stream_or_string, codecs.StreamReader):   626             return stream_or_string   627         else:   628             return codecs.getreader(encoding or default_encoding)(stream_or_string)   629     else:   630         return codecs.open(stream_or_string, encoding=(encoding or default_encoding))   631    632 def get_output_stream(stream_or_string, encoding=None):   633     if hasattr(stream_or_string, "write"):   634         if isinstance(stream_or_string, codecs.StreamWriter):   635             return stream_or_string   636         else:   637             return codecs.getwriter(encoding or default_encoding)(stream_or_string)   638     else:   639         return codecs.open(stream_or_string, "w", encoding=(encoding or default_encoding))   640    641 def items_to_dict(items, sections=None):   642    643     """   644     Return the given 'items' as a dictionary mapping names to tuples of the form   645     (value, attributes). Where 'sections' is provided, only items whose names   646     occur in the given 'sections' collection will be treated as groups or   647     sections of definitions.   648     """   649    650     d = {}   651     for name, attr, value in items:   652         if not d.has_key(name):   653             d[name] = []   654         if isinstance(value, list) and (not sections or name in sections):   655             d[name].append((items_to_dict(value, sections), attr))   656         else:   657             d[name].append((value, attr))   658     return d   659    660 def dict_to_items(d):   661    662     """   663     Return 'd' converted to a list of items suitable for serialisation using   664     iterwrite.   665     """   666    667     items = []   668     for name, value in d.items():   669         if isinstance(value, list):   670             for v, a in value:   671                 if isinstance(v, dict):   672                     items.append((name, a, dict_to_items(v)))   673                 else:   674                     items.append((name, a, v))   675         else:   676             v, a = value   677             items.append((name, a, dict_to_items(v)))   678     return items   679    680 # Public functions.   681    682 def parse(stream_or_string, encoding=None, non_standard_newline=0, parser_cls=None):   683    684     """   685     Parse the resource data found through the use of the 'stream_or_string',   686     which is either a stream providing Unicode data (the codecs module can be   687     used to open files or to wrap streams in order to provide Unicode data) or a   688     filename identifying a file to be parsed.   689    690     The optional 'encoding' can be used to specify the character encoding used   691     by the file to be parsed.   692    693     The optional 'non_standard_newline' can be set to a true value (unlike the   694     default) in order to attempt to process files with CR as the end of line   695     character.   696    697     As a result of parsing the resource, the root node of the imported resource   698     is returned.   699     """   700    701     stream = get_input_stream(stream_or_string, encoding)   702     reader = Reader(stream, non_standard_newline)   703    704     # Parse using the reader.   705    706     try:   707         parser = (parser_cls or Parser)()   708         return parser.parse(reader)   709    710     # Close any opened streams.   711    712     finally:   713         if not is_input_stream(stream_or_string):   714             reader.close()   715    716 def iterparse(stream_or_string, encoding=None, non_standard_newline=0, parser_cls=None):   717    718     """   719     Parse the resource data found through the use of the 'stream_or_string',   720     which is either a stream providing Unicode data (the codecs module can be   721     used to open files or to wrap streams in order to provide Unicode data) or a   722     filename identifying a file to be parsed.   723    724     The optional 'encoding' can be used to specify the character encoding used   725     by the file to be parsed.   726    727     The optional 'non_standard_newline' can be set to a true value (unlike the   728     default) in order to attempt to process files with CR as the end of line   729     character.   730    731     An iterator is returned which provides event tuples describing parsing   732     events of the form (name, parameters, value).   733     """   734    735     stream = get_input_stream(stream_or_string, encoding)   736     reader = Reader(stream, non_standard_newline)   737     parser = (parser_cls or StreamParser)(reader)   738     return parser   739    740 def iterwrite(stream_or_string=None, write=None, encoding=None, line_length=None, writer_cls=None):   741    742     """   743     Return a writer which will either send data to the resource found through   744     the use of 'stream_or_string' or using the given 'write' operation.   745    746     The 'stream_or_string' parameter may be either a stream accepting Unicode   747     data (the codecs module can be used to open files or to wrap streams in   748     order to accept Unicode data) or a filename identifying a file to be   749     written.   750    751     The optional 'encoding' can be used to specify the character encoding used   752     by the file to be written.   753    754     The optional 'line_length' can be used to specify how long lines should be   755     in the resulting data.   756     """   757    758     if stream_or_string:   759         stream = get_output_stream(stream_or_string, encoding)   760         _writer = Writer(stream.write, line_length)   761     elif write:   762         _writer = Writer(write, line_length)   763     else:   764         raise IOError, "No stream, filename or write operation specified."   765    766     return (writer_cls or StreamWriter)(_writer)   767    768 def to_dict(node, sections=None):   769    770     "Return the 'node' converted to a dictionary representation."   771    772     name, attr, items = node   773     return {name : (isinstance(items, list) and items_to_dict(items, sections) or items, attr)}   774    775 def to_node(d):   776    777     "Return 'd' converted to a items-based representation."   778    779     return dict_to_items(d)[0]   780    781 # vim: tabstop=4 expandtab shiftwidth=4