# HG changeset patch
# User Paul Boddie <paul@boddie.org.uk>
# Date 1224186930 -7200
# Node ID 09250e78af5e1f0a4e1df94c686ff3a6799d8fcf

Created a vContent module based on essential parts of the RDFCalendar.Parsers
module.

diff -r 000000000000 -r 09250e78af5e vContent.py
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/vContent.py	Thu Oct 16 21:55:30 2008 +0200
@@ -0,0 +1,319 @@
+#!/usr/bin/env python
+
+"""
+Parsing of vCard, vCalendar and iCalendar files.
+
+Copyright (C) 2005, 2006, 2007, 2008 Paul Boddie <paul@boddie.org.uk>
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU Lesser General Public License as published by the Free
+Software Foundation; either version 3 of the License, or (at your option) any
+later version.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for more
+details.
+
+You should have received a copy of the GNU Lesser General Public License along
+with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+--------
+
+References:
+
+RFC 2445: Internet Calendaring and Scheduling Core Object Specification
+          (iCalendar)
+          http://rfc.net/rfc2445.html
+
+RFC 2425: A MIME Content-Type for Directory Information
+          http://rfc.net/rfc2425.html
+
+RFC 2426: vCard MIME Directory Profile
+          http://rfc.net/rfc2426.html
+"""
+
+# Encoding-related imports.
+
+import base64, quopri
+
+# Simple reader class.
+
+class Reader:
+
+    "A simple class wrapping a file, providing simple pushback capabilities."
+
+    def __init__(self, f, non_standard_newline=0):
+
+        """
+        Initialise the object with the file 'f'. If 'non_standard_newline' is
+        set to a true value (unlike the default), lines ending with CR will be
+        treated as complete lines.
+        """
+
+        self.f = f
+        self.non_standard_newline = non_standard_newline
+        self.lines = []
+        self.line_number = 0
+
+    def pushback(self, line):
+
+        """
+        Push the given 'line' back so that the next line read is actually the
+        given 'line' and not the next line from the underlying file.
+        """
+
+        self.lines.append(line)
+        self.line_number -= 1
+
+    def readline(self):
+
+        """
+        If no pushed-back lines exist, read a line directly from the file.
+        Otherwise, read from the list of pushed-back lines.
+        """
+
+        self.line_number += 1
+        if self.lines:
+            return self.lines.pop()
+        else:
+            # NOTE: Sanity check for broken lines (\r instead of \r\n or \n).
+            line = self.f.readline()
+            while line.endswith("\r") and not self.non_standard_newline:
+                line += self.f.readline()
+            if line.endswith("\r") and self.non_standard_newline:
+                return line + "\n"
+            else:
+                return line
+
+    def read_until(self, targets):
+
+        """
+        Read from the stream until one of the 'targets' is seen. Return the
+        string from the current position up to the target found, along with the
+        target string, using a tuple of the form (string, target). If no target
+        was found, return the entire string together with a target of None.
+        """
+
+        indexes = {}
+
+        # Remember the entire text read and the index of the current line in
+        # that text.
+
+        lines = []
+
+        line = self.readline()
+        lines.append(line)
+        start = 0
+
+        while indexes == {} and line != "":
+            for target in targets:
+                index = line.find(target)
+
+                # Always choose the first matching target.
+
+                if index != -1 and not indexes.has_key(start + index):
+                    indexes[start + index] = target
+
+            start += len(line)
+            line = self.readline()
+            lines.append(line)
+
+        text = "".join(lines)
+
+        if indexes:
+            min_index = reduce(min, indexes.keys())
+            target = indexes[min_index]
+
+            # Skip the target.
+            # Since the end of the buffer should always be a newline, ignore the
+            # last element.
+
+            lines = text[min_index + len(target):].split("\n")[:]
+            if not lines[-1]:
+                del lines[-1]
+            lines.reverse()
+
+            for line in lines:
+                self.pushback(line + "\n")
+
+            return text[:min_index], target
+        else:
+            return text, None
+
+class StreamParser:
+
+    "A stream parser for content in vCard/vCalendar/iCalendar-like formats."
+
+    def __init__(self, f):
+
+        "Initialise the parser for the given file 'f'."
+
+        self.f = f
+
+    def __iter__(self):
+
+        "Return self as the iterator."
+
+        return self
+
+    def next(self):
+
+        """
+        Return the next content item in the file as a tuple of the form
+        (name, parameters, values).
+        """
+
+        return self.parse_content_line()
+
+    def parse_content_line(self):
+
+        """
+        Return the name, parameters and a list containing value information for
+        the current content line in the file being parsed.
+        """
+
+        f = self.f
+
+        parameters = {}
+        name, sep = f.read_until([";", ":"])
+
+        name = name.strip()
+
+        if not name and sep is None:
+            raise StopIteration
+
+        while sep == ";":
+
+            # Find the actual modifier.
+
+            parameter_name, sep = f.read_until(["=", ";", ":"])
+            parameter_name = parameter_name.strip()
+
+            if sep == "=":
+                parameter_value, sep = f.read_until([";", ":"])
+                parameter_value = parameter_value.strip()
+            else:
+                parameter_value = None
+
+            # Append a key, value tuple to the parameters list.
+
+            parameters[parameter_name] = parameter_value
+
+        # Get the value content.
+
+        if sep != ":":
+            raise ValueError, f.line_number
+
+        # Strip all appropriate whitespace from the right end of each line.
+        # For subsequent lines, remove the first whitespace character.
+        # See section 4.1 of the iCalendar specification.
+
+        line = f.readline()
+        value_lines = [line.rstrip("\r\n")]
+        line = f.readline()
+        while line != "" and line[0] in [" ", "\t"]:
+            value_lines.append(line.rstrip("\r\n")[1:])
+            line = f.readline()
+
+        # Since one line too many will have been read, push the line back into the
+        # file.
+
+        f.pushback(line)
+
+        # Decode the value.
+
+        value = self.decode("".join(value_lines), parameters.get("ENCODING"))
+
+        return name, parameters, value
+
+    def decode(self, value, encoding):
+
+        "Decode the 'value' with the given 'encoding'."
+
+        # NOTE: Assuming ISO 8869-1 for the character set.
+
+        if encoding == "QUOTED-PRINTABLE":
+            return unicode(quopri.decodestring(value), "iso-8859-1")
+        elif encoding == "BASE64":
+            return base64.decodestring(value)
+        else:
+            # NOTE: Introducing newline conversions.
+            # Replace quoted characters (see 4.3.11 in RFC 2445).
+
+            return value.replace("\r", "").replace("\\N", "\n").replace("\\n", "\n").replace("\\,", ",").replace("\\;", ";")
+
+class Parser:
+
+    "A parser for content in vCard/vCalendar/iCalendar-like formats."
+
+    def __init__(self):
+
+        "Initialise the parser."
+
+        self.elements = [] # also known as components
+        self.document = []
+        self.current = self.document
+
+    def parse(self, f):
+
+        "Parse the contents of the file 'f'."
+
+        parser = StreamParser(f)
+
+        for name, parameters, value in parser:
+
+            # Add new elements/components to the current position in the
+            # document, recording the element as the active element.
+
+            if name == "BEGIN":
+                children = []
+                element = (value, parameters, children)
+                self.elements.append(element)
+                self.current.append(element)
+                self.current = children
+
+            # End elements by removing them from the active element stack and
+            # making the next element's children the current position for new
+            # content.
+
+            elif name == "END":
+                start_element = self.elements.pop()
+                start_value, start_parameters, children = start_element
+                if start_value != value:
+                    raise ParseError, "Mismatch in BEGIN and END declarations (%r and %r) at line %d." % (
+                        start_value, value, f.line_number)
+                if self.elements:
+                    parent_value, parent_parameters, children = self.elements[-1]
+                    self.current = children
+                else:
+                    self.current = self.document
+
+            else:
+                self.current.append((name, parameters, value))
+
+        return self.document
+
+# Public functions.
+
+def parse(f, non_standard_newline=0):
+
+    """
+    Parse the resource data found through the use of the file object 'f', which
+    should provide Unicode data, and put the resource information in the given
+    'store'. (The codecs module can be used to open files or to wrap streams in
+    order to provide Unicode data.)
+
+    The optional 'non_standard_newline' can be set to a true value (unlike the
+    default) in order to attempt to process files with CR as the end of line
+    character.
+
+    As a result of parsing the resource, the root node of the imported resource
+    is returned.
+    """
+
+    reader = Reader(f, non_standard_newline=non_standard_newline)
+    parser = Parser()
+    return parser.parse(reader)
+
+# vim: tabstop=4 expandtab shiftwidth=4