# HG changeset patch # User Paul Boddie # Date 1224186930 -7200 # Node ID 09250e78af5e1f0a4e1df94c686ff3a6799d8fcf Created a vContent module based on essential parts of the RDFCalendar.Parsers module. diff -r 000000000000 -r 09250e78af5e vContent.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/vContent.py Thu Oct 16 21:55:30 2008 +0200 @@ -0,0 +1,319 @@ +#!/usr/bin/env python + +""" +Parsing of vCard, vCalendar and iCalendar files. + +Copyright (C) 2005, 2006, 2007, 2008 Paul Boddie + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU Lesser General Public License as published by the Free +Software Foundation; either version 3 of the License, or (at your option) any +later version. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more +details. + +You should have received a copy of the GNU Lesser General Public License along +with this program. If not, see . + +-------- + +References: + +RFC 2445: Internet Calendaring and Scheduling Core Object Specification + (iCalendar) + http://rfc.net/rfc2445.html + +RFC 2425: A MIME Content-Type for Directory Information + http://rfc.net/rfc2425.html + +RFC 2426: vCard MIME Directory Profile + http://rfc.net/rfc2426.html +""" + +# Encoding-related imports. + +import base64, quopri + +# Simple reader class. + +class Reader: + + "A simple class wrapping a file, providing simple pushback capabilities." + + def __init__(self, f, non_standard_newline=0): + + """ + Initialise the object with the file 'f'. If 'non_standard_newline' is + set to a true value (unlike the default), lines ending with CR will be + treated as complete lines. + """ + + self.f = f + self.non_standard_newline = non_standard_newline + self.lines = [] + self.line_number = 0 + + def pushback(self, line): + + """ + Push the given 'line' back so that the next line read is actually the + given 'line' and not the next line from the underlying file. + """ + + self.lines.append(line) + self.line_number -= 1 + + def readline(self): + + """ + If no pushed-back lines exist, read a line directly from the file. + Otherwise, read from the list of pushed-back lines. + """ + + self.line_number += 1 + if self.lines: + return self.lines.pop() + else: + # NOTE: Sanity check for broken lines (\r instead of \r\n or \n). + line = self.f.readline() + while line.endswith("\r") and not self.non_standard_newline: + line += self.f.readline() + if line.endswith("\r") and self.non_standard_newline: + return line + "\n" + else: + return line + + def read_until(self, targets): + + """ + Read from the stream until one of the 'targets' is seen. Return the + string from the current position up to the target found, along with the + target string, using a tuple of the form (string, target). If no target + was found, return the entire string together with a target of None. + """ + + indexes = {} + + # Remember the entire text read and the index of the current line in + # that text. + + lines = [] + + line = self.readline() + lines.append(line) + start = 0 + + while indexes == {} and line != "": + for target in targets: + index = line.find(target) + + # Always choose the first matching target. + + if index != -1 and not indexes.has_key(start + index): + indexes[start + index] = target + + start += len(line) + line = self.readline() + lines.append(line) + + text = "".join(lines) + + if indexes: + min_index = reduce(min, indexes.keys()) + target = indexes[min_index] + + # Skip the target. + # Since the end of the buffer should always be a newline, ignore the + # last element. + + lines = text[min_index + len(target):].split("\n")[:] + if not lines[-1]: + del lines[-1] + lines.reverse() + + for line in lines: + self.pushback(line + "\n") + + return text[:min_index], target + else: + return text, None + +class StreamParser: + + "A stream parser for content in vCard/vCalendar/iCalendar-like formats." + + def __init__(self, f): + + "Initialise the parser for the given file 'f'." + + self.f = f + + def __iter__(self): + + "Return self as the iterator." + + return self + + def next(self): + + """ + Return the next content item in the file as a tuple of the form + (name, parameters, values). + """ + + return self.parse_content_line() + + def parse_content_line(self): + + """ + Return the name, parameters and a list containing value information for + the current content line in the file being parsed. + """ + + f = self.f + + parameters = {} + name, sep = f.read_until([";", ":"]) + + name = name.strip() + + if not name and sep is None: + raise StopIteration + + while sep == ";": + + # Find the actual modifier. + + parameter_name, sep = f.read_until(["=", ";", ":"]) + parameter_name = parameter_name.strip() + + if sep == "=": + parameter_value, sep = f.read_until([";", ":"]) + parameter_value = parameter_value.strip() + else: + parameter_value = None + + # Append a key, value tuple to the parameters list. + + parameters[parameter_name] = parameter_value + + # Get the value content. + + if sep != ":": + raise ValueError, f.line_number + + # Strip all appropriate whitespace from the right end of each line. + # For subsequent lines, remove the first whitespace character. + # See section 4.1 of the iCalendar specification. + + line = f.readline() + value_lines = [line.rstrip("\r\n")] + line = f.readline() + while line != "" and line[0] in [" ", "\t"]: + value_lines.append(line.rstrip("\r\n")[1:]) + line = f.readline() + + # Since one line too many will have been read, push the line back into the + # file. + + f.pushback(line) + + # Decode the value. + + value = self.decode("".join(value_lines), parameters.get("ENCODING")) + + return name, parameters, value + + def decode(self, value, encoding): + + "Decode the 'value' with the given 'encoding'." + + # NOTE: Assuming ISO 8869-1 for the character set. + + if encoding == "QUOTED-PRINTABLE": + return unicode(quopri.decodestring(value), "iso-8859-1") + elif encoding == "BASE64": + return base64.decodestring(value) + else: + # NOTE: Introducing newline conversions. + # Replace quoted characters (see 4.3.11 in RFC 2445). + + return value.replace("\r", "").replace("\\N", "\n").replace("\\n", "\n").replace("\\,", ",").replace("\\;", ";") + +class Parser: + + "A parser for content in vCard/vCalendar/iCalendar-like formats." + + def __init__(self): + + "Initialise the parser." + + self.elements = [] # also known as components + self.document = [] + self.current = self.document + + def parse(self, f): + + "Parse the contents of the file 'f'." + + parser = StreamParser(f) + + for name, parameters, value in parser: + + # Add new elements/components to the current position in the + # document, recording the element as the active element. + + if name == "BEGIN": + children = [] + element = (value, parameters, children) + self.elements.append(element) + self.current.append(element) + self.current = children + + # End elements by removing them from the active element stack and + # making the next element's children the current position for new + # content. + + elif name == "END": + start_element = self.elements.pop() + start_value, start_parameters, children = start_element + if start_value != value: + raise ParseError, "Mismatch in BEGIN and END declarations (%r and %r) at line %d." % ( + start_value, value, f.line_number) + if self.elements: + parent_value, parent_parameters, children = self.elements[-1] + self.current = children + else: + self.current = self.document + + else: + self.current.append((name, parameters, value)) + + return self.document + +# Public functions. + +def parse(f, non_standard_newline=0): + + """ + Parse the resource data found through the use of the file object 'f', which + should provide Unicode data, and put the resource information in the given + 'store'. (The codecs module can be used to open files or to wrap streams in + order to provide Unicode data.) + + The optional 'non_standard_newline' can be set to a true value (unlike the + default) in order to attempt to process files with CR as the end of line + character. + + As a result of parsing the resource, the root node of the imported resource + is returned. + """ + + reader = Reader(f, non_standard_newline=non_standard_newline) + parser = Parser() + return parser.parse(reader) + +# vim: tabstop=4 expandtab shiftwidth=4