1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
1.2 +++ b/vContent.py Sun Sep 21 23:37:24 2014 +0200
1.3 @@ -0,0 +1,701 @@
1.4 +#!/usr/bin/env python
1.5 +
1.6 +"""
1.7 +Parsing of vCard, vCalendar and iCalendar files.
1.8 +
1.9 +Copyright (C) 2005, 2006, 2007, 2008, 2009, 2011, 2013 Paul Boddie <paul@boddie.org.uk>
1.10 +
1.11 +This program is free software; you can redistribute it and/or modify it under
1.12 +the terms of the GNU General Public License as published by the Free Software
1.13 +Foundation; either version 3 of the License, or (at your option) any later
1.14 +version.
1.15 +
1.16 +This program is distributed in the hope that it will be useful, but WITHOUT
1.17 +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
1.18 +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
1.19 +details.
1.20 +
1.21 +You should have received a copy of the GNU General Public License along with
1.22 +this program. If not, see <http://www.gnu.org/licenses/>.
1.23 +
1.24 +--------
1.25 +
1.26 +References:
1.27 +
1.28 +RFC 5545: Internet Calendaring and Scheduling Core Object Specification
1.29 + (iCalendar)
1.30 + http://tools.ietf.org/html/rfc5545
1.31 +
1.32 +RFC 2445: Internet Calendaring and Scheduling Core Object Specification
1.33 + (iCalendar)
1.34 + http://tools.ietf.org/html/rfc2445
1.35 +
1.36 +RFC 2425: A MIME Content-Type for Directory Information
1.37 + http://tools.ietf.org/html/rfc2425
1.38 +
1.39 +RFC 2426: vCard MIME Directory Profile
1.40 + http://tools.ietf.org/html/rfc2426
1.41 +"""
1.42 +
1.43 +try:
1.44 + set
1.45 +except NameError:
1.46 + from sets import Set as set
1.47 +
1.48 +# Encoding-related imports.
1.49 +
1.50 +import base64, quopri
1.51 +import codecs
1.52 +
1.53 +# Tokenisation help.
1.54 +
1.55 +import re
1.56 +
1.57 +# Configuration.
1.58 +
1.59 +default_encoding = "utf-8"
1.60 +
1.61 +# Reader and parser classes.
1.62 +
1.63 +class Reader:
1.64 +
1.65 + "A simple class wrapping a file, providing simple pushback capabilities."
1.66 +
1.67 + def __init__(self, f, non_standard_newline=0):
1.68 +
1.69 + """
1.70 + Initialise the object with the file 'f'. If 'non_standard_newline' is
1.71 + set to a true value (unlike the default), lines ending with CR will be
1.72 + treated as complete lines.
1.73 + """
1.74 +
1.75 + self.f = f
1.76 + self.non_standard_newline = non_standard_newline
1.77 + self.lines = []
1.78 + self.line_number = 1 # about to read line 1
1.79 +
1.80 + def close(self):
1.81 +
1.82 + "Close the reader."
1.83 +
1.84 + self.f.close()
1.85 +
1.86 + def pushback(self, line):
1.87 +
1.88 + """
1.89 + Push the given 'line' back so that the next line read is actually the
1.90 + given 'line' and not the next line from the underlying file.
1.91 + """
1.92 +
1.93 + self.lines.append(line)
1.94 + self.line_number -= 1
1.95 +
1.96 + def readline(self):
1.97 +
1.98 + """
1.99 + If no pushed-back lines exist, read a line directly from the file.
1.100 + Otherwise, read from the list of pushed-back lines.
1.101 + """
1.102 +
1.103 + self.line_number += 1
1.104 + if self.lines:
1.105 + return self.lines.pop()
1.106 + else:
1.107 + # Sanity check for broken lines (\r instead of \r\n or \n).
1.108 + line = self.f.readline()
1.109 + while line.endswith("\r") and not self.non_standard_newline:
1.110 + s = self.f.readline()
1.111 + if not s:
1.112 + break
1.113 + line += s
1.114 + if line.endswith("\r") and self.non_standard_newline:
1.115 + return line + "\n"
1.116 + else:
1.117 + return line
1.118 +
1.119 + def read_content_line(self):
1.120 +
1.121 + """
1.122 + Read an entire content line, itself potentially consisting of many
1.123 + physical lines of text, returning a string.
1.124 + """
1.125 +
1.126 + # Skip blank lines.
1.127 +
1.128 + line = self.readline()
1.129 + while line:
1.130 + line_stripped = line.rstrip("\r\n")
1.131 + if not line_stripped:
1.132 + line = self.readline()
1.133 + else:
1.134 + break
1.135 + else:
1.136 + return ""
1.137 +
1.138 + # Strip all appropriate whitespace from the right end of each line.
1.139 + # For subsequent lines, remove the first whitespace character.
1.140 + # See section 4.1 of the iCalendar specification.
1.141 +
1.142 + lines = [line_stripped]
1.143 +
1.144 + line = self.readline()
1.145 + while line.startswith(" ") or line.startswith("\t"):
1.146 + lines.append(line[1:].rstrip("\r\n"))
1.147 + line = self.readline()
1.148 +
1.149 + # Since one line too many will have been read, push the line back into
1.150 + # the file.
1.151 +
1.152 + if line:
1.153 + self.pushback(line)
1.154 +
1.155 + return "".join(lines)
1.156 +
1.157 + def get_content_line(self):
1.158 +
1.159 + "Return a content line object for the current line."
1.160 +
1.161 + return ContentLine(self.read_content_line())
1.162 +
1.163 +class ContentLine:
1.164 +
1.165 + "A content line which can be searched."
1.166 +
1.167 + SEPARATORS = re.compile('[;:"]')
1.168 + SEPARATORS_PLUS_EQUALS = re.compile('[=;:"]')
1.169 +
1.170 + def __init__(self, text):
1.171 + self.text = text
1.172 + self.start = 0
1.173 +
1.174 + def __repr__(self):
1.175 + return "ContentLine(%r)" % self.text
1.176 +
1.177 + def get_remaining(self):
1.178 +
1.179 + "Get the remaining text from the content line."
1.180 +
1.181 + return self.text[self.start:]
1.182 +
1.183 + def search(self, targets):
1.184 +
1.185 + """
1.186 + Find one of the 'targets' in the text, returning the string from the
1.187 + current position up to the target found, along with the target string,
1.188 + using a tuple of the form (string, target). If no target was found,
1.189 + return the entire string together with a target of None.
1.190 +
1.191 + The 'targets' parameter must be a regular expression object or an object
1.192 + compatible with the API of such objects.
1.193 + """
1.194 +
1.195 + text = self.text
1.196 + start = pos = self.start
1.197 + length = len(text)
1.198 +
1.199 + # Remember the first target.
1.200 +
1.201 + first = None
1.202 + first_pos = None
1.203 + in_quoted_region = 0
1.204 +
1.205 + # Process the text, looking for the targets.
1.206 +
1.207 + while pos < length:
1.208 + match = targets.search(text, pos)
1.209 +
1.210 + # Where nothing matches, end the search.
1.211 +
1.212 + if match is None:
1.213 + pos = length
1.214 +
1.215 + # Where a double quote matches, toggle the region state.
1.216 +
1.217 + elif match.group() == '"':
1.218 + in_quoted_region = not in_quoted_region
1.219 + pos = match.end()
1.220 +
1.221 + # Where something else matches outside a region, stop searching.
1.222 +
1.223 + elif not in_quoted_region:
1.224 + first = match.group()
1.225 + first_pos = match.start()
1.226 + break
1.227 +
1.228 + # Otherwise, keep looking for the end of the region.
1.229 +
1.230 + else:
1.231 + pos = match.end()
1.232 +
1.233 + # Where no more input can provide the targets, return a special result.
1.234 +
1.235 + else:
1.236 + self.start = length
1.237 + return text[start:], None
1.238 +
1.239 + self.start = match.end()
1.240 + return text[start:first_pos], first
1.241 +
1.242 +class StreamParser:
1.243 +
1.244 + "A stream parser for content in vCard/vCalendar/iCalendar-like formats."
1.245 +
1.246 + def __init__(self, f):
1.247 +
1.248 + "Initialise the parser for the given file 'f'."
1.249 +
1.250 + self.f = f
1.251 +
1.252 + def close(self):
1.253 +
1.254 + "Close the reader."
1.255 +
1.256 + self.f.close()
1.257 +
1.258 + def __iter__(self):
1.259 +
1.260 + "Return self as the iterator."
1.261 +
1.262 + return self
1.263 +
1.264 + def next(self):
1.265 +
1.266 + """
1.267 + Return the next content item in the file as a tuple of the form
1.268 + (name, parameters, values).
1.269 + """
1.270 +
1.271 + return self.parse_content_line()
1.272 +
1.273 + def decode_content(self, value):
1.274 +
1.275 + "Decode the given 'value', replacing quoted characters."
1.276 +
1.277 + return value.replace("\r", "").replace("\\N", "\n").replace("\\n", "\n")
1.278 +
1.279 + # Internal methods.
1.280 +
1.281 + def parse_content_line(self):
1.282 +
1.283 + """
1.284 + Return the name, parameters and value information for the current
1.285 + content line in the file being parsed.
1.286 + """
1.287 +
1.288 + f = self.f
1.289 + line_number = f.line_number
1.290 + line = f.get_content_line()
1.291 +
1.292 + # Read the property name.
1.293 +
1.294 + name, sep = line.search(line.SEPARATORS)
1.295 + name = name.strip()
1.296 +
1.297 + if not name and sep is None:
1.298 + raise StopIteration
1.299 +
1.300 + # Read the parameters.
1.301 +
1.302 + parameters = {}
1.303 +
1.304 + while sep == ";":
1.305 +
1.306 + # Find the actual modifier.
1.307 +
1.308 + parameter_name, sep = line.search(line.SEPARATORS_PLUS_EQUALS)
1.309 + parameter_name = parameter_name.strip()
1.310 +
1.311 + if sep == "=":
1.312 + parameter_value, sep = line.search(line.SEPARATORS)
1.313 + parameter_value = parameter_value.strip()
1.314 + else:
1.315 + parameter_value = None
1.316 +
1.317 + # Append a key, value tuple to the parameters list.
1.318 +
1.319 + parameters[parameter_name] = parameter_value
1.320 +
1.321 + # Get the value content.
1.322 +
1.323 + if sep != ":":
1.324 + raise ValueError, (line_number, line)
1.325 +
1.326 + # Obtain and decode the value.
1.327 +
1.328 + value = self.decode(name, parameters, line.get_remaining())
1.329 +
1.330 + return name, parameters, value
1.331 +
1.332 + def decode(self, name, parameters, value):
1.333 +
1.334 + "Decode using 'name' and 'parameters' the given 'value'."
1.335 +
1.336 + encoding = parameters.get("ENCODING")
1.337 + charset = parameters.get("CHARSET")
1.338 +
1.339 + value = self.decode_content(value)
1.340 +
1.341 + if encoding == "QUOTED-PRINTABLE":
1.342 + return unicode(quopri.decodestring(value), charset or "iso-8859-1")
1.343 + elif encoding == "BASE64":
1.344 + return base64.decodestring(value)
1.345 + else:
1.346 + return value
1.347 +
1.348 +class ParserBase:
1.349 +
1.350 + "An abstract parser for content in vCard/vCalendar/iCalendar-like formats."
1.351 +
1.352 + def __init__(self):
1.353 +
1.354 + "Initialise the parser."
1.355 +
1.356 + self.names = []
1.357 +
1.358 + def parse(self, f, parser_cls=None):
1.359 +
1.360 + "Parse the contents of the file 'f'."
1.361 +
1.362 + parser = (parser_cls or StreamParser)(f)
1.363 +
1.364 + for name, parameters, value in parser:
1.365 +
1.366 + if name == "BEGIN":
1.367 + self.names.append(value)
1.368 + self.startComponent(value, parameters)
1.369 +
1.370 + elif name == "END":
1.371 + start_name = self.names.pop()
1.372 + if start_name != value:
1.373 + raise ParseError, "Mismatch in BEGIN and END declarations (%r and %r) at line %d." % (
1.374 + start_name, value, f.line_number)
1.375 +
1.376 + self.endComponent(value)
1.377 +
1.378 + else:
1.379 + self.handleProperty(name, parameters, value)
1.380 +
1.381 +class Parser(ParserBase):
1.382 +
1.383 + "A SAX-like parser for vCard/vCalendar/iCalendar-like formats."
1.384 +
1.385 + def __init__(self):
1.386 + ParserBase.__init__(self)
1.387 + self.components = []
1.388 +
1.389 + def startComponent(self, name, parameters):
1.390 +
1.391 + """
1.392 + Add the component with the given 'name' and 'parameters', recording an
1.393 + empty list of children as part of the component's content.
1.394 + """
1.395 +
1.396 + component = self.handleProperty(name, parameters)
1.397 + self.components.append(component)
1.398 + return component
1.399 +
1.400 + def endComponent(self, name):
1.401 +
1.402 + """
1.403 + End the component with the given 'name' by removing it from the active
1.404 + component stack. If only one component exists on the stack, retain it
1.405 + for later inspection.
1.406 + """
1.407 +
1.408 + if len(self.components) > 1:
1.409 + return self.components.pop()
1.410 +
1.411 + # Or return the only element.
1.412 +
1.413 + elif self.components:
1.414 + return self.components[0]
1.415 +
1.416 + def handleProperty(self, name, parameters, value=None):
1.417 +
1.418 + """
1.419 + Record the property with the given 'name', 'parameters' and optional
1.420 + 'value' as part of the current component's children.
1.421 + """
1.422 +
1.423 + component = self.makeComponent(name, parameters, value)
1.424 + self.attachComponent(component)
1.425 + return component
1.426 +
1.427 + # Component object construction/manipulation methods.
1.428 +
1.429 + def attachComponent(self, component):
1.430 +
1.431 + "Attach the given 'component' to its parent."
1.432 +
1.433 + if self.components:
1.434 + component_name, component_parameters, component_children = self.components[-1]
1.435 + component_children.append(component)
1.436 +
1.437 + def makeComponent(self, name, parameters, value=None):
1.438 +
1.439 + """
1.440 + Make a component object from the given 'name', 'parameters' and optional
1.441 + 'value'.
1.442 + """
1.443 +
1.444 + return (name, parameters, value or [])
1.445 +
1.446 + # Public methods.
1.447 +
1.448 + def parse(self, f, parser_cls=None):
1.449 +
1.450 + "Parse the contents of the file 'f'."
1.451 +
1.452 + ParserBase.parse(self, f, parser_cls)
1.453 + return self.components[0]
1.454 +
1.455 +# Writer classes.
1.456 +
1.457 +class Writer:
1.458 +
1.459 + "A simple class wrapping a file, providing simple output capabilities."
1.460 +
1.461 + default_line_length = 76
1.462 +
1.463 + def __init__(self, write, line_length=None):
1.464 +
1.465 + """
1.466 + Initialise the object with the given 'write' operation. If 'line_length'
1.467 + is set, the length of written lines will conform to the specified value
1.468 + instead of the default value.
1.469 + """
1.470 +
1.471 + self._write = write
1.472 + self.line_length = line_length or self.default_line_length
1.473 + self.char_offset = 0
1.474 +
1.475 + def write(self, text):
1.476 +
1.477 + "Write the 'text' to the file."
1.478 +
1.479 + write = self._write
1.480 + line_length = self.line_length
1.481 +
1.482 + i = 0
1.483 + remaining = len(text)
1.484 +
1.485 + while remaining:
1.486 + space = line_length - self.char_offset
1.487 + if remaining > space:
1.488 + write(text[i:i + space])
1.489 + write("\r\n ")
1.490 + self.char_offset = 1
1.491 + i += space
1.492 + remaining -= space
1.493 + else:
1.494 + write(text[i:])
1.495 + self.char_offset += remaining
1.496 + i += remaining
1.497 + remaining = 0
1.498 +
1.499 + def end_line(self):
1.500 +
1.501 + "End the current content line."
1.502 +
1.503 + if self.char_offset > 0:
1.504 + self.char_offset = 0
1.505 + self._write("\r\n")
1.506 +
1.507 +class StreamWriter:
1.508 +
1.509 + "A stream writer for content in vCard/vCalendar/iCalendar-like formats."
1.510 +
1.511 + def __init__(self, f):
1.512 +
1.513 + "Initialise the stream writer with the given 'f' stream object."
1.514 +
1.515 + self.f = f
1.516 +
1.517 + def write(self, name, parameters, value):
1.518 +
1.519 + """
1.520 + Write a content line, serialising the given 'name', 'parameters' and
1.521 + 'value' information.
1.522 + """
1.523 +
1.524 + self.write_content_line(name, self.encode_parameters(parameters), self.encode_value(name, parameters, value))
1.525 +
1.526 + # Internal methods.
1.527 +
1.528 + def write_content_line(self, name, encoded_parameters, encoded_value):
1.529 +
1.530 + """
1.531 + Write a content line for the given 'name', 'encoded_parameters' and
1.532 + 'encoded_value' information.
1.533 + """
1.534 +
1.535 + f = self.f
1.536 +
1.537 + f.write(name)
1.538 + for param_name, param_value in encoded_parameters.items():
1.539 + f.write(";")
1.540 + f.write(param_name)
1.541 + f.write("=")
1.542 + f.write(param_value)
1.543 + f.write(":")
1.544 + f.write(encoded_value)
1.545 + f.end_line()
1.546 +
1.547 + def encode_quoted_parameter_value(self, value):
1.548 +
1.549 + "Encode the given 'value'."
1.550 +
1.551 + return '"%s"' % value
1.552 +
1.553 + def encode_value(self, name, parameters, value):
1.554 +
1.555 + """
1.556 + Encode using 'name' and 'parameters' the given 'value' so that the
1.557 + resulting encoded form employs any specified character encodings.
1.558 + """
1.559 +
1.560 + encoding = parameters.get("ENCODING")
1.561 + charset = parameters.get("CHARSET")
1.562 +
1.563 + if encoding == "QUOTED-PRINTABLE":
1.564 + value = quopri.encodestring(value.encode(charset or "iso-8859-1"))
1.565 + elif encoding == "BASE64":
1.566 + value = base64.encodestring(value)
1.567 +
1.568 + return self.encode_content(value)
1.569 +
1.570 + # Overrideable methods.
1.571 +
1.572 + def encode_parameters(self, parameters):
1.573 +
1.574 + """
1.575 + Encode the given 'parameters' according to the vCalendar specification.
1.576 + """
1.577 +
1.578 + encoded_parameters = {}
1.579 +
1.580 + for param_name, param_value in parameters.items():
1.581 +
1.582 + # Basic format support merely involves quoting values which seem to
1.583 + # need it. Other more specific formats may define exactly which
1.584 + # parameters should be quoted.
1.585 +
1.586 + if ContentLine.SEPARATORS.search(param_value):
1.587 + param_value = self.encode_quoted_parameter_value(param_value)
1.588 +
1.589 + encoded_parameters[param_name] = param_value
1.590 +
1.591 + return encoded_parameters
1.592 +
1.593 + def encode_content(self, value):
1.594 +
1.595 + "Encode the given 'value', quoting characters."
1.596 +
1.597 + return value.replace("\n", "\\n")
1.598 +
1.599 +# Utility functions.
1.600 +
1.601 +def is_input_stream(stream_or_string):
1.602 + return hasattr(stream_or_string, "read")
1.603 +
1.604 +def get_input_stream(stream_or_string, encoding=None):
1.605 + if is_input_stream(stream_or_string):
1.606 + return stream_or_string
1.607 + else:
1.608 + return codecs.open(stream_or_string, encoding=(encoding or default_encoding))
1.609 +
1.610 +def get_output_stream(stream_or_string, encoding=None):
1.611 + if hasattr(stream_or_string, "write"):
1.612 + return stream_or_string
1.613 + else:
1.614 + return codecs.open(stream_or_string, "w", encoding=(encoding or default_encoding))
1.615 +
1.616 +# Public functions.
1.617 +
1.618 +def parse(stream_or_string, encoding=None, non_standard_newline=0, parser_cls=None):
1.619 +
1.620 + """
1.621 + Parse the resource data found through the use of the 'stream_or_string',
1.622 + which is either a stream providing Unicode data (the codecs module can be
1.623 + used to open files or to wrap streams in order to provide Unicode data) or a
1.624 + filename identifying a file to be parsed.
1.625 +
1.626 + The optional 'encoding' can be used to specify the character encoding used
1.627 + by the file to be parsed.
1.628 +
1.629 + The optional 'non_standard_newline' can be set to a true value (unlike the
1.630 + default) in order to attempt to process files with CR as the end of line
1.631 + character.
1.632 +
1.633 + As a result of parsing the resource, the root node of the imported resource
1.634 + is returned.
1.635 + """
1.636 +
1.637 + stream = get_input_stream(stream_or_string, encoding)
1.638 + reader = Reader(stream, non_standard_newline)
1.639 +
1.640 + # Parse using the reader.
1.641 +
1.642 + try:
1.643 + parser = (parser_cls or Parser)()
1.644 + return parser.parse(reader)
1.645 +
1.646 + # Close any opened streams.
1.647 +
1.648 + finally:
1.649 + if not is_input_stream(stream_or_string):
1.650 + reader.close()
1.651 +
1.652 +def iterparse(stream_or_string, encoding=None, non_standard_newline=0, parser_cls=None):
1.653 +
1.654 + """
1.655 + Parse the resource data found through the use of the 'stream_or_string',
1.656 + which is either a stream providing Unicode data (the codecs module can be
1.657 + used to open files or to wrap streams in order to provide Unicode data) or a
1.658 + filename identifying a file to be parsed.
1.659 +
1.660 + The optional 'encoding' can be used to specify the character encoding used
1.661 + by the file to be parsed.
1.662 +
1.663 + The optional 'non_standard_newline' can be set to a true value (unlike the
1.664 + default) in order to attempt to process files with CR as the end of line
1.665 + character.
1.666 +
1.667 + An iterator is returned which provides event tuples describing parsing
1.668 + events of the form (name, parameters, value).
1.669 + """
1.670 +
1.671 + stream = get_input_stream(stream_or_string, encoding)
1.672 + reader = Reader(stream, non_standard_newline)
1.673 + parser = (parser_cls or StreamParser)(reader)
1.674 + return parser
1.675 +
1.676 +def iterwrite(stream_or_string=None, write=None, encoding=None, line_length=None, writer_cls=None):
1.677 +
1.678 + """
1.679 + Return a writer which will either send data to the resource found through
1.680 + the use of 'stream_or_string' or using the given 'write' operation.
1.681 +
1.682 + The 'stream_or_string' parameter may be either a stream accepting Unicode
1.683 + data (the codecs module can be used to open files or to wrap streams in
1.684 + order to accept Unicode data) or a filename identifying a file to be
1.685 + written.
1.686 +
1.687 + The optional 'encoding' can be used to specify the character encoding used
1.688 + by the file to be written.
1.689 +
1.690 + The optional 'line_length' can be used to specify how long lines should be
1.691 + in the resulting data.
1.692 + """
1.693 +
1.694 + if stream_or_string:
1.695 + stream = get_output_stream(stream_or_string, encoding)
1.696 + _writer = Writer(stream.write, line_length)
1.697 + elif write:
1.698 + _writer = Writer(write, line_length)
1.699 + else:
1.700 + raise IOError, "No stream, filename or write operation specified."
1.701 +
1.702 + return (writer_cls or StreamWriter)(_writer)
1.703 +
1.704 +# vim: tabstop=4 expandtab shiftwidth=4