vContent (annotate vContent.py in 3fdf59812622)

vContent

Annotated vContent.py

5:3fdf59812622

2008-10-23

Paul Boddie

Added a vCalendarStreamParser class which decodes content, reducing the vCalendarParser class to something which only assembles the content. Fixed the decode_parameters method to actually return the decoded parameters. Added test files and new tests for stream parsing. Added iterparse functions and made the vCalendar.parse and vCalendar.iterparse functions use their vContent counterparts.

paul@0	1	#!/usr/bin/env python
paul@0	2
paul@0	3	"""
paul@0	4	Parsing of vCard, vCalendar and iCalendar files.
paul@0	5
paul@0	6	Copyright (C) 2005, 2006, 2007, 2008 Paul Boddie <paul@boddie.org.uk>
paul@0	7
paul@0	8	This program is free software; you can redistribute it and/or modify it under
paul@0	9	the terms of the GNU Lesser General Public License as published by the Free
paul@0	10	Software Foundation; either version 3 of the License, or (at your option) any
paul@0	11	later version.
paul@0	12
paul@0	13	This program is distributed in the hope that it will be useful, but WITHOUT
paul@0	14	ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
paul@0	15	FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
paul@0	16	details.
paul@0	17
paul@0	18	You should have received a copy of the GNU Lesser General Public License along
paul@0	19	with this program. If not, see <http://www.gnu.org/licenses/>.
paul@0	20
paul@0	21	--------
paul@0	22
paul@0	23	References:
paul@0	24
paul@0	25	RFC 2445: Internet Calendaring and Scheduling Core Object Specification
paul@0	26	(iCalendar)
paul@0	27	http://rfc.net/rfc2445.html
paul@0	28
paul@0	29	RFC 2425: A MIME Content-Type for Directory Information
paul@0	30	http://rfc.net/rfc2425.html
paul@0	31
paul@0	32	RFC 2426: vCard MIME Directory Profile
paul@0	33	http://rfc.net/rfc2426.html
paul@0	34	"""
paul@0	35
paul@4	36	try:
paul@4	37	set
paul@4	38	except NameError:
paul@4	39	from sets import Set as set
paul@4	40
paul@0	41	# Encoding-related imports.
paul@0	42
paul@0	43	import base64, quopri
paul@0	44
paul@4	45	# Tokenisation help.
paul@4	46
paul@4	47	import re
paul@4	48
paul@0	49	# Simple reader class.
paul@0	50
paul@0	51	class Reader:
paul@0	52
paul@0	53	"A simple class wrapping a file, providing simple pushback capabilities."
paul@0	54
paul@4	55	SEPARATORS = re.compile('[;:"]')
paul@4	56	SEPARATORS_PLUS_EQUALS = re.compile('[=;:"]')
paul@4	57
paul@0	58	def __init__(self, f, non_standard_newline=0):
paul@0	59
paul@0	60	"""
paul@0	61	Initialise the object with the file 'f'. If 'non_standard_newline' is
paul@0	62	set to a true value (unlike the default), lines ending with CR will be
paul@0	63	treated as complete lines.
paul@0	64	"""
paul@0	65
paul@0	66	self.f = f
paul@0	67	self.non_standard_newline = non_standard_newline
paul@0	68	self.lines = []
paul@0	69	self.line_number = 0
paul@0	70
paul@0	71	def pushback(self, line):
paul@0	72
paul@0	73	"""
paul@0	74	Push the given 'line' back so that the next line read is actually the
paul@0	75	given 'line' and not the next line from the underlying file.
paul@0	76	"""
paul@0	77
paul@0	78	self.lines.append(line)
paul@0	79	self.line_number -= 1
paul@0	80
paul@0	81	def readline(self):
paul@0	82
paul@0	83	"""
paul@0	84	If no pushed-back lines exist, read a line directly from the file.
paul@0	85	Otherwise, read from the list of pushed-back lines.
paul@0	86	"""
paul@0	87
paul@0	88	self.line_number += 1
paul@0	89	if self.lines:
paul@0	90	return self.lines.pop()
paul@0	91	else:
paul@0	92	# NOTE: Sanity check for broken lines (\r instead of \r\n or \n).
paul@0	93	line = self.f.readline()
paul@0	94	while line.endswith("\r") and not self.non_standard_newline:
paul@0	95	line += self.f.readline()
paul@0	96	if line.endswith("\r") and self.non_standard_newline:
paul@0	97	return line + "\n"
paul@0	98	else:
paul@0	99	return line
paul@0	100
paul@0	101	def read_until(self, targets):
paul@0	102
paul@0	103	"""
paul@0	104	Read from the stream until one of the 'targets' is seen. Return the
paul@0	105	string from the current position up to the target found, along with the
paul@0	106	target string, using a tuple of the form (string, target). If no target
paul@0	107	was found, return the entire string together with a target of None.
paul@0	108	"""
paul@0	109
paul@0	110	# Remember the entire text read and the index of the current line in
paul@0	111	# that text.
paul@0	112
paul@0	113	lines = []
paul@0	114
paul@0	115	line = self.readline()
paul@0	116	lines.append(line)
paul@0	117	start = 0
paul@0	118
paul@4	119	# Remember the first target.
paul@4	120
paul@4	121	first = None
paul@4	122	first_pos = None
paul@4	123	in_quoted_region = 0
paul@0	124
paul@4	125	# Process each line, looking for the targets.
paul@4	126
paul@4	127	while line != "":
paul@4	128	match = targets.search(line, start)
paul@4	129
paul@4	130	# Where nothing matches, get the next line.
paul@0	131
paul@4	132	if match is None:
paul@4	133	line = self.readline()
paul@4	134	lines.append(line)
paul@4	135	start = 0
paul@0	136
paul@4	137	# Where a double quote matches, toggle the region state.
paul@0	138
paul@4	139	elif match.group() == '"':
paul@4	140	in_quoted_region = not in_quoted_region
paul@4	141	start = match.end()
paul@4	142
paul@4	143	# Where something else matches outside a region, stop searching.
paul@0	144
paul@4	145	elif not in_quoted_region:
paul@4	146	first = match.group()
paul@4	147	first_pos = match.start()
paul@4	148	break
paul@0	149
paul@4	150	# Otherwise, keep looking for the end of the region.
paul@4	151
paul@4	152	else:
paul@4	153	start = match.end()
paul@4	154
paul@4	155	# Where no more input can provide the targets, return a special result.
paul@0	156
paul@4	157	else:
paul@4	158	text = "".join(lines)
paul@4	159	return text, None
paul@4	160
paul@4	161	# Push back the text after the target.
paul@0	162
paul@4	163	after_target = lines[-1][first_pos + len(first):]
paul@4	164	self.pushback(after_target)
paul@0	165
paul@4	166	# Produce the lines until the matching line, together with the portion
paul@4	167	# of the matching line before the target.
paul@4	168
paul@4	169	lines[-1] = lines[-1][:first_pos]
paul@4	170	text = "".join(lines)
paul@4	171	return text, first
paul@0	172
paul@0	173	class StreamParser:
paul@0	174
paul@0	175	"A stream parser for content in vCard/vCalendar/iCalendar-like formats."
paul@0	176
paul@0	177	def __init__(self, f):
paul@0	178
paul@0	179	"Initialise the parser for the given file 'f'."
paul@0	180
paul@0	181	self.f = f
paul@0	182
paul@0	183	def __iter__(self):
paul@0	184
paul@0	185	"Return self as the iterator."
paul@0	186
paul@0	187	return self
paul@0	188
paul@0	189	def next(self):
paul@0	190
paul@0	191	"""
paul@0	192	Return the next content item in the file as a tuple of the form
paul@0	193	(name, parameters, values).
paul@0	194	"""
paul@0	195
paul@0	196	return self.parse_content_line()
paul@0	197
paul@5	198	# Internal methods.
paul@5	199
paul@0	200	def parse_content_line(self):
paul@0	201
paul@0	202	"""
paul@0	203	Return the name, parameters and a list containing value information for
paul@0	204	the current content line in the file being parsed.
paul@0	205	"""
paul@0	206
paul@0	207	f = self.f
paul@0	208
paul@0	209	parameters = {}
paul@4	210	name, sep = f.read_until(f.SEPARATORS)
paul@0	211
paul@0	212	name = name.strip()
paul@0	213
paul@0	214	if not name and sep is None:
paul@0	215	raise StopIteration
paul@0	216
paul@0	217	while sep == ";":
paul@0	218
paul@0	219	# Find the actual modifier.
paul@0	220
paul@4	221	parameter_name, sep = f.read_until(f.SEPARATORS_PLUS_EQUALS)
paul@0	222	parameter_name = parameter_name.strip()
paul@0	223
paul@0	224	if sep == "=":
paul@4	225	parameter_value, sep = f.read_until(f.SEPARATORS)
paul@0	226	parameter_value = parameter_value.strip()
paul@0	227	else:
paul@0	228	parameter_value = None
paul@0	229
paul@0	230	# Append a key, value tuple to the parameters list.
paul@0	231
paul@0	232	parameters[parameter_name] = parameter_value
paul@0	233
paul@0	234	# Get the value content.
paul@0	235
paul@0	236	if sep != ":":
paul@0	237	raise ValueError, f.line_number
paul@0	238
paul@0	239	# Strip all appropriate whitespace from the right end of each line.
paul@0	240	# For subsequent lines, remove the first whitespace character.
paul@0	241	# See section 4.1 of the iCalendar specification.
paul@0	242
paul@0	243	line = f.readline()
paul@0	244	value_lines = [line.rstrip("\r\n")]
paul@0	245	line = f.readline()
paul@0	246	while line != "" and line[0] in [" ", "\t"]:
paul@0	247	value_lines.append(line.rstrip("\r\n")[1:])
paul@0	248	line = f.readline()
paul@0	249
paul@0	250	# Since one line too many will have been read, push the line back into the
paul@0	251	# file.
paul@0	252
paul@0	253	f.pushback(line)
paul@0	254
paul@0	255	# Decode the value.
paul@0	256
paul@1	257	value = self.decode("".join(value_lines), parameters)
paul@0	258
paul@0	259	return name, parameters, value
paul@0	260
paul@1	261	def decode(self, value, parameters):
paul@1	262
paul@1	263	"Decode the 'value' using the given 'parameters'."
paul@0	264
paul@1	265	encoding = parameters.get("ENCODING")
paul@1	266	charset = parameters.get("CHARSET")
paul@0	267
paul@1	268	# NOTE: Introducing newline conversions.
paul@1	269	# Replace quoted characters (see 4.3.11 in RFC 2445).
paul@1	270
paul@1	271	value = value.replace("\r", "").replace("\\N", "\n").replace("\\n", "\n").replace("\\,", ",").replace("\\;", ";")
paul@0	272
paul@0	273	if encoding == "QUOTED-PRINTABLE":
paul@1	274	return unicode(quopri.decodestring(value), charset or "iso-8859-1")
paul@0	275	elif encoding == "BASE64":
paul@0	276	return base64.decodestring(value)
paul@0	277	else:
paul@1	278	return value
paul@0	279
paul@2	280	class ParserBase:
paul@0	281
paul@2	282	"An abstract parser for content in vCard/vCalendar/iCalendar-like formats."
paul@0	283
paul@0	284	def __init__(self):
paul@0	285
paul@0	286	"Initialise the parser."
paul@0	287
paul@2	288	self.names = []
paul@0	289
paul@5	290	def parse(self, f, parser_cls=None):
paul@0	291
paul@0	292	"Parse the contents of the file 'f'."
paul@0	293
paul@5	294	parser = (parser_cls or StreamParser)(f)
paul@0	295
paul@0	296	for name, parameters, value in parser:
paul@0	297
paul@0	298	if name == "BEGIN":
paul@2	299	self.names.append(value)
paul@3	300	self.startComponent(value, parameters)
paul@0	301
paul@0	302	elif name == "END":
paul@2	303	start_name = self.names.pop()
paul@2	304	if start_name != value:
paul@0	305	raise ParseError, "Mismatch in BEGIN and END declarations (%r and %r) at line %d." % (
paul@2	306	start_name, value, f.line_number)
paul@2	307
paul@3	308	self.endComponent(value)
paul@0	309
paul@0	310	else:
paul@3	311	self.handleProperty(name, parameters, value)
paul@2	312
paul@2	313	class Parser(ParserBase):
paul@2	314
paul@2	315	"A SAX-like parser for vCard/vCalendar/iCalendar-like formats."
paul@2	316
paul@2	317	def __init__(self):
paul@2	318	ParserBase.__init__(self)
paul@3	319	self.components = []
paul@2	320
paul@3	321	def startComponent(self, name, parameters):
paul@2	322
paul@2	323	"""
paul@3	324	Add the component with the given 'name' and 'parameters', recording an
paul@3	325	empty list of children as part of the component's content.
paul@2	326	"""
paul@2	327
paul@3	328	component = self.handleProperty(name, parameters, [])
paul@3	329	self.components.append(component)
paul@3	330	return component
paul@2	331
paul@3	332	def endComponent(self, name):
paul@2	333
paul@2	334	"""
paul@3	335	End the component with the given 'name' by removing it from the active
paul@3	336	component stack.
paul@2	337	"""
paul@2	338
paul@3	339	if len(self.components) > 1:
paul@3	340	return self.components.pop()
paul@3	341	elif self.components:
paul@3	342	return self.components[-1]
paul@2	343
paul@3	344	def handleProperty(self, name, parameters, value):
paul@0	345
paul@2	346	"""
paul@4	347	Record the property with the given 'name', 'parameters' and 'value' as
paul@3	348	part of the current component's children.
paul@2	349	"""
paul@2	350
paul@2	351	component = self.makeComponent(name, parameters, value)
paul@2	352	self.attachComponent(component)
paul@2	353	return component
paul@2	354
paul@2	355	# Component object construction/manipulation methods.
paul@2	356
paul@2	357	def attachComponent(self, component):
paul@2	358
paul@2	359	"Attach the given 'component' to its parent."
paul@2	360
paul@3	361	if self.components:
paul@3	362	component_name, component_parameters, component_children = self.components[-1]
paul@3	363	component_children.append(component)
paul@2	364
paul@2	365	def makeComponent(self, name, parameters, value):
paul@2	366
paul@2	367	"""
paul@2	368	Make a component object from the given 'name', 'parameters' and 'value'.
paul@2	369	"""
paul@2	370
paul@2	371	return (name, parameters, value)
paul@2	372
paul@2	373	# Public methods.
paul@2	374
paul@5	375	def parse(self, f, parser_cls=None):
paul@2	376
paul@2	377	"Parse the contents of the file 'f'."
paul@2	378
paul@5	379	ParserBase.parse(self, f, parser_cls)
paul@3	380	return self.components[0]
paul@0	381
paul@0	382	# Public functions.
paul@0	383
paul@5	384	def parse(f, non_standard_newline=0, parser_cls=None):
paul@0	385
paul@0	386	"""
paul@0	387	Parse the resource data found through the use of the file object 'f', which
paul@5	388	should provide Unicode data. (The codecs module can be used to open files or
paul@5	389	to wrap streams in order to provide Unicode data.)
paul@0	390
paul@0	391	The optional 'non_standard_newline' can be set to a true value (unlike the
paul@0	392	default) in order to attempt to process files with CR as the end of line
paul@0	393	character.
paul@0	394
paul@0	395	As a result of parsing the resource, the root node of the imported resource
paul@0	396	is returned.
paul@0	397	"""
paul@0	398
paul@5	399	reader = Reader(f, non_standard_newline)
paul@5	400	parser = (parser_cls or Parser)()
paul@0	401	return parser.parse(reader)
paul@0	402
paul@5	403	def iterparse(f, non_standard_newline=0, parser_cls=None):
paul@5	404
paul@5	405	"""
paul@5	406	Parse the resource data found through the use of the file object 'f', which
paul@5	407	should provide Unicode data. (The codecs module can be used to open files or
paul@5	408	to wrap streams in order to provide Unicode data.)
paul@5	409
paul@5	410	The optional 'non_standard_newline' can be set to a true value (unlike the
paul@5	411	default) in order to attempt to process files with CR as the end of line
paul@5	412	character.
paul@5	413
paul@5	414	An iterator is returned which provides event tuples describing parsing
paul@5	415	events of the form (name, parameters, value).
paul@5	416	"""
paul@5	417
paul@5	418	reader = Reader(f, non_standard_newline)
paul@5	419	parser = (parser_cls or StreamParser)(reader)
paul@5	420	return iter(parser)
paul@5	421
paul@0	422	# vim: tabstop=4 expandtab shiftwidth=4