vContent (annotate vContent.py in f988b4dd5531)

vContent

Annotated vContent.py

4:f988b4dd5531

2008-10-18

Paul Boddie

Changed the Reader class to more properly handle quoting, updating the StreamParser class to use the regular expressions provided. Fixed the handleProperty docstring. Added a vCalendar module which has specialised knowledge about that format. Added test programs for the Reader and for each module's parse function.

paul@0	1	#!/usr/bin/env python
paul@0	2
paul@0	3	"""
paul@0	4	Parsing of vCard, vCalendar and iCalendar files.
paul@0	5
paul@0	6	Copyright (C) 2005, 2006, 2007, 2008 Paul Boddie <paul@boddie.org.uk>
paul@0	7
paul@0	8	This program is free software; you can redistribute it and/or modify it under
paul@0	9	the terms of the GNU Lesser General Public License as published by the Free
paul@0	10	Software Foundation; either version 3 of the License, or (at your option) any
paul@0	11	later version.
paul@0	12
paul@0	13	This program is distributed in the hope that it will be useful, but WITHOUT
paul@0	14	ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
paul@0	15	FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
paul@0	16	details.
paul@0	17
paul@0	18	You should have received a copy of the GNU Lesser General Public License along
paul@0	19	with this program. If not, see <http://www.gnu.org/licenses/>.
paul@0	20
paul@0	21	--------
paul@0	22
paul@0	23	References:
paul@0	24
paul@0	25	RFC 2445: Internet Calendaring and Scheduling Core Object Specification
paul@0	26	(iCalendar)
paul@0	27	http://rfc.net/rfc2445.html
paul@0	28
paul@0	29	RFC 2425: A MIME Content-Type for Directory Information
paul@0	30	http://rfc.net/rfc2425.html
paul@0	31
paul@0	32	RFC 2426: vCard MIME Directory Profile
paul@0	33	http://rfc.net/rfc2426.html
paul@0	34	"""
paul@0	35
paul@4	36	try:
paul@4	37	set
paul@4	38	except NameError:
paul@4	39	from sets import Set as set
paul@4	40
paul@0	41	# Encoding-related imports.
paul@0	42
paul@0	43	import base64, quopri
paul@0	44
paul@4	45	# Tokenisation help.
paul@4	46
paul@4	47	import re
paul@4	48
paul@0	49	# Simple reader class.
paul@0	50
paul@0	51	class Reader:
paul@0	52
paul@0	53	"A simple class wrapping a file, providing simple pushback capabilities."
paul@0	54
paul@4	55	SEPARATORS = re.compile('[;:"]')
paul@4	56	SEPARATORS_PLUS_EQUALS = re.compile('[=;:"]')
paul@4	57
paul@0	58	def __init__(self, f, non_standard_newline=0):
paul@0	59
paul@0	60	"""
paul@0	61	Initialise the object with the file 'f'. If 'non_standard_newline' is
paul@0	62	set to a true value (unlike the default), lines ending with CR will be
paul@0	63	treated as complete lines.
paul@0	64	"""
paul@0	65
paul@0	66	self.f = f
paul@0	67	self.non_standard_newline = non_standard_newline
paul@0	68	self.lines = []
paul@0	69	self.line_number = 0
paul@0	70
paul@0	71	def pushback(self, line):
paul@0	72
paul@0	73	"""
paul@0	74	Push the given 'line' back so that the next line read is actually the
paul@0	75	given 'line' and not the next line from the underlying file.
paul@0	76	"""
paul@0	77
paul@0	78	self.lines.append(line)
paul@0	79	self.line_number -= 1
paul@0	80
paul@0	81	def readline(self):
paul@0	82
paul@0	83	"""
paul@0	84	If no pushed-back lines exist, read a line directly from the file.
paul@0	85	Otherwise, read from the list of pushed-back lines.
paul@0	86	"""
paul@0	87
paul@0	88	self.line_number += 1
paul@0	89	if self.lines:
paul@0	90	return self.lines.pop()
paul@0	91	else:
paul@0	92	# NOTE: Sanity check for broken lines (\r instead of \r\n or \n).
paul@0	93	line = self.f.readline()
paul@0	94	while line.endswith("\r") and not self.non_standard_newline:
paul@0	95	line += self.f.readline()
paul@0	96	if line.endswith("\r") and self.non_standard_newline:
paul@0	97	return line + "\n"
paul@0	98	else:
paul@0	99	return line
paul@0	100
paul@0	101	def read_until(self, targets):
paul@0	102
paul@0	103	"""
paul@0	104	Read from the stream until one of the 'targets' is seen. Return the
paul@0	105	string from the current position up to the target found, along with the
paul@0	106	target string, using a tuple of the form (string, target). If no target
paul@0	107	was found, return the entire string together with a target of None.
paul@0	108	"""
paul@0	109
paul@0	110	# Remember the entire text read and the index of the current line in
paul@0	111	# that text.
paul@0	112
paul@0	113	lines = []
paul@0	114
paul@0	115	line = self.readline()
paul@0	116	lines.append(line)
paul@0	117	start = 0
paul@0	118
paul@4	119	# Remember the first target.
paul@4	120
paul@4	121	first = None
paul@4	122	first_pos = None
paul@4	123	in_quoted_region = 0
paul@0	124
paul@4	125	# Process each line, looking for the targets.
paul@4	126
paul@4	127	while line != "":
paul@4	128	match = targets.search(line, start)
paul@4	129
paul@4	130	# Where nothing matches, get the next line.
paul@0	131
paul@4	132	if match is None:
paul@4	133	line = self.readline()
paul@4	134	lines.append(line)
paul@4	135	start = 0
paul@0	136
paul@4	137	# Where a double quote matches, toggle the region state.
paul@0	138
paul@4	139	elif match.group() == '"':
paul@4	140	in_quoted_region = not in_quoted_region
paul@4	141	start = match.end()
paul@4	142
paul@4	143	# Where something else matches outside a region, stop searching.
paul@0	144
paul@4	145	elif not in_quoted_region:
paul@4	146	first = match.group()
paul@4	147	first_pos = match.start()
paul@4	148	break
paul@0	149
paul@4	150	# Otherwise, keep looking for the end of the region.
paul@4	151
paul@4	152	else:
paul@4	153	start = match.end()
paul@4	154
paul@4	155	# Where no more input can provide the targets, return a special result.
paul@0	156
paul@4	157	else:
paul@4	158	text = "".join(lines)
paul@4	159	return text, None
paul@4	160
paul@4	161	# Push back the text after the target.
paul@0	162
paul@4	163	after_target = lines[-1][first_pos + len(first):]
paul@4	164	self.pushback(after_target)
paul@0	165
paul@4	166	# Produce the lines until the matching line, together with the portion
paul@4	167	# of the matching line before the target.
paul@4	168
paul@4	169	lines[-1] = lines[-1][:first_pos]
paul@4	170	text = "".join(lines)
paul@4	171	return text, first
paul@0	172
paul@0	173	class StreamParser:
paul@0	174
paul@0	175	"A stream parser for content in vCard/vCalendar/iCalendar-like formats."
paul@0	176
paul@0	177	def __init__(self, f):
paul@0	178
paul@0	179	"Initialise the parser for the given file 'f'."
paul@0	180
paul@0	181	self.f = f
paul@0	182
paul@0	183	def __iter__(self):
paul@0	184
paul@0	185	"Return self as the iterator."
paul@0	186
paul@0	187	return self
paul@0	188
paul@0	189	def next(self):
paul@0	190
paul@0	191	"""
paul@0	192	Return the next content item in the file as a tuple of the form
paul@0	193	(name, parameters, values).
paul@0	194	"""
paul@0	195
paul@0	196	return self.parse_content_line()
paul@0	197
paul@0	198	def parse_content_line(self):
paul@0	199
paul@0	200	"""
paul@0	201	Return the name, parameters and a list containing value information for
paul@0	202	the current content line in the file being parsed.
paul@0	203	"""
paul@0	204
paul@0	205	f = self.f
paul@0	206
paul@0	207	parameters = {}
paul@4	208	name, sep = f.read_until(f.SEPARATORS)
paul@0	209
paul@0	210	name = name.strip()
paul@0	211
paul@0	212	if not name and sep is None:
paul@0	213	raise StopIteration
paul@0	214
paul@0	215	while sep == ";":
paul@0	216
paul@0	217	# Find the actual modifier.
paul@0	218
paul@4	219	parameter_name, sep = f.read_until(f.SEPARATORS_PLUS_EQUALS)
paul@0	220	parameter_name = parameter_name.strip()
paul@0	221
paul@0	222	if sep == "=":
paul@4	223	parameter_value, sep = f.read_until(f.SEPARATORS)
paul@0	224	parameter_value = parameter_value.strip()
paul@0	225	else:
paul@0	226	parameter_value = None
paul@0	227
paul@0	228	# Append a key, value tuple to the parameters list.
paul@0	229
paul@0	230	parameters[parameter_name] = parameter_value
paul@0	231
paul@0	232	# Get the value content.
paul@0	233
paul@0	234	if sep != ":":
paul@0	235	raise ValueError, f.line_number
paul@0	236
paul@0	237	# Strip all appropriate whitespace from the right end of each line.
paul@0	238	# For subsequent lines, remove the first whitespace character.
paul@0	239	# See section 4.1 of the iCalendar specification.
paul@0	240
paul@0	241	line = f.readline()
paul@0	242	value_lines = [line.rstrip("\r\n")]
paul@0	243	line = f.readline()
paul@0	244	while line != "" and line[0] in [" ", "\t"]:
paul@0	245	value_lines.append(line.rstrip("\r\n")[1:])
paul@0	246	line = f.readline()
paul@0	247
paul@0	248	# Since one line too many will have been read, push the line back into the
paul@0	249	# file.
paul@0	250
paul@0	251	f.pushback(line)
paul@0	252
paul@0	253	# Decode the value.
paul@0	254
paul@1	255	value = self.decode("".join(value_lines), parameters)
paul@0	256
paul@0	257	return name, parameters, value
paul@0	258
paul@1	259	def decode(self, value, parameters):
paul@1	260
paul@1	261	"Decode the 'value' using the given 'parameters'."
paul@0	262
paul@1	263	encoding = parameters.get("ENCODING")
paul@1	264	charset = parameters.get("CHARSET")
paul@0	265
paul@1	266	# NOTE: Introducing newline conversions.
paul@1	267	# Replace quoted characters (see 4.3.11 in RFC 2445).
paul@1	268
paul@1	269	value = value.replace("\r", "").replace("\\N", "\n").replace("\\n", "\n").replace("\\,", ",").replace("\\;", ";")
paul@0	270
paul@0	271	if encoding == "QUOTED-PRINTABLE":
paul@1	272	return unicode(quopri.decodestring(value), charset or "iso-8859-1")
paul@0	273	elif encoding == "BASE64":
paul@0	274	return base64.decodestring(value)
paul@0	275	else:
paul@1	276	return value
paul@0	277
paul@2	278	class ParserBase:
paul@0	279
paul@2	280	"An abstract parser for content in vCard/vCalendar/iCalendar-like formats."
paul@0	281
paul@0	282	def __init__(self):
paul@0	283
paul@0	284	"Initialise the parser."
paul@0	285
paul@2	286	self.names = []
paul@0	287
paul@0	288	def parse(self, f):
paul@0	289
paul@0	290	"Parse the contents of the file 'f'."
paul@0	291
paul@0	292	parser = StreamParser(f)
paul@0	293
paul@0	294	for name, parameters, value in parser:
paul@0	295
paul@0	296	if name == "BEGIN":
paul@2	297	self.names.append(value)
paul@3	298	self.startComponent(value, parameters)
paul@0	299
paul@0	300	elif name == "END":
paul@2	301	start_name = self.names.pop()
paul@2	302	if start_name != value:
paul@0	303	raise ParseError, "Mismatch in BEGIN and END declarations (%r and %r) at line %d." % (
paul@2	304	start_name, value, f.line_number)
paul@2	305
paul@3	306	self.endComponent(value)
paul@0	307
paul@0	308	else:
paul@3	309	self.handleProperty(name, parameters, value)
paul@2	310
paul@2	311	class Parser(ParserBase):
paul@2	312
paul@2	313	"A SAX-like parser for vCard/vCalendar/iCalendar-like formats."
paul@2	314
paul@2	315	def __init__(self):
paul@2	316	ParserBase.__init__(self)
paul@3	317	self.components = []
paul@2	318
paul@3	319	def startComponent(self, name, parameters):
paul@2	320
paul@2	321	"""
paul@3	322	Add the component with the given 'name' and 'parameters', recording an
paul@3	323	empty list of children as part of the component's content.
paul@2	324	"""
paul@2	325
paul@3	326	component = self.handleProperty(name, parameters, [])
paul@3	327	self.components.append(component)
paul@3	328	return component
paul@2	329
paul@3	330	def endComponent(self, name):
paul@2	331
paul@2	332	"""
paul@3	333	End the component with the given 'name' by removing it from the active
paul@3	334	component stack.
paul@2	335	"""
paul@2	336
paul@3	337	if len(self.components) > 1:
paul@3	338	return self.components.pop()
paul@3	339	elif self.components:
paul@3	340	return self.components[-1]
paul@2	341
paul@3	342	def handleProperty(self, name, parameters, value):
paul@0	343
paul@2	344	"""
paul@4	345	Record the property with the given 'name', 'parameters' and 'value' as
paul@3	346	part of the current component's children.
paul@2	347	"""
paul@2	348
paul@2	349	component = self.makeComponent(name, parameters, value)
paul@2	350	self.attachComponent(component)
paul@2	351	return component
paul@2	352
paul@2	353	# Component object construction/manipulation methods.
paul@2	354
paul@2	355	def attachComponent(self, component):
paul@2	356
paul@2	357	"Attach the given 'component' to its parent."
paul@2	358
paul@3	359	if self.components:
paul@3	360	component_name, component_parameters, component_children = self.components[-1]
paul@3	361	component_children.append(component)
paul@2	362
paul@2	363	def makeComponent(self, name, parameters, value):
paul@2	364
paul@2	365	"""
paul@2	366	Make a component object from the given 'name', 'parameters' and 'value'.
paul@2	367	"""
paul@2	368
paul@2	369	return (name, parameters, value)
paul@2	370
paul@2	371	# Public methods.
paul@2	372
paul@2	373	def parse(self, f):
paul@2	374
paul@2	375	"Parse the contents of the file 'f'."
paul@2	376
paul@2	377	ParserBase.parse(self, f)
paul@3	378	return self.components[0]
paul@0	379
paul@0	380	# Public functions.
paul@0	381
paul@0	382	def parse(f, non_standard_newline=0):
paul@0	383
paul@0	384	"""
paul@0	385	Parse the resource data found through the use of the file object 'f', which
paul@0	386	should provide Unicode data, and put the resource information in the given
paul@0	387	'store'. (The codecs module can be used to open files or to wrap streams in
paul@0	388	order to provide Unicode data.)
paul@0	389
paul@0	390	The optional 'non_standard_newline' can be set to a true value (unlike the
paul@0	391	default) in order to attempt to process files with CR as the end of line
paul@0	392	character.
paul@0	393
paul@0	394	As a result of parsing the resource, the root node of the imported resource
paul@0	395	is returned.
paul@0	396	"""
paul@0	397
paul@0	398	reader = Reader(f, non_standard_newline=non_standard_newline)
paul@0	399	parser = Parser()
paul@0	400	return parser.parse(reader)
paul@0	401
paul@0	402	# vim: tabstop=4 expandtab shiftwidth=4