vContent (annotate vContent.py in b5f2be07e2f8)

vContent

Annotated vContent.py

2:b5f2be07e2f8

2008-10-17

Paul Boddie

Separated parser functionality out into two distinct classes, adopting SAX-like conventions.

paul@0	1	#!/usr/bin/env python
paul@0	2
paul@0	3	"""
paul@0	4	Parsing of vCard, vCalendar and iCalendar files.
paul@0	5
paul@0	6	Copyright (C) 2005, 2006, 2007, 2008 Paul Boddie <paul@boddie.org.uk>
paul@0	7
paul@0	8	This program is free software; you can redistribute it and/or modify it under
paul@0	9	the terms of the GNU Lesser General Public License as published by the Free
paul@0	10	Software Foundation; either version 3 of the License, or (at your option) any
paul@0	11	later version.
paul@0	12
paul@0	13	This program is distributed in the hope that it will be useful, but WITHOUT
paul@0	14	ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
paul@0	15	FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
paul@0	16	details.
paul@0	17
paul@0	18	You should have received a copy of the GNU Lesser General Public License along
paul@0	19	with this program. If not, see <http://www.gnu.org/licenses/>.
paul@0	20
paul@0	21	--------
paul@0	22
paul@0	23	References:
paul@0	24
paul@0	25	RFC 2445: Internet Calendaring and Scheduling Core Object Specification
paul@0	26	(iCalendar)
paul@0	27	http://rfc.net/rfc2445.html
paul@0	28
paul@0	29	RFC 2425: A MIME Content-Type for Directory Information
paul@0	30	http://rfc.net/rfc2425.html
paul@0	31
paul@0	32	RFC 2426: vCard MIME Directory Profile
paul@0	33	http://rfc.net/rfc2426.html
paul@0	34	"""
paul@0	35
paul@0	36	# Encoding-related imports.
paul@0	37
paul@0	38	import base64, quopri
paul@0	39
paul@0	40	# Simple reader class.
paul@0	41
paul@0	42	class Reader:
paul@0	43
paul@0	44	"A simple class wrapping a file, providing simple pushback capabilities."
paul@0	45
paul@0	46	def __init__(self, f, non_standard_newline=0):
paul@0	47
paul@0	48	"""
paul@0	49	Initialise the object with the file 'f'. If 'non_standard_newline' is
paul@0	50	set to a true value (unlike the default), lines ending with CR will be
paul@0	51	treated as complete lines.
paul@0	52	"""
paul@0	53
paul@0	54	self.f = f
paul@0	55	self.non_standard_newline = non_standard_newline
paul@0	56	self.lines = []
paul@0	57	self.line_number = 0
paul@0	58
paul@0	59	def pushback(self, line):
paul@0	60
paul@0	61	"""
paul@0	62	Push the given 'line' back so that the next line read is actually the
paul@0	63	given 'line' and not the next line from the underlying file.
paul@0	64	"""
paul@0	65
paul@0	66	self.lines.append(line)
paul@0	67	self.line_number -= 1
paul@0	68
paul@0	69	def readline(self):
paul@0	70
paul@0	71	"""
paul@0	72	If no pushed-back lines exist, read a line directly from the file.
paul@0	73	Otherwise, read from the list of pushed-back lines.
paul@0	74	"""
paul@0	75
paul@0	76	self.line_number += 1
paul@0	77	if self.lines:
paul@0	78	return self.lines.pop()
paul@0	79	else:
paul@0	80	# NOTE: Sanity check for broken lines (\r instead of \r\n or \n).
paul@0	81	line = self.f.readline()
paul@0	82	while line.endswith("\r") and not self.non_standard_newline:
paul@0	83	line += self.f.readline()
paul@0	84	if line.endswith("\r") and self.non_standard_newline:
paul@0	85	return line + "\n"
paul@0	86	else:
paul@0	87	return line
paul@0	88
paul@0	89	def read_until(self, targets):
paul@0	90
paul@0	91	"""
paul@0	92	Read from the stream until one of the 'targets' is seen. Return the
paul@0	93	string from the current position up to the target found, along with the
paul@0	94	target string, using a tuple of the form (string, target). If no target
paul@0	95	was found, return the entire string together with a target of None.
paul@0	96	"""
paul@0	97
paul@0	98	indexes = {}
paul@0	99
paul@0	100	# Remember the entire text read and the index of the current line in
paul@0	101	# that text.
paul@0	102
paul@0	103	lines = []
paul@0	104
paul@0	105	line = self.readline()
paul@0	106	lines.append(line)
paul@0	107	start = 0
paul@0	108
paul@0	109	while indexes == {} and line != "":
paul@0	110	for target in targets:
paul@0	111	index = line.find(target)
paul@0	112
paul@0	113	# Always choose the first matching target.
paul@0	114
paul@0	115	if index != -1 and not indexes.has_key(start + index):
paul@0	116	indexes[start + index] = target
paul@0	117
paul@0	118	start += len(line)
paul@0	119	line = self.readline()
paul@0	120	lines.append(line)
paul@0	121
paul@0	122	text = "".join(lines)
paul@0	123
paul@0	124	if indexes:
paul@0	125	min_index = reduce(min, indexes.keys())
paul@0	126	target = indexes[min_index]
paul@0	127
paul@0	128	# Skip the target.
paul@0	129	# Since the end of the buffer should always be a newline, ignore the
paul@0	130	# last element.
paul@0	131
paul@0	132	lines = text[min_index + len(target):].split("\n")[:]
paul@0	133	if not lines[-1]:
paul@0	134	del lines[-1]
paul@0	135	lines.reverse()
paul@0	136
paul@0	137	for line in lines:
paul@0	138	self.pushback(line + "\n")
paul@0	139
paul@0	140	return text[:min_index], target
paul@0	141	else:
paul@0	142	return text, None
paul@0	143
paul@0	144	class StreamParser:
paul@0	145
paul@0	146	"A stream parser for content in vCard/vCalendar/iCalendar-like formats."
paul@0	147
paul@0	148	def __init__(self, f):
paul@0	149
paul@0	150	"Initialise the parser for the given file 'f'."
paul@0	151
paul@0	152	self.f = f
paul@0	153
paul@0	154	def __iter__(self):
paul@0	155
paul@0	156	"Return self as the iterator."
paul@0	157
paul@0	158	return self
paul@0	159
paul@0	160	def next(self):
paul@0	161
paul@0	162	"""
paul@0	163	Return the next content item in the file as a tuple of the form
paul@0	164	(name, parameters, values).
paul@0	165	"""
paul@0	166
paul@0	167	return self.parse_content_line()
paul@0	168
paul@0	169	def parse_content_line(self):
paul@0	170
paul@0	171	"""
paul@0	172	Return the name, parameters and a list containing value information for
paul@0	173	the current content line in the file being parsed.
paul@0	174	"""
paul@0	175
paul@0	176	f = self.f
paul@0	177
paul@0	178	parameters = {}
paul@0	179	name, sep = f.read_until([";", ":"])
paul@0	180
paul@0	181	name = name.strip()
paul@0	182
paul@0	183	if not name and sep is None:
paul@0	184	raise StopIteration
paul@0	185
paul@0	186	while sep == ";":
paul@0	187
paul@0	188	# Find the actual modifier.
paul@0	189
paul@0	190	parameter_name, sep = f.read_until(["=", ";", ":"])
paul@0	191	parameter_name = parameter_name.strip()
paul@0	192
paul@0	193	if sep == "=":
paul@0	194	parameter_value, sep = f.read_until([";", ":"])
paul@0	195	parameter_value = parameter_value.strip()
paul@0	196	else:
paul@0	197	parameter_value = None
paul@0	198
paul@0	199	# Append a key, value tuple to the parameters list.
paul@0	200
paul@0	201	parameters[parameter_name] = parameter_value
paul@0	202
paul@0	203	# Get the value content.
paul@0	204
paul@0	205	if sep != ":":
paul@0	206	raise ValueError, f.line_number
paul@0	207
paul@0	208	# Strip all appropriate whitespace from the right end of each line.
paul@0	209	# For subsequent lines, remove the first whitespace character.
paul@0	210	# See section 4.1 of the iCalendar specification.
paul@0	211
paul@0	212	line = f.readline()
paul@0	213	value_lines = [line.rstrip("\r\n")]
paul@0	214	line = f.readline()
paul@0	215	while line != "" and line[0] in [" ", "\t"]:
paul@0	216	value_lines.append(line.rstrip("\r\n")[1:])
paul@0	217	line = f.readline()
paul@0	218
paul@0	219	# Since one line too many will have been read, push the line back into the
paul@0	220	# file.
paul@0	221
paul@0	222	f.pushback(line)
paul@0	223
paul@0	224	# Decode the value.
paul@0	225
paul@1	226	value = self.decode("".join(value_lines), parameters)
paul@0	227
paul@0	228	return name, parameters, value
paul@0	229
paul@1	230	def decode(self, value, parameters):
paul@1	231
paul@1	232	"Decode the 'value' using the given 'parameters'."
paul@0	233
paul@1	234	encoding = parameters.get("ENCODING")
paul@1	235	charset = parameters.get("CHARSET")
paul@0	236
paul@1	237	# NOTE: Introducing newline conversions.
paul@1	238	# Replace quoted characters (see 4.3.11 in RFC 2445).
paul@1	239
paul@1	240	value = value.replace("\r", "").replace("\\N", "\n").replace("\\n", "\n").replace("\\,", ",").replace("\\;", ";")
paul@0	241
paul@0	242	if encoding == "QUOTED-PRINTABLE":
paul@1	243	return unicode(quopri.decodestring(value), charset or "iso-8859-1")
paul@0	244	elif encoding == "BASE64":
paul@0	245	return base64.decodestring(value)
paul@0	246	else:
paul@1	247	return value
paul@0	248
paul@2	249	class ParserBase:
paul@0	250
paul@2	251	"An abstract parser for content in vCard/vCalendar/iCalendar-like formats."
paul@0	252
paul@0	253	def __init__(self):
paul@0	254
paul@0	255	"Initialise the parser."
paul@0	256
paul@2	257	self.names = []
paul@0	258
paul@0	259	def parse(self, f):
paul@0	260
paul@0	261	"Parse the contents of the file 'f'."
paul@0	262
paul@0	263	parser = StreamParser(f)
paul@0	264
paul@0	265	for name, parameters, value in parser:
paul@0	266
paul@0	267	if name == "BEGIN":
paul@2	268	self.names.append(value)
paul@2	269	self.startElement(value, parameters)
paul@0	270
paul@0	271	elif name == "END":
paul@2	272	start_name = self.names.pop()
paul@2	273	if start_name != value:
paul@0	274	raise ParseError, "Mismatch in BEGIN and END declarations (%r and %r) at line %d." % (
paul@2	275	start_name, value, f.line_number)
paul@2	276
paul@2	277	self.endElement(value)
paul@0	278
paul@0	279	else:
paul@2	280	self.handleComponent(name, parameters, value)
paul@2	281
paul@2	282	class Parser(ParserBase):
paul@2	283
paul@2	284	"A SAX-like parser for vCard/vCalendar/iCalendar-like formats."
paul@2	285
paul@2	286	def __init__(self):
paul@2	287	ParserBase.__init__(self)
paul@2	288	self.elements = [] # also known as components
paul@2	289
paul@2	290	def startElement(self, name, parameters):
paul@2	291
paul@2	292	"""
paul@2	293	Add the element/component with the given 'name' and 'parameters',
paul@2	294	recording an empty list of children as part of the element's content.
paul@2	295	"""
paul@2	296
paul@2	297	element = self.handleComponent(name, parameters, [])
paul@2	298	self.elements.append(element)
paul@2	299	return element
paul@2	300
paul@2	301	def endElement(self, name):
paul@2	302
paul@2	303	"""
paul@2	304	End the element with the given 'name' by removing it from the active
paul@2	305	element stack.
paul@2	306	"""
paul@2	307
paul@2	308	if len(self.elements) > 1:
paul@2	309	return self.elements.pop()
paul@2	310	elif self.elements:
paul@2	311	return self.elements[-1]
paul@2	312
paul@2	313	def handleComponent(self, name, parameters, value):
paul@0	314
paul@2	315	"""
paul@2	316	Record the component with the given 'name', 'parameters' and 'value' as
paul@2	317	part of the current element's children.
paul@2	318	"""
paul@2	319
paul@2	320	component = self.makeComponent(name, parameters, value)
paul@2	321	self.attachComponent(component)
paul@2	322	return component
paul@2	323
paul@2	324	# Component object construction/manipulation methods.
paul@2	325
paul@2	326	def attachComponent(self, component):
paul@2	327
paul@2	328	"Attach the given 'component' to its parent."
paul@2	329
paul@2	330	if self.elements:
paul@2	331	element_name, element_parameters, element_children = self.elements[-1]
paul@2	332	element_children.append(component)
paul@2	333
paul@2	334	def makeComponent(self, name, parameters, value):
paul@2	335
paul@2	336	"""
paul@2	337	Make a component object from the given 'name', 'parameters' and 'value'.
paul@2	338	"""
paul@2	339
paul@2	340	return (name, parameters, value)
paul@2	341
paul@2	342	# Public methods.
paul@2	343
paul@2	344	def parse(self, f):
paul@2	345
paul@2	346	"Parse the contents of the file 'f'."
paul@2	347
paul@2	348	ParserBase.parse(self, f)
paul@2	349	return self.elements[0]
paul@0	350
paul@0	351	# Public functions.
paul@0	352
paul@0	353	def parse(f, non_standard_newline=0):
paul@0	354
paul@0	355	"""
paul@0	356	Parse the resource data found through the use of the file object 'f', which
paul@0	357	should provide Unicode data, and put the resource information in the given
paul@0	358	'store'. (The codecs module can be used to open files or to wrap streams in
paul@0	359	order to provide Unicode data.)
paul@0	360
paul@0	361	The optional 'non_standard_newline' can be set to a true value (unlike the
paul@0	362	default) in order to attempt to process files with CR as the end of line
paul@0	363	character.
paul@0	364
paul@0	365	As a result of parsing the resource, the root node of the imported resource
paul@0	366	is returned.
paul@0	367	"""
paul@0	368
paul@0	369	reader = Reader(f, non_standard_newline=non_standard_newline)
paul@0	370	parser = Parser()
paul@0	371	return parser.parse(reader)
paul@0	372
paul@0	373	# vim: tabstop=4 expandtab shiftwidth=4