vContent (annotate vContent.py in 97d39ea15ccf)

vContent

Annotated vContent.py

1:97d39ea15ccf

2008-10-17

Paul Boddie

Fixed value decoding, performing character substitutions before explicitly specified decoding operations.

paul@0	1	#!/usr/bin/env python
paul@0	2
paul@0	3	"""
paul@0	4	Parsing of vCard, vCalendar and iCalendar files.
paul@0	5
paul@0	6	Copyright (C) 2005, 2006, 2007, 2008 Paul Boddie <paul@boddie.org.uk>
paul@0	7
paul@0	8	This program is free software; you can redistribute it and/or modify it under
paul@0	9	the terms of the GNU Lesser General Public License as published by the Free
paul@0	10	Software Foundation; either version 3 of the License, or (at your option) any
paul@0	11	later version.
paul@0	12
paul@0	13	This program is distributed in the hope that it will be useful, but WITHOUT
paul@0	14	ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
paul@0	15	FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
paul@0	16	details.
paul@0	17
paul@0	18	You should have received a copy of the GNU Lesser General Public License along
paul@0	19	with this program. If not, see <http://www.gnu.org/licenses/>.
paul@0	20
paul@0	21	--------
paul@0	22
paul@0	23	References:
paul@0	24
paul@0	25	RFC 2445: Internet Calendaring and Scheduling Core Object Specification
paul@0	26	(iCalendar)
paul@0	27	http://rfc.net/rfc2445.html
paul@0	28
paul@0	29	RFC 2425: A MIME Content-Type for Directory Information
paul@0	30	http://rfc.net/rfc2425.html
paul@0	31
paul@0	32	RFC 2426: vCard MIME Directory Profile
paul@0	33	http://rfc.net/rfc2426.html
paul@0	34	"""
paul@0	35
paul@0	36	# Encoding-related imports.
paul@0	37
paul@0	38	import base64, quopri
paul@0	39
paul@0	40	# Simple reader class.
paul@0	41
paul@0	42	class Reader:
paul@0	43
paul@0	44	"A simple class wrapping a file, providing simple pushback capabilities."
paul@0	45
paul@0	46	def __init__(self, f, non_standard_newline=0):
paul@0	47
paul@0	48	"""
paul@0	49	Initialise the object with the file 'f'. If 'non_standard_newline' is
paul@0	50	set to a true value (unlike the default), lines ending with CR will be
paul@0	51	treated as complete lines.
paul@0	52	"""
paul@0	53
paul@0	54	self.f = f
paul@0	55	self.non_standard_newline = non_standard_newline
paul@0	56	self.lines = []
paul@0	57	self.line_number = 0
paul@0	58
paul@0	59	def pushback(self, line):
paul@0	60
paul@0	61	"""
paul@0	62	Push the given 'line' back so that the next line read is actually the
paul@0	63	given 'line' and not the next line from the underlying file.
paul@0	64	"""
paul@0	65
paul@0	66	self.lines.append(line)
paul@0	67	self.line_number -= 1
paul@0	68
paul@0	69	def readline(self):
paul@0	70
paul@0	71	"""
paul@0	72	If no pushed-back lines exist, read a line directly from the file.
paul@0	73	Otherwise, read from the list of pushed-back lines.
paul@0	74	"""
paul@0	75
paul@0	76	self.line_number += 1
paul@0	77	if self.lines:
paul@0	78	return self.lines.pop()
paul@0	79	else:
paul@0	80	# NOTE: Sanity check for broken lines (\r instead of \r\n or \n).
paul@0	81	line = self.f.readline()
paul@0	82	while line.endswith("\r") and not self.non_standard_newline:
paul@0	83	line += self.f.readline()
paul@0	84	if line.endswith("\r") and self.non_standard_newline:
paul@0	85	return line + "\n"
paul@0	86	else:
paul@0	87	return line
paul@0	88
paul@0	89	def read_until(self, targets):
paul@0	90
paul@0	91	"""
paul@0	92	Read from the stream until one of the 'targets' is seen. Return the
paul@0	93	string from the current position up to the target found, along with the
paul@0	94	target string, using a tuple of the form (string, target). If no target
paul@0	95	was found, return the entire string together with a target of None.
paul@0	96	"""
paul@0	97
paul@0	98	indexes = {}
paul@0	99
paul@0	100	# Remember the entire text read and the index of the current line in
paul@0	101	# that text.
paul@0	102
paul@0	103	lines = []
paul@0	104
paul@0	105	line = self.readline()
paul@0	106	lines.append(line)
paul@0	107	start = 0
paul@0	108
paul@0	109	while indexes == {} and line != "":
paul@0	110	for target in targets:
paul@0	111	index = line.find(target)
paul@0	112
paul@0	113	# Always choose the first matching target.
paul@0	114
paul@0	115	if index != -1 and not indexes.has_key(start + index):
paul@0	116	indexes[start + index] = target
paul@0	117
paul@0	118	start += len(line)
paul@0	119	line = self.readline()
paul@0	120	lines.append(line)
paul@0	121
paul@0	122	text = "".join(lines)
paul@0	123
paul@0	124	if indexes:
paul@0	125	min_index = reduce(min, indexes.keys())
paul@0	126	target = indexes[min_index]
paul@0	127
paul@0	128	# Skip the target.
paul@0	129	# Since the end of the buffer should always be a newline, ignore the
paul@0	130	# last element.
paul@0	131
paul@0	132	lines = text[min_index + len(target):].split("\n")[:]
paul@0	133	if not lines[-1]:
paul@0	134	del lines[-1]
paul@0	135	lines.reverse()
paul@0	136
paul@0	137	for line in lines:
paul@0	138	self.pushback(line + "\n")
paul@0	139
paul@0	140	return text[:min_index], target
paul@0	141	else:
paul@0	142	return text, None
paul@0	143
paul@0	144	class StreamParser:
paul@0	145
paul@0	146	"A stream parser for content in vCard/vCalendar/iCalendar-like formats."
paul@0	147
paul@0	148	def __init__(self, f):
paul@0	149
paul@0	150	"Initialise the parser for the given file 'f'."
paul@0	151
paul@0	152	self.f = f
paul@0	153
paul@0	154	def __iter__(self):
paul@0	155
paul@0	156	"Return self as the iterator."
paul@0	157
paul@0	158	return self
paul@0	159
paul@0	160	def next(self):
paul@0	161
paul@0	162	"""
paul@0	163	Return the next content item in the file as a tuple of the form
paul@0	164	(name, parameters, values).
paul@0	165	"""
paul@0	166
paul@0	167	return self.parse_content_line()
paul@0	168
paul@0	169	def parse_content_line(self):
paul@0	170
paul@0	171	"""
paul@0	172	Return the name, parameters and a list containing value information for
paul@0	173	the current content line in the file being parsed.
paul@0	174	"""
paul@0	175
paul@0	176	f = self.f
paul@0	177
paul@0	178	parameters = {}
paul@0	179	name, sep = f.read_until([";", ":"])
paul@0	180
paul@0	181	name = name.strip()
paul@0	182
paul@0	183	if not name and sep is None:
paul@0	184	raise StopIteration
paul@0	185
paul@0	186	while sep == ";":
paul@0	187
paul@0	188	# Find the actual modifier.
paul@0	189
paul@0	190	parameter_name, sep = f.read_until(["=", ";", ":"])
paul@0	191	parameter_name = parameter_name.strip()
paul@0	192
paul@0	193	if sep == "=":
paul@0	194	parameter_value, sep = f.read_until([";", ":"])
paul@0	195	parameter_value = parameter_value.strip()
paul@0	196	else:
paul@0	197	parameter_value = None
paul@0	198
paul@0	199	# Append a key, value tuple to the parameters list.
paul@0	200
paul@0	201	parameters[parameter_name] = parameter_value
paul@0	202
paul@0	203	# Get the value content.
paul@0	204
paul@0	205	if sep != ":":
paul@0	206	raise ValueError, f.line_number
paul@0	207
paul@0	208	# Strip all appropriate whitespace from the right end of each line.
paul@0	209	# For subsequent lines, remove the first whitespace character.
paul@0	210	# See section 4.1 of the iCalendar specification.
paul@0	211
paul@0	212	line = f.readline()
paul@0	213	value_lines = [line.rstrip("\r\n")]
paul@0	214	line = f.readline()
paul@0	215	while line != "" and line[0] in [" ", "\t"]:
paul@0	216	value_lines.append(line.rstrip("\r\n")[1:])
paul@0	217	line = f.readline()
paul@0	218
paul@0	219	# Since one line too many will have been read, push the line back into the
paul@0	220	# file.
paul@0	221
paul@0	222	f.pushback(line)
paul@0	223
paul@0	224	# Decode the value.
paul@0	225
paul@1	226	value = self.decode("".join(value_lines), parameters)
paul@0	227
paul@0	228	return name, parameters, value
paul@0	229
paul@1	230	def decode(self, value, parameters):
paul@1	231
paul@1	232	"Decode the 'value' using the given 'parameters'."
paul@0	233
paul@1	234	encoding = parameters.get("ENCODING")
paul@1	235	charset = parameters.get("CHARSET")
paul@0	236
paul@1	237	# NOTE: Introducing newline conversions.
paul@1	238	# Replace quoted characters (see 4.3.11 in RFC 2445).
paul@1	239
paul@1	240	value = value.replace("\r", "").replace("\\N", "\n").replace("\\n", "\n").replace("\\,", ",").replace("\\;", ";")
paul@0	241
paul@0	242	if encoding == "QUOTED-PRINTABLE":
paul@1	243	return unicode(quopri.decodestring(value), charset or "iso-8859-1")
paul@0	244	elif encoding == "BASE64":
paul@0	245	return base64.decodestring(value)
paul@0	246	else:
paul@1	247	return value
paul@0	248
paul@0	249	class Parser:
paul@0	250
paul@0	251	"A parser for content in vCard/vCalendar/iCalendar-like formats."
paul@0	252
paul@0	253	def __init__(self):
paul@0	254
paul@0	255	"Initialise the parser."
paul@0	256
paul@0	257	self.elements = [] # also known as components
paul@0	258	self.document = []
paul@0	259	self.current = self.document
paul@0	260
paul@0	261	def parse(self, f):
paul@0	262
paul@0	263	"Parse the contents of the file 'f'."
paul@0	264
paul@0	265	parser = StreamParser(f)
paul@0	266
paul@0	267	for name, parameters, value in parser:
paul@0	268
paul@0	269	# Add new elements/components to the current position in the
paul@0	270	# document, recording the element as the active element.
paul@0	271
paul@0	272	if name == "BEGIN":
paul@0	273	children = []
paul@0	274	element = (value, parameters, children)
paul@0	275	self.elements.append(element)
paul@0	276	self.current.append(element)
paul@0	277	self.current = children
paul@0	278
paul@0	279	# End elements by removing them from the active element stack and
paul@0	280	# making the next element's children the current position for new
paul@0	281	# content.
paul@0	282
paul@0	283	elif name == "END":
paul@0	284	start_element = self.elements.pop()
paul@0	285	start_value, start_parameters, children = start_element
paul@0	286	if start_value != value:
paul@0	287	raise ParseError, "Mismatch in BEGIN and END declarations (%r and %r) at line %d." % (
paul@0	288	start_value, value, f.line_number)
paul@0	289	if self.elements:
paul@0	290	parent_value, parent_parameters, children = self.elements[-1]
paul@0	291	self.current = children
paul@0	292	else:
paul@0	293	self.current = self.document
paul@0	294
paul@0	295	else:
paul@0	296	self.current.append((name, parameters, value))
paul@0	297
paul@0	298	return self.document
paul@0	299
paul@0	300	# Public functions.
paul@0	301
paul@0	302	def parse(f, non_standard_newline=0):
paul@0	303
paul@0	304	"""
paul@0	305	Parse the resource data found through the use of the file object 'f', which
paul@0	306	should provide Unicode data, and put the resource information in the given
paul@0	307	'store'. (The codecs module can be used to open files or to wrap streams in
paul@0	308	order to provide Unicode data.)
paul@0	309
paul@0	310	The optional 'non_standard_newline' can be set to a true value (unlike the
paul@0	311	default) in order to attempt to process files with CR as the end of line
paul@0	312	character.
paul@0	313
paul@0	314	As a result of parsing the resource, the root node of the imported resource
paul@0	315	is returned.
paul@0	316	"""
paul@0	317
paul@0	318	reader = Reader(f, non_standard_newline=non_standard_newline)
paul@0	319	parser = Parser()
paul@0	320	return parser.parse(reader)
paul@0	321
paul@0	322	# vim: tabstop=4 expandtab shiftwidth=4