vContent (annotate vContent.py in 18ef1a1eab60)

vContent

Annotated vContent.py

9:18ef1a1eab60

2008-11-03

Paul Boddie

Renamed the StreamWriter.write method to write_content_line. Added support for skipping blank lines when reading content. Added support for opening streams using filenames in the convenience methods, introducing close methods on certain classes in order to support the proper closure of streams after use.

paul@0	1	#!/usr/bin/env python
paul@0	2
paul@0	3	"""
paul@0	4	Parsing of vCard, vCalendar and iCalendar files.
paul@0	5
paul@0	6	Copyright (C) 2005, 2006, 2007, 2008 Paul Boddie <paul@boddie.org.uk>
paul@0	7
paul@0	8	This program is free software; you can redistribute it and/or modify it under
paul@0	9	the terms of the GNU Lesser General Public License as published by the Free
paul@0	10	Software Foundation; either version 3 of the License, or (at your option) any
paul@0	11	later version.
paul@0	12
paul@0	13	This program is distributed in the hope that it will be useful, but WITHOUT
paul@0	14	ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
paul@0	15	FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
paul@0	16	details.
paul@0	17
paul@0	18	You should have received a copy of the GNU Lesser General Public License along
paul@0	19	with this program. If not, see <http://www.gnu.org/licenses/>.
paul@0	20
paul@0	21	--------
paul@0	22
paul@0	23	References:
paul@0	24
paul@0	25	RFC 2445: Internet Calendaring and Scheduling Core Object Specification
paul@0	26	(iCalendar)
paul@0	27	http://rfc.net/rfc2445.html
paul@0	28
paul@0	29	RFC 2425: A MIME Content-Type for Directory Information
paul@0	30	http://rfc.net/rfc2425.html
paul@0	31
paul@0	32	RFC 2426: vCard MIME Directory Profile
paul@0	33	http://rfc.net/rfc2426.html
paul@0	34	"""
paul@0	35
paul@4	36	try:
paul@4	37	set
paul@4	38	except NameError:
paul@4	39	from sets import Set as set
paul@4	40
paul@0	41	# Encoding-related imports.
paul@0	42
paul@0	43	import base64, quopri
paul@9	44	import codecs
paul@0	45
paul@4	46	# Tokenisation help.
paul@4	47
paul@4	48	import re
paul@4	49
paul@9	50	# Configuration.
paul@9	51
paul@9	52	default_encoding = "utf-8"
paul@9	53
paul@7	54	# Reader and parser classes.
paul@0	55
paul@0	56	class Reader:
paul@0	57
paul@0	58	"A simple class wrapping a file, providing simple pushback capabilities."
paul@0	59
paul@0	60	def __init__(self, f, non_standard_newline=0):
paul@0	61
paul@0	62	"""
paul@0	63	Initialise the object with the file 'f'. If 'non_standard_newline' is
paul@0	64	set to a true value (unlike the default), lines ending with CR will be
paul@0	65	treated as complete lines.
paul@0	66	"""
paul@0	67
paul@0	68	self.f = f
paul@0	69	self.non_standard_newline = non_standard_newline
paul@0	70	self.lines = []
paul@8	71	self.line_number = 1 # about to read line 1
paul@0	72
paul@9	73	def close(self):
paul@9	74
paul@9	75	"Close the reader."
paul@9	76
paul@9	77	self.f.close()
paul@9	78
paul@0	79	def pushback(self, line):
paul@0	80
paul@0	81	"""
paul@0	82	Push the given 'line' back so that the next line read is actually the
paul@0	83	given 'line' and not the next line from the underlying file.
paul@0	84	"""
paul@0	85
paul@0	86	self.lines.append(line)
paul@0	87	self.line_number -= 1
paul@0	88
paul@0	89	def readline(self):
paul@0	90
paul@0	91	"""
paul@0	92	If no pushed-back lines exist, read a line directly from the file.
paul@0	93	Otherwise, read from the list of pushed-back lines.
paul@0	94	"""
paul@0	95
paul@0	96	self.line_number += 1
paul@0	97	if self.lines:
paul@0	98	return self.lines.pop()
paul@0	99	else:
paul@0	100	# NOTE: Sanity check for broken lines (\r instead of \r\n or \n).
paul@0	101	line = self.f.readline()
paul@0	102	while line.endswith("\r") and not self.non_standard_newline:
paul@0	103	line += self.f.readline()
paul@0	104	if line.endswith("\r") and self.non_standard_newline:
paul@0	105	return line + "\n"
paul@0	106	else:
paul@0	107	return line
paul@0	108
paul@8	109	def read_content_line(self):
paul@0	110
paul@0	111	"""
paul@8	112	Read an entire content line, itself potentially consisting of many
paul@8	113	physical lines of text.
paul@0	114	"""
paul@0	115
paul@9	116	# Skip blank lines.
paul@9	117
paul@8	118	line = self.readline()
paul@9	119	while line:
paul@9	120	line_stripped = line.rstrip("\r\n")
paul@9	121	if not line_stripped:
paul@9	122	line = self.readline()
paul@9	123	else:
paul@9	124	break
paul@9	125	else:
paul@9	126	return ""
paul@0	127
paul@8	128	# Strip all appropriate whitespace from the right end of each line.
paul@8	129	# For subsequent lines, remove the first whitespace character.
paul@8	130	# See section 4.1 of the iCalendar specification.
paul@8	131
paul@9	132	lines = [line_stripped]
paul@0	133
paul@0	134	line = self.readline()
paul@8	135	while line.startswith(" ") or line.startswith("\t"):
paul@8	136	lines.append(line[1:].rstrip("\r\n"))
paul@8	137	line = self.readline()
paul@8	138
paul@8	139	# Since one line too many will have been read, push the line back into
paul@8	140	# the file.
paul@8	141
paul@8	142	if line:
paul@8	143	self.pushback(line)
paul@8	144
paul@8	145	return "".join(lines)
paul@8	146
paul@8	147	def get_content_line(self):
paul@8	148
paul@8	149	"Return a content line object for the current line."
paul@8	150
paul@8	151	return ContentLine(self.read_content_line())
paul@8	152
paul@8	153	class ContentLine:
paul@8	154
paul@8	155	"A content line which can be searched."
paul@8	156
paul@8	157	SEPARATORS = re.compile('[;:"]')
paul@8	158	SEPARATORS_PLUS_EQUALS = re.compile('[=;:"]')
paul@8	159
paul@8	160	def __init__(self, text):
paul@8	161	self.text = text
paul@8	162	self.start = 0
paul@8	163
paul@8	164	def get_remaining(self):
paul@8	165
paul@8	166	"Get the remaining text from the content line."
paul@8	167
paul@8	168	return self.text[self.start:]
paul@8	169
paul@8	170	def search(self, targets):
paul@8	171
paul@8	172	"""
paul@8	173	Find one of the 'targets' in the text, returning the string from the
paul@8	174	current position up to the target found, along with the target string,
paul@8	175	using a tuple of the form (string, target). If no target was found,
paul@8	176	return the entire string together with a target of None.
paul@8	177	"""
paul@8	178
paul@8	179	text = self.text
paul@8	180	start = pos = self.start
paul@8	181	length = len(text)
paul@0	182
paul@4	183	# Remember the first target.
paul@4	184
paul@4	185	first = None
paul@4	186	first_pos = None
paul@4	187	in_quoted_region = 0
paul@0	188
paul@8	189	# Process the text, looking for the targets.
paul@4	190
paul@8	191	while pos < length:
paul@8	192	match = targets.search(text, pos)
paul@4	193
paul@8	194	# Where nothing matches, end the search.
paul@0	195
paul@4	196	if match is None:
paul@8	197	pos = length
paul@0	198
paul@4	199	# Where a double quote matches, toggle the region state.
paul@0	200
paul@4	201	elif match.group() == '"':
paul@4	202	in_quoted_region = not in_quoted_region
paul@8	203	pos = match.end()
paul@4	204
paul@4	205	# Where something else matches outside a region, stop searching.
paul@0	206
paul@4	207	elif not in_quoted_region:
paul@4	208	first = match.group()
paul@4	209	first_pos = match.start()
paul@4	210	break
paul@0	211
paul@4	212	# Otherwise, keep looking for the end of the region.
paul@4	213
paul@4	214	else:
paul@8	215	pos = match.end()
paul@4	216
paul@4	217	# Where no more input can provide the targets, return a special result.
paul@0	218
paul@4	219	else:
paul@8	220	self.start = length
paul@8	221	return text[start:], None
paul@0	222
paul@8	223	self.start = match.end()
paul@8	224	return text[start:first_pos], first
paul@0	225
paul@0	226	class StreamParser:
paul@0	227
paul@0	228	"A stream parser for content in vCard/vCalendar/iCalendar-like formats."
paul@0	229
paul@0	230	def __init__(self, f):
paul@0	231
paul@0	232	"Initialise the parser for the given file 'f'."
paul@0	233
paul@0	234	self.f = f
paul@0	235
paul@9	236	def close(self):
paul@9	237
paul@9	238	"Close the reader."
paul@9	239
paul@9	240	self.f.close()
paul@9	241
paul@0	242	def __iter__(self):
paul@0	243
paul@0	244	"Return self as the iterator."
paul@0	245
paul@0	246	return self
paul@0	247
paul@0	248	def next(self):
paul@0	249
paul@0	250	"""
paul@0	251	Return the next content item in the file as a tuple of the form
paul@0	252	(name, parameters, values).
paul@0	253	"""
paul@0	254
paul@0	255	return self.parse_content_line()
paul@0	256
paul@7	257	def decode_content(self, value):
paul@7	258
paul@7	259	"Decode the given 'value', replacing quoted characters."
paul@7	260
paul@7	261	return value.replace("\r", "").replace("\\N", "\n").replace("\\n", "\n")
paul@7	262
paul@5	263	# Internal methods.
paul@5	264
paul@0	265	def parse_content_line(self):
paul@0	266
paul@0	267	"""
paul@7	268	Return the name, parameters and value information for the current
paul@7	269	content line in the file being parsed.
paul@0	270	"""
paul@0	271
paul@0	272	f = self.f
paul@8	273	line_number = f.line_number
paul@8	274	line = f.get_content_line()
paul@0	275
paul@8	276	# Read the property name.
paul@0	277
paul@8	278	name, sep = line.search(line.SEPARATORS)
paul@0	279	name = name.strip()
paul@0	280
paul@0	281	if not name and sep is None:
paul@0	282	raise StopIteration
paul@0	283
paul@8	284	# Read the parameters.
paul@8	285
paul@8	286	parameters = {}
paul@8	287
paul@0	288	while sep == ";":
paul@0	289
paul@0	290	# Find the actual modifier.
paul@0	291
paul@8	292	parameter_name, sep = line.search(line.SEPARATORS_PLUS_EQUALS)
paul@0	293	parameter_name = parameter_name.strip()
paul@0	294
paul@0	295	if sep == "=":
paul@8	296	parameter_value, sep = line.search(line.SEPARATORS)
paul@0	297	parameter_value = parameter_value.strip()
paul@0	298	else:
paul@0	299	parameter_value = None
paul@0	300
paul@0	301	# Append a key, value tuple to the parameters list.
paul@0	302
paul@0	303	parameters[parameter_name] = parameter_value
paul@0	304
paul@0	305	# Get the value content.
paul@0	306
paul@0	307	if sep != ":":
paul@8	308	raise ValueError, line_number
paul@0	309
paul@8	310	# Obtain and decode the value.
paul@0	311
paul@8	312	value = self.decode(name, parameters, line.get_remaining())
paul@0	313
paul@0	314	return name, parameters, value
paul@0	315
paul@7	316	def decode(self, name, parameters, value):
paul@1	317
paul@7	318	"Decode using 'name' and 'parameters' the given 'value'."
paul@0	319
paul@1	320	encoding = parameters.get("ENCODING")
paul@1	321	charset = parameters.get("CHARSET")
paul@0	322
paul@7	323	value = self.decode_content(value)
paul@0	324
paul@0	325	if encoding == "QUOTED-PRINTABLE":
paul@1	326	return unicode(quopri.decodestring(value), charset or "iso-8859-1")
paul@0	327	elif encoding == "BASE64":
paul@0	328	return base64.decodestring(value)
paul@0	329	else:
paul@1	330	return value
paul@0	331
paul@2	332	class ParserBase:
paul@0	333
paul@2	334	"An abstract parser for content in vCard/vCalendar/iCalendar-like formats."
paul@0	335
paul@0	336	def __init__(self):
paul@0	337
paul@0	338	"Initialise the parser."
paul@0	339
paul@2	340	self.names = []
paul@0	341
paul@5	342	def parse(self, f, parser_cls=None):
paul@0	343
paul@0	344	"Parse the contents of the file 'f'."
paul@0	345
paul@5	346	parser = (parser_cls or StreamParser)(f)
paul@0	347
paul@0	348	for name, parameters, value in parser:
paul@0	349
paul@0	350	if name == "BEGIN":
paul@2	351	self.names.append(value)
paul@3	352	self.startComponent(value, parameters)
paul@0	353
paul@0	354	elif name == "END":
paul@2	355	start_name = self.names.pop()
paul@2	356	if start_name != value:
paul@0	357	raise ParseError, "Mismatch in BEGIN and END declarations (%r and %r) at line %d." % (
paul@2	358	start_name, value, f.line_number)
paul@2	359
paul@3	360	self.endComponent(value)
paul@0	361
paul@0	362	else:
paul@3	363	self.handleProperty(name, parameters, value)
paul@2	364
paul@2	365	class Parser(ParserBase):
paul@2	366
paul@2	367	"A SAX-like parser for vCard/vCalendar/iCalendar-like formats."
paul@2	368
paul@2	369	def __init__(self):
paul@2	370	ParserBase.__init__(self)
paul@3	371	self.components = []
paul@2	372
paul@3	373	def startComponent(self, name, parameters):
paul@2	374
paul@2	375	"""
paul@3	376	Add the component with the given 'name' and 'parameters', recording an
paul@3	377	empty list of children as part of the component's content.
paul@2	378	"""
paul@2	379
paul@3	380	component = self.handleProperty(name, parameters, [])
paul@3	381	self.components.append(component)
paul@3	382	return component
paul@2	383
paul@3	384	def endComponent(self, name):
paul@2	385
paul@2	386	"""
paul@3	387	End the component with the given 'name' by removing it from the active
paul@3	388	component stack.
paul@2	389	"""
paul@2	390
paul@3	391	if len(self.components) > 1:
paul@3	392	return self.components.pop()
paul@3	393	elif self.components:
paul@3	394	return self.components[-1]
paul@2	395
paul@3	396	def handleProperty(self, name, parameters, value):
paul@0	397
paul@2	398	"""
paul@4	399	Record the property with the given 'name', 'parameters' and 'value' as
paul@3	400	part of the current component's children.
paul@2	401	"""
paul@2	402
paul@2	403	component = self.makeComponent(name, parameters, value)
paul@2	404	self.attachComponent(component)
paul@2	405	return component
paul@2	406
paul@2	407	# Component object construction/manipulation methods.
paul@2	408
paul@2	409	def attachComponent(self, component):
paul@2	410
paul@2	411	"Attach the given 'component' to its parent."
paul@2	412
paul@3	413	if self.components:
paul@3	414	component_name, component_parameters, component_children = self.components[-1]
paul@3	415	component_children.append(component)
paul@2	416
paul@2	417	def makeComponent(self, name, parameters, value):
paul@2	418
paul@2	419	"""
paul@2	420	Make a component object from the given 'name', 'parameters' and 'value'.
paul@2	421	"""
paul@2	422
paul@2	423	return (name, parameters, value)
paul@2	424
paul@2	425	# Public methods.
paul@2	426
paul@5	427	def parse(self, f, parser_cls=None):
paul@2	428
paul@2	429	"Parse the contents of the file 'f'."
paul@2	430
paul@5	431	ParserBase.parse(self, f, parser_cls)
paul@3	432	return self.components[0]
paul@0	433
paul@7	434	# Writer classes.
paul@7	435
paul@8	436	class Writer:
paul@8	437
paul@8	438	"A simple class wrapping a file, providing simple output capabilities."
paul@8	439
paul@8	440	default_line_length = 76
paul@8	441
paul@8	442	def __init__(self, f, line_length=None):
paul@8	443
paul@8	444	"""
paul@8	445	Initialise the object with the file 'f'. If 'line_length' is set, the
paul@8	446	length of written lines will conform to the specified value instead of
paul@8	447	the default value.
paul@8	448	"""
paul@8	449
paul@8	450	self.f = f
paul@8	451	self.line_length = line_length or self.default_line_length
paul@8	452	self.char_offset = 0
paul@8	453
paul@9	454	def close(self):
paul@9	455
paul@9	456	"Close the writer."
paul@9	457
paul@9	458	self.f.close()
paul@9	459
paul@8	460	def write(self, text):
paul@8	461
paul@8	462	"Write the 'text' to the file."
paul@8	463
paul@8	464	f = self.f
paul@8	465	line_length = self.line_length
paul@8	466
paul@8	467	i = 0
paul@8	468	remaining = len(text)
paul@8	469
paul@8	470	while remaining:
paul@8	471	space = line_length - self.char_offset
paul@8	472	if remaining > space:
paul@8	473	f.write(text[i:i + space])
paul@8	474	f.write("\r\n ")
paul@8	475	self.char_offset = 1
paul@8	476	i += space
paul@8	477	remaining -= space
paul@8	478	else:
paul@8	479	f.write(text[i:])
paul@8	480	self.char_offset += remaining
paul@8	481	i += remaining
paul@8	482	remaining = 0
paul@8	483
paul@8	484	def end_line(self):
paul@8	485
paul@8	486	"End the current content line."
paul@8	487
paul@8	488	if self.char_offset > 0:
paul@8	489	self.char_offset = 0
paul@8	490	self.f.write("\r\n")
paul@8	491
paul@7	492	class StreamWriter:
paul@7	493
paul@7	494	"A stream writer for content in vCard/vCalendar/iCalendar-like formats."
paul@7	495
paul@8	496	def __init__(self, f):
paul@7	497
paul@7	498	"Initialise the parser for the given file 'f'."
paul@7	499
paul@7	500	self.f = f
paul@7	501
paul@9	502	def close(self):
paul@9	503
paul@9	504	"Close the writer."
paul@9	505
paul@9	506	self.f.close()
paul@9	507
paul@9	508	def write_content_line(self, name, parameters, value):
paul@7	509
paul@7	510	"""
paul@7	511	Write a content line for the given 'name', 'parameters' and 'value'
paul@7	512	information.
paul@7	513	"""
paul@7	514
paul@7	515	f = self.f
paul@7	516
paul@7	517	f.write(name)
paul@8	518	for parameter_name, parameter_value in parameters.items():
paul@8	519	f.write(";")
paul@8	520	f.write(parameter_name)
paul@8	521	f.write("=")
paul@8	522	f.write(parameter_value)
paul@7	523	f.write(":")
paul@8	524	f.write(self.encode(name, parameters, value))
paul@8	525	f.end_line()
paul@7	526
paul@7	527	def encode_content(self, value):
paul@7	528
paul@7	529	"Encode the given 'value', quoting characters."
paul@7	530
paul@7	531	return value.replace("\n", "\\n")
paul@7	532
paul@7	533	# Internal methods.
paul@7	534
paul@7	535	def encode(self, name, parameters, value):
paul@7	536
paul@7	537	"Encode using 'name' and 'parameters' the given 'value'."
paul@7	538
paul@7	539	encoding = parameters.get("ENCODING")
paul@7	540	charset = parameters.get("CHARSET")
paul@7	541
paul@7	542	if encoding == "QUOTED-PRINTABLE":
paul@7	543	value = quopri.encodestring(value.encode(charset or "iso-8859-1"))
paul@7	544	elif encoding == "BASE64":
paul@7	545	value = base64.encodestring(value)
paul@7	546
paul@7	547	return self.encode_content(value)
paul@7	548
paul@9	549	# Utility functions.
paul@9	550
paul@9	551	def is_input_stream(stream_or_string):
paul@9	552	return hasattr(stream_or_string, "read")
paul@9	553
paul@9	554	def get_input_stream(stream_or_string):
paul@9	555	if is_input_stream(stream_or_string):
paul@9	556	return stream_or_string
paul@9	557	else:
paul@9	558	return codecs.open(stream_or_string, encoding=default_encoding)
paul@9	559
paul@9	560	def get_output_stream(stream_or_string):
paul@9	561	if hasattr(stream_or_string, "write"):
paul@9	562	return stream_or_string
paul@9	563	else:
paul@9	564	return codecs.open(stream_or_string, "w", encoding=default_encoding)
paul@9	565
paul@0	566	# Public functions.
paul@0	567
paul@9	568	def parse(stream_or_string, non_standard_newline=0, parser_cls=None):
paul@0	569
paul@0	570	"""
paul@9	571	Parse the resource data found through the use of the 'stream_or_string',
paul@9	572	which is either a stream providing Unicode data (the codecs module can be
paul@9	573	used to open files or to wrap streams in order to provide Unicode data) or a
paul@9	574	filename identifying a file to be parsed.
paul@0	575
paul@0	576	The optional 'non_standard_newline' can be set to a true value (unlike the
paul@0	577	default) in order to attempt to process files with CR as the end of line
paul@0	578	character.
paul@0	579
paul@0	580	As a result of parsing the resource, the root node of the imported resource
paul@0	581	is returned.
paul@0	582	"""
paul@0	583
paul@9	584	stream = get_input_stream(stream_or_string)
paul@9	585	reader = Reader(stream, non_standard_newline)
paul@9	586
paul@9	587	# Parse using the reader.
paul@0	588
paul@9	589	try:
paul@9	590	parser = (parser_cls or Parser)()
paul@9	591	return parser.parse(reader)
paul@9	592
paul@9	593	# Close any opened streams.
paul@9	594
paul@9	595	finally:
paul@9	596	if not is_input_stream(stream_or_string):
paul@9	597	reader.close()
paul@9	598
paul@9	599	def iterparse(stream_or_string, non_standard_newline=0, parser_cls=None):
paul@5	600
paul@5	601	"""
paul@9	602	Parse the resource data found through the use of the 'stream_or_string',
paul@9	603	which is either a stream providing Unicode data (the codecs module can be
paul@9	604	used to open files or to wrap streams in order to provide Unicode data) or a
paul@9	605	filename identifying a file to be parsed.
paul@5	606
paul@5	607	The optional 'non_standard_newline' can be set to a true value (unlike the
paul@5	608	default) in order to attempt to process files with CR as the end of line
paul@5	609	character.
paul@5	610
paul@5	611	An iterator is returned which provides event tuples describing parsing
paul@5	612	events of the form (name, parameters, value).
paul@5	613	"""
paul@5	614
paul@9	615	stream = get_input_stream(stream_or_string)
paul@9	616	reader = Reader(stream, non_standard_newline)
paul@5	617	parser = (parser_cls or StreamParser)(reader)
paul@9	618	return parser
paul@5	619
paul@9	620	def iterwrite(stream_or_string, line_length=None, writer_cls=None):
paul@9	621	stream = get_output_stream(stream_or_string)
paul@9	622	_writer = Writer(stream, line_length)
paul@8	623	writer = (writer_cls or StreamWriter)(_writer)
paul@8	624	return writer
paul@8	625
paul@0	626	# vim: tabstop=4 expandtab shiftwidth=4