ConfluenceConverter (annotate convert.py in 4ecf97af8a76)

ConfluenceConverter

Annotated convert.py

53:4ecf97af8a76

2013-04-09

Paul Boddie

Handle page renaming to a reasonable extent, although comments and attachments may not appear on particular versions of renamed pages. Added an item about inter-space linking to the "to do" list.

paul@0	1	#!/usr/bin/env python
paul@0	2
paul@8	3	"""
paul@8	4	Confluence XML dump conversion to a MoinMoin-compatible representation.
paul@8	5
paul@33	6	Copyright (C) 2012, 2013 Paul Boddie <paul@boddie.org.uk>
paul@8	7
paul@8	8	This software is free software; you can redistribute it and/or
paul@8	9	modify it under the terms of the GNU General Public License as
paul@8	10	published by the Free Software Foundation; either version 2 of
paul@8	11	the License, or (at your option) any later version.
paul@8	12
paul@8	13	This software is distributed in the hope that it will be useful,
paul@8	14	but WITHOUT ANY WARRANTY; without even the implied warranty of
paul@8	15	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
paul@8	16	GNU General Public License for more details.
paul@8	17
paul@8	18	You should have received a copy of the GNU General Public
paul@8	19	License along with this library; see the file LICENCE.txt
paul@8	20	If not, write to the Free Software Foundation, Inc.,
paul@8	21	51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
paul@8	22	"""
paul@8	23
paul@40	24	from os import chdir, getcwd, listdir, mkdir, makedirs, walk
paul@40	25	from os.path import exists, extsep, join, split, splitext
paul@0	26	from zipfile import ZipFile
paul@0	27	from cStringIO import StringIO
paul@40	28	from MoinMoin import wikiutil
paul@0	29	import codecs
paul@0	30	import xmlread
paul@35	31	import wikiparser, xmlparser
paul@25	32	import sys
paul@0	33
paul@23	34	MAX_TITLE_LENGTH = 120
paul@23	35
paul@0	36	class ConfluenceHandler:
paul@0	37
paul@0	38	"Handle content from a Confluence Wiki dump."
paul@0	39
paul@13	40	def __init__(self, space, no_translate=False):
paul@0	41	self.content = {}
paul@0	42	self.elements = []
paul@12	43	self.space = space
paul@13	44	self.no_translate = no_translate
paul@0	45
paul@0	46	def handle_object(self, name, elements, attributes, all_text, text):
paul@0	47
paul@40	48	"""
paul@40	49	Handle objects according to type. Objects appear as follows:
paul@40	50
paul@40	51	<object class="Page" package="...">
paul@40	52	<id name="id">...</id>
paul@40	53	...
paul@40	54	</object>
paul@40	55
paul@40	56	Within objects, one finds things like properties and collections, which
paul@40	57	are handled by their own methods but which are stored in the content
paul@40	58	dictionary associated with the current object.
paul@40	59
paul@40	60	By the time this method is called, the contents of the object will have
paul@40	61	been gathered and the properties and collections populated in the
paul@40	62	content dictionary. Any identifier will have been assigned to the
paul@40	63	textual content of the object element and will be available in the
paul@40	64	'text' parameter.
paul@40	65	"""
paul@0	66
paul@0	67	objecttype = attributes[-1]["class"]
paul@25	68
paul@25	69	# Any identifier is stored as the object's textual content.
paul@25	70
paul@0	71	identifier = text.strip()
paul@25	72
paul@25	73	# The content is a dictionary mapping names to properties and
paul@25	74	# collections.
paul@25	75
paul@0	76	content = self.content
paul@0	77
paul@12	78	pages_dir = join(self.space, "pages")
paul@12	79	versions_dir = join(self.space, "versions")
paul@0	80
paul@0	81	# Handle particular types.
paul@0	82
paul@10	83	if objecttype in ("Page", "Comment", "BlogPost"):
paul@0	84
paul@0	85	# Handle pages and revisions, adding revisions to the page manifest.
paul@9	86	# The original version is used as a unifying identifier for all the
paul@9	87	# different revisions (each of which being defined by a Page
paul@9	88	# element). Although "original" implies the first identifier used,
paul@9	89	# it actually appears to be the latest and will have the highest
paul@9	90	# version number.
paul@0	91
paul@0	92	if content.has_key("originalVersion"):
paul@0	93	pageid = content["originalVersion"]
paul@0	94	else:
paul@0	95	pageid = identifier
paul@0	96
paul@0	97	versionfile = join(versions_dir, identifier)
paul@0	98
paul@0	99	# Note page metadata, not necessarily in the correct order.
paul@9	100	# For comments, the title will need to be rewritten, since they
paul@9	101	# should be defined in terms of their owner page.
paul@0	102
paul@53	103	# NOTE: This only makes the current title available to comments.
paul@53	104
paul@0	105	mkdirs(join(pages_dir, pageid))
paul@0	106
paul@12	107	title = content["title"]
paul@23	108
paul@23	109	# Limit the title to a "safe" number of characters in order to avoid
paul@23	110	# filesystem issues.
paul@23	111
paul@23	112	title = title[:MAX_TITLE_LENGTH]
paul@23	113
paul@12	114	if title:
paul@12	115	title = "%s/%s" % (self.space, title)
paul@31	116	write(join(pages_dir, pageid, "pagetitle"), title)
paul@12	117
paul@28	118	# See sort_manifest for access to this data.
paul@28	119
paul@24	120	append(join(pages_dir, pageid, "manifest"),
paul@40	121	"%s\|AddRevision\|_\|%s\|%s\|%s\|%s\n" % ( # blank added for consistency with AddAttachment
paul@24	122	content["version"],
paul@24	123	versionfile,
paul@31	124	title, # comment titles will incorporate the comment's position
paul@24	125	content["lastModifierName"],
paul@24	126	content["versionComment"]
paul@24	127	))
paul@0	128
paul@24	129	# Add information to parent pages for child page lists.
paul@24	130
paul@24	131	if content.has_key("parent"):
paul@24	132	parentid = content["parent"]
paul@24	133	mkdirs(join(pages_dir, parentid))
paul@24	134	append(join(pages_dir, parentid, "children"), title + "\n")
paul@24	135
paul@31	136	# Add creation details for comments to the owner page.
paul@31	137	# Since comments can be versioned, the date of the original version
paul@31	138	# is used, and only this "original" version has the owner property.
paul@31	139
paul@31	140	if objecttype == "Comment" and content.has_key("owner"):
paul@31	141	ownerid = content["owner"]
paul@31	142	mkdirs(join(pages_dir, ownerid))
paul@31	143	append(join(pages_dir, ownerid, "comments"), "%s\|%s\n" % (content["creationDate"], pageid))
paul@31	144
paul@0	145	# Some metadata is not particularly relevant. For example,
paul@0	146	# ancestors, children, parent are navigation-related.
paul@0	147
paul@0	148	# Other metadata could be added to the page content itself.
paul@0	149	# For example, labelling could be converted to categories.
paul@0	150
paul@0	151	# Handle revisions.
paul@0	152
paul@0	153	elif objecttype == "BodyContent":
paul@12	154	body = content["body"]
paul@12	155	if not body:
paul@12	156	body = "## Empty page."
paul@13	157
paul@25	158	# NOTE: Very simple technique employed for guessing the format.
paul@25	159
paul@13	160	if no_translate:
paul@13	161	fn = write
paul@25	162	elif body.startswith("<"):
paul@25	163	fn = xmltranslate
paul@13	164	else:
paul@13	165	fn = translate
paul@13	166
paul@25	167	try:
paul@25	168	fn(join(versions_dir, content["content"]), body)
paul@25	169	except:
paul@42	170	err = codecs.getwriter("utf-8")(sys.stderr)
paul@42	171	print >>err, "Error parsing", content["content"]
paul@25	172	raise
paul@0	173
paul@40	174	# Handle attachments.
paul@40	175
paul@40	176	elif objecttype == "Attachment":
paul@40	177	pageid = content["content"]
paul@40	178	version = content["attachmentVersion"]
paul@40	179
paul@40	180	if content.has_key("originalVersion"):
paul@40	181	attachid = content["originalVersion"]
paul@40	182	else:
paul@40	183	attachid = identifier
paul@40	184
paul@40	185	append(join(pages_dir, pageid, "attachments"),
paul@40	186	"%s\|AddAttachment\|%s\|%s\|%s\|%s\|%s\n" % (
paul@40	187	version,
paul@40	188	# Have to "taint" archive filenames, although Moin will
paul@40	189	# probably handle package script filename tainting.
paul@40	190	wikiutil.taintfilename(join("attachments", pageid, attachid, version)),
paul@40	191	wikiutil.taintfilename(content["fileName"]),
paul@40	192	"", # pagename is substituted later
paul@40	193	content["lastModifierName"],
paul@40	194	content["comment"]
paul@40	195	))
paul@40	196
paul@0	197	self.content = {}
paul@0	198
paul@0	199	def handle_property(self, name, elements, attributes, all_text, text):
paul@0	200
paul@0	201	"Record properties in the current content dictionary."
paul@0	202
paul@0	203	self.content[attributes[-1]["name"]] = text.strip()
paul@0	204
paul@0	205	def handle_id(self, name, elements, attributes, all_text, text):
paul@0	206
paul@0	207	"Promote identifiers to the parent element's text."
paul@0	208
paul@0	209	all_text[-2].append(text)
paul@0	210
paul@0	211	def handle_collection(self, name, elements, attributes, all_text, text):
paul@0	212
paul@0	213	"Record collections in the current content dictionary."
paul@0	214
paul@0	215	self.content[attributes[-1]["name"]] = self.elements
paul@0	216	self.elements = []
paul@0	217
paul@0	218	def handle_element(self, name, elements, attributes, all_text, text):
paul@0	219
paul@0	220	"Add elements to the current collection."
paul@0	221
paul@0	222	self.elements.append((attributes[-1]["class"], text.strip()))
paul@0	223
paul@0	224	def mkdirs(name):
paul@2	225
paul@2	226	"Make the directory with the given 'name' at any depth."
paul@2	227
paul@0	228	try:
paul@0	229	makedirs(name)
paul@0	230	except OSError:
paul@0	231	pass
paul@0	232
paul@0	233	def append(filename, s):
paul@2	234
paul@2	235	"Append to the file with the given 'filename' the string 's'."
paul@2	236
paul@0	237	write(filename, s, True)
paul@0	238
paul@0	239	def write(filename, s, append=False):
paul@2	240
paul@2	241	"""
paul@2	242	Write to the file with the given 'filename' the string 's'. If the optional
paul@2	243	'append' parameter is set to a true value, 's' will be appended to the file.
paul@2	244	"""
paul@2	245
paul@2	246	f = codecs.open(filename, append and "a" or "w", encoding="utf-8")
paul@0	247	try:
paul@0	248	f.write(s)
paul@0	249	finally:
paul@0	250	f.close()
paul@0	251
paul@9	252	def read(filename):
paul@9	253
paul@9	254	"""
paul@9	255	Read from the file with the given 'filename', returning a string containing
paul@9	256	its contents.
paul@9	257	"""
paul@9	258
paul@9	259	f = codecs.open(filename, encoding="utf-8")
paul@9	260	try:
paul@9	261	return f.read()
paul@9	262	finally:
paul@9	263	f.close()
paul@3	264
paul@25	265	def translate(filename, body, fn=None):
paul@11	266
paul@11	267	"""
paul@11	268	Write to the file with the given 'filename' a translation of the given
paul@11	269	'body'.
paul@11	270	"""
paul@11	271
paul@35	272	fn = fn or wikiparser.parse
paul@25	273
paul@11	274	out = codecs.open(filename, "w", encoding="utf-8")
paul@11	275	try:
paul@44	276	print >>out, "#pragma page-filename", filename
paul@25	277	fn(body, out)
paul@11	278	finally:
paul@11	279	out.close()
paul@11	280
paul@25	281	def xmltranslate(filename, body):
paul@35	282	translate(filename, body, xmlparser.parse)
paul@25	283
paul@31	284	def sort_comments(pages_dir, pageid):
paul@31	285
paul@31	286	"""
paul@31	287	Where 'pageid' has comments associated with it, sort them chronologically
paul@31	288	and label the comment pages with the owner page's title and comment's
paul@31	289	position in the chronological sequence. Such labelling is done by writing
paul@31	290	a "pagetitle" file in each comment page's directory.
paul@31	291	"""
paul@31	292
paul@31	293	comments = join(pages_dir, pageid, "comments")
paul@31	294
paul@31	295	if not exists(comments):
paul@31	296	return
paul@31	297
paul@31	298	title = read(join(pages_dir, pageid, "pagetitle"))
paul@31	299
paul@31	300	details = [line.split("\|") for line in read(comments).split("\n") if line]
paul@31	301	details.sort()
paul@31	302
paul@31	303	# Write the sorted comments list for testing purposes.
paul@31	304
paul@31	305	write(comments, "\n".join(["\|".join(x) for x in details]))
paul@31	306
paul@31	307	# Define comments as subpages by setting their titles using this
paul@31	308	# page's name/title and their position in the comments collection.
paul@31	309
paul@31	310	for position, (_lastmodified, commentid) in enumerate(details):
paul@31	311
paul@31	312	# In the page directory for each comment, write the page title in a
paul@31	313	# special file for later processing.
paul@31	314
paul@32	315	write(join(pages_dir, commentid, "pagetitle"), "%s/%04d" % (title, position))
paul@31	316
paul@40	317	def _sort_manifest(manifest, title):
paul@40	318
paul@40	319	"""
paul@40	320	Open the given 'manifest' and sort it according to revision so that it will
paul@40	321	be added to MoinMoin in the correct order.
paul@40	322
paul@40	323	If a 'title' is provided, the title column in the manifest will be augmented
paul@40	324	with that information. This is typically done for comments and is necessary
paul@40	325	for attachments.
paul@40	326
paul@40	327	A list of manifest entries is returned.
paul@40	328	"""
paul@40	329
paul@40	330	f = codecs.open(manifest, "r", encoding="utf-8")
paul@40	331	try:
paul@40	332	lines = [x.split("\|") for x in f.readlines()]
paul@40	333	lines.sort(cmp=lambda x, y: cmp(int(x[0]), int(y[0])))
paul@40	334
paul@40	335	# Reconstruct the lines, optionally changing the titles.
paul@40	336
paul@40	337	result = []
paul@40	338
paul@40	339	for line in lines:
paul@40	340	version, _action, _archive_filename, filename, old_title, username, comment = line
paul@40	341
paul@40	342	# Replace title information with the information already present.
paul@40	343
paul@53	344	if not old_title:
paul@40	345	new_title = title
paul@40	346	else:
paul@40	347	new_title = old_title
paul@40	348
paul@40	349	# The version is omitted now that the manifest is ordered.
paul@40	350
paul@40	351	line = _action, _archive_filename, filename, new_title, username, comment
paul@40	352	result.append(line)
paul@40	353
paul@40	354	return result
paul@40	355
paul@40	356	finally:
paul@40	357	f.close()
paul@40	358
paul@40	359	def serialise_manifest(manifest):
paul@40	360
paul@40	361	"""
paul@40	362	Process the 'manifest' consisting of entries, removing superfluous columns.
paul@40	363	"""
paul@40	364
paul@40	365	result = []
paul@40	366
paul@40	367	for columns in manifest:
paul@40	368	action = columns[0]
paul@40	369	if action == "AddRevision":
paul@40	370	columns = list(columns)
paul@40	371	del columns[1]
paul@40	372	result.append("\|".join(columns))
paul@40	373
paul@40	374	return "".join(result)
paul@40	375
paul@33	376	def sort_manifest(pages_dir, pageid, output=None, no_translate=False):
paul@9	377
paul@9	378	"""
paul@28	379	Using the given 'pageid', locate the manifest for the page and any page
paul@28	380	title information written to a "pagetitle" file.
paul@23	381
paul@28	382	Then sort the manifest according to revision so that it will be added to
paul@28	383	MoinMoin in the correct order.
paul@28	384
paul@28	385	If a "pagetitle" file exists, the title column in the manifest will be
paul@23	386	augmented with the contents of that file. This is typically done for
paul@23	387	comments.
paul@23	388
paul@28	389	If a "children" file exists, the pages in that file will be added as a list
paul@28	390	to the end of each revision's content.
paul@28	391
paul@23	392	If 'output' is given, the manifest details will be appended to the file
paul@23	393	having that filename instead of being rewritten to the original manifest
paul@23	394	file.
paul@9	395	"""
paul@9	396
paul@28	397	manifest = join(pages_dir, pageid, "manifest")
paul@40	398	attachments = join(pages_dir, pageid, "attachments")
paul@28	399	pagetitle = join(pages_dir, pageid, "pagetitle")
paul@28	400	children = join(pages_dir, pageid, "children")
paul@32	401	comments = join(pages_dir, pageid, "comments")
paul@28	402
paul@9	403	if exists(pagetitle):
paul@9	404	title = read(pagetitle)
paul@9	405	else:
paul@9	406	title = None
paul@3	407
paul@40	408	# Sort the revision manifest.
paul@40	409
paul@40	410	result = _sort_manifest(manifest, title)
paul@9	411
paul@40	412	for _action, _archive_filename, filename, new_title, username, comment in result:
paul@9	413
paul@40	414	# Add child page information to the content.
paul@28	415
paul@40	416	if exists(children) and not no_translate:
paul@40	417	child_pages = []
paul@40	418	child_page_names = [x for x in read(children).split("\n") if x]
paul@40	419	child_page_names.sort()
paul@28	420
paul@40	421	for child_page_name in child_page_names:
paul@40	422	child_pages.append(" * [[%s]]" % child_page_name)
paul@28	423
paul@40	424	append(filename, child_page_section % "\n".join(child_pages))
paul@28	425
paul@40	426	# Add comments to the content.
paul@40	427
paul@40	428	if exists(comments) and title and not no_translate:
paul@40	429	append(filename, comment_section % title)
paul@28	430
paul@40	431	# Add the attachments to the manifest.
paul@32	432
paul@40	433	if exists(attachments):
paul@40	434	result += _sort_manifest(attachments, title)
paul@32	435
paul@40	436	# Serialise the manifest.
paul@3	437
paul@40	438	s = serialise_manifest(result)
paul@10	439
paul@10	440	if output is None:
paul@28	441	write(manifest, s)
paul@10	442	else:
paul@10	443	append(output, s)
paul@3	444
paul@28	445	# Template for child page information.
paul@28	446
paul@28	447	child_page_section = """
paul@28	448	----
paul@28	449
paul@28	450	%s
paul@28	451	"""
paul@28	452
paul@32	453	# Template for comments.
paul@32	454
paul@32	455	comment_section = """
paul@32	456	----
paul@32	457
paul@32	458	<<Include("^%s/")>>
paul@32	459	"""
paul@32	460
paul@28	461	# Main program.
paul@28	462
paul@0	463	if __name__ == "__main__":
paul@20	464	try:
paul@20	465	filename = sys.argv[1]
paul@20	466	is_zipfile = splitext(filename)[-1] == extsep + "zip"
paul@20	467	space = sys.argv[2]
paul@44	468	if len(sys.argv) > 3 and sys.argv[3]:
paul@40	469	attachments = sys.argv[3]
paul@40	470	else:
paul@40	471	attachments = None
paul@20	472	except IndexError:
paul@47	473	print >>sys.stderr, """
paul@47	474	Please specify an XML file containing Wiki data, a workspace name, and an
paul@47	475	optional attachments directory location. For example:
paul@47	476
paul@47	477	com_entities.xml COM attachments
paul@47	478
paul@47	479	Adding --no-translate will unpack the Wiki but not translate the content.
paul@47	480	When doing so without an attachments directory, add an empty argument as
paul@47	481	follows:
paul@47	482
paul@47	483	com_entities.xml COM '' --no-translate
paul@47	484	"""
paul@20	485	sys.exit(1)
paul@0	486
paul@13	487	no_translate = "--no-translate" in sys.argv
paul@0	488
paul@12	489	if exists(space):
paul@12	490	print >>sys.stderr, "Directory exists for space %s. Please choose another or remove its contents." % space
paul@0	491	sys.exit(1)
paul@0	492
paul@12	493	package_zip = space + extsep + "zip"
paul@12	494
paul@12	495	if exists(package_zip):
paul@12	496	print >>sys.stderr, "Page package exists. Please remove or rename it:", package_zip
paul@12	497	sys.exit(1)
paul@12	498
paul@12	499	mkdir(space)
paul@12	500	mkdirs(join(space, "pages"))
paul@12	501	mkdirs(join(space, "versions"))
paul@0	502
paul@0	503	p = xmlread.ConfigurableParser()
paul@13	504	handler = ConfluenceHandler(space, no_translate)
paul@0	505
paul@24	506	# Register handlers in the parser for different elements.
paul@24	507
paul@0	508	p["object"] = handler.handle_object
paul@0	509	p["property"] = handler.handle_property
paul@0	510	p["id"] = handler.handle_id
paul@0	511	p["collection"] = handler.handle_collection
paul@0	512	p["element"] = handler.handle_element
paul@0	513
paul@2	514	# Open the XML dump.
paul@2	515
paul@0	516	f = open(filename)
paul@0	517
paul@0	518	if is_zipfile:
paul@0	519	zf = ZipFile(f)
paul@0	520	ff = StringIO(zf.read("entities.xml"))
paul@0	521	else:
paul@0	522	ff = f
paul@0	523
paul@2	524	# Parse the data.
paul@2	525
paul@0	526	try:
paul@0	527	p.parse(ff)
paul@40	528
paul@40	529	# Tidy up the import manifests, sorting each of them by revision and
paul@40	530	# finalising them.
paul@40	531
paul@40	532	pages_dir = join(space, "pages")
paul@40	533
paul@40	534	for pageid in listdir(pages_dir):
paul@40	535	sort_comments(pages_dir, pageid)
paul@40	536
paul@40	537	output_manifest = join(space, "MOIN_PACKAGE")
paul@40	538	append(output_manifest, "MoinMoinPackage\|1\n")
paul@40	539
paul@40	540	for pageid in listdir(pages_dir):
paul@40	541	sort_manifest(pages_dir, pageid, output_manifest, no_translate)
paul@40	542
paul@40	543	# Write the page package.
paul@40	544
paul@40	545	page_package = ZipFile(package_zip, "w")
paul@40	546
paul@40	547	try:
paul@40	548	# Include the page revisions.
paul@40	549
paul@40	550	versions_dir = join(space, "versions")
paul@40	551
paul@40	552	for versionid in listdir(versions_dir):
paul@40	553	page_package.write(join(versions_dir, versionid))
paul@40	554
paul@40	555	# Include the attachments.
paul@40	556
paul@40	557	if attachments:
paul@40	558	cwd = getcwd()
paul@40	559	chdir(split(attachments)[0])
paul@40	560	try:
paul@40	561	for path, dirnames, filenames in walk(split(attachments)[1]):
paul@40	562	for filename in filenames:
paul@40	563	# Have to "taint" archive filenames.
paul@40	564	page_package.write(join(path, filename), wikiutil.taintfilename(join(path, filename)))
paul@40	565	finally:
paul@40	566	chdir(cwd)
paul@40	567	elif is_zipfile:
paul@40	568	for filename in zf.namelist():
paul@40	569	if filename.startswith("attachments"):
paul@40	570	# Have to "taint" archive filenames.
paul@40	571	page_package.writestr(wikiutil.taintfilename(filename), zf.read(filename))
paul@40	572
paul@40	573	# Include only the top-level manifest.
paul@40	574
paul@40	575	page_package.write(output_manifest, "MOIN_PACKAGE")
paul@40	576
paul@40	577	finally:
paul@40	578	page_package.close()
paul@40	579
paul@0	580	finally:
paul@0	581	f.close()
paul@0	582
paul@0	583	# vim: tabstop=4 expandtab shiftwidth=4