ConfluenceConverter (annotate convert.py in 702a040785d7)

ConfluenceConverter

Annotated convert.py

20:702a040785d7

2012-06-05

Paul Boddie

Added some documentation; added command line argument handling in the converter.

paul@0	1	#!/usr/bin/env python
paul@0	2
paul@8	3	"""
paul@8	4	Confluence XML dump conversion to a MoinMoin-compatible representation.
paul@8	5
paul@8	6	Copyright (C) 2012 Paul Boddie <paul@boddie.org.uk>
paul@8	7
paul@8	8	This software is free software; you can redistribute it and/or
paul@8	9	modify it under the terms of the GNU General Public License as
paul@8	10	published by the Free Software Foundation; either version 2 of
paul@8	11	the License, or (at your option) any later version.
paul@8	12
paul@8	13	This software is distributed in the hope that it will be useful,
paul@8	14	but WITHOUT ANY WARRANTY; without even the implied warranty of
paul@8	15	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
paul@8	16	GNU General Public License for more details.
paul@8	17
paul@8	18	You should have received a copy of the GNU General Public
paul@8	19	License along with this library; see the file LICENCE.txt
paul@8	20	If not, write to the Free Software Foundation, Inc.,
paul@8	21	51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
paul@8	22	"""
paul@8	23
paul@3	24	from os import listdir, mkdir, makedirs
paul@1	25	from os.path import exists, extsep, join, splitext
paul@0	26	from zipfile import ZipFile
paul@0	27	from cStringIO import StringIO
paul@0	28	import codecs
paul@0	29	import xmlread
paul@11	30	import parser
paul@0	31
paul@0	32	class ConfluenceHandler:
paul@0	33
paul@0	34	"Handle content from a Confluence Wiki dump."
paul@0	35
paul@13	36	def __init__(self, space, no_translate=False):
paul@0	37	self.content = {}
paul@0	38	self.elements = []
paul@12	39	self.space = space
paul@13	40	self.no_translate = no_translate
paul@0	41
paul@0	42	def handle_object(self, name, elements, attributes, all_text, text):
paul@0	43
paul@0	44	"Handle objects according to type."
paul@0	45
paul@0	46	objecttype = attributes[-1]["class"]
paul@0	47	identifier = text.strip()
paul@0	48	content = self.content
paul@0	49
paul@12	50	pages_dir = join(self.space, "pages")
paul@12	51	versions_dir = join(self.space, "versions")
paul@0	52
paul@0	53	# Handle particular types.
paul@0	54
paul@10	55	if objecttype in ("Page", "Comment", "BlogPost"):
paul@0	56
paul@0	57	# Handle pages and revisions, adding revisions to the page manifest.
paul@9	58	# The original version is used as a unifying identifier for all the
paul@9	59	# different revisions (each of which being defined by a Page
paul@9	60	# element). Although "original" implies the first identifier used,
paul@9	61	# it actually appears to be the latest and will have the highest
paul@9	62	# version number.
paul@0	63
paul@0	64	if content.has_key("originalVersion"):
paul@0	65	pageid = content["originalVersion"]
paul@0	66	else:
paul@0	67	pageid = identifier
paul@0	68
paul@0	69	versionfile = join(versions_dir, identifier)
paul@0	70
paul@0	71	# Note page metadata, not necessarily in the correct order.
paul@9	72	# For comments, the title will need to be rewritten, since they
paul@9	73	# should be defined in terms of their owner page.
paul@0	74
paul@0	75	mkdirs(join(pages_dir, pageid))
paul@0	76
paul@12	77	title = content["title"]
paul@12	78	if title:
paul@12	79	title = "%s/%s" % (self.space, title)
paul@12	80
paul@3	81	append(join(pages_dir, pageid, "manifest"), "%s\|AddRevision\|%s\|%s\|%s\|%s\n" % (
paul@0	82	content["version"],
paul@0	83	versionfile,
paul@12	84	title or content["version"], # comment titles will incorporate the version
paul@0	85	content["lastModifierName"],
paul@0	86	content["versionComment"]))
paul@0	87
paul@9	88	# Write comments as subpages.
paul@9	89
paul@9	90	if content.has_key("comments"):
paul@9	91
paul@9	92	# Define a page directory for each comment, and write the page
paul@9	93	# title in a special file for later processing.
paul@9	94
paul@9	95	for _comment, commentid in content["comments"]:
paul@9	96	mkdirs(join(pages_dir, commentid))
paul@12	97	append(join(pages_dir, commentid, "pagetitle"), title)
paul@9	98
paul@0	99	# Some metadata is not particularly relevant. For example,
paul@0	100	# ancestors, children, parent are navigation-related.
paul@0	101
paul@0	102	# Other metadata could be added to the page content itself.
paul@0	103	# For example, labelling could be converted to categories.
paul@0	104
paul@0	105	# Handle revisions.
paul@0	106
paul@0	107	elif objecttype == "BodyContent":
paul@12	108	body = content["body"]
paul@12	109	if not body:
paul@12	110	body = "## Empty page."
paul@13	111
paul@13	112	if no_translate:
paul@13	113	fn = write
paul@13	114	else:
paul@13	115	fn = translate
paul@13	116
paul@13	117	fn(join(versions_dir, content["content"]), body)
paul@0	118
paul@0	119	self.content = {}
paul@0	120
paul@0	121	def handle_property(self, name, elements, attributes, all_text, text):
paul@0	122
paul@0	123	"Record properties in the current content dictionary."
paul@0	124
paul@0	125	self.content[attributes[-1]["name"]] = text.strip()
paul@0	126
paul@0	127	def handle_id(self, name, elements, attributes, all_text, text):
paul@0	128
paul@0	129	"Promote identifiers to the parent element's text."
paul@0	130
paul@0	131	all_text[-2].append(text)
paul@0	132
paul@0	133	def handle_collection(self, name, elements, attributes, all_text, text):
paul@0	134
paul@0	135	"Record collections in the current content dictionary."
paul@0	136
paul@0	137	self.content[attributes[-1]["name"]] = self.elements
paul@0	138	self.elements = []
paul@0	139
paul@0	140	def handle_element(self, name, elements, attributes, all_text, text):
paul@0	141
paul@0	142	"Add elements to the current collection."
paul@0	143
paul@0	144	self.elements.append((attributes[-1]["class"], text.strip()))
paul@0	145
paul@0	146	def mkdirs(name):
paul@2	147
paul@2	148	"Make the directory with the given 'name' at any depth."
paul@2	149
paul@0	150	try:
paul@0	151	makedirs(name)
paul@0	152	except OSError:
paul@0	153	pass
paul@0	154
paul@0	155	def append(filename, s):
paul@2	156
paul@2	157	"Append to the file with the given 'filename' the string 's'."
paul@2	158
paul@0	159	write(filename, s, True)
paul@0	160
paul@0	161	def write(filename, s, append=False):
paul@2	162
paul@2	163	"""
paul@2	164	Write to the file with the given 'filename' the string 's'. If the optional
paul@2	165	'append' parameter is set to a true value, 's' will be appended to the file.
paul@2	166	"""
paul@2	167
paul@2	168	f = codecs.open(filename, append and "a" or "w", encoding="utf-8")
paul@0	169	try:
paul@0	170	f.write(s)
paul@0	171	finally:
paul@0	172	f.close()
paul@0	173
paul@9	174	def read(filename):
paul@9	175
paul@9	176	"""
paul@9	177	Read from the file with the given 'filename', returning a string containing
paul@9	178	its contents.
paul@9	179	"""
paul@9	180
paul@9	181	f = codecs.open(filename, encoding="utf-8")
paul@9	182	try:
paul@9	183	return f.read()
paul@9	184	finally:
paul@9	185	f.close()
paul@3	186
paul@11	187	def translate(filename, body):
paul@11	188
paul@11	189	"""
paul@11	190	Write to the file with the given 'filename' a translation of the given
paul@11	191	'body'.
paul@11	192	"""
paul@11	193
paul@11	194	out = codecs.open(filename, "w", encoding="utf-8")
paul@11	195	try:
paul@11	196	parser.parse(body, out)
paul@11	197	finally:
paul@11	198	out.close()
paul@11	199
paul@10	200	def sort_manifest(filename, pagetitle, output=None):
paul@9	201
paul@9	202	"""
paul@9	203	Sort the manifest given in 'filename' according to revision. If a
paul@9	204	'pagetitle' file exists, the title column in the manifest will be augmented
paul@10	205	with the contents of that file. If 'output' is given, the manifest details
paul@10	206	will be appended to the file having that filename instead of being rewritten
paul@10	207	to the original manifest file.
paul@9	208	"""
paul@9	209
paul@9	210	if exists(pagetitle):
paul@9	211	title = read(pagetitle)
paul@9	212	else:
paul@9	213	title = None
paul@3	214
paul@5	215	f = codecs.open(filename, "r", encoding="utf-8")
paul@3	216	try:
paul@3	217	lines = [x.split("\|") for x in f.readlines()]
paul@3	218	lines.sort(cmp=lambda x, y: cmp(int(x[0]), int(y[0])))
paul@9	219
paul@9	220	# Reconstruct the lines, optionally changing the titles.
paul@9	221
paul@9	222	result = []
paul@9	223	for x in lines:
paul@9	224	if title is not None:
paul@9	225	x[3] = "%s/%s" % (title, x[3])
paul@9	226	result.append("\|".join(x[1:]))
paul@3	227	finally:
paul@3	228	f.close()
paul@3	229
paul@10	230	s = "".join(result)
paul@10	231
paul@10	232	if output is None:
paul@10	233	write(filename, s)
paul@10	234	else:
paul@10	235	append(output, s)
paul@3	236
paul@0	237	if __name__ == "__main__":
paul@0	238	import sys
paul@0	239
paul@20	240	try:
paul@20	241	filename = sys.argv[1]
paul@20	242	is_zipfile = splitext(filename)[-1] == extsep + "zip"
paul@20	243	space = sys.argv[2]
paul@20	244	except IndexError:
paul@20	245	print >>sys.stderr, "Please specify an XML file containing Wiki data and a workspace name."
paul@20	246	print >>sys.stderr, "For example: com_entities.xml COM"
paul@20	247	sys.exit(1)
paul@0	248
paul@13	249	no_translate = "--no-translate" in sys.argv
paul@0	250
paul@12	251	if exists(space):
paul@12	252	print >>sys.stderr, "Directory exists for space %s. Please choose another or remove its contents." % space
paul@0	253	sys.exit(1)
paul@0	254
paul@12	255	package_zip = space + extsep + "zip"
paul@12	256
paul@12	257	if exists(package_zip):
paul@12	258	print >>sys.stderr, "Page package exists. Please remove or rename it:", package_zip
paul@12	259	sys.exit(1)
paul@12	260
paul@12	261	mkdir(space)
paul@12	262	mkdirs(join(space, "pages"))
paul@12	263	mkdirs(join(space, "versions"))
paul@0	264
paul@0	265	p = xmlread.ConfigurableParser()
paul@13	266	handler = ConfluenceHandler(space, no_translate)
paul@0	267
paul@0	268	p["object"] = handler.handle_object
paul@0	269	p["property"] = handler.handle_property
paul@0	270	p["id"] = handler.handle_id
paul@0	271	p["collection"] = handler.handle_collection
paul@0	272	p["element"] = handler.handle_element
paul@0	273
paul@2	274	# Open the XML dump.
paul@2	275
paul@0	276	f = open(filename)
paul@0	277
paul@0	278	if is_zipfile:
paul@0	279	zf = ZipFile(f)
paul@0	280	ff = StringIO(zf.read("entities.xml"))
paul@0	281	else:
paul@0	282	ff = f
paul@0	283
paul@2	284	# Parse the data.
paul@2	285
paul@0	286	try:
paul@0	287	p.parse(ff)
paul@0	288	finally:
paul@0	289	f.close()
paul@0	290
paul@2	291	# Tidy up the import manifests, sorting each of them by revision and
paul@2	292	# finalising them.
paul@2	293
paul@12	294	pages_dir = join(space, "pages")
paul@3	295
paul@12	296	output_manifest = join(space, "MOIN_PACKAGE")
paul@10	297	append(output_manifest, "MoinMoinPackage\|1\n")
paul@10	298
paul@3	299	for pageid in listdir(pages_dir):
paul@3	300	manifest = join(pages_dir, pageid, "manifest")
paul@9	301	pagetitle = join(pages_dir, pageid, "pagetitle")
paul@10	302	sort_manifest(manifest, pagetitle, output_manifest)
paul@10	303
paul@10	304	# Write the page package.
paul@10	305
paul@12	306	page_package = ZipFile(package_zip, "w")
paul@10	307
paul@10	308	try:
paul@10	309	# Include the page revisions.
paul@10	310
paul@12	311	versions_dir = join(space, "versions")
paul@10	312
paul@10	313	for versionid in listdir(versions_dir):
paul@10	314	page_package.write(join(versions_dir, versionid))
paul@10	315
paul@10	316	# Include only the top-level manifest.
paul@10	317
paul@10	318	page_package.write(output_manifest, "MOIN_PACKAGE")
paul@10	319
paul@10	320	finally:
paul@10	321	page_package.close()
paul@3	322
paul@0	323	# vim: tabstop=4 expandtab shiftwidth=4