ConfluenceConverter (annotate convert.py in 7fdb737bed89)

ConfluenceConverter

Annotated convert.py

33:7fdb737bed89

2013-02-17

Paul Boddie

Propagated the "no translate" option to functionality wanting to add extra content to pages.

paul@0	1	#!/usr/bin/env python
paul@0	2
paul@8	3	"""
paul@8	4	Confluence XML dump conversion to a MoinMoin-compatible representation.
paul@8	5
paul@33	6	Copyright (C) 2012, 2013 Paul Boddie <paul@boddie.org.uk>
paul@8	7
paul@8	8	This software is free software; you can redistribute it and/or
paul@8	9	modify it under the terms of the GNU General Public License as
paul@8	10	published by the Free Software Foundation; either version 2 of
paul@8	11	the License, or (at your option) any later version.
paul@8	12
paul@8	13	This software is distributed in the hope that it will be useful,
paul@8	14	but WITHOUT ANY WARRANTY; without even the implied warranty of
paul@8	15	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
paul@8	16	GNU General Public License for more details.
paul@8	17
paul@8	18	You should have received a copy of the GNU General Public
paul@8	19	License along with this library; see the file LICENCE.txt
paul@8	20	If not, write to the Free Software Foundation, Inc.,
paul@8	21	51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
paul@8	22	"""
paul@8	23
paul@3	24	from os import listdir, mkdir, makedirs
paul@1	25	from os.path import exists, extsep, join, splitext
paul@0	26	from zipfile import ZipFile
paul@0	27	from cStringIO import StringIO
paul@0	28	import codecs
paul@0	29	import xmlread
paul@11	30	import parser
paul@25	31	import sys
paul@0	32
paul@23	33	MAX_TITLE_LENGTH = 120
paul@23	34
paul@0	35	class ConfluenceHandler:
paul@0	36
paul@0	37	"Handle content from a Confluence Wiki dump."
paul@0	38
paul@13	39	def __init__(self, space, no_translate=False):
paul@0	40	self.content = {}
paul@0	41	self.elements = []
paul@12	42	self.space = space
paul@13	43	self.no_translate = no_translate
paul@0	44
paul@0	45	def handle_object(self, name, elements, attributes, all_text, text):
paul@0	46
paul@0	47	"Handle objects according to type."
paul@0	48
paul@0	49	objecttype = attributes[-1]["class"]
paul@25	50
paul@25	51	# Any identifier is stored as the object's textual content.
paul@25	52
paul@0	53	identifier = text.strip()
paul@25	54
paul@25	55	# The content is a dictionary mapping names to properties and
paul@25	56	# collections.
paul@25	57
paul@0	58	content = self.content
paul@0	59
paul@12	60	pages_dir = join(self.space, "pages")
paul@12	61	versions_dir = join(self.space, "versions")
paul@0	62
paul@0	63	# Handle particular types.
paul@0	64
paul@10	65	if objecttype in ("Page", "Comment", "BlogPost"):
paul@0	66
paul@0	67	# Handle pages and revisions, adding revisions to the page manifest.
paul@9	68	# The original version is used as a unifying identifier for all the
paul@9	69	# different revisions (each of which being defined by a Page
paul@9	70	# element). Although "original" implies the first identifier used,
paul@9	71	# it actually appears to be the latest and will have the highest
paul@9	72	# version number.
paul@0	73
paul@0	74	if content.has_key("originalVersion"):
paul@0	75	pageid = content["originalVersion"]
paul@0	76	else:
paul@0	77	pageid = identifier
paul@0	78
paul@0	79	versionfile = join(versions_dir, identifier)
paul@0	80
paul@0	81	# Note page metadata, not necessarily in the correct order.
paul@9	82	# For comments, the title will need to be rewritten, since they
paul@9	83	# should be defined in terms of their owner page.
paul@0	84
paul@0	85	mkdirs(join(pages_dir, pageid))
paul@0	86
paul@12	87	title = content["title"]
paul@23	88
paul@23	89	# Limit the title to a "safe" number of characters in order to avoid
paul@23	90	# filesystem issues.
paul@23	91
paul@23	92	title = title[:MAX_TITLE_LENGTH]
paul@23	93
paul@12	94	if title:
paul@12	95	title = "%s/%s" % (self.space, title)
paul@31	96	write(join(pages_dir, pageid, "pagetitle"), title)
paul@12	97
paul@28	98	# See sort_manifest for access to this data.
paul@28	99
paul@24	100	append(join(pages_dir, pageid, "manifest"),
paul@24	101	"%s\|AddRevision\|%s\|%s\|%s\|%s\n" % (
paul@24	102	content["version"],
paul@24	103	versionfile,
paul@31	104	title, # comment titles will incorporate the comment's position
paul@24	105	content["lastModifierName"],
paul@24	106	content["versionComment"]
paul@24	107	))
paul@0	108
paul@24	109	# Add information to parent pages for child page lists.
paul@24	110
paul@24	111	if content.has_key("parent"):
paul@24	112	parentid = content["parent"]
paul@24	113	mkdirs(join(pages_dir, parentid))
paul@24	114	append(join(pages_dir, parentid, "children"), title + "\n")
paul@24	115
paul@31	116	# Add creation details for comments to the owner page.
paul@31	117	# Since comments can be versioned, the date of the original version
paul@31	118	# is used, and only this "original" version has the owner property.
paul@31	119
paul@31	120	if objecttype == "Comment" and content.has_key("owner"):
paul@31	121	ownerid = content["owner"]
paul@31	122	mkdirs(join(pages_dir, ownerid))
paul@31	123	append(join(pages_dir, ownerid, "comments"), "%s\|%s\n" % (content["creationDate"], pageid))
paul@31	124
paul@0	125	# Some metadata is not particularly relevant. For example,
paul@0	126	# ancestors, children, parent are navigation-related.
paul@0	127
paul@0	128	# Other metadata could be added to the page content itself.
paul@0	129	# For example, labelling could be converted to categories.
paul@0	130
paul@0	131	# Handle revisions.
paul@0	132
paul@0	133	elif objecttype == "BodyContent":
paul@12	134	body = content["body"]
paul@12	135	if not body:
paul@12	136	body = "## Empty page."
paul@13	137
paul@25	138	# NOTE: Very simple technique employed for guessing the format.
paul@25	139
paul@13	140	if no_translate:
paul@13	141	fn = write
paul@25	142	elif body.startswith("<"):
paul@25	143	fn = xmltranslate
paul@13	144	else:
paul@13	145	fn = translate
paul@13	146
paul@25	147	try:
paul@25	148	fn(join(versions_dir, content["content"]), body)
paul@25	149	except:
paul@25	150	print >>sys.stderr, "Error parsing..."
paul@25	151	print >>sys.stderr, body
paul@25	152	raise
paul@0	153
paul@0	154	self.content = {}
paul@0	155
paul@0	156	def handle_property(self, name, elements, attributes, all_text, text):
paul@0	157
paul@0	158	"Record properties in the current content dictionary."
paul@0	159
paul@0	160	self.content[attributes[-1]["name"]] = text.strip()
paul@0	161
paul@0	162	def handle_id(self, name, elements, attributes, all_text, text):
paul@0	163
paul@0	164	"Promote identifiers to the parent element's text."
paul@0	165
paul@0	166	all_text[-2].append(text)
paul@0	167
paul@0	168	def handle_collection(self, name, elements, attributes, all_text, text):
paul@0	169
paul@0	170	"Record collections in the current content dictionary."
paul@0	171
paul@0	172	self.content[attributes[-1]["name"]] = self.elements
paul@0	173	self.elements = []
paul@0	174
paul@0	175	def handle_element(self, name, elements, attributes, all_text, text):
paul@0	176
paul@0	177	"Add elements to the current collection."
paul@0	178
paul@0	179	self.elements.append((attributes[-1]["class"], text.strip()))
paul@0	180
paul@0	181	def mkdirs(name):
paul@2	182
paul@2	183	"Make the directory with the given 'name' at any depth."
paul@2	184
paul@0	185	try:
paul@0	186	makedirs(name)
paul@0	187	except OSError:
paul@0	188	pass
paul@0	189
paul@0	190	def append(filename, s):
paul@2	191
paul@2	192	"Append to the file with the given 'filename' the string 's'."
paul@2	193
paul@0	194	write(filename, s, True)
paul@0	195
paul@0	196	def write(filename, s, append=False):
paul@2	197
paul@2	198	"""
paul@2	199	Write to the file with the given 'filename' the string 's'. If the optional
paul@2	200	'append' parameter is set to a true value, 's' will be appended to the file.
paul@2	201	"""
paul@2	202
paul@2	203	f = codecs.open(filename, append and "a" or "w", encoding="utf-8")
paul@0	204	try:
paul@0	205	f.write(s)
paul@0	206	finally:
paul@0	207	f.close()
paul@0	208
paul@9	209	def read(filename):
paul@9	210
paul@9	211	"""
paul@9	212	Read from the file with the given 'filename', returning a string containing
paul@9	213	its contents.
paul@9	214	"""
paul@9	215
paul@9	216	f = codecs.open(filename, encoding="utf-8")
paul@9	217	try:
paul@9	218	return f.read()
paul@9	219	finally:
paul@9	220	f.close()
paul@3	221
paul@25	222	def translate(filename, body, fn=None):
paul@11	223
paul@11	224	"""
paul@11	225	Write to the file with the given 'filename' a translation of the given
paul@11	226	'body'.
paul@11	227	"""
paul@11	228
paul@25	229	fn = fn or parser.parse
paul@25	230
paul@11	231	out = codecs.open(filename, "w", encoding="utf-8")
paul@11	232	try:
paul@25	233	fn(body, out)
paul@11	234	finally:
paul@11	235	out.close()
paul@11	236
paul@25	237	def xmltranslate(filename, body):
paul@25	238	translate(filename, body, parser.xmlparse)
paul@25	239
paul@31	240	def sort_comments(pages_dir, pageid):
paul@31	241
paul@31	242	"""
paul@31	243	Where 'pageid' has comments associated with it, sort them chronologically
paul@31	244	and label the comment pages with the owner page's title and comment's
paul@31	245	position in the chronological sequence. Such labelling is done by writing
paul@31	246	a "pagetitle" file in each comment page's directory.
paul@31	247	"""
paul@31	248
paul@31	249	comments = join(pages_dir, pageid, "comments")
paul@31	250
paul@31	251	if not exists(comments):
paul@31	252	return
paul@31	253
paul@31	254	title = read(join(pages_dir, pageid, "pagetitle"))
paul@31	255
paul@31	256	details = [line.split("\|") for line in read(comments).split("\n") if line]
paul@31	257	details.sort()
paul@31	258
paul@31	259	# Write the sorted comments list for testing purposes.
paul@31	260
paul@31	261	write(comments, "\n".join(["\|".join(x) for x in details]))
paul@31	262
paul@31	263	# Define comments as subpages by setting their titles using this
paul@31	264	# page's name/title and their position in the comments collection.
paul@31	265
paul@31	266	for position, (_lastmodified, commentid) in enumerate(details):
paul@31	267
paul@31	268	# In the page directory for each comment, write the page title in a
paul@31	269	# special file for later processing.
paul@31	270
paul@32	271	write(join(pages_dir, commentid, "pagetitle"), "%s/%04d" % (title, position))
paul@31	272
paul@33	273	def sort_manifest(pages_dir, pageid, output=None, no_translate=False):
paul@9	274
paul@9	275	"""
paul@28	276	Using the given 'pageid', locate the manifest for the page and any page
paul@28	277	title information written to a "pagetitle" file.
paul@23	278
paul@28	279	Then sort the manifest according to revision so that it will be added to
paul@28	280	MoinMoin in the correct order.
paul@28	281
paul@28	282	If a "pagetitle" file exists, the title column in the manifest will be
paul@23	283	augmented with the contents of that file. This is typically done for
paul@23	284	comments.
paul@23	285
paul@28	286	If a "children" file exists, the pages in that file will be added as a list
paul@28	287	to the end of each revision's content.
paul@28	288
paul@23	289	If 'output' is given, the manifest details will be appended to the file
paul@23	290	having that filename instead of being rewritten to the original manifest
paul@23	291	file.
paul@9	292	"""
paul@9	293
paul@28	294	manifest = join(pages_dir, pageid, "manifest")
paul@28	295	pagetitle = join(pages_dir, pageid, "pagetitle")
paul@28	296	children = join(pages_dir, pageid, "children")
paul@32	297	comments = join(pages_dir, pageid, "comments")
paul@28	298
paul@9	299	if exists(pagetitle):
paul@9	300	title = read(pagetitle)
paul@9	301	else:
paul@9	302	title = None
paul@3	303
paul@28	304	f = codecs.open(manifest, "r", encoding="utf-8")
paul@3	305	try:
paul@3	306	lines = [x.split("\|") for x in f.readlines()]
paul@3	307	lines.sort(cmp=lambda x, y: cmp(int(x[0]), int(y[0])))
paul@9	308
paul@9	309	# Reconstruct the lines, optionally changing the titles.
paul@9	310
paul@9	311	result = []
paul@28	312
paul@28	313	for line in lines:
paul@28	314	version, _addrevision, filename, old_title, username, comment = line
paul@28	315
paul@31	316	# Replace title information with the information already present.
paul@28	317
paul@9	318	if title is not None:
paul@31	319	new_title = title
paul@28	320	else:
paul@28	321	new_title = old_title
paul@28	322
paul@28	323	# The version is omitted now that the manifest is ordered.
paul@28	324
paul@28	325	line = _addrevision, filename, new_title, username, comment
paul@28	326	result.append("\|".join(line))
paul@28	327
paul@28	328	# Add child page information to the content.
paul@28	329
paul@33	330	if exists(children) and not no_translate:
paul@28	331	child_pages = []
paul@28	332	child_page_names = [x for x in read(children).split("\n") if x]
paul@28	333	child_page_names.sort()
paul@28	334
paul@28	335	for child_page_name in child_page_names:
paul@28	336	child_pages.append(" * [[%s]]" % child_page_name)
paul@28	337
paul@28	338	append(filename, child_page_section % "\n".join(child_pages))
paul@28	339
paul@32	340	# Add comments to the content.
paul@32	341
paul@33	342	if exists(comments) and title and not no_translate:
paul@32	343	append(filename, comment_section % title)
paul@32	344
paul@3	345	finally:
paul@3	346	f.close()
paul@3	347
paul@10	348	s = "".join(result)
paul@10	349
paul@10	350	if output is None:
paul@28	351	write(manifest, s)
paul@10	352	else:
paul@10	353	append(output, s)
paul@3	354
paul@28	355	# Template for child page information.
paul@28	356
paul@28	357	child_page_section = """
paul@28	358	----
paul@28	359
paul@28	360	%s
paul@28	361	"""
paul@28	362
paul@32	363	# Template for comments.
paul@32	364
paul@32	365	comment_section = """
paul@32	366	----
paul@32	367
paul@32	368	<<Include("^%s/")>>
paul@32	369	"""
paul@32	370
paul@28	371	# Main program.
paul@28	372
paul@0	373	if __name__ == "__main__":
paul@20	374	try:
paul@20	375	filename = sys.argv[1]
paul@20	376	is_zipfile = splitext(filename)[-1] == extsep + "zip"
paul@20	377	space = sys.argv[2]
paul@20	378	except IndexError:
paul@20	379	print >>sys.stderr, "Please specify an XML file containing Wiki data and a workspace name."
paul@20	380	print >>sys.stderr, "For example: com_entities.xml COM"
paul@20	381	sys.exit(1)
paul@0	382
paul@13	383	no_translate = "--no-translate" in sys.argv
paul@0	384
paul@12	385	if exists(space):
paul@12	386	print >>sys.stderr, "Directory exists for space %s. Please choose another or remove its contents." % space
paul@0	387	sys.exit(1)
paul@0	388
paul@12	389	package_zip = space + extsep + "zip"
paul@12	390
paul@12	391	if exists(package_zip):
paul@12	392	print >>sys.stderr, "Page package exists. Please remove or rename it:", package_zip
paul@12	393	sys.exit(1)
paul@12	394
paul@12	395	mkdir(space)
paul@12	396	mkdirs(join(space, "pages"))
paul@12	397	mkdirs(join(space, "versions"))
paul@0	398
paul@0	399	p = xmlread.ConfigurableParser()
paul@13	400	handler = ConfluenceHandler(space, no_translate)
paul@0	401
paul@24	402	# Register handlers in the parser for different elements.
paul@24	403
paul@0	404	p["object"] = handler.handle_object
paul@0	405	p["property"] = handler.handle_property
paul@0	406	p["id"] = handler.handle_id
paul@0	407	p["collection"] = handler.handle_collection
paul@0	408	p["element"] = handler.handle_element
paul@0	409
paul@2	410	# Open the XML dump.
paul@2	411
paul@0	412	f = open(filename)
paul@0	413
paul@0	414	if is_zipfile:
paul@0	415	zf = ZipFile(f)
paul@0	416	ff = StringIO(zf.read("entities.xml"))
paul@0	417	else:
paul@0	418	ff = f
paul@0	419
paul@2	420	# Parse the data.
paul@2	421
paul@0	422	try:
paul@0	423	p.parse(ff)
paul@0	424	finally:
paul@0	425	f.close()
paul@0	426
paul@2	427	# Tidy up the import manifests, sorting each of them by revision and
paul@2	428	# finalising them.
paul@2	429
paul@12	430	pages_dir = join(space, "pages")
paul@3	431
paul@31	432	for pageid in listdir(pages_dir):
paul@31	433	sort_comments(pages_dir, pageid)
paul@31	434
paul@12	435	output_manifest = join(space, "MOIN_PACKAGE")
paul@10	436	append(output_manifest, "MoinMoinPackage\|1\n")
paul@10	437
paul@3	438	for pageid in listdir(pages_dir):
paul@33	439	sort_manifest(pages_dir, pageid, output_manifest, no_translate)
paul@10	440
paul@10	441	# Write the page package.
paul@10	442
paul@12	443	page_package = ZipFile(package_zip, "w")
paul@10	444
paul@10	445	try:
paul@10	446	# Include the page revisions.
paul@10	447
paul@12	448	versions_dir = join(space, "versions")
paul@10	449
paul@10	450	for versionid in listdir(versions_dir):
paul@10	451	page_package.write(join(versions_dir, versionid))
paul@10	452
paul@10	453	# Include only the top-level manifest.
paul@10	454
paul@10	455	page_package.write(output_manifest, "MOIN_PACKAGE")
paul@10	456
paul@10	457	finally:
paul@10	458	page_package.close()
paul@3	459
paul@0	460	# vim: tabstop=4 expandtab shiftwidth=4