ConfluenceConverter (annotate xmlparser.py in 3255ed8e2426)

ConfluenceConverter

Annotated xmlparser.py

106:3255ed8e2426

2013-07-18

Paul Boddie

Added libxml2dom dependency note and updated the "to do" list.

paul@35	1	#!/usr/bin/env python
paul@35	2
paul@35	3	"""
paul@35	4	Confluence Wiki XML/XHTML syntax parsing.
paul@35	5
paul@35	6	Copyright (C) 2012, 2013 Paul Boddie <paul@boddie.org.uk>
paul@35	7
paul@35	8	This software is free software; you can redistribute it and/or
paul@35	9	modify it under the terms of the GNU General Public License as
paul@35	10	published by the Free Software Foundation; either version 2 of
paul@35	11	the License, or (at your option) any later version.
paul@35	12
paul@35	13	This software is distributed in the hope that it will be useful,
paul@35	14	but WITHOUT ANY WARRANTY; without even the implied warranty of
paul@35	15	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
paul@35	16	GNU General Public License for more details.
paul@35	17
paul@35	18	You should have received a copy of the GNU General Public
paul@35	19	License along with this library; see the file LICENCE.txt
paul@35	20	If not, write to the Free Software Foundation, Inc.,
paul@35	21	51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
paul@35	22	"""
paul@35	23
paul@35	24	try:
paul@35	25	from cStringIO import StringIO
paul@35	26	except ImportError:
paul@35	27	from StringIO import StringIO
paul@35	28
paul@51	29	from MoinMoin import wikiutil
paul@35	30	from common import *
paul@35	31	from xmlread import Parser
paul@35	32	import re
paul@35	33	import sys
paul@35	34	import operator
paul@35	35	import htmlentitydefs
paul@41	36	import codecs
paul@35	37
paul@35	38	# XML dialect syntax parsing.
paul@35	39
paul@35	40	tags = {
paul@35	41	# XHTML tag MoinMoin syntax
paul@35	42	"strong" : "'''%s'''",
paul@35	43	"em" : "''%s''",
paul@35	44	"u" : "__%s__",
paul@35	45	"del" : "--(%s)--",
paul@35	46	"sup" : "^%s^",
paul@35	47	"sub" : ",,%s,,",
paul@35	48	"code" : "`%s`",
paul@41	49	"tbody" : "%s",
paul@41	50	"tr" : "%s",
paul@41	51	"th" : "'''%s'''",
paul@41	52	"td" : "%s",
paul@35	53	"blockquote" : " %s",
paul@35	54	"small" : "~-%s-~",
paul@35	55	"big" : "~+%s+~",
paul@35	56	"p" : "%s",
paul@35	57	"ol" : "%s",
paul@35	58	"ul" : "%s",
paul@84	59	"ac:link" : "[[%s%s%s\|%s]]",
paul@84	60	"ac:image" : "{{%s%s%s\|%s}}",
paul@55	61	"a" : "[[%s\|%s]]",
paul@35	62	}
paul@35	63
paul@35	64	for tag, translation in blocktypes.items():
paul@35	65	tags[tag] = translation
paul@35	66
paul@35	67	simple_tags = {
paul@35	68	# XHTML tag MoinMoin syntax
paul@35	69	"br" : "<<BR>>",
paul@35	70	}
paul@35	71
paul@66	72	simple_preformatted_tags = {
paul@66	73	# XHTML tag MoinMoin syntax
paul@66	74	"br" : "\n",
paul@66	75	}
paul@66	76
paul@35	77	list_tags = {
paul@35	78	# XHTML list tag MoinMoin list item syntax
paul@35	79	"ol" : "1. %s",
paul@35	80	"ul" : "* %s",
paul@35	81	}
paul@35	82
paul@51	83	preformatted_tags = ["pre", "ac:plain-text-body"]
paul@51	84	single_level_tags = ["strong", "em", "u", "del", "sup", "sub", "code"]
paul@51	85	formatted_tags = ["ac:rich-text-body", "table"]
paul@51	86
paul@56	87	indented_tags = ["li", "p"] + preformatted_tags + formatted_tags
paul@56	88	block_tags = indented_tags + blocktypes.keys() + list_tags.keys()
paul@58	89	span_override_tags = ["ac:link"]
paul@56	90
paul@35	91	link_target_tags = {
paul@54	92	# Confluence element Attributes providing the target
paul@54	93	"ri:page" : ("ri:space-key", "ri:content-title"),
paul@54	94	"ri:attachment" : ("ri:filename",),
paul@54	95	"ri:user" : ("ri:username",),
paul@35	96	}
paul@35	97
paul@54	98	link_target_prefixes = {
paul@54	99	# Attribute with details Prefix ensuring correct relative link
paul@54	100	"ri:space-key" : "..",
paul@54	101	"ri:content-title" : "..",
paul@54	102	}
paul@54	103
paul@54	104	link_label_attributes = "ri:content-title", "ac:link-body"
paul@54	105
paul@51	106	# NOTE: User links should support the intended user namespace prefix.
paul@51	107
paul@51	108	link_target_types = {
paul@51	109	# Confluence element MoinMoin link prefix
paul@51	110	"ri:attachment" : "attachment:",
paul@51	111	"ri:user" : "",
paul@51	112	}
paul@51	113
paul@35	114	macro_rich_text_styles = {
paul@35	115	# Confluence style MoinMoin admonition style
paul@35	116	"note" : "caution",
paul@35	117	"warning" : "warning",
paul@35	118	"info" : "important",
paul@35	119	"tip" : "tip",
paul@92	120	"excerpt" : "",
paul@35	121	}
paul@35	122
paul@71	123	macroargs = {
paul@71	124	# Confluence macro Confluence and MoinMoin macro arguments
paul@71	125	"color" : ("color", "col"),
paul@71	126	}
paul@71	127
paul@71	128	macrotypes = {
paul@71	129	# Confluence macro MoinMoin syntax
paul@71	130	"anchor" : "<<Anchor(%(anchor)s)>>",
paul@71	131	"color" : "<<Color2(%(content)s, %(args)s)>>",
paul@93	132	"toc" : "<<TableOfContents>>",
paul@71	133	}
paul@71	134
paul@35	135	normalise_regexp_str = r"\s+"
paul@35	136	normalise_regexp = re.compile(normalise_regexp_str)
paul@35	137
paul@35	138	class ConfluenceXMLParser(Parser):
paul@35	139
paul@35	140	"Handle content from Confluence 4 page revisions."
paul@35	141
paul@35	142	def __init__(self, out):
paul@35	143	Parser.__init__(self)
paul@35	144	self.out = out
paul@35	145
paul@51	146	# Link target and label information.
paul@35	147
paul@35	148	self.target = None
paul@35	149	self.target_type = None
paul@51	150	self.label = None
paul@35	151
paul@35	152	# Macro information.
paul@35	153
paul@93	154	self.macros = []
paul@93	155	self.macro_parameters = []
paul@73	156	self.held_anchors = []
paul@35	157
paul@51	158	# Indentation and element nesting states.
paul@35	159
paul@63	160	self.indents = [0]
paul@35	161	self.states = {}
paul@51	162	self.max_level = self.level = 0
paul@51	163
paul@51	164	for name in preformatted_tags + single_level_tags:
paul@35	165	self.states[name] = 0
paul@35	166
paul@41	167	# Table states.
paul@41	168
paul@41	169	self.table_rows = 0
paul@41	170	self.table_columns = 0
paul@41	171
paul@56	172	# Block states.
paul@56	173
paul@56	174	self.have_block = False
paul@56	175
paul@35	176	# ContentHandler-related methods.
paul@35	177
paul@35	178	def startElement(self, name, attrs):
paul@54	179
paul@54	180	# Track indentation for lists.
paul@54	181
paul@35	182	if list_tags.has_key(name):
paul@63	183	self.indents.append(self.indents[-1] + 1)
paul@54	184
paul@54	185	# Track element nesting.
paul@54	186
paul@63	187	if self.states.has_key(name):
paul@35	188	self.states[name] += 1
paul@54	189
paul@54	190	# Track cumulative element nesting in order to produce appropriate depth
paul@54	191	# indicators in the formatted output.
paul@54	192
paul@51	193	if name in preformatted_tags or name in formatted_tags:
paul@51	194	self.level += 1
paul@51	195	self.max_level = max(self.level, self.max_level)
paul@51	196
paul@63	197	# Reset indentation within regions.
paul@63	198
paul@63	199	self.indents.append(0)
paul@63	200
paul@73	201	if name in headings:
paul@73	202	self.held_anchors = []
paul@73	203
paul@35	204	Parser.startElement(self, name, attrs)
paul@35	205
paul@51	206	# Remember macro information for use within the element.
paul@51	207
paul@51	208	if name == "ac:macro":
paul@93	209	self.macros.append(self.attributes[-1].get("ac:name"))
paul@93	210	self.macro_parameters.append({})
paul@51	211
paul@35	212	def endElement(self, name):
paul@63	213
paul@63	214	# Reset the indent for any preformatted/formatted region so that it may
paul@63	215	# itself be indented.
paul@63	216
paul@63	217	if name in preformatted_tags or name in formatted_tags:
paul@63	218	self.indents.pop()
paul@63	219
paul@35	220	Parser.endElement(self, name)
paul@51	221
paul@35	222	if list_tags.has_key(name):
paul@63	223	self.indents.pop()
paul@63	224
paul@63	225	if self.states.has_key(name):
paul@35	226	self.states[name] -= 1
paul@63	227
paul@51	228	if name in preformatted_tags or name in formatted_tags:
paul@51	229	self.level -= 1
paul@51	230	if not self.level:
paul@51	231	self.max_level = 0
paul@35	232
paul@63	233	# Discard macro state.
paul@63	234
paul@63	235	if name == "ac:macro":
paul@93	236	self.macros.pop()
paul@93	237	self.macro_parameters.pop()
paul@63	238
paul@35	239	def characters(self, content):
paul@35	240	if not self.is_preformatted():
paul@35	241	content = self.normalise(content, self.elements[-1])
paul@35	242	Parser.characters(self, content)
paul@35	243
paul@35	244	def skippedEntity(self, name):
paul@35	245	ch = htmlentitydefs.name2codepoint.get(name)
paul@35	246	if ch:
paul@35	247	self.text[-1].append(unichr(ch))
paul@35	248
paul@35	249	# Parser-related methods.
paul@35	250
paul@35	251	def handleElement(self, name):
paul@51	252
paul@51	253	"""
paul@51	254	Handle the completion of the element with the given 'name'. Any content
paul@51	255	will either be recorded for later use (by an enclosing element, for
paul@51	256	example) or emitted in some form.
paul@51	257	"""
paul@51	258
paul@59	259	text = u"".join(self.text[-1])
paul@41	260
paul@41	261	# Handle state.
paul@41	262
paul@41	263	if name == "table":
paul@41	264	self.table_rows = 0
paul@41	265	elif name == "tr":
paul@41	266	self.table_columns = 0
paul@41	267
paul@41	268	# Find conversions.
paul@41	269
paul@35	270	conversion = None
paul@35	271
paul@35	272	# Handle list elements.
paul@35	273
paul@35	274	if name == "li" and len(self.elements) > 1:
paul@35	275	list_tag = self.elements[-2]
paul@35	276	conversion = list_tags.get(list_tag)
paul@35	277
paul@35	278	# Remember link target information.
paul@35	279
paul@35	280	elif link_target_tags.has_key(name):
paul@54	281	target_details = []
paul@54	282
paul@54	283	# Get target details from the element's attributes.
paul@54	284
paul@54	285	for attrname in link_target_tags[name]:
paul@54	286	attrvalue = self.attributes[-1].get(attrname)
paul@54	287	if attrvalue:
paul@85	288
paul@85	289	# Obtain a link label.
paul@85	290
paul@85	291	if attrname in link_label_attributes and not self.label:
paul@85	292	self.label = attrvalue
paul@85	293
paul@85	294	# Validate any page title.
paul@85	295
paul@85	296	if attrname == "ri:content-title":
paul@85	297	attrvalue = get_page_title(attrvalue)
paul@54	298	target_details.append(attrvalue)
paul@85	299
paul@85	300	# Insert any prefix required for the link.
paul@85	301
paul@54	302	prefix = link_target_prefixes.get(attrname)
paul@54	303	if prefix:
paul@54	304	target_details.insert(0, prefix)
paul@54	305
paul@54	306	# Make a link based on the details.
paul@54	307
paul@59	308	self.target = u"/".join(target_details)
paul@35	309	self.target_type = name
paul@35	310	text = ""
paul@35	311
paul@51	312	# For anchor links, just use the raw text and let Moin do the formatting.
paul@94	313	# Set an empty default target, overwriting it if enclosing elements
paul@94	314	# specify target details.
paul@51	315
paul@51	316	elif name == "ac:link-body":
paul@94	317	self.target = self.target or ""
paul@59	318	self.label = text.strip()
paul@51	319	text = ""
paul@51	320
paul@55	321	# For conventional links, remember the href attribute as the target.
paul@55	322
paul@55	323	elif name == "a":
paul@55	324	self.target = self.attributes[-1].get("href")
paul@59	325	self.label = text.strip()
paul@55	326	text = ""
paul@55	327
paul@35	328	# Remember macro information.
paul@35	329
paul@63	330	elif name == "ac:parameter":
paul@93	331	self.macro_parameters[-1][self.attributes[-1].get("ac:name")] = text
paul@35	332	text = ""
paul@35	333
paul@63	334	elif name == "ac:default-parameter":
paul@93	335	self.macro_parameters[-1][self.attributes[-2].get("ac:name")] = text
paul@63	336	text = ""
paul@63	337
paul@51	338	# Handle single-level tags.
paul@51	339
paul@51	340	elif name in single_level_tags and self.states[name] > 1:
paul@51	341	conversion = "%s"
paul@51	342
paul@51	343	# Handle preformatted sections.
paul@51	344
paul@51	345	elif name in preformatted_tags or name in formatted_tags:
paul@51	346
paul@51	347	# Nest the section appropriately.
paul@51	348
paul@51	349	level = 3 + self.max_level - self.level
paul@51	350	opening = "{" * level
paul@51	351	closing = "}" * level
paul@51	352
paul@51	353	# Macro name information is used to style rich text body regions.
paul@51	354
paul@93	355	if name != "table" and self.macros and macro_rich_text_styles.has_key(self.macros[-1]):
paul@93	356	details = macro_rich_text_styles[self.macros[-1]]
paul@93	357	title = self.macro_parameters[-1].get("title")
paul@51	358	if title:
paul@51	359	details = "%s\n\n%s" % (details, title)
paul@51	360
paul@51	361	conversion = "%s#!wiki %s\n\n%%s\n%s" % (opening, details, closing)
paul@51	362
paul@51	363	elif name == "table":
paul@51	364	conversion = "%s#!table\n%%s\n%s" % (opening, closing)
paul@51	365
paul@51	366	else:
paul@60	367	# Preformatted sections containing newlines must contain an initial
paul@60	368	# newline.
paul@60	369
paul@60	370	if text.find("\n") != -1 and not text.startswith("\n"):
paul@60	371	opening += "\n"
paul@60	372
paul@51	373	conversion = "%s%%s%s" % (opening, closing)
paul@35	374
paul@55	375	# Handle the common case and simpler special cases.
paul@35	376
paul@55	377	if not conversion:
paul@35	378	conversion = tags.get(name)
paul@35	379
paul@56	380
paul@56	381
paul@35	382	# Attempt to convert the text.
paul@35	383
paul@35	384	# Links require target information.
paul@35	385
paul@42	386	if name in ("ac:link", "ac:image"):
paul@54	387	prefix = link_target_types.get(self.target_type, "")
paul@84	388	anchor = self.attributes[-1].get("ac:anchor") or ""
paul@84	389	label = self.label or text.strip() or self.target
paul@84	390	text = conversion % (prefix, self.target, anchor and ("#%s" % anchor) or "", label)
paul@51	391	self.target = self.target_type = self.label = None
paul@35	392
paul@55	393	elif name == "a":
paul@59	394	text = conversion % (self.target, self.label or self.target)
paul@55	395	self.target = self.target_type = self.label = None
paul@55	396
paul@63	397	# Macros require various kinds of information.
paul@71	398	# Some macros affect the formatting of their contents, whereas other
paul@71	399	# simpler macros are handled here.
paul@63	400
paul@73	401	elif name == "ac:macro":
paul@93	402	conversion = macrotypes.get(self.macros[-1])
paul@71	403	if conversion:
paul@71	404	parameters = {"content" : text}
paul@93	405	parameters.update(self.macro_parameters[-1])
paul@93	406	argnames = macroargs.get(self.macros[-1])
paul@71	407	if argnames:
paul@71	408	confargname, moinargname = argnames
paul@93	409	parameters["args"] = quote_macro_argument("%s=%s" % (moinargname, self.macro_parameters[-1][confargname]))
paul@71	410	text = conversion % parameters
paul@93	411	if self.macros[-1] == "anchor" and self.forbids_macros():
paul@73	412	self.held_anchors.append(text)
paul@73	413	text = ""
paul@63	414
paul@63	415	# Handle the common cases for parameterised and unparameterised
paul@63	416	# substitutions.
paul@35	417
paul@35	418	elif text and conversion:
paul@35	419	text = conversion % text
paul@66	420	elif simple_tags.has_key(name) and not self.is_preformatted():
paul@35	421	text = simple_tags[name]
paul@66	422	elif simple_preformatted_tags.has_key(name) and self.is_preformatted():
paul@66	423	text = simple_preformatted_tags[name]
paul@35	424
paul@63	425
paul@63	426
paul@41	427	# Postprocess table columns and rows.
paul@41	428
paul@41	429	if name in ("th", "td"):
paul@41	430	if self.table_columns:
paul@41	431	text = "\n\|\| %s" % text
paul@41	432	self.table_columns += 1
paul@41	433	elif name == "tr":
paul@41	434	if self.table_rows:
paul@41	435	text = "\n==\n%s" % text
paul@41	436	self.table_rows += 1
paul@41	437
paul@73	438	# Postprocess held anchor tags in headings.
paul@73	439
paul@73	440	elif name in headings and self.held_anchors:
paul@73	441	text = "%s\n%s" % ("".join(self.held_anchors), text)
paul@73	442
paul@63	443
paul@63	444
paul@35	445	# Normalise leading whitespace and indent the text if appropriate.
paul@35	446
paul@35	447	if name in indented_tags:
paul@63	448	text = " " * self.indents[-1] + text.lstrip()
paul@35	449
paul@35	450	# Add the converted text to the end of the parent element's text nodes.
paul@35	451
paul@35	452	if len(self.text) > 1:
paul@35	453	nodes = self.text[-2]
paul@58	454	parent = self.elements[-2]
paul@56	455
paul@56	456	# Where preceding text exists, add any blank line separators.
paul@56	457
paul@59	458	if u"".join(nodes):
paul@56	459
paul@56	460	# All top-level elements are separated with blank lines.
paul@56	461
paul@35	462	if parent == "body":
paul@56	463	nodes.append("\n")
paul@56	464
paul@56	465	# Block elements always cause a new line to be started.
paul@56	466
paul@58	467	if name in block_tags or self.have_block and name not in span_override_tags:
paul@35	468	nodes.append("\n")
paul@56	469
paul@56	470	self.have_block = False
paul@56	471
paul@58	472	# Lists inside lists require separation.
paul@58	473
paul@58	474	elif list_tags.has_key(name) and parent == "li":
paul@58	475	nodes.append("\n")
paul@56	476
paul@58	477	# Without preceding text, save any block node state for non-block
paul@60	478	# elements so that newline separators can be added at another
paul@58	479	# level.
paul@58	480
paul@58	481	elif name in block_tags and parent not in block_tags:
paul@58	482	self.have_block = True
paul@58	483
paul@58	484	elif name not in block_tags and self.have_block and name not in span_override_tags:
paul@56	485	self.have_block = True
paul@56	486
paul@56	487	else:
paul@56	488	self.have_block = False
paul@56	489
paul@35	490	nodes.append(text)
paul@35	491
paul@56	492	# Otherwise, emit the text (at the top level of the document).
paul@35	493
paul@35	494	else:
paul@35	495	self.out.write(text)
paul@35	496
paul@35	497	def is_preformatted(self):
paul@51	498	return reduce(operator.or_, [self.states[tag] for tag in preformatted_tags], False)
paul@35	499
paul@71	500	def forbids_macros(self):
paul@71	501	return reduce(operator.or_, [(tag in headings or tag == "a") for tag in self.elements], False)
paul@71	502
paul@35	503	# Whitespace normalisation.
paul@35	504
paul@35	505	def get_replacement(self, name):
paul@42	506	if name in ("html", "body", "table", "tbody", "tr") or list_tags.has_key(name):
paul@35	507	return ""
paul@35	508	else:
paul@35	509	return " "
paul@35	510
paul@35	511	def normalise(self, text, name):
paul@35	512	return normalise_regexp.sub(self.get_replacement(name), text)
paul@35	513
paul@35	514	def parse(s, out):
paul@35	515
paul@35	516	"Parse the content in the string 's', writing a translation to 'out'."
paul@35	517
paul@35	518	# NOTE: CDATA sections appear to have erroneous endings.
paul@35	519
paul@35	520	s = u"""\
paul@35	521	<?xml version="1.0"?>
paul@35	522	<!DOCTYPE html
paul@35	523	PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
paul@35	524	"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
paul@35	525	<html xmlns="http://www.w3.org/1999/xhtml">
paul@35	526	<body>
paul@35	527	%s
paul@35	528	</body>
paul@35	529	</html>""" % s.replace("]] >", "]]>")
paul@35	530
paul@35	531	f = StringIO(s.encode("utf-8"))
paul@35	532	try:
paul@35	533	parser = ConfluenceXMLParser(out)
paul@35	534	parser.parse(f)
paul@35	535	finally:
paul@35	536	f.close()
paul@35	537
paul@35	538	if __name__ == "__main__":
paul@63	539	s = codecs.getreader("utf-8")(sys.stdin).read()
paul@41	540	out = codecs.getwriter("utf-8")(sys.stdout)
paul@41	541	parse(s, out)
paul@35	542
paul@35	543	# vim: tabstop=4 expandtab shiftwidth=4