ConfluenceConverter (annotate xmlparser.py in 90c4ddc8afb6)

ConfluenceConverter

Annotated xmlparser.py

145:90c4ddc8afb6

2017-06-16

Paul Boddie

Added some resources describing Confluence storage and format representations.

paul@35	1	#!/usr/bin/env python
paul@35	2
paul@35	3	"""
paul@35	4	Confluence Wiki XML/XHTML syntax parsing.
paul@35	5
paul@144	6	Copyright (C) 2012, 2013, 2015, 2017 Paul Boddie <paul@boddie.org.uk>
paul@35	7
paul@35	8	This software is free software; you can redistribute it and/or
paul@35	9	modify it under the terms of the GNU General Public License as
paul@35	10	published by the Free Software Foundation; either version 2 of
paul@35	11	the License, or (at your option) any later version.
paul@35	12
paul@35	13	This software is distributed in the hope that it will be useful,
paul@35	14	but WITHOUT ANY WARRANTY; without even the implied warranty of
paul@35	15	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
paul@35	16	GNU General Public License for more details.
paul@35	17
paul@35	18	You should have received a copy of the GNU General Public
paul@35	19	License along with this library; see the file LICENCE.txt
paul@35	20	If not, write to the Free Software Foundation, Inc.,
paul@35	21	51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
paul@35	22	"""
paul@35	23
paul@35	24	try:
paul@35	25	from cStringIO import StringIO
paul@35	26	except ImportError:
paul@35	27	from StringIO import StringIO
paul@35	28
paul@51	29	from MoinMoin import wikiutil
paul@35	30	from common import *
paul@35	31	from xmlread import Parser
paul@35	32	import re
paul@35	33	import sys
paul@35	34	import operator
paul@35	35	import htmlentitydefs
paul@41	36	import codecs
paul@35	37
paul@35	38	# XML dialect syntax parsing.
paul@35	39
paul@35	40	tags = {
paul@35	41	# XHTML tag MoinMoin syntax
paul@35	42	"strong" : "'''%s'''",
paul@35	43	"em" : "''%s''",
paul@35	44	"u" : "__%s__",
paul@35	45	"del" : "--(%s)--",
paul@35	46	"sup" : "^%s^",
paul@35	47	"sub" : ",,%s,,",
paul@35	48	"code" : "`%s`",
paul@41	49	"tbody" : "%s",
paul@41	50	"tr" : "%s",
paul@41	51	"th" : "'''%s'''",
paul@41	52	"td" : "%s",
paul@35	53	"blockquote" : " %s",
paul@35	54	"small" : "~-%s-~",
paul@35	55	"big" : "~+%s+~",
paul@35	56	"p" : "%s",
paul@35	57	"ol" : "%s",
paul@35	58	"ul" : "%s",
paul@84	59	"ac:link" : "[[%s%s%s\|%s]]",
paul@84	60	"ac:image" : "{{%s%s%s\|%s}}",
paul@55	61	"a" : "[[%s\|%s]]",
paul@35	62	}
paul@35	63
paul@35	64	for tag, translation in blocktypes.items():
paul@35	65	tags[tag] = translation
paul@35	66
paul@35	67	simple_tags = {
paul@35	68	# XHTML tag MoinMoin syntax
paul@35	69	"br" : "<<BR>>",
paul@35	70	}
paul@35	71
paul@66	72	simple_preformatted_tags = {
paul@66	73	# XHTML tag MoinMoin syntax
paul@66	74	"br" : "\n",
paul@66	75	}
paul@66	76
paul@35	77	list_tags = {
paul@35	78	# XHTML list tag MoinMoin list item syntax
paul@35	79	"ol" : "1. %s",
paul@35	80	"ul" : "* %s",
paul@35	81	}
paul@35	82
paul@51	83	preformatted_tags = ["pre", "ac:plain-text-body"]
paul@51	84	single_level_tags = ["strong", "em", "u", "del", "sup", "sub", "code"]
paul@51	85	formatted_tags = ["ac:rich-text-body", "table"]
paul@51	86
paul@56	87	indented_tags = ["li", "p"] + preformatted_tags + formatted_tags
paul@56	88	block_tags = indented_tags + blocktypes.keys() + list_tags.keys()
paul@58	89	span_override_tags = ["ac:link"]
paul@56	90
paul@35	91	link_target_tags = {
paul@54	92	# Confluence element Attributes providing the target
paul@54	93	"ri:page" : ("ri:space-key", "ri:content-title"),
paul@54	94	"ri:attachment" : ("ri:filename",),
paul@54	95	"ri:user" : ("ri:username",),
paul@35	96	}
paul@35	97
paul@54	98	link_target_prefixes = {
paul@54	99	# Attribute with details Prefix ensuring correct relative link
paul@54	100	"ri:space-key" : "..",
paul@54	101	"ri:content-title" : "..",
paul@54	102	}
paul@54	103
paul@54	104	link_label_attributes = "ri:content-title", "ac:link-body"
paul@54	105
paul@51	106	# NOTE: User links should support the intended user namespace prefix.
paul@51	107
paul@51	108	link_target_types = {
paul@51	109	# Confluence element MoinMoin link prefix
paul@51	110	"ri:attachment" : "attachment:",
paul@51	111	"ri:user" : "",
paul@51	112	}
paul@51	113
paul@35	114	macro_rich_text_styles = {
paul@35	115	# Confluence style MoinMoin admonition style
paul@35	116	"note" : "caution",
paul@35	117	"warning" : "warning",
paul@35	118	"info" : "important",
paul@35	119	"tip" : "tip",
paul@92	120	"excerpt" : "",
paul@35	121	}
paul@35	122
paul@71	123	macroargs = {
paul@71	124	# Confluence macro Confluence and MoinMoin macro arguments
paul@71	125	"color" : ("color", "col"),
paul@71	126	}
paul@71	127
paul@71	128	macrotypes = {
paul@71	129	# Confluence macro MoinMoin syntax
paul@71	130	"anchor" : "<<Anchor(%(anchor)s)>>",
paul@71	131	"color" : "<<Color2(%(content)s, %(args)s)>>",
paul@93	132	"toc" : "<<TableOfContents>>",
paul@71	133	}
paul@71	134
paul@35	135	normalise_regexp_str = r"\s+"
paul@35	136	normalise_regexp = re.compile(normalise_regexp_str)
paul@35	137
paul@35	138	class ConfluenceXMLParser(Parser):
paul@35	139
paul@35	140	"Handle content from Confluence 4 page revisions."
paul@35	141
paul@142	142	def __init__(self, out, is_comment_page=False):
paul@35	143	Parser.__init__(self)
paul@35	144	self.out = out
paul@142	145	self.is_comment_page = is_comment_page
paul@35	146
paul@51	147	# Link target and label information.
paul@35	148
paul@35	149	self.target = None
paul@35	150	self.target_type = None
paul@51	151	self.label = None
paul@35	152
paul@35	153	# Macro information.
paul@35	154
paul@93	155	self.macros = []
paul@93	156	self.macro_parameters = []
paul@73	157	self.held_anchors = []
paul@35	158
paul@51	159	# Indentation and element nesting states.
paul@35	160
paul@63	161	self.indents = [0]
paul@35	162	self.states = {}
paul@51	163	self.max_level = self.level = 0
paul@51	164
paul@51	165	for name in preformatted_tags + single_level_tags:
paul@35	166	self.states[name] = 0
paul@35	167
paul@41	168	# Table states.
paul@41	169
paul@41	170	self.table_rows = 0
paul@41	171	self.table_columns = 0
paul@41	172
paul@56	173	# Block states.
paul@56	174
paul@56	175	self.have_block = False
paul@56	176
paul@35	177	# ContentHandler-related methods.
paul@35	178
paul@35	179	def startElement(self, name, attrs):
paul@54	180
paul@54	181	# Track indentation for lists.
paul@54	182
paul@35	183	if list_tags.has_key(name):
paul@63	184	self.indents.append(self.indents[-1] + 1)
paul@54	185
paul@54	186	# Track element nesting.
paul@54	187
paul@63	188	if self.states.has_key(name):
paul@35	189	self.states[name] += 1
paul@54	190
paul@54	191	# Track cumulative element nesting in order to produce appropriate depth
paul@54	192	# indicators in the formatted output.
paul@54	193
paul@51	194	if name in preformatted_tags or name in formatted_tags:
paul@51	195	self.level += 1
paul@51	196	self.max_level = max(self.level, self.max_level)
paul@51	197
paul@63	198	# Reset indentation within regions.
paul@63	199
paul@63	200	self.indents.append(0)
paul@63	201
paul@73	202	if name in headings:
paul@73	203	self.held_anchors = []
paul@73	204
paul@35	205	Parser.startElement(self, name, attrs)
paul@35	206
paul@51	207	# Remember macro information for use within the element.
paul@51	208
paul@144	209	if name in ("ac:macro", "ac:structured-macro"):
paul@93	210	self.macros.append(self.attributes[-1].get("ac:name"))
paul@93	211	self.macro_parameters.append({})
paul@51	212
paul@35	213	def endElement(self, name):
paul@63	214
paul@63	215	# Reset the indent for any preformatted/formatted region so that it may
paul@63	216	# itself be indented.
paul@63	217
paul@63	218	if name in preformatted_tags or name in formatted_tags:
paul@63	219	self.indents.pop()
paul@63	220
paul@35	221	Parser.endElement(self, name)
paul@51	222
paul@35	223	if list_tags.has_key(name):
paul@63	224	self.indents.pop()
paul@63	225
paul@63	226	if self.states.has_key(name):
paul@35	227	self.states[name] -= 1
paul@63	228
paul@51	229	if name in preformatted_tags or name in formatted_tags:
paul@51	230	self.level -= 1
paul@51	231	if not self.level:
paul@51	232	self.max_level = 0
paul@35	233
paul@63	234	# Discard macro state.
paul@63	235
paul@144	236	if name in ("ac:macro", "ac:structured-macro"):
paul@93	237	self.macros.pop()
paul@93	238	self.macro_parameters.pop()
paul@63	239
paul@35	240	def characters(self, content):
paul@35	241	if not self.is_preformatted():
paul@35	242	content = self.normalise(content, self.elements[-1])
paul@35	243	Parser.characters(self, content)
paul@35	244
paul@35	245	def skippedEntity(self, name):
paul@35	246	ch = htmlentitydefs.name2codepoint.get(name)
paul@35	247	if ch:
paul@35	248	self.text[-1].append(unichr(ch))
paul@35	249
paul@35	250	# Parser-related methods.
paul@35	251
paul@35	252	def handleElement(self, name):
paul@51	253
paul@51	254	"""
paul@51	255	Handle the completion of the element with the given 'name'. Any content
paul@51	256	will either be recorded for later use (by an enclosing element, for
paul@51	257	example) or emitted in some form.
paul@51	258	"""
paul@51	259
paul@59	260	text = u"".join(self.text[-1])
paul@41	261
paul@41	262	# Handle state.
paul@41	263
paul@41	264	if name == "table":
paul@41	265	self.table_rows = 0
paul@41	266	elif name == "tr":
paul@41	267	self.table_columns = 0
paul@41	268
paul@41	269	# Find conversions.
paul@41	270
paul@35	271	conversion = None
paul@35	272
paul@35	273	# Handle list elements.
paul@35	274
paul@35	275	if name == "li" and len(self.elements) > 1:
paul@35	276	list_tag = self.elements[-2]
paul@35	277	conversion = list_tags.get(list_tag)
paul@35	278
paul@35	279	# Remember link target information.
paul@35	280
paul@35	281	elif link_target_tags.has_key(name):
paul@54	282	target_details = []
paul@54	283
paul@54	284	# Get target details from the element's attributes.
paul@54	285
paul@54	286	for attrname in link_target_tags[name]:
paul@54	287	attrvalue = self.attributes[-1].get(attrname)
paul@54	288	if attrvalue:
paul@85	289
paul@85	290	# Obtain a link label.
paul@85	291
paul@85	292	if attrname in link_label_attributes and not self.label:
paul@85	293	self.label = attrvalue
paul@85	294
paul@85	295	# Validate any page title.
paul@85	296
paul@85	297	if attrname == "ri:content-title":
paul@85	298	attrvalue = get_page_title(attrvalue)
paul@54	299	target_details.append(attrvalue)
paul@85	300
paul@85	301	# Insert any prefix required for the link.
paul@85	302
paul@54	303	prefix = link_target_prefixes.get(attrname)
paul@54	304	if prefix:
paul@54	305	target_details.insert(0, prefix)
paul@142	306	if self.is_comment_page:
paul@142	307	target_details.insert(0, prefix)
paul@54	308
paul@54	309	# Make a link based on the details.
paul@54	310
paul@59	311	self.target = u"/".join(target_details)
paul@35	312	self.target_type = name
paul@35	313	text = ""
paul@35	314
paul@51	315	# For anchor links, just use the raw text and let Moin do the formatting.
paul@94	316	# Set an empty default target, overwriting it if enclosing elements
paul@94	317	# specify target details.
paul@51	318
paul@141	319	elif name in ("ac:link-body", "ac:plain-text-link-body"):
paul@94	320	self.target = self.target or ""
paul@59	321	self.label = text.strip()
paul@51	322	text = ""
paul@51	323
paul@55	324	# For conventional links, remember the href attribute as the target.
paul@55	325
paul@55	326	elif name == "a":
paul@55	327	self.target = self.attributes[-1].get("href")
paul@59	328	self.label = text.strip()
paul@55	329	text = ""
paul@55	330
paul@35	331	# Remember macro information.
paul@35	332
paul@63	333	elif name == "ac:parameter":
paul@93	334	self.macro_parameters[-1][self.attributes[-1].get("ac:name")] = text
paul@35	335	text = ""
paul@35	336
paul@63	337	elif name == "ac:default-parameter":
paul@93	338	self.macro_parameters[-1][self.attributes[-2].get("ac:name")] = text
paul@63	339	text = ""
paul@63	340
paul@51	341	# Handle single-level tags.
paul@51	342
paul@51	343	elif name in single_level_tags and self.states[name] > 1:
paul@51	344	conversion = "%s"
paul@51	345
paul@51	346	# Handle preformatted sections.
paul@51	347
paul@51	348	elif name in preformatted_tags or name in formatted_tags:
paul@51	349
paul@51	350	# Nest the section appropriately.
paul@51	351
paul@51	352	level = 3 + self.max_level - self.level
paul@51	353	opening = "{" * level
paul@51	354	closing = "}" * level
paul@51	355
paul@51	356	# Macro name information is used to style rich text body regions.
paul@51	357
paul@93	358	if name != "table" and self.macros and macro_rich_text_styles.has_key(self.macros[-1]):
paul@93	359	details = macro_rich_text_styles[self.macros[-1]]
paul@93	360	title = self.macro_parameters[-1].get("title")
paul@51	361	if title:
paul@51	362	details = "%s\n\n%s" % (details, title)
paul@51	363
paul@51	364	conversion = "%s#!wiki %s\n\n%%s\n%s" % (opening, details, closing)
paul@51	365
paul@51	366	elif name == "table":
paul@51	367	conversion = "%s#!table\n%%s\n%s" % (opening, closing)
paul@51	368
paul@51	369	else:
paul@60	370	# Preformatted sections containing newlines must contain an initial
paul@60	371	# newline.
paul@60	372
paul@60	373	if text.find("\n") != -1 and not text.startswith("\n"):
paul@60	374	opening += "\n"
paul@60	375
paul@51	376	conversion = "%s%%s%s" % (opening, closing)
paul@35	377
paul@55	378	# Handle the common case and simpler special cases.
paul@35	379
paul@55	380	if not conversion:
paul@35	381	conversion = tags.get(name)
paul@35	382
paul@56	383
paul@56	384
paul@35	385	# Attempt to convert the text.
paul@35	386
paul@35	387	# Links require target information.
paul@35	388
paul@42	389	if name in ("ac:link", "ac:image"):
paul@54	390	prefix = link_target_types.get(self.target_type, "")
paul@84	391	anchor = self.attributes[-1].get("ac:anchor") or ""
paul@84	392	label = self.label or text.strip() or self.target
paul@84	393	text = conversion % (prefix, self.target, anchor and ("#%s" % anchor) or "", label)
paul@51	394	self.target = self.target_type = self.label = None
paul@35	395
paul@55	396	elif name == "a":
paul@59	397	text = conversion % (self.target, self.label or self.target)
paul@55	398	self.target = self.target_type = self.label = None
paul@55	399
paul@63	400	# Macros require various kinds of information.
paul@71	401	# Some macros affect the formatting of their contents, whereas other
paul@71	402	# simpler macros are handled here.
paul@63	403
paul@144	404	elif name in ("ac:macro", "ac:structured-macro"):
paul@93	405	conversion = macrotypes.get(self.macros[-1])
paul@144	406
paul@144	407	# Produce the converted macro.
paul@144	408
paul@71	409	if conversion:
paul@71	410	parameters = {"content" : text}
paul@93	411	parameters.update(self.macro_parameters[-1])
paul@93	412	argnames = macroargs.get(self.macros[-1])
paul@71	413	if argnames:
paul@71	414	confargname, moinargname = argnames
paul@93	415	parameters["args"] = quote_macro_argument("%s=%s" % (moinargname, self.macro_parameters[-1][confargname]))
paul@144	416
paul@144	417	# Obtain the Moin macro with parameters substituted.
paul@144	418
paul@71	419	text = conversion % parameters
paul@93	420	if self.macros[-1] == "anchor" and self.forbids_macros():
paul@73	421	self.held_anchors.append(text)
paul@73	422	text = ""
paul@63	423
paul@144	424	# Warn about macros that are not converted.
paul@144	425
paul@144	426	elif not macro_rich_text_styles.has_key(self.macros[-1]):
paul@144	427	print >>sys.stderr, "No conversion possible for macro", self.macros[-1]
paul@144	428	print >>sys.stderr, "Macro has arguments", self.macro_parameters[-1]
paul@144	429	print >>sys.stderr
paul@144	430
paul@63	431	# Handle the common cases for parameterised and unparameterised
paul@63	432	# substitutions.
paul@35	433
paul@35	434	elif text and conversion:
paul@35	435	text = conversion % text
paul@66	436	elif simple_tags.has_key(name) and not self.is_preformatted():
paul@35	437	text = simple_tags[name]
paul@66	438	elif simple_preformatted_tags.has_key(name) and self.is_preformatted():
paul@66	439	text = simple_preformatted_tags[name]
paul@35	440
paul@63	441
paul@63	442
paul@41	443	# Postprocess table columns and rows.
paul@41	444
paul@41	445	if name in ("th", "td"):
paul@41	446	if self.table_columns:
paul@41	447	text = "\n\|\| %s" % text
paul@41	448	self.table_columns += 1
paul@41	449	elif name == "tr":
paul@41	450	if self.table_rows:
paul@41	451	text = "\n==\n%s" % text
paul@41	452	self.table_rows += 1
paul@41	453
paul@73	454	# Postprocess held anchor tags in headings.
paul@73	455
paul@73	456	elif name in headings and self.held_anchors:
paul@73	457	text = "%s\n%s" % ("".join(self.held_anchors), text)
paul@73	458
paul@63	459
paul@63	460
paul@35	461	# Normalise leading whitespace and indent the text if appropriate.
paul@35	462
paul@35	463	if name in indented_tags:
paul@63	464	text = " " * self.indents[-1] + text.lstrip()
paul@35	465
paul@35	466	# Add the converted text to the end of the parent element's text nodes.
paul@35	467
paul@35	468	if len(self.text) > 1:
paul@35	469	nodes = self.text[-2]
paul@58	470	parent = self.elements[-2]
paul@56	471
paul@56	472	# Where preceding text exists, add any blank line separators.
paul@56	473
paul@59	474	if u"".join(nodes):
paul@56	475
paul@56	476	# All top-level elements are separated with blank lines.
paul@56	477
paul@35	478	if parent == "body":
paul@56	479	nodes.append("\n")
paul@56	480
paul@56	481	# Block elements always cause a new line to be started.
paul@56	482
paul@58	483	if name in block_tags or self.have_block and name not in span_override_tags:
paul@35	484	nodes.append("\n")
paul@56	485
paul@56	486	self.have_block = False
paul@56	487
paul@58	488	# Lists inside lists require separation.
paul@58	489
paul@58	490	elif list_tags.has_key(name) and parent == "li":
paul@58	491	nodes.append("\n")
paul@56	492
paul@58	493	# Without preceding text, save any block node state for non-block
paul@60	494	# elements so that newline separators can be added at another
paul@58	495	# level.
paul@58	496
paul@58	497	elif name in block_tags and parent not in block_tags:
paul@58	498	self.have_block = True
paul@58	499
paul@58	500	elif name not in block_tags and self.have_block and name not in span_override_tags:
paul@56	501	self.have_block = True
paul@56	502
paul@56	503	else:
paul@56	504	self.have_block = False
paul@56	505
paul@35	506	nodes.append(text)
paul@35	507
paul@56	508	# Otherwise, emit the text (at the top level of the document).
paul@35	509
paul@35	510	else:
paul@35	511	self.out.write(text)
paul@35	512
paul@35	513	def is_preformatted(self):
paul@51	514	return reduce(operator.or_, [self.states[tag] for tag in preformatted_tags], False)
paul@35	515
paul@71	516	def forbids_macros(self):
paul@71	517	return reduce(operator.or_, [(tag in headings or tag == "a") for tag in self.elements], False)
paul@71	518
paul@35	519	# Whitespace normalisation.
paul@35	520
paul@35	521	def get_replacement(self, name):
paul@42	522	if name in ("html", "body", "table", "tbody", "tr") or list_tags.has_key(name):
paul@35	523	return ""
paul@35	524	else:
paul@35	525	return " "
paul@35	526
paul@35	527	def normalise(self, text, name):
paul@35	528	return normalise_regexp.sub(self.get_replacement(name), text)
paul@35	529
paul@142	530	def parse(s, out, is_comment_page=False):
paul@35	531
paul@35	532	"Parse the content in the string 's', writing a translation to 'out'."
paul@35	533
paul@35	534	# NOTE: CDATA sections appear to have erroneous endings.
paul@35	535
paul@35	536	s = u"""\
paul@35	537	<?xml version="1.0"?>
paul@35	538	<!DOCTYPE html
paul@35	539	PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
paul@35	540	"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
paul@35	541	<html xmlns="http://www.w3.org/1999/xhtml">
paul@35	542	<body>
paul@35	543	%s
paul@35	544	</body>
paul@35	545	</html>""" % s.replace("]] >", "]]>")
paul@35	546
paul@35	547	f = StringIO(s.encode("utf-8"))
paul@35	548	try:
paul@142	549	parser = ConfluenceXMLParser(out, is_comment_page)
paul@35	550	parser.parse(f)
paul@35	551	finally:
paul@35	552	f.close()
paul@35	553
paul@35	554	if __name__ == "__main__":
paul@63	555	s = codecs.getreader("utf-8")(sys.stdin).read()
paul@41	556	out = codecs.getwriter("utf-8")(sys.stdout)
paul@41	557	parse(s, out)
paul@35	558
paul@35	559	# vim: tabstop=4 expandtab shiftwidth=4