ConfluenceConverter (annotate xmlparser.py in 72d0d89748c7)

ConfluenceConverter

Annotated xmlparser.py

147:72d0d89748c7

2017-06-16

Paul Boddie

Introduced elementary support for layout tags, employing special sections, and task list tags, converting task lists to tables.

paul@35	1	#!/usr/bin/env python
paul@35	2
paul@35	3	"""
paul@35	4	Confluence Wiki XML/XHTML syntax parsing.
paul@35	5
paul@144	6	Copyright (C) 2012, 2013, 2015, 2017 Paul Boddie <paul@boddie.org.uk>
paul@35	7
paul@35	8	This software is free software; you can redistribute it and/or
paul@35	9	modify it under the terms of the GNU General Public License as
paul@35	10	published by the Free Software Foundation; either version 2 of
paul@35	11	the License, or (at your option) any later version.
paul@35	12
paul@35	13	This software is distributed in the hope that it will be useful,
paul@35	14	but WITHOUT ANY WARRANTY; without even the implied warranty of
paul@35	15	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
paul@35	16	GNU General Public License for more details.
paul@35	17
paul@35	18	You should have received a copy of the GNU General Public
paul@35	19	License along with this library; see the file LICENCE.txt
paul@35	20	If not, write to the Free Software Foundation, Inc.,
paul@35	21	51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
paul@35	22	"""
paul@35	23
paul@35	24	try:
paul@35	25	from cStringIO import StringIO
paul@35	26	except ImportError:
paul@35	27	from StringIO import StringIO
paul@35	28
paul@51	29	from MoinMoin import wikiutil
paul@35	30	from common import *
paul@35	31	from xmlread import Parser
paul@35	32	import re
paul@35	33	import sys
paul@35	34	import operator
paul@35	35	import htmlentitydefs
paul@41	36	import codecs
paul@35	37
paul@35	38	# XML dialect syntax parsing.
paul@35	39
paul@35	40	tags = {
paul@35	41	# XHTML tag MoinMoin syntax
paul@35	42	"strong" : "'''%s'''",
paul@35	43	"em" : "''%s''",
paul@35	44	"u" : "__%s__",
paul@35	45	"del" : "--(%s)--",
paul@35	46	"sup" : "^%s^",
paul@35	47	"sub" : ",,%s,,",
paul@35	48	"code" : "`%s`",
paul@41	49	"tbody" : "%s",
paul@41	50	"tr" : "%s",
paul@41	51	"th" : "'''%s'''",
paul@41	52	"td" : "%s",
paul@35	53	"blockquote" : " %s",
paul@35	54	"small" : "~-%s-~",
paul@35	55	"big" : "~+%s+~",
paul@35	56	"p" : "%s",
paul@35	57	"ol" : "%s",
paul@35	58	"ul" : "%s",
paul@84	59	"ac:link" : "[[%s%s%s\|%s]]",
paul@84	60	"ac:image" : "{{%s%s%s\|%s}}",
paul@55	61	"a" : "[[%s\|%s]]",
paul@35	62	}
paul@35	63
paul@35	64	for tag, translation in blocktypes.items():
paul@35	65	tags[tag] = translation
paul@35	66
paul@35	67	simple_tags = {
paul@35	68	# XHTML tag MoinMoin syntax
paul@35	69	"br" : "<<BR>>",
paul@35	70	}
paul@35	71
paul@66	72	simple_preformatted_tags = {
paul@66	73	# XHTML tag MoinMoin syntax
paul@66	74	"br" : "\n",
paul@66	75	}
paul@66	76
paul@35	77	list_tags = {
paul@35	78	# XHTML list tag MoinMoin list item syntax
paul@35	79	"ol" : "1. %s",
paul@35	80	"ul" : "* %s",
paul@35	81	}
paul@35	82
paul@147	83	formatted_tags = ["ac:rich-text-body"]
paul@147	84	layout_tags = ["ac:layout", "ac:layout-section", "ac:layout-cell"]
paul@51	85	preformatted_tags = ["pre", "ac:plain-text-body"]
paul@51	86	single_level_tags = ["strong", "em", "u", "del", "sup", "sub", "code"]
paul@147	87	table_tags = ["ac:task-list", "table"]
paul@147	88	table_cell_tags = ["ac:task-body", "ac:task-status", "td", "th"]
paul@147	89	table_row_tags = ["ac:task", "tr"]
paul@51	90
paul@147	91	hierarchical_tags = formatted_tags + preformatted_tags + layout_tags + table_tags
paul@147	92	indented_tags = ["li", "p"] + hierarchical_tags
paul@56	93	block_tags = indented_tags + blocktypes.keys() + list_tags.keys()
paul@58	94	span_override_tags = ["ac:link"]
paul@56	95
paul@35	96	link_target_tags = {
paul@54	97	# Confluence element Attributes providing the target
paul@54	98	"ri:page" : ("ri:space-key", "ri:content-title"),
paul@54	99	"ri:attachment" : ("ri:filename",),
paul@54	100	"ri:user" : ("ri:username",),
paul@35	101	}
paul@35	102
paul@54	103	link_target_prefixes = {
paul@54	104	# Attribute with details Prefix ensuring correct relative link
paul@54	105	"ri:space-key" : "..",
paul@54	106	"ri:content-title" : "..",
paul@54	107	}
paul@54	108
paul@54	109	link_label_attributes = "ri:content-title", "ac:link-body"
paul@54	110
paul@51	111	# NOTE: User links should support the intended user namespace prefix.
paul@51	112
paul@51	113	link_target_types = {
paul@51	114	# Confluence element MoinMoin link prefix
paul@51	115	"ri:attachment" : "attachment:",
paul@51	116	"ri:user" : "",
paul@51	117	}
paul@51	118
paul@35	119	macro_rich_text_styles = {
paul@35	120	# Confluence style MoinMoin admonition style
paul@35	121	"note" : "caution",
paul@35	122	"warning" : "warning",
paul@35	123	"info" : "important",
paul@35	124	"tip" : "tip",
paul@92	125	"excerpt" : "",
paul@35	126	}
paul@35	127
paul@71	128	macroargs = {
paul@71	129	# Confluence macro Confluence and MoinMoin macro arguments
paul@146	130	"attachments" : [("page", "pagename")],
paul@146	131	"color" : [("color", "col")],
paul@71	132	}
paul@71	133
paul@71	134	macrotypes = {
paul@71	135	# Confluence macro MoinMoin syntax
paul@71	136	"anchor" : "<<Anchor(%(anchor)s)>>",
paul@146	137	"attachments" : "<<AttachList(%(args)s)>>",
paul@71	138	"color" : "<<Color2(%(content)s, %(args)s)>>",
paul@146	139	"recently-updated" : "<<RecentChanges>>",
paul@93	140	"toc" : "<<TableOfContents>>",
paul@71	141	}
paul@71	142
paul@35	143	normalise_regexp_str = r"\s+"
paul@35	144	normalise_regexp = re.compile(normalise_regexp_str)
paul@35	145
paul@35	146	class ConfluenceXMLParser(Parser):
paul@35	147
paul@35	148	"Handle content from Confluence 4 page revisions."
paul@35	149
paul@142	150	def __init__(self, out, is_comment_page=False):
paul@35	151	Parser.__init__(self)
paul@35	152	self.out = out
paul@142	153	self.is_comment_page = is_comment_page
paul@35	154
paul@51	155	# Link target and label information.
paul@35	156
paul@35	157	self.target = None
paul@35	158	self.target_type = None
paul@51	159	self.label = None
paul@35	160
paul@35	161	# Macro information.
paul@35	162
paul@93	163	self.macros = []
paul@93	164	self.macro_parameters = []
paul@73	165	self.held_anchors = []
paul@35	166
paul@51	167	# Indentation and element nesting states.
paul@35	168
paul@63	169	self.indents = [0]
paul@35	170	self.states = {}
paul@51	171	self.max_level = self.level = 0
paul@51	172
paul@51	173	for name in preformatted_tags + single_level_tags:
paul@35	174	self.states[name] = 0
paul@35	175
paul@41	176	# Table states.
paul@41	177
paul@41	178	self.table_rows = 0
paul@41	179	self.table_columns = 0
paul@41	180
paul@56	181	# Block states.
paul@56	182
paul@56	183	self.have_block = False
paul@56	184
paul@35	185	# ContentHandler-related methods.
paul@35	186
paul@35	187	def startElement(self, name, attrs):
paul@54	188
paul@54	189	# Track indentation for lists.
paul@54	190
paul@35	191	if list_tags.has_key(name):
paul@63	192	self.indents.append(self.indents[-1] + 1)
paul@54	193
paul@54	194	# Track element nesting.
paul@54	195
paul@63	196	if self.states.has_key(name):
paul@35	197	self.states[name] += 1
paul@54	198
paul@54	199	# Track cumulative element nesting in order to produce appropriate depth
paul@54	200	# indicators in the formatted output.
paul@54	201
paul@147	202	if name in hierarchical_tags:
paul@51	203	self.level += 1
paul@51	204	self.max_level = max(self.level, self.max_level)
paul@51	205
paul@63	206	# Reset indentation within regions.
paul@63	207
paul@63	208	self.indents.append(0)
paul@63	209
paul@73	210	if name in headings:
paul@73	211	self.held_anchors = []
paul@73	212
paul@35	213	Parser.startElement(self, name, attrs)
paul@35	214
paul@51	215	# Remember macro information for use within the element.
paul@51	216
paul@144	217	if name in ("ac:macro", "ac:structured-macro"):
paul@93	218	self.macros.append(self.attributes[-1].get("ac:name"))
paul@93	219	self.macro_parameters.append({})
paul@51	220
paul@35	221	def endElement(self, name):
paul@63	222
paul@63	223	# Reset the indent for any preformatted/formatted region so that it may
paul@63	224	# itself be indented.
paul@63	225
paul@147	226	if name in hierarchical_tags:
paul@63	227	self.indents.pop()
paul@63	228
paul@35	229	Parser.endElement(self, name)
paul@51	230
paul@35	231	if list_tags.has_key(name):
paul@63	232	self.indents.pop()
paul@63	233
paul@63	234	if self.states.has_key(name):
paul@35	235	self.states[name] -= 1
paul@63	236
paul@147	237	if name in hierarchical_tags:
paul@51	238	self.level -= 1
paul@51	239	if not self.level:
paul@51	240	self.max_level = 0
paul@35	241
paul@63	242	# Discard macro state.
paul@63	243
paul@144	244	if name in ("ac:macro", "ac:structured-macro"):
paul@93	245	self.macros.pop()
paul@93	246	self.macro_parameters.pop()
paul@63	247
paul@35	248	def characters(self, content):
paul@35	249	if not self.is_preformatted():
paul@35	250	content = self.normalise(content, self.elements[-1])
paul@35	251	Parser.characters(self, content)
paul@35	252
paul@35	253	def skippedEntity(self, name):
paul@35	254	ch = htmlentitydefs.name2codepoint.get(name)
paul@35	255	if ch:
paul@35	256	self.text[-1].append(unichr(ch))
paul@35	257
paul@35	258	# Parser-related methods.
paul@35	259
paul@35	260	def handleElement(self, name):
paul@51	261
paul@51	262	"""
paul@51	263	Handle the completion of the element with the given 'name'. Any content
paul@51	264	will either be recorded for later use (by an enclosing element, for
paul@51	265	example) or emitted in some form.
paul@51	266	"""
paul@51	267
paul@59	268	text = u"".join(self.text[-1])
paul@41	269
paul@41	270	# Handle state.
paul@41	271
paul@147	272	if name in table_tags:
paul@41	273	self.table_rows = 0
paul@147	274	elif name in table_row_tags:
paul@41	275	self.table_columns = 0
paul@41	276
paul@41	277	# Find conversions.
paul@41	278
paul@35	279	conversion = None
paul@35	280
paul@35	281	# Handle list elements.
paul@35	282
paul@35	283	if name == "li" and len(self.elements) > 1:
paul@35	284	list_tag = self.elements[-2]
paul@35	285	conversion = list_tags.get(list_tag)
paul@35	286
paul@35	287	# Remember link target information.
paul@35	288
paul@35	289	elif link_target_tags.has_key(name):
paul@54	290	target_details = []
paul@54	291
paul@54	292	# Get target details from the element's attributes.
paul@54	293
paul@54	294	for attrname in link_target_tags[name]:
paul@54	295	attrvalue = self.attributes[-1].get(attrname)
paul@54	296	if attrvalue:
paul@85	297
paul@85	298	# Obtain a link label.
paul@85	299
paul@85	300	if attrname in link_label_attributes and not self.label:
paul@85	301	self.label = attrvalue
paul@85	302
paul@85	303	# Validate any page title.
paul@85	304
paul@85	305	if attrname == "ri:content-title":
paul@85	306	attrvalue = get_page_title(attrvalue)
paul@54	307	target_details.append(attrvalue)
paul@85	308
paul@85	309	# Insert any prefix required for the link.
paul@85	310
paul@54	311	prefix = link_target_prefixes.get(attrname)
paul@54	312	if prefix:
paul@54	313	target_details.insert(0, prefix)
paul@142	314	if self.is_comment_page:
paul@142	315	target_details.insert(0, prefix)
paul@54	316
paul@54	317	# Make a link based on the details.
paul@54	318
paul@59	319	self.target = u"/".join(target_details)
paul@35	320	self.target_type = name
paul@35	321	text = ""
paul@35	322
paul@51	323	# For anchor links, just use the raw text and let Moin do the formatting.
paul@94	324	# Set an empty default target, overwriting it if enclosing elements
paul@94	325	# specify target details.
paul@51	326
paul@141	327	elif name in ("ac:link-body", "ac:plain-text-link-body"):
paul@94	328	self.target = self.target or ""
paul@59	329	self.label = text.strip()
paul@51	330	text = ""
paul@51	331
paul@55	332	# For conventional links, remember the href attribute as the target.
paul@55	333
paul@55	334	elif name == "a":
paul@55	335	self.target = self.attributes[-1].get("href")
paul@59	336	self.label = text.strip()
paul@55	337	text = ""
paul@55	338
paul@35	339	# Remember macro information.
paul@35	340
paul@63	341	elif name == "ac:parameter":
paul@93	342	self.macro_parameters[-1][self.attributes[-1].get("ac:name")] = text
paul@35	343	text = ""
paul@35	344
paul@63	345	elif name == "ac:default-parameter":
paul@93	346	self.macro_parameters[-1][self.attributes[-2].get("ac:name")] = text
paul@63	347	text = ""
paul@63	348
paul@51	349	# Handle single-level tags.
paul@51	350
paul@51	351	elif name in single_level_tags and self.states[name] > 1:
paul@51	352	conversion = "%s"
paul@51	353
paul@51	354	# Handle preformatted sections.
paul@51	355
paul@147	356	elif name in hierarchical_tags:
paul@51	357
paul@51	358	# Nest the section appropriately.
paul@51	359
paul@51	360	level = 3 + self.max_level - self.level
paul@51	361	opening = "{" * level
paul@51	362	closing = "}" * level
paul@51	363
paul@51	364	# Macro name information is used to style rich text body regions.
paul@51	365
paul@147	366	if name not in table_tags and self.macros and macro_rich_text_styles.has_key(self.macros[-1]):
paul@93	367	details = macro_rich_text_styles[self.macros[-1]]
paul@93	368	title = self.macro_parameters[-1].get("title")
paul@51	369	if title:
paul@51	370	details = "%s\n\n%s" % (details, title)
paul@51	371
paul@51	372	conversion = "%s#!wiki %s\n\n%%s\n%s" % (opening, details, closing)
paul@51	373
paul@147	374	# Tables employ specially-marked sections.
paul@147	375
paul@147	376	elif name in table_tags:
paul@51	377	conversion = "%s#!table\n%%s\n%s" % (opening, closing)
paul@51	378
paul@147	379	# Layout tags may be nested and their markers are placed on separate
paul@147	380	# lines in the output. They also employ specially-marked sections.
paul@147	381
paul@147	382	elif name in layout_tags:
paul@147	383	section_name = name.split(":", 1)[-1]
paul@147	384	conversion = "%s#!%s\n%%s\n%s" % (opening, section_name, closing)
paul@147	385
paul@51	386	else:
paul@147	387	# Preformatted sections containing newlines must contain an
paul@147	388	# initial newline.
paul@60	389
paul@60	390	if text.find("\n") != -1 and not text.startswith("\n"):
paul@60	391	opening += "\n"
paul@60	392
paul@51	393	conversion = "%s%%s%s" % (opening, closing)
paul@35	394
paul@55	395	# Handle the common case and simpler special cases.
paul@35	396
paul@55	397	if not conversion:
paul@35	398	conversion = tags.get(name)
paul@35	399
paul@56	400
paul@56	401
paul@35	402	# Attempt to convert the text.
paul@35	403
paul@35	404	# Links require target information.
paul@35	405
paul@42	406	if name in ("ac:link", "ac:image"):
paul@54	407	prefix = link_target_types.get(self.target_type, "")
paul@84	408	anchor = self.attributes[-1].get("ac:anchor") or ""
paul@84	409	label = self.label or text.strip() or self.target
paul@84	410	text = conversion % (prefix, self.target, anchor and ("#%s" % anchor) or "", label)
paul@51	411	self.target = self.target_type = self.label = None
paul@35	412
paul@55	413	elif name == "a":
paul@59	414	text = conversion % (self.target, self.label or self.target)
paul@55	415	self.target = self.target_type = self.label = None
paul@55	416
paul@63	417	# Macros require various kinds of information.
paul@71	418	# Some macros affect the formatting of their contents, whereas other
paul@71	419	# simpler macros are handled here.
paul@63	420
paul@144	421	elif name in ("ac:macro", "ac:structured-macro"):
paul@93	422	conversion = macrotypes.get(self.macros[-1])
paul@144	423
paul@144	424	# Produce the converted macro.
paul@144	425
paul@71	426	if conversion:
paul@71	427	parameters = {"content" : text}
paul@93	428	parameters.update(self.macro_parameters[-1])
paul@93	429	argnames = macroargs.get(self.macros[-1])
paul@146	430
paul@146	431	# Convert Confluence arguments to Moin arguments. Unlike the
paul@146	432	# wiki markup parser, multiple arguments are supported.
paul@146	433
paul@71	434	if argnames:
paul@146	435	all_args = []
paul@146	436	for confargname, moinargname in argnames:
paul@146	437	argvalue = self.macro_parameters[-1].get(confargname)
paul@146	438	if argvalue:
paul@146	439	all_args.append(quote_macro_argument("%s=%s" % (moinargname, argvalue)))
paul@146	440	parameters["args"] = ", ".join(all_args)
paul@144	441
paul@144	442	# Obtain the Moin macro with parameters substituted.
paul@144	443
paul@71	444	text = conversion % parameters
paul@93	445	if self.macros[-1] == "anchor" and self.forbids_macros():
paul@73	446	self.held_anchors.append(text)
paul@73	447	text = ""
paul@63	448
paul@144	449	# Warn about macros that are not converted.
paul@144	450
paul@144	451	elif not macro_rich_text_styles.has_key(self.macros[-1]):
paul@144	452	print >>sys.stderr, "No conversion possible for macro", self.macros[-1]
paul@144	453	print >>sys.stderr, "Macro has arguments", self.macro_parameters[-1]
paul@144	454	print >>sys.stderr
paul@144	455
paul@63	456	# Handle the common cases for parameterised and unparameterised
paul@63	457	# substitutions.
paul@35	458
paul@35	459	elif text and conversion:
paul@35	460	text = conversion % text
paul@66	461	elif simple_tags.has_key(name) and not self.is_preformatted():
paul@35	462	text = simple_tags[name]
paul@66	463	elif simple_preformatted_tags.has_key(name) and self.is_preformatted():
paul@66	464	text = simple_preformatted_tags[name]
paul@35	465
paul@63	466
paul@63	467
paul@41	468	# Postprocess table columns and rows.
paul@41	469
paul@147	470	if name in table_cell_tags:
paul@41	471	if self.table_columns:
paul@41	472	text = "\n\|\| %s" % text
paul@41	473	self.table_columns += 1
paul@147	474	elif name in table_row_tags:
paul@41	475	if self.table_rows:
paul@41	476	text = "\n==\n%s" % text
paul@41	477	self.table_rows += 1
paul@41	478
paul@73	479	# Postprocess held anchor tags in headings.
paul@73	480
paul@73	481	elif name in headings and self.held_anchors:
paul@73	482	text = "%s\n%s" % ("".join(self.held_anchors), text)
paul@73	483
paul@63	484
paul@63	485
paul@35	486	# Normalise leading whitespace and indent the text if appropriate.
paul@35	487
paul@35	488	if name in indented_tags:
paul@63	489	text = " " * self.indents[-1] + text.lstrip()
paul@35	490
paul@35	491	# Add the converted text to the end of the parent element's text nodes.
paul@35	492
paul@35	493	if len(self.text) > 1:
paul@35	494	nodes = self.text[-2]
paul@58	495	parent = self.elements[-2]
paul@56	496
paul@56	497	# Where preceding text exists, add any blank line separators.
paul@56	498
paul@59	499	if u"".join(nodes):
paul@56	500
paul@56	501	# All top-level elements are separated with blank lines.
paul@56	502
paul@35	503	if parent == "body":
paul@56	504	nodes.append("\n")
paul@56	505
paul@56	506	# Block elements always cause a new line to be started.
paul@56	507
paul@58	508	if name in block_tags or self.have_block and name not in span_override_tags:
paul@35	509	nodes.append("\n")
paul@56	510
paul@56	511	self.have_block = False
paul@56	512
paul@58	513	# Lists inside lists require separation.
paul@58	514
paul@58	515	elif list_tags.has_key(name) and parent == "li":
paul@58	516	nodes.append("\n")
paul@56	517
paul@58	518	# Without preceding text, save any block node state for non-block
paul@60	519	# elements so that newline separators can be added at another
paul@58	520	# level.
paul@58	521
paul@58	522	elif name in block_tags and parent not in block_tags:
paul@58	523	self.have_block = True
paul@58	524
paul@58	525	elif name not in block_tags and self.have_block and name not in span_override_tags:
paul@56	526	self.have_block = True
paul@56	527
paul@56	528	else:
paul@56	529	self.have_block = False
paul@56	530
paul@35	531	nodes.append(text)
paul@35	532
paul@56	533	# Otherwise, emit the text (at the top level of the document).
paul@35	534
paul@35	535	else:
paul@35	536	self.out.write(text)
paul@35	537
paul@35	538	def is_preformatted(self):
paul@51	539	return reduce(operator.or_, [self.states[tag] for tag in preformatted_tags], False)
paul@35	540
paul@71	541	def forbids_macros(self):
paul@71	542	return reduce(operator.or_, [(tag in headings or tag == "a") for tag in self.elements], False)
paul@71	543
paul@35	544	# Whitespace normalisation.
paul@35	545
paul@35	546	def get_replacement(self, name):
paul@42	547	if name in ("html", "body", "table", "tbody", "tr") or list_tags.has_key(name):
paul@35	548	return ""
paul@35	549	else:
paul@35	550	return " "
paul@35	551
paul@35	552	def normalise(self, text, name):
paul@35	553	return normalise_regexp.sub(self.get_replacement(name), text)
paul@35	554
paul@142	555	def parse(s, out, is_comment_page=False):
paul@35	556
paul@35	557	"Parse the content in the string 's', writing a translation to 'out'."
paul@35	558
paul@35	559	# NOTE: CDATA sections appear to have erroneous endings.
paul@35	560
paul@35	561	s = u"""\
paul@35	562	<?xml version="1.0"?>
paul@35	563	<!DOCTYPE html
paul@35	564	PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
paul@35	565	"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
paul@35	566	<html xmlns="http://www.w3.org/1999/xhtml">
paul@35	567	<body>
paul@35	568	%s
paul@35	569	</body>
paul@35	570	</html>""" % s.replace("]] >", "]]>")
paul@35	571
paul@35	572	f = StringIO(s.encode("utf-8"))
paul@35	573	try:
paul@142	574	parser = ConfluenceXMLParser(out, is_comment_page)
paul@35	575	parser.parse(f)
paul@35	576	finally:
paul@35	577	f.close()
paul@35	578
paul@35	579	if __name__ == "__main__":
paul@63	580	s = codecs.getreader("utf-8")(sys.stdin).read()
paul@41	581	out = codecs.getwriter("utf-8")(sys.stdout)
paul@41	582	parse(s, out)
paul@35	583
paul@35	584	# vim: tabstop=4 expandtab shiftwidth=4