ConfluenceConverter (annotate parser.py in 7c02c17dfdc9)

ConfluenceConverter

Annotated parser.py

27:7c02c17dfdc9

2012-12-19

Paul Boddie

Introduced more general handling of HTML entities.

paul@6	1	#!/usr/bin/env python
paul@6	2
paul@7	3	"""
paul@7	4	Confluence Wiki syntax parsing.
paul@7	5
paul@8	6	Copyright (C) 2012 Paul Boddie <paul@boddie.org.uk>
paul@8	7
paul@8	8	This software is free software; you can redistribute it and/or
paul@8	9	modify it under the terms of the GNU General Public License as
paul@8	10	published by the Free Software Foundation; either version 2 of
paul@8	11	the License, or (at your option) any later version.
paul@8	12
paul@8	13	This software is distributed in the hope that it will be useful,
paul@8	14	but WITHOUT ANY WARRANTY; without even the implied warranty of
paul@8	15	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
paul@8	16	GNU General Public License for more details.
paul@8	17
paul@8	18	You should have received a copy of the GNU General Public
paul@8	19	License along with this library; see the file LICENCE.txt
paul@8	20	If not, write to the Free Software Foundation, Inc.,
paul@8	21	51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
paul@8	22
paul@8	23	--------
paul@8	24
paul@8	25	The basic procedure is as follows:
paul@8	26
paul@7	27	1. Wiki pages are first split up into regions.
paul@7	28	2. Then, within these regions, the text is split into blocks.
paul@7	29	1. First, lists are identified.
paul@7	30	2. Additionally, other block-like elements are identified.
paul@7	31	3. Each block is then parsed.
paul@7	32	"""
paul@7	33
paul@25	34	try:
paul@25	35	from cStringIO import StringIO
paul@25	36	except ImportError:
paul@25	37	from StringIO import StringIO
paul@25	38
paul@25	39	from xmlread import Parser
paul@6	40	import re
paul@25	41	import sys
paul@26	42	import operator
paul@27	43	import htmlentitydefs
paul@6	44
paul@19	45	URL_SCHEMES = ("http", "https", "ftp", "mailto")
paul@19	46
paul@6	47	# Section extraction.
paul@6	48
paul@19	49	sections_regexp_str = r"(?<!{){(?P<type>[^-_+{}\n:]+)(:[^}\n]+)?}.?{(?P=type)}"
paul@6	50	sections_regexp = re.compile(sections_regexp_str, re.DOTALL \| re.MULTILINE)
paul@6	51
paul@6	52	def get_regions(s):
paul@6	53
paul@6	54	"""
paul@6	55	Return a list of regions from 's'. Each region is specified using a tuple of
paul@6	56	the form (type, text).
paul@6	57	"""
paul@6	58
paul@6	59	last = 0
paul@6	60	regions = []
paul@6	61	for match in sections_regexp.finditer(s):
paul@6	62	start, end = match.span()
paul@6	63	regions.append((None, s[last:start]))
paul@6	64	regions.append(get_section_details(s[start:end]))
paul@6	65	last = end
paul@6	66	regions.append((None, s[last:]))
paul@6	67	return regions
paul@6	68
paul@7	69	# Section inspection.
paul@7	70
paul@15	71	section_regexp_str = r"{(?P<sectiontype>[^\n:]?)(?::(?P<options>.?))?}(?P<section>.*){(?P=sectiontype)}"
paul@7	72	section_regexp = re.compile(section_regexp_str, re.DOTALL \| re.MULTILINE)
paul@7	73
paul@6	74	def get_section_details(s):
paul@6	75
paul@7	76	"Return the details of a section 's' in the form (type, text)."
paul@6	77
paul@6	78	match = section_regexp.match(s)
paul@6	79	if match:
paul@15	80	return (match.group("sectiontype"), match.group("options")), match.group("section")
paul@6	81	else:
paul@6	82	return None, s
paul@6	83
paul@14	84	# Heading, table and list extraction.
paul@7	85
paul@17	86	list_regexp_str = r"^\s(?P<listtype>[#-])[#-].\n(\s(?P=listtype).(?:\n\|$))"
paul@14	87	table_regexp_str = r"^((?P<celltype>[\|]{1,2})(.+?(?P=celltype))+(\n\|$))+"
paul@14	88	blocktext_regexp_str = r"^(?P<type>h\d\|bq)\.\s+(?P<text>.*)$"
paul@7	89
paul@14	90	blockelement_regexp = re.compile(
paul@14	91	"(" + list_regexp_str + ")"
paul@14	92	"\|"
paul@14	93	"(" + table_regexp_str + ")"
paul@14	94	"\|"
paul@14	95	"(" + blocktext_regexp_str + ")",
paul@14	96	re.MULTILINE
paul@14	97	)
paul@14	98
paul@14	99	def get_block_elements(s):
paul@7	100
paul@7	101	"""
paul@14	102	Extract headings, tables and lists from the given string 's'.
paul@7	103	"""
paul@7	104
paul@7	105	last = 0
paul@7	106	blocks = []
paul@14	107	for match in blockelement_regexp.finditer(s):
paul@7	108	start, end = match.span()
paul@14	109	matchtype = match.group("listtype") and "list" or match.group("celltype") and "table" or match.group("type")
paul@7	110	blocks.append((None, s[last:start]))
paul@14	111	blocks.append((matchtype, match.group("text") or s[start:end]))
paul@7	112	last = end
paul@7	113	blocks.append((None, s[last:]))
paul@7	114	return blocks
paul@7	115
paul@7	116	# Block extraction.
paul@7	117
paul@7	118	block_regexp_str = r"^(?:\s*\n)+"
paul@7	119	block_regexp = re.compile(block_regexp_str, re.MULTILINE)
paul@7	120
paul@7	121	def get_basic_blocks(s):
paul@7	122
paul@7	123	"""
paul@7	124	Return blocks from the given string 's' by splitting the text on blank lines
paul@7	125	and eliminating those lines.
paul@7	126	"""
paul@7	127
paul@7	128	return [b for b in block_regexp.split(s) if b.strip()]
paul@7	129
paul@7	130	# Block inspection.
paul@7	131
paul@7	132	def get_blocks(s):
paul@7	133
paul@7	134	"""
paul@7	135	Return blocks from the given string 's', inspecting the basic blocks and
paul@7	136	generating additional block-level text where appropriate.
paul@7	137	"""
paul@7	138
paul@7	139	blocks = []
paul@7	140
paul@14	141	for blocktype, blocktext in get_block_elements(s):
paul@7	142
paul@14	143	# Collect heading, list and table blocks.
paul@7	144
paul@7	145	if blocktype is not None:
paul@7	146	blocks.append((blocktype, blocktext))
paul@7	147
paul@7	148	# Attempt to find new subblocks in other regions.
paul@7	149
paul@7	150	else:
paul@7	151	for block in get_basic_blocks(blocktext):
paul@14	152	blocks.append((None, block))
paul@7	153
paul@7	154	return blocks
paul@7	155
paul@14	156	# List item inspection.
paul@14	157
paul@17	158	listitem_regexp_str = r"^(?P<marker> [-#]+)\s(?P<text>.)$"
paul@7	159	listitem_regexp = re.compile(listitem_regexp_str, re.MULTILINE)
paul@7	160
paul@14	161	def get_list_items(text):
paul@14	162
paul@14	163	"Return a list of (marker, text) tuples for the given list 'text'."
paul@14	164
paul@14	165	items = []
paul@14	166
paul@14	167	for match in listitem_regexp.finditer(text):
paul@14	168	items.append((match.group("marker"), match.group("text")))
paul@14	169
paul@14	170	return items
paul@14	171
paul@14	172	# Table row inspection.
paul@14	173
paul@19	174	monospace_regexp_str = r"{{(?P<monotext>.*?)}}"
paul@14	175	link_regexp_str = r"[[](?P<linktext>.*?)]"
paul@14	176	image_regexp_str = r"!(?P<imagetext>.*?)!"
paul@14	177	cellsep_regexp_str = r"(?P<celltype>[\|]{1,2})"
paul@16	178
paul@16	179	content_regexp_str = (
paul@19	180	"(" + monospace_regexp_str + ")"
paul@19	181	"\|"
paul@14	182	"(" + link_regexp_str + ")"
paul@14	183	"\|"
paul@14	184	"(" + image_regexp_str + ")"
paul@16	185	)
paul@16	186
paul@16	187	table_content_regexp_str = (
paul@16	188	content_regexp_str +
paul@14	189	"\|"
paul@14	190	"(" + cellsep_regexp_str + ")"
paul@14	191	)
paul@14	192
paul@16	193	content_regexp = re.compile(content_regexp_str)
paul@16	194	table_content_regexp = re.compile(table_content_regexp_str)
paul@16	195
paul@16	196	def translate_content_match(match):
paul@16	197
paul@16	198	"Translate the content described by the given 'match', returning a string."
paul@16	199
paul@19	200	if match.group("monotext"):
paul@19	201	return "{{{%s}}}" % match.group("monotext")
paul@19	202
paul@19	203	elif match.group("linktext"):
paul@16	204	parts = match.group("linktext").split("\|")
paul@16	205
paul@16	206	# NOTE: Proper detection of external links required.
paul@16	207
paul@19	208	if len(parts) == 1:
paul@22	209	label, target, title = None, parts[0], None
paul@19	210	elif len(parts) == 2:
paul@22	211	(label, target), title = parts, None
paul@19	212	else:
paul@19	213	label, target, title = parts
paul@19	214
paul@21	215	target = target.strip()
paul@21	216
paul@22	217	# Look for namespace links and rewrite them.
paul@22	218
paul@19	219	if target.find(":") != -1:
paul@16	220	prefix = ""
paul@19	221	space, rest = target.split(":", 1)
paul@19	222	if space not in URL_SCHEMES:
paul@19	223	target = "%s/%s" % (space, rest)
paul@22	224
paul@22	225	# Detect anchors.
paul@22	226
paul@19	227	elif target.startswith("#"):
paul@16	228	prefix = ""
paul@22	229
paul@22	230	# Detect attachments.
paul@22	231
paul@19	232	elif target.startswith("^"):
paul@16	233	prefix = "attachment:"
paul@22	234
paul@22	235	# Link to other pages within a space.
paul@22	236
paul@16	237	else:
paul@16	238	prefix = "../"
paul@16	239
paul@22	240	# Make the link tidier by making a target if none was given.
paul@22	241
paul@22	242	if not label:
paul@22	243	label = target
paul@22	244
paul@22	245	if not label and not title:
paul@19	246	return "[[%s%s]]" % (prefix, target)
paul@22	247	elif not title:
paul@19	248	return "[[%s%s\|%s]]" % (prefix, target, label)
paul@16	249	else:
paul@19	250	return "[[%s%s\|%s\|title=%s]]" % (prefix, target, label, title)
paul@16	251
paul@16	252	elif match.group("imagetext"):
paul@16	253	parts = match.group("imagetext").split("\|")
paul@16	254
paul@16	255	# NOTE: Proper detection of external links required.
paul@16	256
paul@16	257	if parts[0].startswith("http"):
paul@16	258	prefix = ""
paul@16	259	else:
paul@16	260	prefix = "attachment:"
paul@16	261
paul@16	262	# NOTE: Proper options conversion required.
paul@16	263
paul@16	264	if len(parts) == 1:
paul@16	265	return "{{%s%s}}" % (prefix, parts[0])
paul@16	266	else:
paul@19	267	return "{{%s%s\|%s}}" % (prefix, parts[0], parts[1])
paul@16	268
paul@16	269	else:
paul@16	270	return match.group()
paul@16	271
paul@14	272	def get_table_rows(text):
paul@14	273
paul@14	274	"Return a list of (cellsep, columns) tuples for the given table 'text'."
paul@14	275
paul@14	276	rows = []
paul@14	277
paul@14	278	for line in text.split("\n"):
paul@14	279	cellsep = None
paul@14	280	columns = [""]
paul@14	281	last = 0
paul@16	282	for match in table_content_regexp.finditer(line):
paul@14	283	start, end = match.span()
paul@14	284	columns[-1] += line[last:start]
paul@14	285
paul@14	286	if match.group("celltype"):
paul@14	287	if cellsep is None:
paul@14	288	cellsep = match.group("celltype")
paul@14	289	columns.append("")
paul@14	290	else:
paul@16	291	columns[-1] += match.group()
paul@14	292
paul@14	293	last = end
paul@14	294
paul@14	295	columns[-1] += line[last:]
paul@14	296
paul@14	297	if cellsep:
paul@14	298	rows.append((cellsep, columns[1:-1]))
paul@14	299
paul@14	300	return rows
paul@14	301
paul@18	302	def translate_content(text, sectiontype=None):
paul@16	303
paul@18	304	"""
paul@18	305	Return a translation of the given 'text'. If the optional 'sectiontype' is
paul@18	306	specified, the translation may be modified to a form appropriate to the
paul@18	307	section being translated.
paul@18	308	"""
paul@16	309
paul@16	310	parts = []
paul@16	311
paul@16	312	last = 0
paul@16	313	for match in content_regexp.finditer(text):
paul@16	314	start, end = match.span()
paul@16	315	parts.append(text[last:start])
paul@18	316
paul@18	317	# Handle unformatted sections.
paul@18	318
paul@18	319	if sectiontype in ("code", "noformat"):
paul@18	320	parts.append(match.group())
paul@18	321	else:
paul@18	322	parts.append(translate_content_match(match))
paul@18	323
paul@16	324	last = end
paul@16	325
paul@16	326	parts.append(text[last:])
paul@16	327	return "".join(parts)
paul@16	328
paul@15	329	# Translation helpers.
paul@14	330
paul@11	331	blocktypes = {
paul@11	332	"h1" : "= %s =",
paul@11	333	"h2" : "== %s ==",
paul@11	334	"h3" : "=== %s ===",
paul@11	335	"h4" : "==== %s ====",
paul@11	336	"h5" : "===== %s =====",
paul@11	337	"h6" : "====== %s ======",
paul@11	338	"bq" : "{{{%s}}}",
paul@11	339	}
paul@11	340
paul@14	341	markers = {
paul@14	342	"" : "",
paul@14	343	"#" : "1.",
paul@14	344	"-" : "*",
paul@14	345	}
paul@14	346
paul@14	347	def translate_marker(marker):
paul@14	348
paul@14	349	"Translate the given 'marker' to a suitable Moin representation."
paul@14	350
paul@14	351	return " " * len(marker) + markers[marker[-1]]
paul@14	352
paul@14	353	cellseps = {
paul@14	354	"\|" : "\|\|",
paul@14	355	"\|\|" : "\|\|",
paul@14	356	}
paul@14	357
paul@14	358	cellextra = {
paul@14	359	"\|" : "",
paul@14	360	"\|\|" : "'''",
paul@14	361	}
paul@14	362
paul@14	363	def translate_cellsep(cellsep):
paul@14	364
paul@14	365	"Translate the given 'cellsep' to a suitable Moin representation."
paul@14	366
paul@14	367	return cellseps[cellsep]
paul@14	368
paul@14	369	def translate_cell(cellsep, text):
paul@14	370
paul@14	371	"Using 'cellsep', translate the cell 'text'."
paul@14	372
paul@16	373	return cellextra[cellsep] + translate_content(text) + cellextra[cellsep]
paul@14	374
paul@15	375	sectiontypes = {
paul@15	376	"code" : "",
paul@15	377	"noformat" : "",
paul@15	378	"quote" : "",
paul@15	379	"info" : "wiki important",
paul@15	380	"note" : "wiki caution",
paul@15	381	"tip" : "wiki tip",
paul@15	382	"warning" : "wiki warning",
paul@15	383	}
paul@15	384
paul@25	385	# XML dialect syntax parsing.
paul@25	386
paul@25	387	tags = {
paul@26	388	# XHTML tag MoinMoin syntax
paul@25	389	"strong" : "'''%s'''",
paul@25	390	"em" : "''%s''",
paul@25	391	"u" : "__%s__",
paul@25	392	"del" : "--(%s)--",
paul@25	393	"sup" : "^%s^",
paul@25	394	"sub" : ",,%s,,",
paul@25	395	"code" : "`%s`",
paul@25	396	"pre" : "{{{%s}}}",
paul@25	397	"blockquote" : " %s",
paul@25	398	"small" : "~-%s-~",
paul@25	399	"big" : "~+%s+~",
paul@26	400	"p" : "\n%s\n",
paul@26	401	"ol" : "\n%s",
paul@26	402	"ul" : "\n%s",
paul@25	403	"ac:plain-text-body" : "{{{%s}}}",
paul@25	404	"ac:link" : "[[%s%s\|%s]]",
paul@25	405	}
paul@26	406
paul@26	407	for tag, translation in blocktypes.items():
paul@26	408	tags[tag] = "\n%s\n" % translation
paul@26	409
paul@26	410	simple_tags = {
paul@26	411	# XHTML tag MoinMoin syntax
paul@26	412	"br" : "<<BR>>",
paul@26	413	}
paul@25	414
paul@25	415	list_tags = {
paul@26	416	# XHTML list tag MoinMoin list item syntax
paul@26	417	"ol" : "1. %s\n",
paul@26	418	"ul" : "* %s\n",
paul@25	419	}
paul@25	420
paul@26	421	indented_tags = ["li", "p"]
paul@26	422
paul@25	423	link_target_tags = {
paul@26	424	# Confluence element Attribute providing the target
paul@25	425	"ri:page" : "ri:content-title",
paul@25	426	"ri:attachment" : "ri:filename",
paul@25	427	}
paul@25	428
paul@26	429	macro_rich_text_styles = {
paul@26	430	# Confluence style MoinMoin admonition style
paul@26	431	"note" : "caution",
paul@26	432	"warning" : "warning",
paul@26	433	"info" : "important",
paul@26	434	"tip" : "tip",
paul@26	435	}
paul@26	436
paul@26	437	normalise_regexp_str = r"\s+"
paul@26	438	normalise_regexp = re.compile(normalise_regexp_str)
paul@26	439
paul@26	440	normalise_end_regexp_str = r"\s\s+$"
paul@26	441	normalise_end_regexp = re.compile(normalise_end_regexp_str)
paul@25	442
paul@25	443	class ConfluenceXMLParser(Parser):
paul@25	444
paul@25	445	"Handle content from Confluence 4 page revisions."
paul@25	446
paul@25	447	def __init__(self, out):
paul@25	448	Parser.__init__(self)
paul@25	449	self.out = out
paul@25	450
paul@25	451	# Link target information.
paul@25	452
paul@25	453	self.target = None
paul@25	454	self.target_type = None
paul@25	455
paul@26	456	# Macro information.
paul@26	457
paul@26	458	self.macro = None
paul@26	459	self.macro_parameters = {}
paul@26	460
paul@26	461	# Indentation and preformatted states.
paul@26	462
paul@26	463	self.indent = 0
paul@26	464	self.states = {}
paul@26	465	for name in ("pre", "ac:plain-text-body"):
paul@26	466	self.states[name] = 0
paul@26	467
paul@26	468	# ContentHandler-related methods.
paul@26	469
paul@26	470	def startElement(self, name, attrs):
paul@26	471	if list_tags.has_key(name):
paul@26	472	self.indent += 1
paul@26	473	elif self.states.has_key(name):
paul@26	474	self.states[name] += 1
paul@26	475	Parser.startElement(self, name, attrs)
paul@26	476
paul@26	477	def endElement(self, name):
paul@26	478	Parser.endElement(self, name)
paul@26	479	if list_tags.has_key(name):
paul@26	480	self.indent -= 1
paul@26	481	elif self.states.has_key(name):
paul@26	482	self.states[name] -= 1
paul@26	483
paul@26	484	def characters(self, content):
paul@26	485	if not self.is_preformatted():
paul@26	486	content = self.normalise(content, self.elements[-1])
paul@26	487	Parser.characters(self, content)
paul@26	488
paul@26	489	def skippedEntity(self, name):
paul@27	490	ch = htmlentitydefs.name2codepoint.get(name)
paul@27	491	if ch:
paul@27	492	self.text[-1].append(unichr(ch))
paul@26	493
paul@26	494	# Parser-related methods.
paul@26	495
paul@25	496	def handleElement(self, name):
paul@25	497	text = "".join(self.text[-1])
paul@26	498	conversion = None
paul@25	499
paul@25	500	# Handle list elements.
paul@25	501
paul@25	502	if name == "li" and len(self.elements) > 1:
paul@25	503	list_tag = self.elements[-2]
paul@25	504	conversion = list_tags.get(list_tag)
paul@25	505
paul@25	506	# Remember link target information.
paul@25	507
paul@25	508	elif link_target_tags.has_key(name):
paul@25	509	self.target = self.attributes[-1].get(link_target_tags[name])
paul@25	510	self.target_type = name
paul@25	511	text = ""
paul@25	512
paul@26	513	# Remember macro information.
paul@26	514
paul@26	515	elif name == "ac:parameter":
paul@26	516	self.macro_parameters[self.attributes[-1].get("ac:name")] = text
paul@26	517	text = ""
paul@26	518
paul@26	519	elif name == "ac:macro":
paul@26	520	self.macro = self.attributes[-1].get("ac:name")
paul@26	521
paul@25	522	# Handle the common case.
paul@25	523
paul@25	524	else:
paul@25	525	conversion = tags.get(name)
paul@25	526
paul@25	527	# Attempt to convert the text.
paul@25	528
paul@26	529	# Links require target information.
paul@26	530
paul@25	531	if name == "ac:link":
paul@25	532	if self.target_type == "ri:attachment":
paul@25	533	prefix = "attachment:"
paul@25	534	else:
paul@25	535	prefix = "../"
paul@25	536
paul@25	537	text = conversion % (prefix, self.target, text or self.target)
paul@26	538	self.target = self.target_type = None
paul@26	539
paul@26	540	# Macro name information is used to style rich text body regions.
paul@26	541
paul@26	542	elif name == "ac:macro" and macro_rich_text_styles.has_key(self.macro):
paul@26	543	details = macro_rich_text_styles[self.macro]
paul@26	544	title = self.macro_parameters.get("title")
paul@26	545	if title:
paul@26	546	details = "%s\n\n%s" % (details, title)
paul@26	547	text = "{{{#!wiki %s\n\n%s}}}" % (details, text)
paul@26	548	self.macro = None
paul@26	549	self.macro_parameters = {}
paul@25	550
paul@25	551	# Handle the common case.
paul@25	552
paul@25	553	elif text and conversion:
paul@25	554	text = conversion % text
paul@26	555	elif simple_tags.has_key(name):
paul@26	556	text = simple_tags[name]
paul@26	557
paul@26	558	# Normalise leading whitespace and indent the text if appropriate.
paul@26	559
paul@26	560	if name in indented_tags:
paul@26	561	text = " " * self.indent + text.lstrip()
paul@25	562
paul@25	563	# Add the converted text to the end of the parent element's text nodes.
paul@25	564
paul@25	565	if len(self.text) > 1:
paul@26	566	preceding = "".join(self.text[-2])
paul@26	567
paul@26	568	if not self.is_preformatted():
paul@26	569	preceding = self.normalise_end(preceding, self.elements[-2])
paul@26	570
paul@26	571	self.text[-2] = [preceding]
paul@25	572	self.text[-2].append(text)
paul@25	573
paul@26	574	# Otherwise, emit the text.
paul@25	575
paul@25	576	else:
paul@26	577	self.out.write(text)
paul@26	578
paul@26	579	def is_preformatted(self):
paul@26	580	return reduce(operator.or_, self.states.values(), False)
paul@26	581
paul@26	582	def get_replacement(self, name, end=False):
paul@26	583	if list_tags.has_key(name):
paul@26	584	if end:
paul@26	585	return "\n"
paul@26	586	else:
paul@26	587	return ""
paul@26	588	elif name == "body":
paul@26	589	return "\n\n"
paul@26	590	else:
paul@26	591	return " "
paul@26	592
paul@26	593	def normalise(self, text, name):
paul@26	594	return normalise_regexp.sub(self.get_replacement(name), text)
paul@26	595
paul@26	596	def normalise_end(self, text, name):
paul@26	597	return normalise_end_regexp.sub(self.get_replacement(name, True), text)
paul@25	598
paul@25	599	def xmlparse(s, out):
paul@25	600
paul@25	601	"Parse the content in the string 's', writing a translation to 'out'."
paul@25	602
paul@25	603	# NOTE: CDATA sections appear to have erroneous endings.
paul@25	604
paul@25	605	s = u"""\
paul@25	606	<?xml version="1.0"?>
paul@25	607	<!DOCTYPE html
paul@25	608	PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
paul@25	609	"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
paul@25	610	<html xmlns="http://www.w3.org/1999/xhtml">
paul@25	611	<body>
paul@25	612	%s
paul@25	613	</body>
paul@25	614	</html>""" % s.replace("]] >", "]]>")
paul@25	615
paul@25	616	f = StringIO(s.encode("utf-8"))
paul@25	617	try:
paul@25	618	parser = ConfluenceXMLParser(out)
paul@25	619	parser.parse(f)
paul@25	620	finally:
paul@25	621	f.close()
paul@25	622
paul@15	623	# General parsing.
paul@15	624
paul@11	625	def parse(s, out):
paul@11	626
paul@11	627	"Parse the content in the string 's', writing a translation to 'out'."
paul@11	628
paul@11	629	for type, text in get_regions(s):
paul@11	630
paul@11	631	# Handle list, heading, blockquote or anonymous blocks.
paul@11	632
paul@11	633	if type is None:
paul@11	634	for blocktype, blocktext in get_blocks(text):
paul@14	635
paul@14	636	# Translate headings and blockquotes.
paul@14	637
paul@11	638	if blocktypes.has_key(blocktype):
paul@11	639	print >>out, blocktypes[blocktype] % blocktext
paul@14	640
paul@14	641	# Translate list items.
paul@14	642
paul@14	643	elif blocktype == "list":
paul@14	644	for listmarker, listitem in get_list_items(blocktext):
paul@16	645	print >>out, "%s %s" % (translate_marker(listmarker), translate_content(listitem))
paul@14	646
paul@14	647	# Translate table items.
paul@14	648
paul@14	649	elif blocktype == "table":
paul@14	650	for cellsep, columns in get_table_rows(blocktext):
paul@14	651	moinsep = translate_cellsep(cellsep)
paul@14	652	print >>out, moinsep + moinsep.join([translate_cell(cellsep, column) for column in columns]) + moinsep
paul@14	653
paul@14	654	# Handle anonymous blocks.
paul@14	655
paul@11	656	else:
paul@16	657	print >>out, translate_content(blocktext.rstrip())
paul@14	658
paul@14	659	print >>out
paul@11	660
paul@11	661	# Handle sections.
paul@11	662
paul@11	663	else:
paul@15	664	sectiontype, options = type
paul@15	665
paul@15	666	# Direct translations of sections.
paul@15	667
paul@15	668	mointype = sectiontypes.get(sectiontype)
paul@15	669	if mointype:
paul@15	670	print >>out, "{{{#!%s" % mointype
paul@15	671	if options:
paul@15	672	print >>out, "##", options
paul@15	673	else:
paul@15	674	print >>out, "{{{",
paul@18	675	print >>out, translate_content(text, sectiontype),
paul@14	676	print >>out, "}}}"
paul@14	677	print >>out
paul@11	678
paul@6	679	if __name__ == "__main__":
paul@6	680	s = sys.stdin.read()
paul@11	681	parse(s, sys.stdout)
paul@6	682
paul@6	683	# vim: tabstop=4 expandtab shiftwidth=4