ConfluenceConverter (annotate wikiparser.py in c3d772d8cbad)

ConfluenceConverter

Annotated wikiparser.py

123:c3d772d8cbad

2013-11-02

Paul Boddie

Added revision and attachment timestamping, sorting edits by such time details. Added a merge script to combine page packages for a single coherent import.

paul@6	1	#!/usr/bin/env python
paul@6	2
paul@7	3	"""
paul@7	4	Confluence Wiki syntax parsing.
paul@7	5
paul@34	6	Copyright (C) 2012, 2013 Paul Boddie <paul@boddie.org.uk>
paul@8	7
paul@8	8	This software is free software; you can redistribute it and/or
paul@8	9	modify it under the terms of the GNU General Public License as
paul@8	10	published by the Free Software Foundation; either version 2 of
paul@8	11	the License, or (at your option) any later version.
paul@8	12
paul@8	13	This software is distributed in the hope that it will be useful,
paul@8	14	but WITHOUT ANY WARRANTY; without even the implied warranty of
paul@8	15	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
paul@8	16	GNU General Public License for more details.
paul@8	17
paul@8	18	You should have received a copy of the GNU General Public
paul@8	19	License along with this library; see the file LICENCE.txt
paul@8	20	If not, write to the Free Software Foundation, Inc.,
paul@8	21	51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
paul@8	22
paul@8	23	--------
paul@8	24
paul@8	25	The basic procedure is as follows:
paul@8	26
paul@7	27	1. Wiki pages are first split up into regions.
paul@7	28	2. Then, within these regions, the text is split into blocks.
paul@7	29	1. First, lists are identified.
paul@7	30	2. Additionally, other block-like elements are identified.
paul@78	31	3. Each block is then split into regions.
paul@7	32	"""
paul@7	33
paul@35	34	from common import *
paul@6	35	import re
paul@25	36	import sys
paul@41	37	import codecs
paul@77	38	import operator
paul@19	39
paul@6	40	# Section extraction.
paul@6	41
paul@88	42	sections_regexp_str = r"(?<!{){(?P<type>[^-_*+{}\n:]+)(?P<options>:[^}\n]+)?}" \
paul@88	43	r"\|" \
paul@88	44	r"^(?P<rowstart>[\|]{1,2})" \
paul@88	45	r"\|" \
paul@88	46	r"(?P<rowend>[\|]{1,2}(\n\|$))" \
paul@88	47	r"\|" \
paul@89	48	r"^(?P<listitem>\s[#-]+\s+.*?([^\|](\n\|$)\|(?=[\|](\n\|$))))"
paul@88	49
paul@89	50	sections_regexp = re.compile(sections_regexp_str, re.MULTILINE)
paul@6	51
paul@6	52	def get_regions(s):
paul@6	53
paul@6	54	"""
paul@6	55	Return a list of regions from 's'. Each region is specified using a tuple of
paul@6	56	the form (type, text).
paul@6	57	"""
paul@6	58
paul@6	59	last = 0
paul@76	60	regions = [""]
paul@75	61	depth = 0
paul@86	62	had_row = False
paul@88	63	had_item = False
paul@75	64
paul@6	65	for match in sections_regexp.finditer(s):
paul@6	66	start, end = match.span()
paul@86	67	is_start = match.group("options") or match.group("rowstart")
paul@76	68	is_section = is_section_marker(match.group("type"))
paul@86	69	is_row = match.group("rowstart") or match.group("rowend")
paul@88	70	is_item = match.group("listitem")
paul@75	71
paul@75	72	# The start of a region is either indicated by a marker with options or
paul@75	73	# by a marker where no region is currently active.
paul@75	74
paul@75	75	if is_start or not depth:
paul@75	76
paul@75	77	# Where no region is active, add the text since the last match as a
paul@75	78	# "null" region.
paul@75	79
paul@75	80	if not depth:
paul@76	81	regions[-1] += s[last:start]
paul@75	82
paul@75	83	# A new region is maintained as a string.
paul@75	84
paul@76	85	if is_section:
paul@76	86	regions.append(s[start:end])
paul@76	87
paul@86	88	# A new row may either continue a table region or start a new
paul@86	89	# table region.
paul@86	90
paul@86	91	elif is_row:
paul@89	92	if had_row and last == start:
paul@86	93	regions[-2] += regions[-1] + s[start:end]
paul@86	94	regions.pop()
paul@89	95	else:
paul@89	96	regions.append(s[start:end])
paul@86	97
paul@88	98	# A list item may either continue a list region or start a new
paul@88	99	# list region.
paul@88	100
paul@88	101	elif is_item:
paul@89	102
paul@89	103	# If continuing a list, merge the list regions and start a
paul@89	104	# new potentally separate region.
paul@89	105
paul@89	106	if had_item and last == start:
paul@89	107	regions[-2] += regions[-1] + s[start:end]
paul@89	108	regions[-1] = ""
paul@89	109
paul@89	110	# If not continuing a list, make a region for a new list and
paul@89	111	# start a new potentally separate region.
paul@89	112
paul@88	113	else:
paul@89	114	regions.append(s[start:end])
paul@89	115	regions.append("")
paul@88	116
paul@76	117	# Certain markers may be standalone macros.
paul@76	118
paul@76	119	else:
paul@76	120	regions[-1] += s[start:end]
paul@75	121
paul@75	122	# Where a region is active, add the text since the last match as
paul@75	123	# well as the text in this match to the region.
paul@75	124
paul@75	125	else:
paul@75	126	regions[-1] += s[last:end]
paul@75	127
paul@86	128	if is_section or is_row:
paul@76	129	depth += 1
paul@75	130
paul@89	131	# The end of a region is indicated by a marker with no options or the
paul@89	132	# end of a row.
paul@75	133
paul@75	134	else:
paul@75	135	# Where no region is active, the text since the last match plus the
paul@75	136	# marker are added to the current "null" region.
paul@75	137
paul@75	138	if not depth:
paul@75	139
paul@75	140	# Add to the string portion of the "null" region.
paul@75	141
paul@76	142	regions[-1] += s[last:end]
paul@75	143
paul@75	144	# Where a region is active, the end marker and preceding text is
paul@75	145	# either incorporated into the current region if more than one
paul@75	146	# region is active, or the preceding text is incorporated into the
paul@75	147	# current region and the details of the region are then obtained.
paul@75	148
paul@75	149	else:
paul@86	150	if depth > 1 or (not is_section and not is_row):
paul@75	151	regions[-1] += s[last:end]
paul@75	152
paul@75	153	# Terminate the active region, interpreting its contents.
paul@75	154
paul@75	155	else:
paul@76	156	regions[-1] += s[last:end]
paul@76	157	regions.append("")
paul@76	158
paul@86	159	if is_section or is_row:
paul@76	160	depth -= 1
paul@75	161
paul@86	162	had_row = is_row
paul@88	163	had_item = is_item
paul@6	164	last = end
paul@75	165
paul@75	166	# Where a region is still active, terminate it.
paul@75	167
paul@76	168	regions[-1] += s[last:]
paul@75	169
paul@76	170	return [get_section_details(s) for s in regions if s]
paul@75	171
paul@76	172	def is_section_marker(sectiontype):
paul@76	173	return sectiontypes.has_key(sectiontype) or sectiontype == "color"
paul@6	174
paul@7	175	# Section inspection.
paul@7	176
paul@15	177	section_regexp_str = r"{(?P<sectiontype>[^\n:]?)(?::(?P<options>.?))?}(?P<section>.*){(?P=sectiontype)}"
paul@7	178	section_regexp = re.compile(section_regexp_str, re.DOTALL \| re.MULTILINE)
paul@7	179
paul@6	180	def get_section_details(s):
paul@6	181
paul@7	182	"Return the details of a section 's' in the form (type, text)."
paul@6	183
paul@6	184	match = section_regexp.match(s)
paul@6	185	if match:
paul@15	186	return (match.group("sectiontype"), match.group("options")), match.group("section")
paul@6	187	else:
paul@6	188	return None, s
paul@6	189
paul@14	190	# Heading, table and list extraction.
paul@7	191
paul@41	192	list_regexp_str = r"^\s(?P<listtype>[#-])[#-]\s+.(\n\s(?P=listtype).?)(?:\n\|$)"
paul@39	193	table_regexp_str = r"^((?P<celltype>[\|]{1,2})((.\|\n(?!\n))+?(?P=celltype))+(\n\|$))+"
paul@14	194	blocktext_regexp_str = r"^(?P<type>h\d\|bq)\.\s+(?P<text>.*)$"
paul@7	195
paul@14	196	blockelement_regexp = re.compile(
paul@14	197	"(" + list_regexp_str + ")"
paul@14	198	"\|"
paul@14	199	"(" + table_regexp_str + ")"
paul@14	200	"\|"
paul@14	201	"(" + blocktext_regexp_str + ")",
paul@14	202	re.MULTILINE
paul@14	203	)
paul@14	204
paul@14	205	def get_block_elements(s):
paul@7	206
paul@7	207	"""
paul@14	208	Extract headings, tables and lists from the given string 's'.
paul@7	209	"""
paul@7	210
paul@7	211	last = 0
paul@7	212	blocks = []
paul@14	213	for match in blockelement_regexp.finditer(s):
paul@7	214	start, end = match.span()
paul@14	215	matchtype = match.group("listtype") and "list" or match.group("celltype") and "table" or match.group("type")
paul@7	216	blocks.append((None, s[last:start]))
paul@14	217	blocks.append((matchtype, match.group("text") or s[start:end]))
paul@7	218	last = end
paul@7	219	blocks.append((None, s[last:]))
paul@7	220	return blocks
paul@7	221
paul@7	222	# Block extraction.
paul@7	223
paul@7	224	block_regexp_str = r"^(?:\s*\n)+"
paul@7	225	block_regexp = re.compile(block_regexp_str, re.MULTILINE)
paul@7	226
paul@7	227	def get_basic_blocks(s):
paul@7	228
paul@7	229	"""
paul@7	230	Return blocks from the given string 's' by splitting the text on blank lines
paul@7	231	and eliminating those lines.
paul@7	232	"""
paul@7	233
paul@7	234	return [b for b in block_regexp.split(s) if b.strip()]
paul@7	235
paul@7	236	# Block inspection.
paul@7	237
paul@7	238	def get_blocks(s):
paul@7	239
paul@7	240	"""
paul@7	241	Return blocks from the given string 's', inspecting the basic blocks and
paul@7	242	generating additional block-level text where appropriate.
paul@7	243	"""
paul@7	244
paul@7	245	blocks = []
paul@7	246
paul@14	247	for blocktype, blocktext in get_block_elements(s):
paul@7	248
paul@14	249	# Collect heading, list and table blocks.
paul@7	250
paul@7	251	if blocktype is not None:
paul@7	252	blocks.append((blocktype, blocktext))
paul@7	253
paul@7	254	# Attempt to find new subblocks in other regions.
paul@7	255
paul@7	256	else:
paul@7	257	for block in get_basic_blocks(blocktext):
paul@14	258	blocks.append((None, block))
paul@7	259
paul@7	260	return blocks
paul@7	261
paul@14	262	# List item inspection.
paul@14	263
paul@41	264	listitem_regexp_str = r"^(?P<marker> [-#]+)\s+(?P<text>.*)$"
paul@7	265	listitem_regexp = re.compile(listitem_regexp_str, re.MULTILINE)
paul@7	266
paul@14	267	def get_list_items(text):
paul@14	268
paul@14	269	"Return a list of (marker, text) tuples for the given list 'text'."
paul@14	270
paul@14	271	items = []
paul@14	272
paul@14	273	for match in listitem_regexp.finditer(text):
paul@14	274	items.append((match.group("marker"), match.group("text")))
paul@14	275
paul@14	276	return items
paul@14	277
paul@36	278	# Content inspection.
paul@14	279
paul@19	280	monospace_regexp_str = r"{{(?P<monotext>.*?)}}"
paul@91	281	link_regexp_str = r"(?<!\\)[[](?P<linktext>.*?)]"
paul@38	282	image_regexp_str = r"!(?P<imagetext>\w.*?)!"
paul@91	283	macro_regexp_str = r"{(?P<macro>.?)(?::(?P<options>.?))?}"
paul@36	284
paul@36	285	# Word-dependent patterns.
paul@36	286	# Here, the unbracketed markers must test for the absence of surrounding word
paul@36	287	# characters.
paul@36	288
paul@36	289	italic_regexp_str = r"(?:(?<!\w)_\|\{_\})(?P<italictext>.*?)(?:_(?!\w)\|\{_\})"
paul@36	290	bold_regexp_str = r"(?:(?<!\w)\\|\{\\})(?P<boldtext>.?)(?:\(?!\w)\|\{\*\})"
paul@36	291	del_regexp_str = r"(?:(?<!\w)-\|\{-\})(?P<deltext>.*?)(?:-(?!\w)\|\{-\})"
paul@36	292	underline_regexp_str = r"(?:(?<!\w)\+\|\{\+\})(?P<underlinetext>.*?)(?:\+(?!\w)\|\{\+\})"
paul@36	293	sub_regexp_str = r"(?:(?<!\w)~\|\{~\})(?P<subtext>.*?)(?:~(?!\w)\|\{~\})"
paul@16	294
paul@16	295	content_regexp_str = (
paul@19	296	"(" + monospace_regexp_str + ")"
paul@19	297	"\|"
paul@14	298	"(" + link_regexp_str + ")"
paul@14	299	"\|"
paul@14	300	"(" + image_regexp_str + ")"
paul@36	301	"\|"
paul@71	302	"(" + macro_regexp_str + ")"
paul@71	303	"\|"
paul@36	304	"(" + italic_regexp_str + ")"
paul@36	305	"\|"
paul@36	306	"(" + bold_regexp_str + ")"
paul@36	307	"\|"
paul@36	308	"(" + del_regexp_str + ")"
paul@36	309	"\|"
paul@36	310	"(" + underline_regexp_str + ")"
paul@36	311	"\|"
paul@36	312	"(" + sub_regexp_str + ")"
paul@16	313	)
paul@16	314
paul@36	315	# Table row inspection.
paul@36	316
paul@36	317	cellsep_regexp_str = r"(?P<celltype>[\|]{1,2})"
paul@36	318
paul@16	319	table_content_regexp_str = (
paul@16	320	content_regexp_str +
paul@14	321	"\|"
paul@14	322	"(" + cellsep_regexp_str + ")"
paul@14	323	)
paul@14	324
paul@16	325	content_regexp = re.compile(content_regexp_str)
paul@16	326	table_content_regexp = re.compile(table_content_regexp_str)
paul@16	327
paul@14	328	def get_table_rows(text):
paul@14	329
paul@14	330	"Return a list of (cellsep, columns) tuples for the given table 'text'."
paul@14	331
paul@14	332	rows = []
paul@14	333
paul@39	334	for row in text.split("\|\n"):
paul@39	335	if not row:
paul@39	336	break
paul@39	337
paul@39	338	row += "\|"
paul@14	339	cellsep = None
paul@14	340	columns = [""]
paul@14	341	last = 0
paul@39	342	for match in table_content_regexp.finditer(row):
paul@14	343	start, end = match.span()
paul@39	344	columns[-1] += row[last:start]
paul@14	345
paul@14	346	if match.group("celltype"):
paul@14	347	if cellsep is None:
paul@14	348	cellsep = match.group("celltype")
paul@14	349	columns.append("")
paul@14	350	else:
paul@16	351	columns[-1] += match.group()
paul@14	352
paul@14	353	last = end
paul@14	354
paul@39	355	columns[-1] += row[last:]
paul@14	356
paul@14	357	if cellsep:
paul@14	358	rows.append((cellsep, columns[1:-1]))
paul@14	359
paul@14	360	return rows
paul@14	361
paul@70	362	# Notation conversion.
paul@70	363
paul@70	364	notation_mapping = [
paul@70	365	(r"\!", "!"),
paul@70	366	(r"\-", "-"),
paul@70	367	(r"\\""\n", "<<BR>>"),
paul@70	368	(r"\\ ", "<<BR>>"),
paul@70	369	(r"\~", "~"),
paul@91	370	(r"\[", "<<Verbatim([)>>"),
paul@91	371	(r"\]", "<<Verbatim(])>>"),
paul@91	372	(r"\", ""),
paul@70	373	]
paul@70	374
paul@70	375	preformatted_notation_mapping = [
paul@70	376	(r"\!", "!"),
paul@70	377	(r"\-", "-"),
paul@70	378	(r"\\""\n", "\n"),
paul@70	379	(r"\\ ", "\n"),
paul@70	380	(r"\~", "~"),
paul@70	381	]
paul@70	382
paul@70	383	# Translation helpers.
paul@70	384
paul@70	385	markers = {
paul@70	386	"" : "",
paul@70	387	"#" : "1.",
paul@70	388	"-" : "*",
paul@70	389	}
paul@70	390
paul@70	391	cellseps = {
paul@70	392	"\|" : "\n\|\| ",
paul@70	393	"\|\|" : "\n\|\| ",
paul@70	394	}
paul@70	395
paul@70	396	cellextra = {
paul@70	397	"\|" : "",
paul@70	398	"\|\|" : "'''",
paul@70	399	}
paul@70	400
paul@15	401	sectiontypes = {
paul@42	402	"code" : "",
paul@91	403	"excerpt" : "#!wiki",
paul@42	404	"noformat" : "",
paul@42	405	"quote" : "",
paul@68	406	"info" : "#!wiki important",
paul@68	407	"note" : "#!wiki caution",
paul@68	408	"tip" : "#!wiki tip",
paul@68	409	"warning" : "#!wiki warning",
paul@42	410	}
paul@42	411
paul@66	412	preformatted_sectiontypes = (None, "noformat")
paul@66	413
paul@71	414	macroargs = {
paul@71	415	"color" : "col",
paul@71	416	}
paul@71	417
paul@42	418	macrotypes = {
paul@71	419	"anchor" : "<<Anchor(%(args)s)>>",
paul@71	420	"color" : "<<Color2(%(content)s, %(args)s)>>",
paul@93	421	"toc" : "<<TableOfContents>>",
paul@15	422	}
paul@15	423
paul@70	424	class ConfluenceParser:
paul@70	425
paul@70	426	"A parser for Confluence markup."
paul@70	427
paul@70	428	def __init__(self):
paul@70	429	self.max_level = self.level = 0
paul@71	430	self.in_heading = False
paul@72	431	self.held_anchors = []
paul@76	432	self.macro = None
paul@77	433	self.sections = []
paul@70	434
paul@70	435	def translate_marker(self, marker):
paul@70	436
paul@70	437	"Translate the given 'marker' to a suitable Moin representation."
paul@70	438
paul@70	439	return " " * len(marker) + markers[marker[-1]]
paul@70	440
paul@70	441	def translate_cellsep(self, cellsep):
paul@70	442
paul@70	443	"Translate the given 'cellsep' to a suitable Moin representation."
paul@70	444
paul@70	445	return cellseps[cellsep]
paul@70	446
paul@70	447	def translate_cell(self, cellsep, text):
paul@15	448
paul@70	449	"Using 'cellsep', translate the cell 'text'."
paul@70	450
paul@70	451	return cellextra[cellsep] + self.parse_text(text).strip() + cellextra[cellsep]
paul@70	452
paul@70	453	def translate_content_match(self, match):
paul@70	454
paul@70	455	"Translate the content described by the given 'match', returning a string."
paul@70	456
paul@70	457	if match.group("monotext"):
paul@70	458	self.enter_section(); self.leave_section()
paul@70	459	return "{{{%s}}}" % match.group("monotext")
paul@11	460
paul@70	461	elif match.group("linktext"):
paul@70	462	parts = match.group("linktext").split("\|")
paul@70	463
paul@70	464	# NOTE: Proper detection of external links required.
paul@70	465
paul@70	466	if len(parts) == 1:
paul@70	467	label, target, title = None, parts[0], None
paul@70	468	elif len(parts) == 2:
paul@70	469	(label, target), title = parts, None
paul@70	470	else:
paul@70	471	label, target, title = parts
paul@39	472
paul@70	473	target = target.strip()
paul@70	474
paul@70	475	# Look for namespace links and rewrite them.
paul@70	476
paul@70	477	if target.find(":") != -1:
paul@70	478	prefix = ""
paul@70	479	space, rest = target.split(":", 1)
paul@70	480	if space not in URL_SCHEMES:
paul@85	481	rest = get_page_title(rest)
paul@70	482	target = "%s/%s" % (space, rest)
paul@70	483
paul@70	484	# Detect anchors.
paul@70	485
paul@70	486	elif target.startswith("#"):
paul@70	487	prefix = ""
paul@70	488
paul@70	489	# Detect attachments.
paul@70	490
paul@70	491	elif target.startswith("^"):
paul@70	492	prefix = "attachment:"
paul@70	493
paul@70	494	# Link to other pages within a space.
paul@11	495
paul@70	496	else:
paul@70	497	prefix = "../"
paul@70	498
paul@70	499	# Make the link tidier by making a target if none was given.
paul@70	500
paul@70	501	if not label:
paul@70	502	label = target
paul@42	503
paul@85	504	target = get_page_title(target)
paul@85	505
paul@70	506	if not label and not title:
paul@70	507	return "[[%s%s]]" % (prefix, target)
paul@70	508	elif not title:
paul@70	509	return "[[%s%s\|%s]]" % (prefix, target, label)
paul@70	510	else:
paul@70	511	return "[[%s%s\|%s\|title=%s]]" % (prefix, target, label, title)
paul@70	512
paul@70	513	elif match.group("imagetext"):
paul@70	514	parts = match.group("imagetext").split("\|")
paul@70	515
paul@70	516	# NOTE: Proper detection of external links required.
paul@70	517
paul@70	518	if parts[0].startswith("http"):
paul@70	519	prefix = ""
paul@70	520	else:
paul@70	521	prefix = "attachment:"
paul@42	522
paul@70	523	# NOTE: Proper options conversion required.
paul@70	524
paul@70	525	if len(parts) == 1:
paul@70	526	return "{{%s%s}}" % (prefix, parts[0])
paul@70	527	else:
paul@70	528	return "{{%s%s\|%s}}" % (prefix, parts[0], parts[1])
paul@70	529
paul@71	530	elif match.group("macro"):
paul@71	531	macro_name = match.group("macro")
paul@72	532	if macrotypes.has_key(macro_name):
paul@71	533	argname = macroargs.get(macro_name)
paul@72	534	result = macrotypes[macro_name] % {
paul@91	535	"args" : quote_macro_argument((argname and ("%s=" % argname) or "") + (match.group("options") or ""))
paul@71	536	}
paul@72	537	if not self.forbids_macros():
paul@72	538	return result
paul@72	539	if macro_name == "anchor":
paul@72	540	self.held_anchors.append(result)
paul@72	541	return ""
paul@71	542
paul@70	543	elif match.group("italictext"):
paul@70	544	return "''%s''" % self.translate_content(match.group("italictext"))
paul@70	545
paul@70	546	elif match.group("boldtext"):
paul@70	547	return "'''%s'''" % self.translate_content(match.group("boldtext"))
paul@70	548
paul@70	549	elif match.group("deltext"):
paul@70	550	return "--(%s)--" % self.translate_content(match.group("deltext"))
paul@70	551
paul@70	552	elif match.group("underlinetext"):
paul@70	553	return "__%s__" % self.translate_content(match.group("underlinetext"))
paul@70	554
paul@70	555	elif match.group("subtext"):
paul@70	556	return ",,%s,," % self.translate_content(match.group("subtext"))
paul@11	557
paul@70	558	else:
paul@70	559	return self.translate_text(match.group())
paul@70	560
paul@70	561	def translate_text(self, s, preformatted=False):
paul@70	562
paul@70	563	"Translate the plain text string 's', converting notation."
paul@70	564
paul@70	565	for before, after in preformatted and preformatted_notation_mapping or notation_mapping:
paul@70	566	s = s.replace(before, after)
paul@70	567	return s
paul@70	568
paul@77	569	def translate_content(self, text):
paul@70	570
paul@70	571	"""
paul@70	572	Return a translation of the given 'text'. If the optional 'sectiontype' is
paul@70	573	specified, the translation may be modified to a form appropriate to the
paul@70	574	section being translated.
paul@70	575	"""
paul@70	576
paul@70	577	parts = []
paul@77	578	preformatted = self.is_preformatted()
paul@11	579
paul@70	580	last = 0
paul@70	581	for match in content_regexp.finditer(text):
paul@70	582	start, end = match.span()
paul@70	583	parts.append(self.translate_text(text[last:start], preformatted))
paul@70	584
paul@70	585	# Handle unformatted sections.
paul@70	586
paul@77	587	if self.sections and self.sections[-1] in ("code", "noformat"):
paul@70	588	parts.append(match.group())
paul@70	589	else:
paul@70	590	parts.append(self.translate_content_match(match))
paul@70	591
paul@70	592	last = end
paul@70	593
paul@70	594	parts.append(self.translate_text(text[last:], preformatted))
paul@70	595	return "".join(parts)
paul@70	596
paul@77	597	def is_preformatted(self):
paul@77	598	return reduce(operator.or_, [x in preformatted_sectiontypes for x in self.sections], False)
paul@77	599
paul@70	600	def translate_block(self, blocktype, blocktext):
paul@70	601
paul@70	602	"Translate the block with the given 'blocktype' and 'blocktext'."
paul@70	603
paul@71	604	if blocktype in headings:
paul@71	605	self.in_heading = True
paul@72	606	self.held_anchors = []
paul@71	607
paul@70	608	parts = []
paul@42	609
paul@70	610	# Translate headings and blockquotes.
paul@70	611
paul@70	612	if blocktypes.has_key(blocktype):
paul@82	613	text = self.parse_text(blocktext)
paul@72	614	for anchor in self.held_anchors:
paul@72	615	parts.append(anchor)
paul@72	616	parts.append(blocktypes[blocktype] % text)
paul@70	617
paul@70	618	# Translate list items.
paul@70	619
paul@70	620	elif blocktype == "list":
paul@70	621	for listmarker, listitem in get_list_items(blocktext):
paul@82	622	parts.append("%s %s" % (self.translate_marker(listmarker), self.parse_text(listitem)))
paul@70	623
paul@70	624	# Translate table items.
paul@70	625
paul@70	626	elif blocktype == "table":
paul@70	627
paul@70	628	# Enter the table.
paul@70	629
paul@91	630	self.enter_section("table")
paul@70	631
paul@70	632	table_parts = []
paul@42	633	first = True
paul@70	634
paul@70	635	for cellsep, columns in get_table_rows(blocktext):
paul@42	636	if not first:
paul@70	637	table_parts.append("==")
paul@42	638	else:
paul@42	639	first = False
paul@70	640	moinsep = self.translate_cellsep(cellsep)
paul@70	641	table_parts.append(moinsep.join([self.translate_cell(cellsep, column) for column in columns]))
paul@70	642
paul@70	643	# Nest the section appropriately.
paul@70	644
paul@70	645	opening, closing = self.nest_section()
paul@42	646
paul@70	647	parts.append("%s#!table" % opening)
paul@70	648	parts += table_parts
paul@70	649	parts.append(closing)
paul@11	650
paul@70	651	# Leave the table.
paul@70	652
paul@70	653	self.leave_section()
paul@70	654
paul@70	655	# Handle anonymous blocks.
paul@11	656
paul@11	657	else:
paul@82	658	parts.append(self.parse_text(blocktext))
paul@70	659
paul@71	660	if blocktype in headings:
paul@71	661	self.in_heading = False
paul@71	662
paul@70	663	return "\n".join(parts)
paul@70	664
paul@70	665	def translate_section(self, sectiontype, options, text):
paul@70	666
paul@70	667	"""
paul@70	668	Translate the section with the given 'sectiontype', 'options' and
paul@70	669	'text'.
paul@70	670	"""
paul@70	671
paul@70	672	parts = []
paul@70	673
paul@70	674	# Enter the section.
paul@70	675
paul@77	676	self.enter_section(sectiontype)
paul@70	677
paul@77	678	# Sections can contain other sections.
paul@77	679
paul@89	680	if sectiontype == "noformat":
paul@89	681	section_content = self.translate_content(text.strip("\n"))
paul@89	682	else:
paul@89	683	section_content = self.parse_text(text.strip())
paul@70	684
paul@70	685	# Nest the section appropriately.
paul@70	686
paul@70	687	opening, closing = self.nest_section()
paul@77	688	mointype = sectiontypes.get(sectiontype)
paul@70	689
paul@70	690	parts.append("%s%s\n" % (opening, mointype or ""))
paul@70	691	parts.append(section_content)
paul@70	692	parts.append("\n%s\n" % closing)
paul@70	693
paul@70	694	# Leave the section.
paul@70	695
paul@70	696	self.leave_section()
paul@15	697
paul@70	698	return parts
paul@70	699
paul@77	700	def enter_section(self, sectiontype=None):
paul@70	701	self.level += 1
paul@70	702	self.max_level = max(self.level, self.max_level)
paul@77	703	self.sections.append(sectiontype)
paul@70	704
paul@70	705	def leave_section(self):
paul@70	706	self.level -= 1
paul@70	707	if not self.level:
paul@70	708	self.max_level = 0
paul@77	709	self.sections.pop()
paul@70	710
paul@70	711	def nest_section(self):
paul@70	712	level = 3 + self.max_level - self.level
paul@70	713	opening = "{" * level
paul@70	714	closing = "}" * level
paul@70	715	return opening, closing
paul@15	716
paul@70	717	# General parsing.
paul@70	718
paul@82	719	def parse_text(self, s, top=False):
paul@70	720
paul@70	721	"Parse the content in the string 's', returning the translation."
paul@70	722
paul@70	723	parts = []
paul@70	724
paul@70	725	# Control spacing between blocks and other blocks or sections.
paul@70	726
paul@70	727	preceded_by_block = False
paul@70	728
paul@70	729	for type, text in get_regions(s):
paul@70	730
paul@70	731	# Handle list, heading, blockquote or anonymous blocks.
paul@70	732
paul@70	733	if type is None:
paul@78	734
paul@78	735	# Where the region is the same as the provided text, return
paul@78	736	# immediately. This is the base case of the recursive parsing
paul@78	737	# process.
paul@78	738
paul@82	739	if text == s and not top:
paul@82	740	return self.translate_content(text)
paul@78	741
paul@78	742	# Otherwise, obtain and translate the blocks.
paul@78	743
paul@42	744	if preceded_by_block:
paul@42	745	parts.append("\n")
paul@42	746
paul@70	747	first = True
paul@70	748	for blocktype, blocktext in get_blocks(text):
paul@70	749	if not first:
paul@70	750	parts.append("\n")
paul@70	751	else:
paul@70	752	first = False
paul@70	753	parts.append("%s" % self.translate_block(blocktype, blocktext))
paul@42	754
paul@70	755	if not first:
paul@70	756	preceded_by_block = True
paul@42	757
paul@70	758	# Handle sections.
paul@42	759
paul@15	760	else:
paul@70	761	sectiontype, options = type
paul@70	762
paul@70	763	# Direct translations of sections.
paul@70	764
paul@70	765	if sectiontypes.has_key(sectiontype):
paul@70	766	if preceded_by_block:
paul@70	767	parts.append("\n")
paul@70	768
paul@70	769	parts += self.translate_section(sectiontype, options, text)
paul@70	770	preceded_by_block = True
paul@39	771
paul@78	772	# Translations of macros acting as sections.
paul@70	773
paul@76	774	elif macrotypes.has_key(sectiontype):
paul@78	775
paul@78	776	# Prevent the production of macros in places they would
paul@78	777	# produce illegal Moin syntax.
paul@78	778
paul@76	779	if not self.forbids_macros():
paul@76	780	self.macro = sectiontype
paul@76	781	argname = macroargs.get(sectiontype)
paul@76	782	parts.append(macrotypes[sectiontype] % {
paul@76	783	"content" : quote_macro_argument(self.parse_text(text)),
paul@76	784	"args" : quote_macro_argument((argname and ("%s=" % argname) or "") + options)
paul@76	785	})
paul@76	786	self.macro = None
paul@78	787
paul@78	788	# Include the contents of section-based macros where the
paul@78	789	# macros themselves are not allowed.
paul@78	790
paul@76	791	else:
paul@76	792	parts.append(self.translate_content(text))
paul@76	793
paul@70	794	preceded_by_block = False
paul@70	795
paul@70	796	# Unrecognised sections.
paul@70	797
paul@70	798	else:
paul@70	799	parts += self.translate_section(sectiontype, None, text)
paul@70	800	preceded_by_block = False
paul@70	801
paul@70	802	return "".join(parts)
paul@39	803
paul@71	804	def forbids_macros(self):
paul@76	805	return self.in_heading or self.macro
paul@71	806
paul@39	807	def parse(s, out):
paul@39	808
paul@39	809	"Parse the content in the string 's', writing a translation to 'out'."
paul@39	810
paul@70	811	parser = ConfluenceParser()
paul@82	812	out.write(parser.parse_text(s, top=True))
paul@11	813
paul@6	814	if __name__ == "__main__":
paul@62	815	s = codecs.getreader("utf-8")(sys.stdin).read()
paul@41	816	out = codecs.getwriter("utf-8")(sys.stdout)
paul@41	817	parse(s, out)
paul@6	818
paul@6	819	# vim: tabstop=4 expandtab shiftwidth=4