ConfluenceConverter (annotate wikiparser.py in 3f0fbef87393)

ConfluenceConverter

Annotated wikiparser.py

86:3f0fbef87393

2013-06-10

Paul Boddie

Supported table recognition in region extraction in order to handle sections within tables, where the appearance of sections would break up tables around those sections.

paul@6	1	#!/usr/bin/env python
paul@6	2
paul@7	3	"""
paul@7	4	Confluence Wiki syntax parsing.
paul@7	5
paul@34	6	Copyright (C) 2012, 2013 Paul Boddie <paul@boddie.org.uk>
paul@8	7
paul@8	8	This software is free software; you can redistribute it and/or
paul@8	9	modify it under the terms of the GNU General Public License as
paul@8	10	published by the Free Software Foundation; either version 2 of
paul@8	11	the License, or (at your option) any later version.
paul@8	12
paul@8	13	This software is distributed in the hope that it will be useful,
paul@8	14	but WITHOUT ANY WARRANTY; without even the implied warranty of
paul@8	15	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
paul@8	16	GNU General Public License for more details.
paul@8	17
paul@8	18	You should have received a copy of the GNU General Public
paul@8	19	License along with this library; see the file LICENCE.txt
paul@8	20	If not, write to the Free Software Foundation, Inc.,
paul@8	21	51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
paul@8	22
paul@8	23	--------
paul@8	24
paul@8	25	The basic procedure is as follows:
paul@8	26
paul@7	27	1. Wiki pages are first split up into regions.
paul@7	28	2. Then, within these regions, the text is split into blocks.
paul@7	29	1. First, lists are identified.
paul@7	30	2. Additionally, other block-like elements are identified.
paul@78	31	3. Each block is then split into regions.
paul@7	32	"""
paul@7	33
paul@35	34	from common import *
paul@6	35	import re
paul@25	36	import sys
paul@41	37	import codecs
paul@77	38	import operator
paul@19	39
paul@6	40	# Section extraction.
paul@6	41
paul@86	42	sections_regexp_str = r"(?<!{){(?P<type>[^-_*+{}\n:]+)(?P<options>:[^}\n]+)?}\|^(?P<rowstart>[\|]{1,2})\|(?P<rowend>[\|]{1,2})(\n\|$)"
paul@6	43	sections_regexp = re.compile(sections_regexp_str, re.DOTALL \| re.MULTILINE)
paul@6	44
paul@6	45	def get_regions(s):
paul@6	46
paul@6	47	"""
paul@6	48	Return a list of regions from 's'. Each region is specified using a tuple of
paul@6	49	the form (type, text).
paul@6	50	"""
paul@6	51
paul@6	52	last = 0
paul@76	53	regions = [""]
paul@75	54	depth = 0
paul@86	55	had_row = False
paul@75	56
paul@6	57	for match in sections_regexp.finditer(s):
paul@6	58	start, end = match.span()
paul@86	59	is_start = match.group("options") or match.group("rowstart")
paul@76	60	is_section = is_section_marker(match.group("type"))
paul@86	61	is_row = match.group("rowstart") or match.group("rowend")
paul@75	62
paul@75	63	# The start of a region is either indicated by a marker with options or
paul@75	64	# by a marker where no region is currently active.
paul@75	65
paul@75	66	if is_start or not depth:
paul@75	67
paul@75	68	# Where no region is active, add the text since the last match as a
paul@75	69	# "null" region.
paul@75	70
paul@75	71	if not depth:
paul@76	72	regions[-1] += s[last:start]
paul@75	73
paul@75	74	# A new region is maintained as a string.
paul@75	75
paul@76	76	if is_section:
paul@76	77	regions.append(s[start:end])
paul@76	78
paul@86	79	# A new row may either continue a table region or start a new
paul@86	80	# table region.
paul@86	81
paul@86	82	elif is_row:
paul@86	83	if (last != start or not had_row):
paul@86	84	regions.append(s[start:end])
paul@86	85	else:
paul@86	86	regions[-2] += regions[-1] + s[start:end]
paul@86	87	regions.pop()
paul@86	88
paul@76	89	# Certain markers may be standalone macros.
paul@76	90
paul@76	91	else:
paul@76	92	regions[-1] += s[start:end]
paul@75	93
paul@75	94	# Where a region is active, add the text since the last match as
paul@75	95	# well as the text in this match to the region.
paul@75	96
paul@75	97	else:
paul@75	98	regions[-1] += s[last:end]
paul@75	99
paul@86	100	if is_section or is_row:
paul@76	101	depth += 1
paul@75	102
paul@75	103	# The end of a region is indicated by a marker with no options.
paul@75	104
paul@75	105	else:
paul@75	106	# Where no region is active, the text since the last match plus the
paul@75	107	# marker are added to the current "null" region.
paul@75	108
paul@75	109	if not depth:
paul@75	110
paul@75	111	# Add to the string portion of the "null" region.
paul@75	112
paul@76	113	regions[-1] += s[last:end]
paul@75	114
paul@75	115	# Where a region is active, the end marker and preceding text is
paul@75	116	# either incorporated into the current region if more than one
paul@75	117	# region is active, or the preceding text is incorporated into the
paul@75	118	# current region and the details of the region are then obtained.
paul@75	119
paul@75	120	else:
paul@86	121	if depth > 1 or (not is_section and not is_row):
paul@75	122	regions[-1] += s[last:end]
paul@75	123
paul@75	124	# Terminate the active region, interpreting its contents.
paul@75	125
paul@75	126	else:
paul@76	127	regions[-1] += s[last:end]
paul@76	128	regions.append("")
paul@76	129
paul@86	130	if is_section or is_row:
paul@76	131	depth -= 1
paul@75	132
paul@86	133	had_row = is_row
paul@6	134	last = end
paul@75	135
paul@75	136	# Where a region is still active, terminate it.
paul@75	137
paul@76	138	regions[-1] += s[last:]
paul@75	139
paul@76	140	return [get_section_details(s) for s in regions if s]
paul@75	141
paul@76	142	def is_section_marker(sectiontype):
paul@76	143	return sectiontypes.has_key(sectiontype) or sectiontype == "color"
paul@6	144
paul@7	145	# Section inspection.
paul@7	146
paul@15	147	section_regexp_str = r"{(?P<sectiontype>[^\n:]?)(?::(?P<options>.?))?}(?P<section>.*){(?P=sectiontype)}"
paul@7	148	section_regexp = re.compile(section_regexp_str, re.DOTALL \| re.MULTILINE)
paul@7	149
paul@6	150	def get_section_details(s):
paul@6	151
paul@7	152	"Return the details of a section 's' in the form (type, text)."
paul@6	153
paul@6	154	match = section_regexp.match(s)
paul@6	155	if match:
paul@15	156	return (match.group("sectiontype"), match.group("options")), match.group("section")
paul@6	157	else:
paul@6	158	return None, s
paul@6	159
paul@14	160	# Heading, table and list extraction.
paul@7	161
paul@41	162	list_regexp_str = r"^\s(?P<listtype>[#-])[#-]\s+.(\n\s(?P=listtype).?)(?:\n\|$)"
paul@39	163	table_regexp_str = r"^((?P<celltype>[\|]{1,2})((.\|\n(?!\n))+?(?P=celltype))+(\n\|$))+"
paul@14	164	blocktext_regexp_str = r"^(?P<type>h\d\|bq)\.\s+(?P<text>.*)$"
paul@7	165
paul@14	166	blockelement_regexp = re.compile(
paul@14	167	"(" + list_regexp_str + ")"
paul@14	168	"\|"
paul@14	169	"(" + table_regexp_str + ")"
paul@14	170	"\|"
paul@14	171	"(" + blocktext_regexp_str + ")",
paul@14	172	re.MULTILINE
paul@14	173	)
paul@14	174
paul@14	175	def get_block_elements(s):
paul@7	176
paul@7	177	"""
paul@14	178	Extract headings, tables and lists from the given string 's'.
paul@7	179	"""
paul@7	180
paul@7	181	last = 0
paul@7	182	blocks = []
paul@14	183	for match in blockelement_regexp.finditer(s):
paul@7	184	start, end = match.span()
paul@14	185	matchtype = match.group("listtype") and "list" or match.group("celltype") and "table" or match.group("type")
paul@7	186	blocks.append((None, s[last:start]))
paul@14	187	blocks.append((matchtype, match.group("text") or s[start:end]))
paul@7	188	last = end
paul@7	189	blocks.append((None, s[last:]))
paul@7	190	return blocks
paul@7	191
paul@7	192	# Block extraction.
paul@7	193
paul@7	194	block_regexp_str = r"^(?:\s*\n)+"
paul@7	195	block_regexp = re.compile(block_regexp_str, re.MULTILINE)
paul@7	196
paul@7	197	def get_basic_blocks(s):
paul@7	198
paul@7	199	"""
paul@7	200	Return blocks from the given string 's' by splitting the text on blank lines
paul@7	201	and eliminating those lines.
paul@7	202	"""
paul@7	203
paul@7	204	return [b for b in block_regexp.split(s) if b.strip()]
paul@7	205
paul@7	206	# Block inspection.
paul@7	207
paul@7	208	def get_blocks(s):
paul@7	209
paul@7	210	"""
paul@7	211	Return blocks from the given string 's', inspecting the basic blocks and
paul@7	212	generating additional block-level text where appropriate.
paul@7	213	"""
paul@7	214
paul@7	215	blocks = []
paul@7	216
paul@14	217	for blocktype, blocktext in get_block_elements(s):
paul@7	218
paul@14	219	# Collect heading, list and table blocks.
paul@7	220
paul@7	221	if blocktype is not None:
paul@7	222	blocks.append((blocktype, blocktext))
paul@7	223
paul@7	224	# Attempt to find new subblocks in other regions.
paul@7	225
paul@7	226	else:
paul@7	227	for block in get_basic_blocks(blocktext):
paul@14	228	blocks.append((None, block))
paul@7	229
paul@7	230	return blocks
paul@7	231
paul@14	232	# List item inspection.
paul@14	233
paul@41	234	listitem_regexp_str = r"^(?P<marker> [-#]+)\s+(?P<text>.*)$"
paul@7	235	listitem_regexp = re.compile(listitem_regexp_str, re.MULTILINE)
paul@7	236
paul@14	237	def get_list_items(text):
paul@14	238
paul@14	239	"Return a list of (marker, text) tuples for the given list 'text'."
paul@14	240
paul@14	241	items = []
paul@14	242
paul@14	243	for match in listitem_regexp.finditer(text):
paul@14	244	items.append((match.group("marker"), match.group("text")))
paul@14	245
paul@14	246	return items
paul@14	247
paul@36	248	# Content inspection.
paul@14	249
paul@19	250	monospace_regexp_str = r"{{(?P<monotext>.*?)}}"
paul@36	251	link_regexp_str = r"[[](?P<linktext>.*?)]"
paul@38	252	image_regexp_str = r"!(?P<imagetext>\w.*?)!"
paul@71	253	macro_regexp_str = r"{(?P<macro>.?):(?P<options>.?)}"
paul@36	254
paul@36	255	# Word-dependent patterns.
paul@36	256	# Here, the unbracketed markers must test for the absence of surrounding word
paul@36	257	# characters.
paul@36	258
paul@36	259	italic_regexp_str = r"(?:(?<!\w)_\|\{_\})(?P<italictext>.*?)(?:_(?!\w)\|\{_\})"
paul@36	260	bold_regexp_str = r"(?:(?<!\w)\\|\{\\})(?P<boldtext>.?)(?:\(?!\w)\|\{\*\})"
paul@36	261	del_regexp_str = r"(?:(?<!\w)-\|\{-\})(?P<deltext>.*?)(?:-(?!\w)\|\{-\})"
paul@36	262	underline_regexp_str = r"(?:(?<!\w)\+\|\{\+\})(?P<underlinetext>.*?)(?:\+(?!\w)\|\{\+\})"
paul@36	263	sub_regexp_str = r"(?:(?<!\w)~\|\{~\})(?P<subtext>.*?)(?:~(?!\w)\|\{~\})"
paul@16	264
paul@16	265	content_regexp_str = (
paul@19	266	"(" + monospace_regexp_str + ")"
paul@19	267	"\|"
paul@14	268	"(" + link_regexp_str + ")"
paul@14	269	"\|"
paul@14	270	"(" + image_regexp_str + ")"
paul@36	271	"\|"
paul@71	272	"(" + macro_regexp_str + ")"
paul@71	273	"\|"
paul@36	274	"(" + italic_regexp_str + ")"
paul@36	275	"\|"
paul@36	276	"(" + bold_regexp_str + ")"
paul@36	277	"\|"
paul@36	278	"(" + del_regexp_str + ")"
paul@36	279	"\|"
paul@36	280	"(" + underline_regexp_str + ")"
paul@36	281	"\|"
paul@36	282	"(" + sub_regexp_str + ")"
paul@16	283	)
paul@16	284
paul@36	285	# Table row inspection.
paul@36	286
paul@36	287	cellsep_regexp_str = r"(?P<celltype>[\|]{1,2})"
paul@36	288
paul@16	289	table_content_regexp_str = (
paul@16	290	content_regexp_str +
paul@14	291	"\|"
paul@14	292	"(" + cellsep_regexp_str + ")"
paul@14	293	)
paul@14	294
paul@16	295	content_regexp = re.compile(content_regexp_str)
paul@16	296	table_content_regexp = re.compile(table_content_regexp_str)
paul@16	297
paul@14	298	def get_table_rows(text):
paul@14	299
paul@14	300	"Return a list of (cellsep, columns) tuples for the given table 'text'."
paul@14	301
paul@14	302	rows = []
paul@14	303
paul@39	304	for row in text.split("\|\n"):
paul@39	305	if not row:
paul@39	306	break
paul@39	307
paul@39	308	row += "\|"
paul@14	309	cellsep = None
paul@14	310	columns = [""]
paul@14	311	last = 0
paul@39	312	for match in table_content_regexp.finditer(row):
paul@14	313	start, end = match.span()
paul@39	314	columns[-1] += row[last:start]
paul@14	315
paul@14	316	if match.group("celltype"):
paul@14	317	if cellsep is None:
paul@14	318	cellsep = match.group("celltype")
paul@14	319	columns.append("")
paul@14	320	else:
paul@16	321	columns[-1] += match.group()
paul@14	322
paul@14	323	last = end
paul@14	324
paul@39	325	columns[-1] += row[last:]
paul@14	326
paul@14	327	if cellsep:
paul@14	328	rows.append((cellsep, columns[1:-1]))
paul@14	329
paul@14	330	return rows
paul@14	331
paul@70	332	# Notation conversion.
paul@70	333
paul@70	334	notation_mapping = [
paul@70	335	(r"\!", "!"),
paul@70	336	(r"\-", "-"),
paul@70	337	(r"\\""\n", "<<BR>>"),
paul@70	338	(r"\\ ", "<<BR>>"),
paul@70	339	(r"\~", "~"),
paul@70	340	]
paul@70	341
paul@70	342	preformatted_notation_mapping = [
paul@70	343	(r"\!", "!"),
paul@70	344	(r"\-", "-"),
paul@70	345	(r"\\""\n", "\n"),
paul@70	346	(r"\\ ", "\n"),
paul@70	347	(r"\~", "~"),
paul@70	348	]
paul@70	349
paul@70	350	# Translation helpers.
paul@70	351
paul@70	352	markers = {
paul@70	353	"" : "",
paul@70	354	"#" : "1.",
paul@70	355	"-" : "*",
paul@70	356	}
paul@70	357
paul@70	358	cellseps = {
paul@70	359	"\|" : "\n\|\| ",
paul@70	360	"\|\|" : "\n\|\| ",
paul@70	361	}
paul@70	362
paul@70	363	cellextra = {
paul@70	364	"\|" : "",
paul@70	365	"\|\|" : "'''",
paul@70	366	}
paul@70	367
paul@15	368	sectiontypes = {
paul@42	369	"code" : "",
paul@42	370	"noformat" : "",
paul@42	371	"quote" : "",
paul@68	372	"info" : "#!wiki important",
paul@68	373	"note" : "#!wiki caution",
paul@68	374	"tip" : "#!wiki tip",
paul@68	375	"warning" : "#!wiki warning",
paul@42	376	}
paul@42	377
paul@66	378	preformatted_sectiontypes = (None, "noformat")
paul@66	379
paul@71	380	macroargs = {
paul@71	381	"color" : "col",
paul@71	382	}
paul@71	383
paul@42	384	macrotypes = {
paul@71	385	"anchor" : "<<Anchor(%(args)s)>>",
paul@71	386	"color" : "<<Color2(%(content)s, %(args)s)>>",
paul@15	387	}
paul@15	388
paul@70	389	class ConfluenceParser:
paul@70	390
paul@70	391	"A parser for Confluence markup."
paul@70	392
paul@70	393	def __init__(self):
paul@70	394	self.max_level = self.level = 0
paul@71	395	self.in_heading = False
paul@72	396	self.held_anchors = []
paul@76	397	self.macro = None
paul@77	398	self.sections = []
paul@70	399
paul@70	400	def translate_marker(self, marker):
paul@70	401
paul@70	402	"Translate the given 'marker' to a suitable Moin representation."
paul@70	403
paul@70	404	return " " * len(marker) + markers[marker[-1]]
paul@70	405
paul@70	406	def translate_cellsep(self, cellsep):
paul@70	407
paul@70	408	"Translate the given 'cellsep' to a suitable Moin representation."
paul@70	409
paul@70	410	return cellseps[cellsep]
paul@70	411
paul@70	412	def translate_cell(self, cellsep, text):
paul@15	413
paul@70	414	"Using 'cellsep', translate the cell 'text'."
paul@70	415
paul@70	416	return cellextra[cellsep] + self.parse_text(text).strip() + cellextra[cellsep]
paul@70	417
paul@70	418	def translate_content_match(self, match):
paul@70	419
paul@70	420	"Translate the content described by the given 'match', returning a string."
paul@70	421
paul@70	422	if match.group("monotext"):
paul@70	423	self.enter_section(); self.leave_section()
paul@70	424	return "{{{%s}}}" % match.group("monotext")
paul@11	425
paul@70	426	elif match.group("linktext"):
paul@70	427	parts = match.group("linktext").split("\|")
paul@70	428
paul@70	429	# NOTE: Proper detection of external links required.
paul@70	430
paul@70	431	if len(parts) == 1:
paul@70	432	label, target, title = None, parts[0], None
paul@70	433	elif len(parts) == 2:
paul@70	434	(label, target), title = parts, None
paul@70	435	else:
paul@70	436	label, target, title = parts
paul@39	437
paul@70	438	target = target.strip()
paul@70	439
paul@70	440	# Look for namespace links and rewrite them.
paul@70	441
paul@70	442	if target.find(":") != -1:
paul@70	443	prefix = ""
paul@70	444	space, rest = target.split(":", 1)
paul@70	445	if space not in URL_SCHEMES:
paul@85	446	rest = get_page_title(rest)
paul@70	447	target = "%s/%s" % (space, rest)
paul@70	448
paul@70	449	# Detect anchors.
paul@70	450
paul@70	451	elif target.startswith("#"):
paul@70	452	prefix = ""
paul@70	453
paul@70	454	# Detect attachments.
paul@70	455
paul@70	456	elif target.startswith("^"):
paul@70	457	prefix = "attachment:"
paul@70	458
paul@70	459	# Link to other pages within a space.
paul@11	460
paul@70	461	else:
paul@70	462	prefix = "../"
paul@70	463
paul@70	464	# Make the link tidier by making a target if none was given.
paul@70	465
paul@70	466	if not label:
paul@70	467	label = target
paul@42	468
paul@85	469	target = get_page_title(target)
paul@85	470
paul@70	471	if not label and not title:
paul@70	472	return "[[%s%s]]" % (prefix, target)
paul@70	473	elif not title:
paul@70	474	return "[[%s%s\|%s]]" % (prefix, target, label)
paul@70	475	else:
paul@70	476	return "[[%s%s\|%s\|title=%s]]" % (prefix, target, label, title)
paul@70	477
paul@70	478	elif match.group("imagetext"):
paul@70	479	parts = match.group("imagetext").split("\|")
paul@70	480
paul@70	481	# NOTE: Proper detection of external links required.
paul@70	482
paul@70	483	if parts[0].startswith("http"):
paul@70	484	prefix = ""
paul@70	485	else:
paul@70	486	prefix = "attachment:"
paul@42	487
paul@70	488	# NOTE: Proper options conversion required.
paul@70	489
paul@70	490	if len(parts) == 1:
paul@70	491	return "{{%s%s}}" % (prefix, parts[0])
paul@70	492	else:
paul@70	493	return "{{%s%s\|%s}}" % (prefix, parts[0], parts[1])
paul@70	494
paul@71	495	elif match.group("macro"):
paul@71	496	macro_name = match.group("macro")
paul@72	497	if macrotypes.has_key(macro_name):
paul@71	498	argname = macroargs.get(macro_name)
paul@72	499	result = macrotypes[macro_name] % {
paul@71	500	"args" : quote_macro_argument((argname and ("%s=" % argname) or "") + match.group("options"))
paul@71	501	}
paul@72	502	if not self.forbids_macros():
paul@72	503	return result
paul@72	504	if macro_name == "anchor":
paul@72	505	self.held_anchors.append(result)
paul@72	506	return ""
paul@71	507
paul@70	508	elif match.group("italictext"):
paul@70	509	return "''%s''" % self.translate_content(match.group("italictext"))
paul@70	510
paul@70	511	elif match.group("boldtext"):
paul@70	512	return "'''%s'''" % self.translate_content(match.group("boldtext"))
paul@70	513
paul@70	514	elif match.group("deltext"):
paul@70	515	return "--(%s)--" % self.translate_content(match.group("deltext"))
paul@70	516
paul@70	517	elif match.group("underlinetext"):
paul@70	518	return "__%s__" % self.translate_content(match.group("underlinetext"))
paul@70	519
paul@70	520	elif match.group("subtext"):
paul@70	521	return ",,%s,," % self.translate_content(match.group("subtext"))
paul@11	522
paul@70	523	else:
paul@70	524	return self.translate_text(match.group())
paul@70	525
paul@70	526	def translate_text(self, s, preformatted=False):
paul@70	527
paul@70	528	"Translate the plain text string 's', converting notation."
paul@70	529
paul@70	530	for before, after in preformatted and preformatted_notation_mapping or notation_mapping:
paul@70	531	s = s.replace(before, after)
paul@70	532	return s
paul@70	533
paul@77	534	def translate_content(self, text):
paul@70	535
paul@70	536	"""
paul@70	537	Return a translation of the given 'text'. If the optional 'sectiontype' is
paul@70	538	specified, the translation may be modified to a form appropriate to the
paul@70	539	section being translated.
paul@70	540	"""
paul@70	541
paul@70	542	parts = []
paul@77	543	preformatted = self.is_preformatted()
paul@11	544
paul@70	545	last = 0
paul@70	546	for match in content_regexp.finditer(text):
paul@70	547	start, end = match.span()
paul@70	548	parts.append(self.translate_text(text[last:start], preformatted))
paul@70	549
paul@70	550	# Handle unformatted sections.
paul@70	551
paul@77	552	if self.sections and self.sections[-1] in ("code", "noformat"):
paul@70	553	parts.append(match.group())
paul@70	554	else:
paul@70	555	parts.append(self.translate_content_match(match))
paul@70	556
paul@70	557	last = end
paul@70	558
paul@70	559	parts.append(self.translate_text(text[last:], preformatted))
paul@70	560	return "".join(parts)
paul@70	561
paul@77	562	def is_preformatted(self):
paul@77	563	return reduce(operator.or_, [x in preformatted_sectiontypes for x in self.sections], False)
paul@77	564
paul@70	565	def translate_block(self, blocktype, blocktext):
paul@70	566
paul@70	567	"Translate the block with the given 'blocktype' and 'blocktext'."
paul@70	568
paul@71	569	if blocktype in headings:
paul@71	570	self.in_heading = True
paul@72	571	self.held_anchors = []
paul@71	572
paul@70	573	parts = []
paul@42	574
paul@70	575	# Translate headings and blockquotes.
paul@70	576
paul@70	577	if blocktypes.has_key(blocktype):
paul@82	578	text = self.parse_text(blocktext)
paul@72	579	for anchor in self.held_anchors:
paul@72	580	parts.append(anchor)
paul@72	581	parts.append(blocktypes[blocktype] % text)
paul@70	582
paul@70	583	# Translate list items.
paul@70	584
paul@70	585	elif blocktype == "list":
paul@70	586	for listmarker, listitem in get_list_items(blocktext):
paul@82	587	parts.append("%s %s" % (self.translate_marker(listmarker), self.parse_text(listitem)))
paul@70	588
paul@70	589	# Translate table items.
paul@70	590
paul@70	591	elif blocktype == "table":
paul@70	592
paul@70	593	# Enter the table.
paul@70	594
paul@70	595	self.enter_section()
paul@70	596
paul@70	597	table_parts = []
paul@42	598	first = True
paul@70	599
paul@70	600	for cellsep, columns in get_table_rows(blocktext):
paul@42	601	if not first:
paul@70	602	table_parts.append("==")
paul@42	603	else:
paul@42	604	first = False
paul@70	605	moinsep = self.translate_cellsep(cellsep)
paul@70	606	table_parts.append(moinsep.join([self.translate_cell(cellsep, column) for column in columns]))
paul@70	607
paul@70	608	# Nest the section appropriately.
paul@70	609
paul@70	610	opening, closing = self.nest_section()
paul@42	611
paul@70	612	parts.append("%s#!table" % opening)
paul@70	613	parts += table_parts
paul@70	614	parts.append(closing)
paul@11	615
paul@70	616	# Leave the table.
paul@70	617
paul@70	618	self.leave_section()
paul@70	619
paul@70	620	# Handle anonymous blocks.
paul@11	621
paul@11	622	else:
paul@82	623	parts.append(self.parse_text(blocktext))
paul@70	624
paul@71	625	if blocktype in headings:
paul@71	626	self.in_heading = False
paul@71	627
paul@70	628	return "\n".join(parts)
paul@70	629
paul@70	630	def translate_section(self, sectiontype, options, text):
paul@70	631
paul@70	632	"""
paul@70	633	Translate the section with the given 'sectiontype', 'options' and
paul@70	634	'text'.
paul@70	635	"""
paul@70	636
paul@70	637	parts = []
paul@70	638
paul@70	639	# Enter the section.
paul@70	640
paul@77	641	self.enter_section(sectiontype)
paul@70	642
paul@77	643	# Sections can contain other sections.
paul@77	644
paul@77	645	section_content = self.parse_text(text.strip())
paul@70	646
paul@70	647	# Nest the section appropriately.
paul@70	648
paul@70	649	opening, closing = self.nest_section()
paul@77	650	mointype = sectiontypes.get(sectiontype)
paul@70	651
paul@70	652	parts.append("%s%s\n" % (opening, mointype or ""))
paul@70	653	if options:
paul@70	654	parts.append("## %s\n" % options)
paul@70	655	parts.append(section_content)
paul@70	656	parts.append("\n%s\n" % closing)
paul@70	657
paul@70	658	# Leave the section.
paul@70	659
paul@70	660	self.leave_section()
paul@15	661
paul@70	662	return parts
paul@70	663
paul@77	664	def enter_section(self, sectiontype=None):
paul@70	665	self.level += 1
paul@70	666	self.max_level = max(self.level, self.max_level)
paul@77	667	self.sections.append(sectiontype)
paul@70	668
paul@70	669	def leave_section(self):
paul@70	670	self.level -= 1
paul@70	671	if not self.level:
paul@70	672	self.max_level = 0
paul@77	673	self.sections.pop()
paul@70	674
paul@70	675	def nest_section(self):
paul@70	676	level = 3 + self.max_level - self.level
paul@70	677	opening = "{" * level
paul@70	678	closing = "}" * level
paul@70	679	return opening, closing
paul@15	680
paul@70	681	# General parsing.
paul@70	682
paul@82	683	def parse_text(self, s, top=False):
paul@70	684
paul@70	685	"Parse the content in the string 's', returning the translation."
paul@70	686
paul@70	687	parts = []
paul@70	688
paul@70	689	# Control spacing between blocks and other blocks or sections.
paul@70	690
paul@70	691	preceded_by_block = False
paul@70	692
paul@70	693	for type, text in get_regions(s):
paul@70	694
paul@70	695	# Handle list, heading, blockquote or anonymous blocks.
paul@70	696
paul@70	697	if type is None:
paul@78	698
paul@78	699	# Where the region is the same as the provided text, return
paul@78	700	# immediately. This is the base case of the recursive parsing
paul@78	701	# process.
paul@78	702
paul@82	703	if text == s and not top:
paul@82	704	return self.translate_content(text)
paul@78	705
paul@78	706	# Otherwise, obtain and translate the blocks.
paul@78	707
paul@42	708	if preceded_by_block:
paul@42	709	parts.append("\n")
paul@42	710
paul@70	711	first = True
paul@70	712	for blocktype, blocktext in get_blocks(text):
paul@70	713	if not first:
paul@70	714	parts.append("\n")
paul@70	715	else:
paul@70	716	first = False
paul@70	717	parts.append("%s" % self.translate_block(blocktype, blocktext))
paul@42	718
paul@70	719	if not first:
paul@70	720	preceded_by_block = True
paul@42	721
paul@70	722	# Handle sections.
paul@42	723
paul@15	724	else:
paul@70	725	sectiontype, options = type
paul@70	726
paul@70	727	# Direct translations of sections.
paul@70	728
paul@70	729	if sectiontypes.has_key(sectiontype):
paul@70	730	if preceded_by_block:
paul@70	731	parts.append("\n")
paul@70	732
paul@70	733	parts += self.translate_section(sectiontype, options, text)
paul@70	734	preceded_by_block = True
paul@39	735
paul@78	736	# Translations of macros acting as sections.
paul@70	737
paul@76	738	elif macrotypes.has_key(sectiontype):
paul@78	739
paul@78	740	# Prevent the production of macros in places they would
paul@78	741	# produce illegal Moin syntax.
paul@78	742
paul@76	743	if not self.forbids_macros():
paul@76	744	self.macro = sectiontype
paul@76	745	argname = macroargs.get(sectiontype)
paul@76	746	parts.append(macrotypes[sectiontype] % {
paul@76	747	"content" : quote_macro_argument(self.parse_text(text)),
paul@76	748	"args" : quote_macro_argument((argname and ("%s=" % argname) or "") + options)
paul@76	749	})
paul@76	750	self.macro = None
paul@78	751
paul@78	752	# Include the contents of section-based macros where the
paul@78	753	# macros themselves are not allowed.
paul@78	754
paul@76	755	else:
paul@76	756	parts.append(self.translate_content(text))
paul@76	757
paul@70	758	preceded_by_block = False
paul@70	759
paul@70	760	# Unrecognised sections.
paul@70	761
paul@70	762	else:
paul@70	763	parts += self.translate_section(sectiontype, None, text)
paul@70	764	preceded_by_block = False
paul@70	765
paul@70	766	return "".join(parts)
paul@39	767
paul@71	768	def forbids_macros(self):
paul@76	769	return self.in_heading or self.macro
paul@71	770
paul@39	771	def parse(s, out):
paul@39	772
paul@39	773	"Parse the content in the string 's', writing a translation to 'out'."
paul@39	774
paul@70	775	parser = ConfluenceParser()
paul@82	776	out.write(parser.parse_text(s, top=True))
paul@11	777
paul@6	778	if __name__ == "__main__":
paul@62	779	s = codecs.getreader("utf-8")(sys.stdin).read()
paul@41	780	out = codecs.getwriter("utf-8")(sys.stdout)
paul@41	781	parse(s, out)
paul@6	782
paul@6	783	# vim: tabstop=4 expandtab shiftwidth=4