MoinLight (annotate moinformat.py in 23b993064a50)

MoinLight

Annotated moinformat.py

3:23b993064a50

2017-04-27

Paul Boddie

Added document node prettyprinting.

paul@0	1	#!/usr/bin/env python
paul@0	2
paul@0	3	"""
paul@0	4	Moin wiki format parser.
paul@0	5
paul@0	6	Copyright (C) 2012, 2013, 2015, 2017 Paul Boddie <paul@boddie.org.uk>
paul@0	7
paul@0	8	This program is free software; you can redistribute it and/or modify it under
paul@0	9	the terms of the GNU General Public License as published by the Free Software
paul@0	10	Foundation; either version 3 of the License, or (at your option) any later
paul@0	11	version.
paul@0	12
paul@0	13	This program is distributed in the hope that it will be useful, but WITHOUT
paul@0	14	ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
paul@0	15	FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
paul@0	16	details.
paul@0	17
paul@0	18	You should have received a copy of the GNU General Public License along with
paul@0	19	this program. If not, see <http://www.gnu.org/licenses/>.
paul@0	20	"""
paul@0	21
paul@0	22	from cgi import escape
paul@0	23	import re
paul@0	24
paul@0	25	# Regular expressions.
paul@0	26
paul@0	27	syntax = {
paul@0	28	# Page regions:
paul@2	29	"marker" : (r"^\s*([{]{3,}\|[}]{3,})", re.MULTILINE \| re.DOTALL), # {{{... or }}}...
paul@0	30
paul@0	31	# Region contents:
paul@2	32	"header" : (r"#!(.*?)\n", 0), # #! char-excl-nl
paul@2	33	"break" : (r"^\s*?\n", re.MULTILINE), # blank line
paul@0	34	}
paul@0	35
paul@0	36	# Define patterns for the regular expressions.
paul@0	37
paul@0	38	patterns = {}
paul@0	39	for name, (value, flags) in syntax.items():
paul@0	40	patterns[name] = re.compile(value, re.UNICODE \| flags)
paul@0	41
paul@0	42
paul@0	43
paul@0	44	# Document nodes.
paul@0	45
paul@0	46	class Container:
paul@0	47
paul@0	48	"A container of document nodes."
paul@0	49
paul@0	50	def __init__(self, nodes):
paul@0	51	self.nodes = nodes
paul@0	52
paul@0	53	def append(self, node):
paul@0	54	self.nodes.append(node)
paul@0	55
paul@1	56	def normalise(self):
paul@1	57
paul@1	58	"Combine adjacent text nodes."
paul@1	59
paul@1	60	nodes = self.nodes
paul@1	61	self.nodes = []
paul@1	62	text = None
paul@1	63
paul@1	64	for node in nodes:
paul@1	65
paul@1	66	# Open a text node or merge text into an open node.
paul@1	67
paul@1	68	if isinstance(node, Text):
paul@1	69	if not text:
paul@1	70	text = node
paul@1	71	else:
paul@1	72	text.merge(node)
paul@1	73
paul@1	74	# Close any open text node and append the current node.
paul@1	75
paul@1	76	else:
paul@1	77	if text:
paul@1	78	self.append(text)
paul@1	79	text = None
paul@1	80	self.append(node)
paul@1	81
paul@1	82	# Add any open text node.
paul@1	83
paul@1	84	if text:
paul@1	85	self.append(text)
paul@1	86
paul@3	87	def __str__(self):
paul@3	88	return self.prettyprint()
paul@3	89
paul@3	90	def prettyprint(self, indent=""):
paul@3	91	pass
paul@3	92
paul@0	93	class Region(Container):
paul@0	94
paul@0	95	"A region of the page."
paul@0	96
paul@1	97	transparent_region_types = ["wiki"]
paul@1	98
paul@0	99	def __init__(self, nodes, level=0, type=None):
paul@0	100	Container.__init__(self, nodes)
paul@0	101	self.level = level
paul@0	102	self.type = type
paul@0	103
paul@1	104	def have_start(self, s):
paul@1	105	return self.is_transparent() and s.startswith("{")
paul@1	106
paul@0	107	def have_end(self, s):
paul@0	108	return self.level and s.startswith("}") and self.level == len(s)
paul@0	109
paul@1	110	def is_transparent(self):
paul@1	111	return not self.level or self.type in self.transparent_region_types
paul@1	112
paul@0	113	def __repr__(self):
paul@0	114	return "Region(%r, %r, %r)" % (self.nodes, self.level, self.type)
paul@0	115
paul@3	116	def prettyprint(self, indent=""):
paul@3	117	l = ["%sRegion: level=%d type=%s" % (indent, self.level, self.type)]
paul@3	118	for node in self.nodes:
paul@3	119	l.append(node.prettyprint(indent + " "))
paul@3	120	return "\n".join(l)
paul@3	121
paul@0	122	def to_string(self, out):
paul@0	123	out.start_region(self.level, self.type)
paul@0	124	for node in self.nodes:
paul@0	125	node.to_string(out)
paul@0	126	out.end_region(self.level, self.type)
paul@0	127
paul@0	128	class Block(Container):
paul@0	129
paul@0	130	"A block in the page."
paul@0	131
paul@0	132	def __init__(self, nodes, final=True):
paul@0	133	Container.__init__(self, nodes)
paul@0	134	self.final = final
paul@0	135
paul@0	136	def __repr__(self):
paul@0	137	return "Block(%r)" % self.nodes
paul@0	138
paul@3	139	def prettyprint(self, indent=""):
paul@3	140	l = ["%sBlock: final=%s" % (indent, self.final)]
paul@3	141	for node in self.nodes:
paul@3	142	l.append(node.prettyprint(indent + " "))
paul@3	143	return "\n".join(l)
paul@3	144
paul@0	145	def to_string(self, out):
paul@0	146	out.start_block(self.final)
paul@0	147	for node in self.nodes:
paul@0	148	node.to_string(out)
paul@0	149	out.end_block(self.final)
paul@0	150
paul@0	151	class Text:
paul@0	152
paul@0	153	"A text node."
paul@0	154
paul@0	155	def __init__(self, s):
paul@0	156	self.s = s
paul@0	157
paul@1	158	def merge(self, text):
paul@1	159	self.s += text.s
paul@1	160
paul@0	161	def __repr__(self):
paul@0	162	return "Text(%r)" % self.s
paul@0	163
paul@3	164	def prettyprint(self, indent=""):
paul@3	165	return "%sText: %r" % (indent, self.s)
paul@3	166
paul@0	167	def to_string(self, out):
paul@0	168	out.text(self.s)
paul@0	169
paul@0	170
paul@0	171
paul@0	172	# Serialisation.
paul@0	173
paul@0	174	class Serialiser:
paul@0	175
paul@0	176	"General serialisation support."
paul@0	177
paul@0	178	def __init__(self, out):
paul@0	179	self.out = out
paul@0	180
paul@0	181	class MoinSerialiser(Serialiser):
paul@0	182
paul@0	183	"Serialisation of the page."
paul@0	184
paul@0	185	def start_region(self, level, type):
paul@0	186	out = self.out
paul@0	187	if level:
paul@0	188	out("{" * level) # marker
paul@1	189	if type and level:
paul@1	190	out("#!%s\n" % type) # header
paul@0	191
paul@0	192	def end_region(self, level, type):
paul@0	193	out = self.out
paul@0	194	if level:
paul@0	195	out("}" * level) # marker
paul@0	196
paul@0	197	def start_block(self, final):
paul@0	198	pass
paul@0	199
paul@0	200	def end_block(self, final):
paul@0	201	if not final:
paul@0	202	self.out("\n")
paul@0	203
paul@0	204	def text(self, s):
paul@0	205	self.out(s)
paul@0	206
paul@0	207	class HTMLSerialiser(Serialiser):
paul@0	208
paul@0	209	"Serialisation of the page."
paul@0	210
paul@0	211	def start_region(self, level, type):
paul@0	212	l = []
paul@0	213	out = l.append
paul@0	214	if level:
paul@0	215	out("level-%d" % level) # marker
paul@0	216
paul@0	217	# NOTE: Encode type details for CSS.
paul@0	218
paul@0	219	if type:
paul@0	220	out("type-%s" % escape(type, True)) # header
paul@0	221
paul@0	222	self.out("<span class='%s'>" % " ".join(l))
paul@0	223
paul@0	224	def end_region(self, level, type):
paul@0	225	self.out("</span>")
paul@0	226
paul@0	227	def start_block(self, final):
paul@0	228	self.out("<p>")
paul@0	229
paul@0	230	def end_block(self, final):
paul@0	231	self.out("</p>")
paul@0	232
paul@0	233	def text(self, s):
paul@0	234	self.out(escape(s))
paul@0	235
paul@0	236
paul@0	237
paul@2	238	# Tokenising functions.
paul@2	239
paul@2	240	class TokenStream:
paul@2	241
paul@2	242	"A stream of tokens taken from a string."
paul@2	243
paul@2	244	def __init__(self, s):
paul@2	245	self.s = s
paul@2	246	self.pos = 0
paul@2	247	self.match = None
paul@2	248	self.matching = None
paul@2	249
paul@2	250	def read_until(self, pattern_names, remaining=True):
paul@2	251
paul@2	252	"""
paul@2	253	Find the first match for the given 'pattern_names'. Return the text
paul@2	254	preceding any match, the remaining text if no match was found, or None
paul@2	255	if no match was found and 'remaining' is given as a false value.
paul@2	256	"""
paul@2	257
paul@2	258	first = None
paul@2	259	self.matching = None
paul@2	260
paul@2	261	# Find the first matching pattern.
paul@2	262
paul@2	263	for pattern_name in pattern_names:
paul@2	264	match = patterns[pattern_name].search(self.s, self.pos)
paul@2	265	if match:
paul@2	266	start, end = match.span()
paul@2	267	if self.matching is None or start < first:
paul@2	268	first = start
paul@2	269	self.matching = pattern_name
paul@2	270	self.match = match
paul@2	271
paul@2	272	if self.matching is None:
paul@2	273	if remaining:
paul@2	274	return self.s[self.pos:]
paul@2	275	else:
paul@2	276	return None
paul@2	277	else:
paul@2	278	return self.s[self.pos:first]
paul@2	279
paul@2	280	def read_match(self):
paul@2	281
paul@2	282	"Return the matched text, updating the position in the stream."
paul@2	283
paul@2	284	if self.match:
paul@2	285	_start, self.pos = self.match.span()
paul@2	286	s = self.match.group(1)
paul@2	287	self.match = None
paul@2	288	return s
paul@2	289	else:
paul@2	290	self.pos = len(self.s)
paul@2	291	return None
paul@2	292
paul@2	293
paul@2	294
paul@0	295	# Parser functions.
paul@0	296
paul@0	297	def parse_page(s):
paul@0	298
paul@0	299	"""
paul@0	300	Parse page text 's'. Pages consist of regions delimited by markers.
paul@0	301	"""
paul@0	302
paul@2	303	items = TokenStream(s)
paul@1	304
paul@1	305	# Define a region for the page and parse it.
paul@1	306
paul@0	307	region = Region([])
paul@0	308	parse_region(items, region)
paul@0	309	return region
paul@0	310
paul@0	311	def parse_region(items, region):
paul@0	312
paul@0	313	"Parse the data provided by 'items' to populate 'region'."
paul@0	314
paul@2	315	# Parse section headers.
paul@2	316
paul@2	317	parse_region_header(items, region)
paul@2	318
paul@2	319	if region.is_transparent():
paul@2	320	parse_region_wiki(items, region)
paul@2	321	else:
paul@2	322	parse_region_opaque(items, region)
paul@2	323
paul@2	324	def parse_region_wiki(items, region):
paul@2	325
paul@2	326	"Parse the data provided by 'items' to populate a wiki 'region'."
paul@0	327
paul@0	328	# Process exposed text and sections.
paul@0	329
paul@2	330	block = Block([])
paul@2	331	region.append(block)
paul@0	332
paul@2	333	while True:
paul@1	334
paul@2	335	# Obtain text before any marker or the end of the input.
paul@1	336
paul@2	337	match_text = items.read_until(["break", "marker"])
paul@2	338	if match_text:
paul@2	339	block.append(Text(match_text))
paul@0	340
paul@2	341	# Obtain any feature.
paul@0	342
paul@2	343	feature = items.read_match()
paul@0	344
paul@1	345	# End of input.
paul@0	346
paul@2	347	if not items.matching:
paul@2	348	break
paul@2	349
paul@2	350	# Start a section if an appropriate marker is given.
paul@2	351
paul@2	352	if region.have_start(feature):
paul@2	353
paul@2	354	# Define the section and parse it.
paul@2	355
paul@2	356	_region = Region([], len(feature))
paul@2	357	region.append(_region)
paul@2	358	parse_region(items, _region)
paul@2	359
paul@2	360	# Start a new block after the section.
paul@2	361
paul@2	362	block = Block([])
paul@2	363	region.append(block)
paul@0	364
paul@2	365	# Interpret the given marker, closing the current section if the
paul@2	366	# given marker is the corresponding end marker for the current
paul@2	367	# section.
paul@2	368
paul@2	369	elif region.have_end(feature):
paul@2	370	break
paul@2	371
paul@2	372	# Start a new block if a paragraph break is found.
paul@2	373
paul@2	374	elif items.matching == "break":
paul@2	375	block.final = False
paul@2	376	block = Block([])
paul@2	377	region.append(block)
paul@2	378
paul@2	379	# Add any inappropriate marker to the text.
paul@2	380
paul@2	381	else:
paul@2	382	block.append(Text(feature))
paul@2	383
paul@2	384	region.normalise()
paul@0	385
paul@2	386	def parse_region_opaque(items, region):
paul@2	387
paul@2	388	"Parse the data provided by 'items' to populate an opaque 'region'."
paul@2	389
paul@2	390	# Process exposed text and sections.
paul@2	391
paul@2	392	while True:
paul@2	393
paul@2	394	# Obtain text before any marker or the end of the input.
paul@2	395
paul@2	396	match_text = items.read_until(["marker"])
paul@2	397	if match_text:
paul@2	398	region.append(Text(match_text))
paul@2	399
paul@2	400	# Obtain any marker.
paul@2	401
paul@2	402	marker = items.read_match()
paul@1	403
paul@2	404	# End of input.
paul@2	405
paul@2	406	if not marker:
paul@2	407	break
paul@2	408
paul@2	409	# Interpret the given marker, closing the current section if the
paul@2	410	# given marker is the corresponding end marker for the current
paul@2	411	# section.
paul@0	412
paul@2	413	if region.have_end(marker):
paul@2	414	break
paul@2	415
paul@2	416	# Add any inappropriate marker to the text.
paul@2	417
paul@2	418	else:
paul@2	419	region.append(Text(marker))
paul@2	420
paul@2	421	region.normalise()
paul@2	422
paul@2	423	def parse_region_header(items, region):
paul@0	424
paul@0	425	"""
paul@2	426	Parse the region header from the 'items', setting it for the given 'region'.
paul@0	427	"""
paul@0	428
paul@2	429	if items.read_until(["header"], False) == "": # None means no header
paul@2	430	region.type = items.read_match()
paul@0	431
paul@1	432
paul@1	433
paul@1	434	# Top-level functions.
paul@0	435
paul@0	436	parse = parse_page
paul@0	437
paul@0	438	def serialise(doc, serialiser=MoinSerialiser):
paul@0	439	l = []
paul@0	440	doc.to_string(serialiser(l.append))
paul@0	441	return "".join(l)
paul@0	442
paul@0	443	# vim: tabstop=4 expandtab shiftwidth=4