MoinLight (annotate moinformat.py in 7336ce637f89)

MoinLight

Annotated moinformat.py

6:7336ce637f89

2017-04-27

Paul Boddie

Tidied and consolidated somewhat.

paul@0	1	#!/usr/bin/env python
paul@0	2
paul@0	3	"""
paul@0	4	Moin wiki format parser.
paul@0	5
paul@0	6	Copyright (C) 2012, 2013, 2015, 2017 Paul Boddie <paul@boddie.org.uk>
paul@0	7
paul@0	8	This program is free software; you can redistribute it and/or modify it under
paul@0	9	the terms of the GNU General Public License as published by the Free Software
paul@0	10	Foundation; either version 3 of the License, or (at your option) any later
paul@0	11	version.
paul@0	12
paul@0	13	This program is distributed in the hope that it will be useful, but WITHOUT
paul@0	14	ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
paul@0	15	FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
paul@0	16	details.
paul@0	17
paul@0	18	You should have received a copy of the GNU General Public License along with
paul@0	19	this program. If not, see <http://www.gnu.org/licenses/>.
paul@0	20	"""
paul@0	21
paul@0	22	from cgi import escape
paul@0	23	import re
paul@0	24
paul@0	25	# Regular expressions.
paul@0	26
paul@0	27	syntax = {
paul@0	28	# Page regions:
paul@6	29	"regionstart" : (r"^\s*([{]{3,})", re.MULTILINE \| re.DOTALL), # {{{...
paul@6	30	"regionend" : (r"^\s*([}]{3,})", re.MULTILINE \| re.DOTALL), # }}}...
paul@6	31	"header" : (r"#!(.*?)\n", 0), # #! char-excl-nl
paul@0	32
paul@0	33	# Region contents:
paul@6	34	"break" : (r"^(\s*?)\n", re.MULTILINE), # blank line
paul@0	35	}
paul@0	36
paul@0	37	# Define patterns for the regular expressions.
paul@0	38
paul@0	39	patterns = {}
paul@0	40	for name, (value, flags) in syntax.items():
paul@0	41	patterns[name] = re.compile(value, re.UNICODE \| flags)
paul@0	42
paul@0	43
paul@0	44
paul@0	45	# Document nodes.
paul@0	46
paul@0	47	class Container:
paul@0	48
paul@0	49	"A container of document nodes."
paul@0	50
paul@0	51	def __init__(self, nodes):
paul@0	52	self.nodes = nodes
paul@0	53
paul@0	54	def append(self, node):
paul@0	55	self.nodes.append(node)
paul@0	56
paul@1	57	def normalise(self):
paul@1	58
paul@1	59	"Combine adjacent text nodes."
paul@1	60
paul@1	61	nodes = self.nodes
paul@1	62	self.nodes = []
paul@1	63	text = None
paul@1	64
paul@1	65	for node in nodes:
paul@1	66
paul@1	67	# Open a text node or merge text into an open node.
paul@1	68
paul@1	69	if isinstance(node, Text):
paul@1	70	if not text:
paul@1	71	text = node
paul@1	72	else:
paul@1	73	text.merge(node)
paul@1	74
paul@1	75	# Close any open text node and append the current node.
paul@1	76
paul@1	77	else:
paul@1	78	if text:
paul@1	79	self.append(text)
paul@1	80	text = None
paul@1	81	self.append(node)
paul@1	82
paul@1	83	# Add any open text node.
paul@1	84
paul@1	85	if text:
paul@1	86	self.append(text)
paul@1	87
paul@3	88	def __str__(self):
paul@3	89	return self.prettyprint()
paul@3	90
paul@3	91	def prettyprint(self, indent=""):
paul@3	92	pass
paul@3	93
paul@0	94	class Region(Container):
paul@0	95
paul@0	96	"A region of the page."
paul@0	97
paul@1	98	transparent_region_types = ["wiki"]
paul@1	99
paul@0	100	def __init__(self, nodes, level=0, type=None):
paul@0	101	Container.__init__(self, nodes)
paul@0	102	self.level = level
paul@0	103	self.type = type
paul@0	104
paul@0	105	def have_end(self, s):
paul@0	106	return self.level and s.startswith("}") and self.level == len(s)
paul@0	107
paul@1	108	def is_transparent(self):
paul@1	109	return not self.level or self.type in self.transparent_region_types
paul@1	110
paul@0	111	def __repr__(self):
paul@0	112	return "Region(%r, %r, %r)" % (self.nodes, self.level, self.type)
paul@0	113
paul@3	114	def prettyprint(self, indent=""):
paul@3	115	l = ["%sRegion: level=%d type=%s" % (indent, self.level, self.type)]
paul@3	116	for node in self.nodes:
paul@3	117	l.append(node.prettyprint(indent + " "))
paul@3	118	return "\n".join(l)
paul@3	119
paul@0	120	def to_string(self, out):
paul@0	121	out.start_region(self.level, self.type)
paul@0	122	for node in self.nodes:
paul@0	123	node.to_string(out)
paul@0	124	out.end_region(self.level, self.type)
paul@0	125
paul@0	126	class Block(Container):
paul@0	127
paul@0	128	"A block in the page."
paul@0	129
paul@0	130	def __init__(self, nodes, final=True):
paul@0	131	Container.__init__(self, nodes)
paul@0	132	self.final = final
paul@0	133
paul@0	134	def __repr__(self):
paul@0	135	return "Block(%r)" % self.nodes
paul@0	136
paul@3	137	def prettyprint(self, indent=""):
paul@3	138	l = ["%sBlock: final=%s" % (indent, self.final)]
paul@3	139	for node in self.nodes:
paul@3	140	l.append(node.prettyprint(indent + " "))
paul@3	141	return "\n".join(l)
paul@3	142
paul@0	143	def to_string(self, out):
paul@0	144	out.start_block(self.final)
paul@0	145	for node in self.nodes:
paul@0	146	node.to_string(out)
paul@0	147	out.end_block(self.final)
paul@0	148
paul@0	149	class Text:
paul@0	150
paul@0	151	"A text node."
paul@0	152
paul@0	153	def __init__(self, s):
paul@0	154	self.s = s
paul@0	155
paul@1	156	def merge(self, text):
paul@1	157	self.s += text.s
paul@1	158
paul@0	159	def __repr__(self):
paul@0	160	return "Text(%r)" % self.s
paul@0	161
paul@3	162	def prettyprint(self, indent=""):
paul@3	163	return "%sText: %r" % (indent, self.s)
paul@3	164
paul@0	165	def to_string(self, out):
paul@0	166	out.text(self.s)
paul@0	167
paul@0	168
paul@0	169
paul@0	170	# Serialisation.
paul@0	171
paul@0	172	class Serialiser:
paul@0	173
paul@0	174	"General serialisation support."
paul@0	175
paul@0	176	def __init__(self, out):
paul@0	177	self.out = out
paul@0	178
paul@0	179	class MoinSerialiser(Serialiser):
paul@0	180
paul@0	181	"Serialisation of the page."
paul@0	182
paul@0	183	def start_region(self, level, type):
paul@0	184	out = self.out
paul@0	185	if level:
paul@0	186	out("{" * level) # marker
paul@1	187	if type and level:
paul@1	188	out("#!%s\n" % type) # header
paul@0	189
paul@0	190	def end_region(self, level, type):
paul@0	191	out = self.out
paul@0	192	if level:
paul@0	193	out("}" * level) # marker
paul@0	194
paul@0	195	def start_block(self, final):
paul@0	196	pass
paul@0	197
paul@0	198	def end_block(self, final):
paul@0	199	if not final:
paul@0	200	self.out("\n")
paul@0	201
paul@0	202	def text(self, s):
paul@0	203	self.out(s)
paul@0	204
paul@0	205	class HTMLSerialiser(Serialiser):
paul@0	206
paul@0	207	"Serialisation of the page."
paul@0	208
paul@0	209	def start_region(self, level, type):
paul@0	210	l = []
paul@0	211	out = l.append
paul@0	212	if level:
paul@0	213	out("level-%d" % level) # marker
paul@0	214
paul@0	215	# NOTE: Encode type details for CSS.
paul@0	216
paul@0	217	if type:
paul@0	218	out("type-%s" % escape(type, True)) # header
paul@0	219
paul@0	220	self.out("<span class='%s'>" % " ".join(l))
paul@0	221
paul@0	222	def end_region(self, level, type):
paul@0	223	self.out("</span>")
paul@0	224
paul@0	225	def start_block(self, final):
paul@0	226	self.out("<p>")
paul@0	227
paul@0	228	def end_block(self, final):
paul@0	229	self.out("</p>")
paul@0	230
paul@0	231	def text(self, s):
paul@0	232	self.out(escape(s))
paul@0	233
paul@0	234
paul@0	235
paul@2	236	# Tokenising functions.
paul@2	237
paul@2	238	class TokenStream:
paul@2	239
paul@2	240	"A stream of tokens taken from a string."
paul@2	241
paul@2	242	def __init__(self, s):
paul@2	243	self.s = s
paul@2	244	self.pos = 0
paul@2	245	self.match = None
paul@2	246	self.matching = None
paul@2	247
paul@2	248	def read_until(self, pattern_names, remaining=True):
paul@2	249
paul@2	250	"""
paul@2	251	Find the first match for the given 'pattern_names'. Return the text
paul@2	252	preceding any match, the remaining text if no match was found, or None
paul@2	253	if no match was found and 'remaining' is given as a false value.
paul@2	254	"""
paul@2	255
paul@2	256	first = None
paul@2	257	self.matching = None
paul@2	258
paul@2	259	# Find the first matching pattern.
paul@2	260
paul@2	261	for pattern_name in pattern_names:
paul@2	262	match = patterns[pattern_name].search(self.s, self.pos)
paul@2	263	if match:
paul@2	264	start, end = match.span()
paul@2	265	if self.matching is None or start < first:
paul@2	266	first = start
paul@2	267	self.matching = pattern_name
paul@2	268	self.match = match
paul@2	269
paul@2	270	if self.matching is None:
paul@2	271	if remaining:
paul@2	272	return self.s[self.pos:]
paul@2	273	else:
paul@2	274	return None
paul@2	275	else:
paul@2	276	return self.s[self.pos:first]
paul@2	277
paul@2	278	def read_match(self):
paul@2	279
paul@2	280	"Return the matched text, updating the position in the stream."
paul@2	281
paul@2	282	if self.match:
paul@2	283	_start, self.pos = self.match.span()
paul@2	284	s = self.match.group(1)
paul@2	285	return s
paul@2	286	else:
paul@2	287	self.pos = len(self.s)
paul@2	288	return None
paul@2	289
paul@2	290
paul@2	291
paul@0	292	# Parser functions.
paul@0	293
paul@0	294	def parse_page(s):
paul@0	295
paul@0	296	"""
paul@0	297	Parse page text 's'. Pages consist of regions delimited by markers.
paul@0	298	"""
paul@0	299
paul@6	300	return parse_region(TokenStream(s))
paul@1	301
paul@6	302	def parse_region(items, level=0):
paul@1	303
paul@6	304	"""
paul@6	305	Parse the data provided by 'items' to populate a region at the given
paul@6	306	'level'.
paul@6	307	"""
paul@0	308
paul@6	309	region = Region([], level)
paul@0	310
paul@2	311	# Parse section headers.
paul@2	312
paul@2	313	parse_region_header(items, region)
paul@2	314
paul@2	315	if region.is_transparent():
paul@2	316	parse_region_wiki(items, region)
paul@2	317	else:
paul@2	318	parse_region_opaque(items, region)
paul@2	319
paul@6	320	return region
paul@6	321
paul@6	322	def parse_region_header(items, region):
paul@6	323
paul@6	324	"""
paul@6	325	Parse the region header from the 'items', setting it for the given 'region'.
paul@6	326	"""
paul@6	327
paul@6	328	if items.read_until(["header"], False) == "": # None means no header
paul@6	329	region.type = items.read_match()
paul@6	330
paul@2	331	def parse_region_wiki(items, region):
paul@2	332
paul@2	333	"Parse the data provided by 'items' to populate a wiki 'region'."
paul@0	334
paul@0	335	# Process exposed text and sections.
paul@0	336
paul@6	337	block = new_block(region)
paul@0	338
paul@2	339	while True:
paul@1	340
paul@2	341	# Obtain text before any marker or the end of the input.
paul@1	342
paul@5	343	preceding = items.read_until(["break", "regionstart", "regionend"])
paul@4	344	if preceding:
paul@4	345	block.append(Text(preceding))
paul@0	346
paul@2	347	# Obtain any feature.
paul@0	348
paul@2	349	feature = items.read_match()
paul@0	350
paul@1	351	# End of input.
paul@0	352
paul@2	353	if not items.matching:
paul@2	354	break
paul@2	355
paul@2	356	# Start a section if an appropriate marker is given.
paul@2	357
paul@5	358	if items.matching == "regionstart":
paul@2	359
paul@6	360	# Parse the section and start a new block after the section.
paul@2	361
paul@6	362	region.append(parse_region(items, len(feature)))
paul@6	363	block = new_block(region)
paul@0	364
paul@2	365	# Interpret the given marker, closing the current section if the
paul@2	366	# given marker is the corresponding end marker for the current
paul@2	367	# section.
paul@2	368
paul@5	369	elif items.matching == "regionend" and region.have_end(feature):
paul@2	370	break
paul@2	371
paul@2	372	# Start a new block if a paragraph break is found.
paul@2	373
paul@2	374	elif items.matching == "break":
paul@2	375	block.final = False
paul@6	376	block = new_block(region)
paul@2	377
paul@2	378	# Add any inappropriate marker to the text.
paul@2	379
paul@2	380	else:
paul@2	381	block.append(Text(feature))
paul@2	382
paul@2	383	region.normalise()
paul@0	384
paul@2	385	def parse_region_opaque(items, region):
paul@2	386
paul@2	387	"Parse the data provided by 'items' to populate an opaque 'region'."
paul@2	388
paul@6	389	# Process exposed text and the section end.
paul@2	390
paul@2	391	while True:
paul@2	392
paul@2	393	# Obtain text before any marker or the end of the input.
paul@2	394
paul@5	395	preceding = items.read_until(["regionend"])
paul@4	396	if preceding:
paul@4	397	region.append(Text(preceding))
paul@2	398
paul@2	399	# Obtain any marker.
paul@2	400
paul@2	401	marker = items.read_match()
paul@1	402
paul@2	403	# End of input.
paul@2	404
paul@2	405	if not marker:
paul@2	406	break
paul@2	407
paul@2	408	# Interpret the given marker, closing the current section if the
paul@2	409	# given marker is the corresponding end marker for the current
paul@2	410	# section.
paul@0	411
paul@2	412	if region.have_end(marker):
paul@2	413	break
paul@2	414
paul@2	415	# Add any inappropriate marker to the text.
paul@2	416
paul@2	417	else:
paul@2	418	region.append(Text(marker))
paul@2	419
paul@2	420	region.normalise()
paul@2	421
paul@6	422	def new_block(region):
paul@6	423
paul@6	424	"Start a new block in 'region'."
paul@0	425
paul@6	426	block = Block([])
paul@6	427	region.append(block)
paul@6	428	return block
paul@0	429
paul@1	430
paul@1	431
paul@1	432	# Top-level functions.
paul@0	433
paul@0	434	parse = parse_page
paul@0	435
paul@0	436	def serialise(doc, serialiser=MoinSerialiser):
paul@0	437	l = []
paul@0	438	doc.to_string(serialiser(l.append))
paul@0	439	return "".join(l)
paul@0	440
paul@0	441	# vim: tabstop=4 expandtab shiftwidth=4