MoinLight (annotate moinformat.py in 3c29a8a62635)

MoinLight

Annotated moinformat.py

10:3c29a8a62635

2017-04-28

Paul Boddie

Capture indents used when starting regions/sections.

paul@0	1	#!/usr/bin/env python
paul@0	2
paul@0	3	"""
paul@0	4	Moin wiki format parser.
paul@0	5
paul@0	6	Copyright (C) 2012, 2013, 2015, 2017 Paul Boddie <paul@boddie.org.uk>
paul@0	7
paul@0	8	This program is free software; you can redistribute it and/or modify it under
paul@0	9	the terms of the GNU General Public License as published by the Free Software
paul@0	10	Foundation; either version 3 of the License, or (at your option) any later
paul@0	11	version.
paul@0	12
paul@0	13	This program is distributed in the hope that it will be useful, but WITHOUT
paul@0	14	ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
paul@0	15	FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
paul@0	16	details.
paul@0	17
paul@0	18	You should have received a copy of the GNU General Public License along with
paul@0	19	this program. If not, see <http://www.gnu.org/licenses/>.
paul@0	20	"""
paul@0	21
paul@0	22	from cgi import escape
paul@0	23	import re
paul@0	24
paul@0	25	# Regular expressions.
paul@0	26
paul@0	27	syntax = {
paul@0	28	# Page regions:
paul@10	29	"regionstart" : (r"((^\s*)([{]{3,}))", re.MULTILINE \| re.DOTALL), # {{{...
paul@6	30	"regionend" : (r"^\s*([}]{3,})", re.MULTILINE \| re.DOTALL), # }}}...
paul@6	31	"header" : (r"#!(.*?)\n", 0), # #! char-excl-nl
paul@0	32
paul@0	33	# Region contents:
paul@6	34	"break" : (r"^(\s*?)\n", re.MULTILINE), # blank line
paul@9	35	"listitem" : (r"^((\s+)([*]\|\d+[.]))", re.MULTILINE), # indent (list-item or number-item)
paul@9	36
paul@9	37	# List contents:
paul@9	38	"listitemend" : (r"^", re.MULTILINE), # next line
paul@0	39	}
paul@0	40
paul@0	41	# Define patterns for the regular expressions.
paul@0	42
paul@0	43	patterns = {}
paul@0	44	for name, (value, flags) in syntax.items():
paul@0	45	patterns[name] = re.compile(value, re.UNICODE \| flags)
paul@0	46
paul@0	47
paul@0	48
paul@0	49	# Document nodes.
paul@0	50
paul@0	51	class Container:
paul@0	52
paul@0	53	"A container of document nodes."
paul@0	54
paul@0	55	def __init__(self, nodes):
paul@0	56	self.nodes = nodes
paul@0	57
paul@0	58	def append(self, node):
paul@0	59	self.nodes.append(node)
paul@0	60
paul@8	61	append_text = append
paul@8	62
paul@9	63	def empty(self):
paul@9	64	return not self.nodes
paul@9	65
paul@1	66	def normalise(self):
paul@1	67
paul@1	68	"Combine adjacent text nodes."
paul@1	69
paul@1	70	nodes = self.nodes
paul@1	71	self.nodes = []
paul@1	72	text = None
paul@1	73
paul@1	74	for node in nodes:
paul@1	75
paul@1	76	# Open a text node or merge text into an open node.
paul@1	77
paul@1	78	if isinstance(node, Text):
paul@1	79	if not text:
paul@1	80	text = node
paul@1	81	else:
paul@1	82	text.merge(node)
paul@1	83
paul@1	84	# Close any open text node and append the current node.
paul@1	85
paul@1	86	else:
paul@1	87	if text:
paul@1	88	self.append(text)
paul@1	89	text = None
paul@1	90	self.append(node)
paul@1	91
paul@1	92	# Add any open text node.
paul@1	93
paul@1	94	if text:
paul@1	95	self.append(text)
paul@1	96
paul@3	97	def __str__(self):
paul@3	98	return self.prettyprint()
paul@3	99
paul@3	100	def prettyprint(self, indent=""):
paul@3	101	pass
paul@3	102
paul@0	103	class Region(Container):
paul@0	104
paul@0	105	"A region of the page."
paul@0	106
paul@1	107	transparent_region_types = ["wiki"]
paul@1	108
paul@10	109	def __init__(self, nodes, level=0, indent=0, type=None):
paul@0	110	Container.__init__(self, nodes)
paul@0	111	self.level = level
paul@10	112	self.indent = indent
paul@0	113	self.type = type
paul@0	114
paul@9	115	def append(self, node):
paul@9	116	last = self.nodes and self.nodes[-1]
paul@9	117	if last and last.empty():
paul@9	118	self.nodes[-1] = node
paul@9	119	else:
paul@9	120	self.nodes.append(node)
paul@9	121
paul@8	122	def append_text(self, s):
paul@8	123	if self.is_transparent():
paul@8	124	self.nodes[-1].append(s)
paul@8	125	else:
paul@8	126	self.append(s)
paul@8	127
paul@0	128	def have_end(self, s):
paul@0	129	return self.level and s.startswith("}") and self.level == len(s)
paul@0	130
paul@1	131	def is_transparent(self):
paul@1	132	return not self.level or self.type in self.transparent_region_types
paul@1	133
paul@0	134	def __repr__(self):
paul@10	135	return "Region(%r, %r, %r, %r)" % (self.nodes, self.level, self.indent, self.type)
paul@0	136
paul@3	137	def prettyprint(self, indent=""):
paul@10	138	l = ["%sRegion: level=%d indent=%d type=%s" % (indent, self.level, self.indent, self.type)]
paul@3	139	for node in self.nodes:
paul@3	140	l.append(node.prettyprint(indent + " "))
paul@3	141	return "\n".join(l)
paul@3	142
paul@0	143	def to_string(self, out):
paul@10	144	out.start_region(self.level, self.indent, self.type)
paul@0	145	for node in self.nodes:
paul@0	146	node.to_string(out)
paul@10	147	out.end_region(self.level, self.indent, self.type)
paul@0	148
paul@0	149	class Block(Container):
paul@0	150
paul@0	151	"A block in the page."
paul@0	152
paul@0	153	def __init__(self, nodes, final=True):
paul@0	154	Container.__init__(self, nodes)
paul@0	155	self.final = final
paul@0	156
paul@0	157	def __repr__(self):
paul@0	158	return "Block(%r)" % self.nodes
paul@0	159
paul@3	160	def prettyprint(self, indent=""):
paul@3	161	l = ["%sBlock: final=%s" % (indent, self.final)]
paul@3	162	for node in self.nodes:
paul@3	163	l.append(node.prettyprint(indent + " "))
paul@3	164	return "\n".join(l)
paul@3	165
paul@0	166	def to_string(self, out):
paul@0	167	out.start_block(self.final)
paul@0	168	for node in self.nodes:
paul@0	169	node.to_string(out)
paul@0	170	out.end_block(self.final)
paul@0	171
paul@9	172	class ListItem(Container):
paul@9	173
paul@9	174	"A list item."
paul@9	175
paul@9	176	def __repr__(self):
paul@9	177	return "ListItem(%r)" % self.nodes
paul@9	178
paul@9	179	def prettyprint(self, indent=""):
paul@9	180	l = ["%sListItem:" % indent]
paul@9	181	for node in self.nodes:
paul@9	182	l.append(node.prettyprint(indent + " "))
paul@9	183	return "\n".join(l)
paul@9	184
paul@9	185	def to_string(self, out):
paul@9	186	out.start_listitem()
paul@9	187	for node in self.nodes:
paul@9	188	node.to_string(out)
paul@9	189	out.end_listitem()
paul@9	190
paul@9	191
paul@0	192	class Text:
paul@0	193
paul@0	194	"A text node."
paul@0	195
paul@0	196	def __init__(self, s):
paul@0	197	self.s = s
paul@0	198
paul@9	199	def empty(self):
paul@9	200	return not self.s
paul@9	201
paul@1	202	def merge(self, text):
paul@1	203	self.s += text.s
paul@1	204
paul@0	205	def __repr__(self):
paul@0	206	return "Text(%r)" % self.s
paul@0	207
paul@3	208	def prettyprint(self, indent=""):
paul@3	209	return "%sText: %r" % (indent, self.s)
paul@3	210
paul@0	211	def to_string(self, out):
paul@0	212	out.text(self.s)
paul@0	213
paul@0	214
paul@0	215
paul@0	216	# Serialisation.
paul@0	217
paul@0	218	class Serialiser:
paul@0	219
paul@0	220	"General serialisation support."
paul@0	221
paul@0	222	def __init__(self, out):
paul@0	223	self.out = out
paul@0	224
paul@0	225	class MoinSerialiser(Serialiser):
paul@0	226
paul@0	227	"Serialisation of the page."
paul@0	228
paul@10	229	def start_region(self, level, indent, type):
paul@0	230	out = self.out
paul@0	231	if level:
paul@10	232	out(" " * indent + "{" * level)
paul@1	233	if type and level:
paul@10	234	out("#!%s\n" % type)
paul@0	235
paul@10	236	def end_region(self, level, indent, type):
paul@0	237	out = self.out
paul@0	238	if level:
paul@10	239	out("}" * level)
paul@0	240
paul@0	241	def start_block(self, final):
paul@0	242	pass
paul@0	243
paul@0	244	def end_block(self, final):
paul@0	245	if not final:
paul@0	246	self.out("\n")
paul@0	247
paul@9	248	def start_listitem(self):
paul@9	249	self.out(" *")
paul@9	250
paul@9	251	def end_listitem(self):
paul@9	252	pass
paul@9	253
paul@0	254	def text(self, s):
paul@0	255	self.out(s)
paul@0	256
paul@0	257	class HTMLSerialiser(Serialiser):
paul@0	258
paul@0	259	"Serialisation of the page."
paul@0	260
paul@10	261	def start_region(self, level, indent, type):
paul@0	262	l = []
paul@0	263	out = l.append
paul@0	264	if level:
paul@10	265	out("level-%d" % level)
paul@10	266
paul@10	267	if indent:
paul@10	268	out("indent-%d" % indent)
paul@0	269
paul@0	270	# NOTE: Encode type details for CSS.
paul@0	271
paul@0	272	if type:
paul@10	273	out("type-%s" % escape(type, True))
paul@0	274
paul@0	275	self.out("<span class='%s'>" % " ".join(l))
paul@0	276
paul@10	277	def end_region(self, level, indent, type):
paul@0	278	self.out("</span>")
paul@0	279
paul@0	280	def start_block(self, final):
paul@0	281	self.out("<p>")
paul@0	282
paul@0	283	def end_block(self, final):
paul@0	284	self.out("</p>")
paul@0	285
paul@9	286	def start_listitem(self):
paul@9	287	self.out("<li>")
paul@9	288
paul@9	289	def end_listitem(self):
paul@9	290	self.out("</li>")
paul@9	291
paul@0	292	def text(self, s):
paul@0	293	self.out(escape(s))
paul@0	294
paul@0	295
paul@0	296
paul@2	297	# Tokenising functions.
paul@2	298
paul@2	299	class TokenStream:
paul@2	300
paul@2	301	"A stream of tokens taken from a string."
paul@2	302
paul@2	303	def __init__(self, s):
paul@2	304	self.s = s
paul@2	305	self.pos = 0
paul@2	306	self.match = None
paul@2	307	self.matching = None
paul@2	308
paul@2	309	def read_until(self, pattern_names, remaining=True):
paul@2	310
paul@2	311	"""
paul@2	312	Find the first match for the given 'pattern_names'. Return the text
paul@2	313	preceding any match, the remaining text if no match was found, or None
paul@2	314	if no match was found and 'remaining' is given as a false value.
paul@2	315	"""
paul@2	316
paul@2	317	first = None
paul@2	318	self.matching = None
paul@2	319
paul@2	320	# Find the first matching pattern.
paul@2	321
paul@2	322	for pattern_name in pattern_names:
paul@2	323	match = patterns[pattern_name].search(self.s, self.pos)
paul@2	324	if match:
paul@2	325	start, end = match.span()
paul@2	326	if self.matching is None or start < first:
paul@2	327	first = start
paul@2	328	self.matching = pattern_name
paul@2	329	self.match = match
paul@2	330
paul@2	331	if self.matching is None:
paul@2	332	if remaining:
paul@2	333	return self.s[self.pos:]
paul@2	334	else:
paul@2	335	return None
paul@2	336	else:
paul@2	337	return self.s[self.pos:first]
paul@2	338
paul@10	339	def read_match(self, group=1):
paul@2	340
paul@10	341	"""
paul@10	342	Return the matched text, updating the position in the stream. If 'group'
paul@10	343	is specified, the indicated group in a match will be returned.
paul@10	344	Typically, group 1 should contain all pertinent data, but groups defined
paul@10	345	within group 1 can provide sections of the data.
paul@10	346	"""
paul@2	347
paul@2	348	if self.match:
paul@2	349	_start, self.pos = self.match.span()
paul@9	350	try:
paul@10	351	return self.match.group(group)
paul@9	352	except IndexError:
paul@9	353	return ""
paul@2	354	else:
paul@2	355	self.pos = len(self.s)
paul@2	356	return None
paul@2	357
paul@2	358
paul@2	359
paul@0	360	# Parser functions.
paul@0	361
paul@0	362	def parse_page(s):
paul@0	363
paul@0	364	"""
paul@0	365	Parse page text 's'. Pages consist of regions delimited by markers.
paul@0	366	"""
paul@0	367
paul@6	368	return parse_region(TokenStream(s))
paul@1	369
paul@10	370	def parse_region(items, level=0, indent=0):
paul@1	371
paul@6	372	"""
paul@10	373	Parse the data provided by 'items' to populate a region with the given
paul@10	374	'level' at the given 'indent'.
paul@6	375	"""
paul@0	376
paul@10	377	region = Region([], level, indent)
paul@0	378
paul@2	379	# Parse section headers.
paul@2	380
paul@2	381	parse_region_header(items, region)
paul@2	382
paul@8	383	# Parse section body.
paul@8	384
paul@2	385	if region.is_transparent():
paul@2	386	parse_region_wiki(items, region)
paul@2	387	else:
paul@2	388	parse_region_opaque(items, region)
paul@2	389
paul@6	390	return region
paul@6	391
paul@6	392	def parse_region_header(items, region):
paul@6	393
paul@6	394	"""
paul@6	395	Parse the region header from the 'items', setting it for the given 'region'.
paul@6	396	"""
paul@6	397
paul@6	398	if items.read_until(["header"], False) == "": # None means no header
paul@6	399	region.type = items.read_match()
paul@6	400
paul@2	401	def parse_region_wiki(items, region):
paul@2	402
paul@2	403	"Parse the data provided by 'items' to populate a wiki 'region'."
paul@0	404
paul@8	405	new_block(region)
paul@9	406	parse_region_details(items, region, ["break", "listitem", "regionstart", "regionend"])
paul@0	407
paul@8	408	def parse_region_opaque(items, region):
paul@1	409
paul@8	410	"Parse the data provided by 'items' to populate an opaque 'region'."
paul@8	411
paul@8	412	parse_region_details(items, region, ["regionend"])
paul@1	413
paul@8	414	def parse_region_details(items, region, pattern_names):
paul@0	415
paul@8	416	"Parse 'items' within 'region' searching using 'pattern_names'."
paul@0	417
paul@8	418	try:
paul@8	419	while True:
paul@0	420
paul@8	421	# Obtain text before any marker or the end of the input.
paul@2	422
paul@8	423	preceding = items.read_until(pattern_names)
paul@8	424	if preceding:
paul@8	425	region.append_text(Text(preceding))
paul@2	426
paul@8	427	# End of input.
paul@0	428
paul@8	429	if not items.matching:
paul@8	430	break
paul@8	431
paul@8	432	# Obtain any feature.
paul@2	433
paul@8	434	feature = items.read_match()
paul@8	435	handler = handlers.get(items.matching)
paul@2	436
paul@8	437	# Handle each feature or add text to the region.
paul@2	438
paul@8	439	if handler:
paul@8	440	handler(items, region)
paul@8	441	else:
paul@8	442	region.append_text(Text(feature))
paul@2	443
paul@8	444	except StopIteration:
paul@8	445	pass
paul@2	446
paul@2	447	region.normalise()
paul@0	448
paul@8	449	def end_region(items, region):
paul@7	450
paul@8	451	"End the parsing of 'region'."
paul@7	452
paul@8	453	raise StopIteration
paul@7	454
paul@8	455	def parse_break(items, region):
paul@8	456
paul@8	457	"Handle a paragraph break within 'region'."
paul@7	458
paul@7	459	# Mark any previous block as not being the final one in a sequence.
paul@7	460
paul@7	461	block = region.nodes[-1]
paul@7	462	block.final = False
paul@8	463	new_block(region)
paul@2	464
paul@9	465	def parse_listitem_end(items, region):
paul@9	466
paul@9	467	"Handle the end of a list."
paul@9	468
paul@9	469	raise StopIteration
paul@9	470
paul@9	471	def parse_listitem(items, region):
paul@9	472
paul@9	473	"Handle a list item marker within 'region'."
paul@9	474
paul@9	475	item = ListItem([])
paul@9	476	parse_region_details(items, item, ["listitemend"])
paul@9	477	region.append(item)
paul@9	478	new_block(region)
paul@9	479
paul@8	480	def parse_section(items, region):
paul@2	481
paul@8	482	"Handle the start of a new section within 'region'."
paul@2	483
paul@8	484	# Parse the section and start a new block after the section.
paul@2	485
paul@10	486	indent = len(items.read_match(2))
paul@10	487	level = len(items.read_match(3))
paul@10	488	region.append(parse_region(items, level, indent))
paul@8	489	new_block(region)
paul@2	490
paul@8	491	def parse_section_end(items, region):
paul@2	492
paul@8	493	"Handle the end of a new section within 'region'."
paul@1	494
paul@8	495	feature = items.read_match()
paul@8	496	if region.have_end(feature):
paul@8	497	raise StopIteration
paul@8	498	else:
paul@8	499	region.append_text(Text(feature))
paul@2	500
paul@8	501	# Pattern handlers.
paul@2	502
paul@8	503	handlers = {
paul@8	504	None : end_region,
paul@8	505	"break" : parse_break,
paul@9	506	"listitemend" : parse_listitem_end,
paul@9	507	"listitem" : parse_listitem,
paul@8	508	"regionstart" : parse_section,
paul@8	509	"regionend" : parse_section_end,
paul@8	510	}
paul@2	511
paul@6	512	def new_block(region):
paul@6	513
paul@6	514	"Start a new block in 'region'."
paul@0	515
paul@6	516	block = Block([])
paul@6	517	region.append(block)
paul@0	518
paul@1	519
paul@1	520
paul@1	521	# Top-level functions.
paul@0	522
paul@0	523	parse = parse_page
paul@0	524
paul@0	525	def serialise(doc, serialiser=MoinSerialiser):
paul@0	526	l = []
paul@0	527	doc.to_string(serialiser(l.append))
paul@0	528	return "".join(l)
paul@0	529
paul@0	530	# vim: tabstop=4 expandtab shiftwidth=4