MoinLight (annotate moinformat/parsers/common.py in 518c6bf3b8ca)

MoinLight

Annotated moinformat/parsers/common.py

43:518c6bf3b8ca

2018-06-01

Paul Boddie

Introduced initial support for list creation, also making new_block a method, and adding some control over whitespace accumulation.

paul@32	1	#!/usr/bin/env python
paul@32	2
paul@32	3	"""
paul@32	4	Moin wiki parsing functionality.
paul@32	5
paul@32	6	Copyright (C) 2017 Paul Boddie <paul@boddie.org.uk>
paul@32	7
paul@32	8	This program is free software; you can redistribute it and/or modify it under
paul@32	9	the terms of the GNU General Public License as published by the Free Software
paul@32	10	Foundation; either version 3 of the License, or (at your option) any later
paul@32	11	version.
paul@32	12
paul@32	13	This program is distributed in the hope that it will be useful, but WITHOUT
paul@32	14	ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
paul@32	15	FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
paul@32	16	details.
paul@32	17
paul@32	18	You should have received a copy of the GNU General Public License along with
paul@32	19	this program. If not, see <http://www.gnu.org/licenses/>.
paul@32	20	"""
paul@32	21
paul@43	22	from collections import defaultdict
paul@32	23	from moinformat.tree import Block, Region, Text
paul@33	24	import re
paul@33	25
paul@33	26	# Pattern management.
paul@33	27
paul@36	28	ws_excl_nl = r"[ \f\r\t\v]"
paul@36	29
paul@33	30	def get_patterns(syntax):
paul@33	31
paul@36	32	"""
paul@36	33	Define patterns for the regular expressions in the 'syntax' mapping. In each
paul@36	34	pattern, replace \N with a pattern for matching whitespace excluding
paul@36	35	newlines.
paul@36	36	"""
paul@33	37
paul@33	38	patterns = {}
paul@33	39	for name, value in syntax.items():
paul@36	40	value = value.replace(r"\N", ws_excl_nl)
paul@33	41	patterns[name] = re.compile(value, re.UNICODE \| re.MULTILINE)
paul@33	42	return patterns
paul@33	43
paul@37	44	def get_subset(d, keys):
paul@33	45
paul@37	46	"Return a subset of 'd' having the given 'keys'."
paul@36	47
paul@37	48	subset = {}
paul@37	49	for key in keys:
paul@37	50	subset[key] = d[key]
paul@37	51	return subset
paul@36	52
paul@36	53
paul@32	54
paul@32	55	# Tokenising functions.
paul@32	56
paul@32	57	class TokenStream:
paul@32	58
paul@32	59	"A stream of tokens taken from a string."
paul@32	60
paul@37	61	def __init__(self, s, pos=0):
paul@32	62	self.s = s
paul@36	63	self.pos = pos
paul@32	64	self.match = None
paul@32	65	self.matching = None
paul@32	66
paul@32	67	def rewind(self, length):
paul@32	68
paul@32	69	"Rewind in the string by 'length'."
paul@32	70
paul@32	71	self.pos -= min(length, self.pos)
paul@32	72
paul@37	73	def read_until(self, patterns, remaining=True):
paul@32	74
paul@32	75	"""
paul@37	76	Find the first match for the given 'patterns'. Return the text preceding
paul@37	77	any match, the remaining text if no match was found, or None if no match
paul@37	78	was found and 'remaining' is given as a false value.
paul@32	79	"""
paul@32	80
paul@32	81	first = None
paul@32	82	self.matching = None
paul@32	83
paul@32	84	# Find the first matching pattern.
paul@32	85
paul@37	86	for pattern_name, pattern in patterns.items():
paul@37	87	match = pattern.search(self.s, self.pos)
paul@32	88	if match:
paul@32	89	start, end = match.span()
paul@32	90	if self.matching is None or start < first:
paul@32	91	first = start
paul@32	92	self.matching = pattern_name
paul@32	93	self.match = match
paul@32	94
paul@32	95	if self.matching is None:
paul@32	96	if remaining:
paul@32	97	return self.s[self.pos:]
paul@32	98	else:
paul@32	99	return None
paul@32	100	else:
paul@32	101	return self.s[self.pos:first]
paul@32	102
paul@32	103	def read_match(self, group=1):
paul@32	104
paul@32	105	"""
paul@32	106	Return the matched text, updating the position in the stream. If 'group'
paul@32	107	is specified, the indicated group in a match will be returned.
paul@32	108	Typically, group 1 should contain all pertinent data, but groups defined
paul@32	109	within group 1 can provide sections of the data.
paul@32	110	"""
paul@32	111
paul@32	112	if self.match:
paul@32	113	_start, self.pos = self.match.span()
paul@32	114	try:
paul@32	115	return self.match.group(group)
paul@32	116	except IndexError:
paul@32	117	return ""
paul@32	118	else:
paul@32	119	self.pos = len(self.s)
paul@32	120	return None
paul@32	121
paul@32	122
paul@32	123
paul@32	124	# Parser abstractions.
paul@32	125
paul@32	126	class ParserBase:
paul@32	127
paul@32	128	"Common parsing methods."
paul@32	129
paul@37	130	region_pattern_names = None
paul@37	131
paul@32	132	def __init__(self, formats=None):
paul@32	133
paul@32	134	"""
paul@32	135	Initialise the parser with any given 'formats' mapping from region type
paul@32	136	names to parser objects.
paul@32	137	"""
paul@32	138
paul@32	139	self.formats = formats
paul@43	140	self.queued = defaultdict(list)
paul@37	141
paul@37	142	def get_parser(self, format_type):
paul@37	143
paul@37	144	"""
paul@37	145	Return a parser for 'format_type' or None if no suitable parser is found.
paul@37	146	"""
paul@37	147
paul@37	148	if not self.formats:
paul@37	149	return None
paul@37	150
paul@37	151	cls = self.formats.get(format_type)
paul@37	152	if cls:
paul@37	153	return cls(self.formats)
paul@37	154	else:
paul@37	155	return None
paul@37	156
paul@37	157	def get_patterns(self, pattern_names):
paul@37	158
paul@37	159	"Return a mapping of the given 'pattern_names' to patterns."
paul@37	160
paul@37	161	return get_subset(self.patterns, pattern_names)
paul@32	162
paul@36	163	def get_items(self, s, pos=0):
paul@32	164
paul@36	165	"Return a sequence of token items for 's' and 'pos'."
paul@32	166
paul@37	167	return TokenStream(s, pos)
paul@37	168
paul@37	169	def set_region(self, items, region):
paul@37	170
paul@37	171	"Set the 'items' used to populate the given 'region'."
paul@32	172
paul@37	173	self.items = items
paul@37	174	self.region = region
paul@37	175
paul@37	176	def read_until(self, pattern_names, remaining=True):
paul@36	177
paul@37	178	"""
paul@37	179	Read the next portion of input, matching using 'pattern_names'. Return
paul@37	180	the text preceding any match, the remaining text if no match was found,
paul@37	181	or None if no match was found and 'remaining' is given as a false value.
paul@37	182	"""
paul@36	183
paul@37	184	return self.items.read_until(self.get_patterns(pattern_names))
paul@37	185
paul@37	186	def read_match(self, group=1):
paul@37	187
paul@37	188	"""
paul@37	189	Return the group of the matching pattern with the given 'group' number.
paul@37	190	"""
paul@36	191
paul@37	192	return self.items.read_match(group)
paul@37	193
paul@37	194	def read_matching(self):
paul@36	195
paul@37	196	"Return the name of the matching pattern."
paul@36	197
paul@37	198	return self.items.matching
paul@37	199
paul@37	200	# Parser methods invoked from other objects.
paul@36	201
paul@32	202	def parse(self, s):
paul@32	203
paul@32	204	"""
paul@32	205	Parse page text 's'. Pages consist of regions delimited by markers.
paul@32	206	"""
paul@32	207
paul@37	208	self.items = self.get_items(s)
paul@37	209	self.region = self.parse_region()
paul@37	210	return self.region
paul@37	211
paul@37	212	def parse_region_content(self, items, region):
paul@37	213
paul@37	214	"Parse the data provided by 'items' to populate a 'region'."
paul@37	215
paul@37	216	self.set_region(items, region)
paul@32	217
paul@37	218	# Define a block to hold text and start parsing.
paul@37	219
paul@43	220	self.new_block(region)
paul@37	221
paul@37	222	if self.region_pattern_names:
paul@37	223	self.parse_region_details(region, self.region_pattern_names)
paul@37	224
paul@37	225	# Top-level parser handler methods.
paul@37	226
paul@37	227	def parse_region(self, level=0, indent=0):
paul@32	228
paul@32	229	"""
paul@37	230	Parse the data to populate a region with the given 'level' at the given
paul@37	231	'indent'.
paul@32	232	"""
paul@32	233
paul@32	234	region = Region([], level, indent)
paul@32	235
paul@32	236	# Parse section headers, then parse according to region type.
paul@32	237
paul@37	238	self.parse_region_header(region)
paul@37	239	self.parse_region_type(region)
paul@32	240
paul@32	241	return region
paul@32	242
paul@37	243	def parse_region_type(self, region):
paul@32	244
paul@32	245	"""
paul@37	246	Use configured parsers to parse 'region' based on its type.
paul@32	247	"""
paul@32	248
paul@32	249	# Find an appropriate parser given the type.
paul@32	250
paul@37	251	parser = self.get_parser(region.type)
paul@37	252
paul@37	253	if parser:
paul@37	254	parser.parse_region_content(self.items, region)
paul@32	255
paul@32	256	# Otherwise, treat the section as opaque.
paul@32	257
paul@32	258	else:
paul@37	259	self.parse_region_opaque(region)
paul@32	260
paul@37	261	def parse_region_header(self, region):
paul@32	262
paul@32	263	"""
paul@37	264	Parse the region header, setting it on the 'region' object.
paul@32	265	"""
paul@32	266
paul@37	267	if self.read_until(["header"], False) == "": # None means no header
paul@37	268	region.type = self.read_match()
paul@32	269
paul@37	270	def parse_region_opaque(self, region):
paul@32	271
paul@37	272	"Parse the data to populate an opaque 'region'."
paul@32	273
paul@32	274	region.transparent = False
paul@37	275	self.parse_region_details(region, ["regionend"])
paul@32	276
paul@32	277	# Parsing utilities.
paul@32	278
paul@43	279	def parse_region_details(self, region, pattern_names, strict=False):
paul@32	280
paul@43	281	"""
paul@43	282	Search 'region' using the 'pattern_names'. If 'strict' is set to a true
paul@43	283	value, forbid the accumulation of additional textual padding.
paul@43	284	"""
paul@32	285
paul@32	286	try:
paul@32	287	while True:
paul@32	288
paul@32	289	# Obtain text before any marker or the end of the input.
paul@32	290
paul@37	291	preceding = self.read_until(pattern_names)
paul@32	292	if preceding:
paul@43	293	if not strict:
paul@43	294	region.append_inline(Text(preceding))
paul@43	295	else:
paul@43	296	break
paul@32	297
paul@32	298	# End of input.
paul@32	299
paul@37	300	if not self.read_matching():
paul@32	301	break
paul@32	302
paul@32	303	# Obtain any feature.
paul@32	304
paul@37	305	feature = self.read_match()
paul@37	306	handler = self.handlers.get(self.read_matching())
paul@32	307
paul@32	308	# Handle each feature or add text to the region.
paul@32	309
paul@32	310	if handler:
paul@37	311	handler(self, region)
paul@43	312	elif not strict:
paul@43	313	region.append_inline(Text(feature))
paul@32	314	else:
paul@43	315	break
paul@32	316
paul@32	317	except StopIteration:
paul@32	318	pass
paul@32	319
paul@32	320	region.normalise()
paul@32	321
paul@43	322	def add_node(self, region, node):
paul@43	323
paul@43	324	"Add to 'region' the given 'node'."
paul@43	325
paul@43	326	region.add(node)
paul@43	327	self.unqueue_region(region, node)
paul@43	328
paul@43	329	def append_node(self, region, node):
paul@43	330
paul@43	331	"Append to 'region' the given 'node'."
paul@43	332
paul@43	333	region.append(node)
paul@43	334	self.unqueue_region(region, node)
paul@43	335
paul@37	336	def end_region(self, region):
paul@32	337
paul@32	338	"End the parsing of 'region', breaking out of the parsing loop."
paul@32	339
paul@32	340	raise StopIteration
paul@32	341
paul@43	342	def queue_region(self, region, current):
paul@43	343
paul@43	344	"Queue 'region' for appending after the 'current' region is ended."
paul@43	345
paul@43	346	self.queued[current].append(region)
paul@43	347
paul@43	348	def unqueue_region(self, region, ended):
paul@43	349
paul@43	350	"Unqueue any queued region, adding it to 'region' after 'ended'."
paul@43	351
paul@43	352	nodes = self.queued.get(ended)
paul@43	353
paul@43	354	while nodes:
paul@43	355	node = nodes.pop()
paul@43	356	region.add(node)
paul@43	357	self.unqueue_region(region, node)
paul@43	358
paul@43	359	if self.queued.has_key(ended):
paul@43	360	del self.queued[ended]
paul@43	361
paul@43	362	def new_block(self, region):
paul@43	363
paul@43	364	"Start a new block in 'region'."
paul@43	365
paul@43	366	self.add_node(region, Block([]))
paul@43	367
paul@32	368	# vim: tabstop=4 expandtab shiftwidth=4