MoinLight (annotate moinformat/parsers/common.py in 6299a99d64bc)

MoinLight

Annotated moinformat/parsers/common.py

45:6299a99d64bc

2018-07-13

Paul Boddie

Fixed list structuring by employing queued match details from the token stream.

paul@32	1	#!/usr/bin/env python
paul@32	2
paul@32	3	"""
paul@32	4	Moin wiki parsing functionality.
paul@32	5
paul@45	6	Copyright (C) 2017, 2018 Paul Boddie <paul@boddie.org.uk>
paul@32	7
paul@32	8	This program is free software; you can redistribute it and/or modify it under
paul@32	9	the terms of the GNU General Public License as published by the Free Software
paul@32	10	Foundation; either version 3 of the License, or (at your option) any later
paul@32	11	version.
paul@32	12
paul@32	13	This program is distributed in the hope that it will be useful, but WITHOUT
paul@32	14	ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
paul@32	15	FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
paul@32	16	details.
paul@32	17
paul@32	18	You should have received a copy of the GNU General Public License along with
paul@32	19	this program. If not, see <http://www.gnu.org/licenses/>.
paul@32	20	"""
paul@32	21
paul@43	22	from collections import defaultdict
paul@32	23	from moinformat.tree import Block, Region, Text
paul@33	24	import re
paul@33	25
paul@33	26	# Pattern management.
paul@33	27
paul@36	28	ws_excl_nl = r"[ \f\r\t\v]"
paul@36	29
paul@33	30	def get_patterns(syntax):
paul@33	31
paul@36	32	"""
paul@36	33	Define patterns for the regular expressions in the 'syntax' mapping. In each
paul@36	34	pattern, replace \N with a pattern for matching whitespace excluding
paul@36	35	newlines.
paul@36	36	"""
paul@33	37
paul@33	38	patterns = {}
paul@33	39	for name, value in syntax.items():
paul@36	40	value = value.replace(r"\N", ws_excl_nl)
paul@33	41	patterns[name] = re.compile(value, re.UNICODE \| re.MULTILINE)
paul@33	42	return patterns
paul@33	43
paul@37	44	def get_subset(d, keys):
paul@33	45
paul@37	46	"Return a subset of 'd' having the given 'keys'."
paul@36	47
paul@37	48	subset = {}
paul@37	49	for key in keys:
paul@37	50	subset[key] = d[key]
paul@37	51	return subset
paul@36	52
paul@36	53
paul@32	54
paul@32	55	# Tokenising functions.
paul@32	56
paul@32	57	class TokenStream:
paul@32	58
paul@32	59	"A stream of tokens taken from a string."
paul@32	60
paul@37	61	def __init__(self, s, pos=0):
paul@32	62	self.s = s
paul@36	63	self.pos = pos
paul@45	64
paul@45	65	# Match details.
paul@45	66
paul@32	67	self.match = None
paul@45	68	self.queued = None
paul@45	69	self.match_start = None
paul@45	70
paul@45	71	# Pattern name details.
paul@45	72
paul@32	73	self.matching = None
paul@32	74
paul@32	75	def rewind(self, length):
paul@32	76
paul@32	77	"Rewind in the string by 'length'."
paul@32	78
paul@32	79	self.pos -= min(length, self.pos)
paul@32	80
paul@45	81	def queue_match(self):
paul@45	82
paul@45	83	"Rewind in the string to the start of the last match."
paul@45	84
paul@45	85	self.queued = self.match
paul@45	86
paul@37	87	def read_until(self, patterns, remaining=True):
paul@32	88
paul@32	89	"""
paul@37	90	Find the first match for the given 'patterns'. Return the text preceding
paul@37	91	any match, the remaining text if no match was found, or None if no match
paul@37	92	was found and 'remaining' is given as a false value.
paul@32	93	"""
paul@32	94
paul@45	95	if self.queued:
paul@45	96	self.match = self.queued
paul@45	97	self.queued = None
paul@45	98	else:
paul@45	99	self.match_start = None
paul@45	100	self.matching = None
paul@32	101
paul@45	102	# Find the first matching pattern.
paul@32	103
paul@45	104	for pattern_name, pattern in patterns.items():
paul@45	105	match = pattern.search(self.s, self.pos)
paul@45	106	if match:
paul@45	107	start, end = match.span()
paul@45	108	if self.matching is None or start < self.start:
paul@45	109	self.start = start
paul@45	110	self.matching = pattern_name
paul@45	111	self.match = match
paul@32	112
paul@32	113	if self.matching is None:
paul@32	114	if remaining:
paul@32	115	return self.s[self.pos:]
paul@32	116	else:
paul@32	117	return None
paul@32	118	else:
paul@45	119	return self.s[self.pos:self.start]
paul@32	120
paul@32	121	def read_match(self, group=1):
paul@32	122
paul@32	123	"""
paul@32	124	Return the matched text, updating the position in the stream. If 'group'
paul@32	125	is specified, the indicated group in a match will be returned.
paul@32	126	Typically, group 1 should contain all pertinent data, but groups defined
paul@32	127	within group 1 can provide sections of the data.
paul@32	128	"""
paul@32	129
paul@32	130	if self.match:
paul@32	131	_start, self.pos = self.match.span()
paul@32	132	try:
paul@32	133	return self.match.group(group)
paul@32	134	except IndexError:
paul@32	135	return ""
paul@32	136	else:
paul@32	137	self.pos = len(self.s)
paul@32	138	return None
paul@32	139
paul@32	140
paul@32	141
paul@32	142	# Parser abstractions.
paul@32	143
paul@32	144	class ParserBase:
paul@32	145
paul@32	146	"Common parsing methods."
paul@32	147
paul@37	148	region_pattern_names = None
paul@37	149
paul@32	150	def __init__(self, formats=None):
paul@32	151
paul@32	152	"""
paul@32	153	Initialise the parser with any given 'formats' mapping from region type
paul@32	154	names to parser objects.
paul@32	155	"""
paul@32	156
paul@32	157	self.formats = formats
paul@37	158
paul@37	159	def get_parser(self, format_type):
paul@37	160
paul@37	161	"""
paul@37	162	Return a parser for 'format_type' or None if no suitable parser is found.
paul@37	163	"""
paul@37	164
paul@37	165	if not self.formats:
paul@37	166	return None
paul@37	167
paul@37	168	cls = self.formats.get(format_type)
paul@37	169	if cls:
paul@37	170	return cls(self.formats)
paul@37	171	else:
paul@37	172	return None
paul@37	173
paul@37	174	def get_patterns(self, pattern_names):
paul@37	175
paul@37	176	"Return a mapping of the given 'pattern_names' to patterns."
paul@37	177
paul@37	178	return get_subset(self.patterns, pattern_names)
paul@32	179
paul@36	180	def get_items(self, s, pos=0):
paul@32	181
paul@36	182	"Return a sequence of token items for 's' and 'pos'."
paul@32	183
paul@37	184	return TokenStream(s, pos)
paul@37	185
paul@37	186	def set_region(self, items, region):
paul@37	187
paul@37	188	"Set the 'items' used to populate the given 'region'."
paul@32	189
paul@37	190	self.items = items
paul@37	191	self.region = region
paul@37	192
paul@37	193	def read_until(self, pattern_names, remaining=True):
paul@36	194
paul@37	195	"""
paul@37	196	Read the next portion of input, matching using 'pattern_names'. Return
paul@37	197	the text preceding any match, the remaining text if no match was found,
paul@37	198	or None if no match was found and 'remaining' is given as a false value.
paul@37	199	"""
paul@36	200
paul@37	201	return self.items.read_until(self.get_patterns(pattern_names))
paul@37	202
paul@37	203	def read_match(self, group=1):
paul@37	204
paul@37	205	"""
paul@37	206	Return the group of the matching pattern with the given 'group' number.
paul@37	207	"""
paul@36	208
paul@37	209	return self.items.read_match(group)
paul@37	210
paul@37	211	def read_matching(self):
paul@36	212
paul@37	213	"Return the name of the matching pattern."
paul@36	214
paul@37	215	return self.items.matching
paul@37	216
paul@37	217	# Parser methods invoked from other objects.
paul@36	218
paul@32	219	def parse(self, s):
paul@32	220
paul@32	221	"""
paul@32	222	Parse page text 's'. Pages consist of regions delimited by markers.
paul@32	223	"""
paul@32	224
paul@37	225	self.items = self.get_items(s)
paul@37	226	self.region = self.parse_region()
paul@37	227	return self.region
paul@37	228
paul@37	229	def parse_region_content(self, items, region):
paul@37	230
paul@37	231	"Parse the data provided by 'items' to populate a 'region'."
paul@37	232
paul@37	233	self.set_region(items, region)
paul@32	234
paul@37	235	# Define a block to hold text and start parsing.
paul@37	236
paul@43	237	self.new_block(region)
paul@37	238
paul@37	239	if self.region_pattern_names:
paul@37	240	self.parse_region_details(region, self.region_pattern_names)
paul@37	241
paul@37	242	# Top-level parser handler methods.
paul@37	243
paul@37	244	def parse_region(self, level=0, indent=0):
paul@32	245
paul@32	246	"""
paul@37	247	Parse the data to populate a region with the given 'level' at the given
paul@37	248	'indent'.
paul@32	249	"""
paul@32	250
paul@32	251	region = Region([], level, indent)
paul@32	252
paul@32	253	# Parse section headers, then parse according to region type.
paul@32	254
paul@37	255	self.parse_region_header(region)
paul@37	256	self.parse_region_type(region)
paul@32	257
paul@32	258	return region
paul@32	259
paul@37	260	def parse_region_type(self, region):
paul@32	261
paul@32	262	"""
paul@37	263	Use configured parsers to parse 'region' based on its type.
paul@32	264	"""
paul@32	265
paul@32	266	# Find an appropriate parser given the type.
paul@32	267
paul@37	268	parser = self.get_parser(region.type)
paul@37	269
paul@37	270	if parser:
paul@37	271	parser.parse_region_content(self.items, region)
paul@32	272
paul@32	273	# Otherwise, treat the section as opaque.
paul@32	274
paul@32	275	else:
paul@37	276	self.parse_region_opaque(region)
paul@32	277
paul@37	278	def parse_region_header(self, region):
paul@32	279
paul@32	280	"""
paul@37	281	Parse the region header, setting it on the 'region' object.
paul@32	282	"""
paul@32	283
paul@37	284	if self.read_until(["header"], False) == "": # None means no header
paul@37	285	region.type = self.read_match()
paul@32	286
paul@37	287	def parse_region_opaque(self, region):
paul@32	288
paul@37	289	"Parse the data to populate an opaque 'region'."
paul@32	290
paul@32	291	region.transparent = False
paul@37	292	self.parse_region_details(region, ["regionend"])
paul@32	293
paul@32	294	# Parsing utilities.
paul@32	295
paul@43	296	def parse_region_details(self, region, pattern_names, strict=False):
paul@32	297
paul@43	298	"""
paul@43	299	Search 'region' using the 'pattern_names'. If 'strict' is set to a true
paul@43	300	value, forbid the accumulation of additional textual padding.
paul@43	301	"""
paul@32	302
paul@32	303	try:
paul@32	304	while True:
paul@32	305
paul@32	306	# Obtain text before any marker or the end of the input.
paul@32	307
paul@37	308	preceding = self.read_until(pattern_names)
paul@32	309	if preceding:
paul@43	310	if not strict:
paul@43	311	region.append_inline(Text(preceding))
paul@43	312	else:
paul@43	313	break
paul@32	314
paul@32	315	# End of input.
paul@32	316
paul@37	317	if not self.read_matching():
paul@32	318	break
paul@32	319
paul@32	320	# Obtain any feature.
paul@32	321
paul@37	322	feature = self.read_match()
paul@37	323	handler = self.handlers.get(self.read_matching())
paul@32	324
paul@32	325	# Handle each feature or add text to the region.
paul@32	326
paul@32	327	if handler:
paul@37	328	handler(self, region)
paul@43	329	elif not strict:
paul@43	330	region.append_inline(Text(feature))
paul@32	331	else:
paul@43	332	break
paul@32	333
paul@32	334	except StopIteration:
paul@32	335	pass
paul@32	336
paul@32	337	region.normalise()
paul@32	338
paul@43	339	def add_node(self, region, node):
paul@43	340
paul@43	341	"Add to 'region' the given 'node'."
paul@43	342
paul@43	343	region.add(node)
paul@43	344
paul@43	345	def append_node(self, region, node):
paul@43	346
paul@43	347	"Append to 'region' the given 'node'."
paul@43	348
paul@43	349	region.append(node)
paul@43	350
paul@37	351	def end_region(self, region):
paul@32	352
paul@32	353	"End the parsing of 'region', breaking out of the parsing loop."
paul@32	354
paul@32	355	raise StopIteration
paul@32	356
paul@45	357	def queue_match(self):
paul@43	358
paul@45	359	"Queue the current match."
paul@43	360
paul@45	361	self.items.queue_match()
paul@43	362
paul@43	363	def new_block(self, region):
paul@43	364
paul@43	365	"Start a new block in 'region'."
paul@43	366
paul@43	367	self.add_node(region, Block([]))
paul@43	368
paul@32	369	# vim: tabstop=4 expandtab shiftwidth=4