MoinLight (annotate moinformat/parsers/common.py in d517824d2df5)

MoinLight

Annotated moinformat/parsers/common.py

54:d517824d2df5

2018-07-15

Paul Boddie

Renamed read_matching to matching_pattern, read_match to match_group, and changed match_groups to update the stream position.

paul@32	1	#!/usr/bin/env python
paul@32	2
paul@32	3	"""
paul@32	4	Moin wiki parsing functionality.
paul@32	5
paul@45	6	Copyright (C) 2017, 2018 Paul Boddie <paul@boddie.org.uk>
paul@32	7
paul@32	8	This program is free software; you can redistribute it and/or modify it under
paul@32	9	the terms of the GNU General Public License as published by the Free Software
paul@32	10	Foundation; either version 3 of the License, or (at your option) any later
paul@32	11	version.
paul@32	12
paul@32	13	This program is distributed in the hope that it will be useful, but WITHOUT
paul@32	14	ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
paul@32	15	FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
paul@32	16	details.
paul@32	17
paul@32	18	You should have received a copy of the GNU General Public License along with
paul@32	19	this program. If not, see <http://www.gnu.org/licenses/>.
paul@32	20	"""
paul@32	21
paul@43	22	from collections import defaultdict
paul@32	23	from moinformat.tree import Block, Region, Text
paul@33	24	import re
paul@33	25
paul@33	26	# Pattern management.
paul@33	27
paul@36	28	ws_excl_nl = r"[ \f\r\t\v]"
paul@36	29
paul@33	30	def get_patterns(syntax):
paul@33	31
paul@36	32	"""
paul@36	33	Define patterns for the regular expressions in the 'syntax' mapping. In each
paul@36	34	pattern, replace \N with a pattern for matching whitespace excluding
paul@36	35	newlines.
paul@36	36	"""
paul@33	37
paul@33	38	patterns = {}
paul@33	39	for name, value in syntax.items():
paul@36	40	value = value.replace(r"\N", ws_excl_nl)
paul@33	41	patterns[name] = re.compile(value, re.UNICODE \| re.MULTILINE)
paul@33	42	return patterns
paul@33	43
paul@37	44	def get_subset(d, keys):
paul@33	45
paul@37	46	"Return a subset of 'd' having the given 'keys'."
paul@36	47
paul@37	48	subset = {}
paul@37	49	for key in keys:
paul@37	50	subset[key] = d[key]
paul@37	51	return subset
paul@36	52
paul@36	53
paul@32	54
paul@32	55	# Tokenising functions.
paul@32	56
paul@32	57	class TokenStream:
paul@32	58
paul@32	59	"A stream of tokens taken from a string."
paul@32	60
paul@37	61	def __init__(self, s, pos=0):
paul@32	62	self.s = s
paul@36	63	self.pos = pos
paul@45	64
paul@45	65	# Match details.
paul@45	66
paul@32	67	self.match = None
paul@45	68	self.queued = None
paul@45	69	self.match_start = None
paul@45	70
paul@45	71	# Pattern name details.
paul@45	72
paul@32	73	self.matching = None
paul@32	74
paul@32	75	def rewind(self, length):
paul@32	76
paul@32	77	"Rewind in the string by 'length'."
paul@32	78
paul@32	79	self.pos -= min(length, self.pos)
paul@32	80
paul@45	81	def queue_match(self):
paul@45	82
paul@45	83	"Rewind in the string to the start of the last match."
paul@45	84
paul@45	85	self.queued = self.match
paul@45	86
paul@37	87	def read_until(self, patterns, remaining=True):
paul@32	88
paul@32	89	"""
paul@37	90	Find the first match for the given 'patterns'. Return the text preceding
paul@37	91	any match, the remaining text if no match was found, or None if no match
paul@37	92	was found and 'remaining' is given as a false value.
paul@32	93	"""
paul@32	94
paul@45	95	if self.queued:
paul@45	96	self.match = self.queued
paul@45	97	self.queued = None
paul@45	98	else:
paul@45	99	self.match_start = None
paul@45	100	self.matching = None
paul@32	101
paul@45	102	# Find the first matching pattern.
paul@32	103
paul@45	104	for pattern_name, pattern in patterns.items():
paul@45	105	match = pattern.search(self.s, self.pos)
paul@45	106	if match:
paul@45	107	start, end = match.span()
paul@45	108	if self.matching is None or start < self.start:
paul@45	109	self.start = start
paul@45	110	self.matching = pattern_name
paul@45	111	self.match = match
paul@32	112
paul@32	113	if self.matching is None:
paul@32	114	if remaining:
paul@32	115	return self.s[self.pos:]
paul@32	116	else:
paul@32	117	return None
paul@32	118	else:
paul@45	119	return self.s[self.pos:self.start]
paul@32	120
paul@54	121	def match_group(self, group=1):
paul@32	122
paul@32	123	"""
paul@32	124	Return the matched text, updating the position in the stream. If 'group'
paul@32	125	is specified, the indicated group in a match will be returned.
paul@32	126	Typically, group 1 should contain all pertinent data, but groups defined
paul@32	127	within group 1 can provide sections of the data.
paul@32	128	"""
paul@32	129
paul@54	130	self.update_pos()
paul@54	131
paul@32	132	if self.match:
paul@32	133	try:
paul@32	134	return self.match.group(group)
paul@32	135	except IndexError:
paul@32	136	return ""
paul@32	137	else:
paul@32	138	return None
paul@32	139
paul@54	140	def match_groups(self, groups=None):
paul@51	141
paul@54	142	"Return the match 'groups', or all groups if unspecified."
paul@54	143
paul@54	144	self.update_pos()
paul@51	145
paul@51	146	if self.match:
paul@54	147	if groups is None:
paul@54	148	return self.match.groups()
paul@54	149	else:
paul@54	150	return self.match.groups(groups)
paul@51	151	else:
paul@51	152	return []
paul@51	153
paul@54	154	def update_pos(self):
paul@54	155
paul@54	156	"Update the position in the stream."
paul@54	157
paul@54	158	if self.match:
paul@54	159	_start, self.pos = self.match.span()
paul@54	160	else:
paul@54	161	self.pos = len(self.s)
paul@54	162
paul@32	163
paul@32	164
paul@32	165	# Parser abstractions.
paul@32	166
paul@32	167	class ParserBase:
paul@32	168
paul@32	169	"Common parsing methods."
paul@32	170
paul@37	171	region_pattern_names = None
paul@37	172
paul@32	173	def __init__(self, formats=None):
paul@32	174
paul@32	175	"""
paul@32	176	Initialise the parser with any given 'formats' mapping from region type
paul@32	177	names to parser objects.
paul@32	178	"""
paul@32	179
paul@32	180	self.formats = formats
paul@37	181
paul@37	182	def get_parser(self, format_type):
paul@37	183
paul@37	184	"""
paul@37	185	Return a parser for 'format_type' or None if no suitable parser is found.
paul@37	186	"""
paul@37	187
paul@37	188	if not self.formats:
paul@37	189	return None
paul@37	190
paul@37	191	cls = self.formats.get(format_type)
paul@37	192	if cls:
paul@37	193	return cls(self.formats)
paul@37	194	else:
paul@37	195	return None
paul@37	196
paul@37	197	def get_patterns(self, pattern_names):
paul@37	198
paul@37	199	"Return a mapping of the given 'pattern_names' to patterns."
paul@37	200
paul@37	201	return get_subset(self.patterns, pattern_names)
paul@32	202
paul@36	203	def get_items(self, s, pos=0):
paul@32	204
paul@36	205	"Return a sequence of token items for 's' and 'pos'."
paul@32	206
paul@37	207	return TokenStream(s, pos)
paul@37	208
paul@37	209	def set_region(self, items, region):
paul@37	210
paul@37	211	"Set the 'items' used to populate the given 'region'."
paul@32	212
paul@37	213	self.items = items
paul@37	214	self.region = region
paul@37	215
paul@37	216	def read_until(self, pattern_names, remaining=True):
paul@36	217
paul@37	218	"""
paul@37	219	Read the next portion of input, matching using 'pattern_names'. Return
paul@37	220	the text preceding any match, the remaining text if no match was found,
paul@37	221	or None if no match was found and 'remaining' is given as a false value.
paul@37	222	"""
paul@36	223
paul@37	224	return self.items.read_until(self.get_patterns(pattern_names))
paul@37	225
paul@54	226	def match_group(self, group=1):
paul@37	227
paul@37	228	"""
paul@37	229	Return the group of the matching pattern with the given 'group' number.
paul@37	230	"""
paul@36	231
paul@54	232	return self.items.match_group(group)
paul@37	233
paul@54	234	def matching_pattern(self):
paul@36	235
paul@37	236	"Return the name of the matching pattern."
paul@36	237
paul@37	238	return self.items.matching
paul@37	239
paul@51	240	def match_groups(self):
paul@51	241
paul@51	242	"Return the number of groups in the match."
paul@51	243
paul@51	244	return self.items.match_groups()
paul@51	245
paul@37	246	# Parser methods invoked from other objects.
paul@36	247
paul@32	248	def parse(self, s):
paul@32	249
paul@32	250	"""
paul@32	251	Parse page text 's'. Pages consist of regions delimited by markers.
paul@32	252	"""
paul@32	253
paul@37	254	self.items = self.get_items(s)
paul@37	255	self.region = self.parse_region()
paul@37	256	return self.region
paul@37	257
paul@37	258	def parse_region_content(self, items, region):
paul@37	259
paul@37	260	"Parse the data provided by 'items' to populate a 'region'."
paul@37	261
paul@37	262	self.set_region(items, region)
paul@32	263
paul@37	264	# Define a block to hold text and start parsing.
paul@37	265
paul@43	266	self.new_block(region)
paul@37	267
paul@37	268	if self.region_pattern_names:
paul@37	269	self.parse_region_details(region, self.region_pattern_names)
paul@37	270
paul@37	271	# Top-level parser handler methods.
paul@37	272
paul@52	273	def parse_region(self, level=0, indent=0, type=None):
paul@32	274
paul@32	275	"""
paul@37	276	Parse the data to populate a region with the given 'level' at the given
paul@52	277	'indent' having the given initial 'type'.
paul@32	278	"""
paul@32	279
paul@52	280	region = Region([], level, indent, type)
paul@32	281
paul@32	282	# Parse section headers, then parse according to region type.
paul@32	283
paul@37	284	self.parse_region_header(region)
paul@37	285	self.parse_region_type(region)
paul@32	286
paul@32	287	return region
paul@32	288
paul@37	289	def parse_region_type(self, region):
paul@32	290
paul@32	291	"""
paul@37	292	Use configured parsers to parse 'region' based on its type.
paul@32	293	"""
paul@32	294
paul@52	295	# Handle potentially inline regions.
paul@52	296
paul@52	297	if region.type == "inline":
paul@52	298	self.parse_region_inline(region)
paul@52	299	return
paul@52	300
paul@32	301	# Find an appropriate parser given the type.
paul@32	302
paul@37	303	parser = self.get_parser(region.type)
paul@37	304
paul@37	305	if parser:
paul@37	306	parser.parse_region_content(self.items, region)
paul@32	307
paul@32	308	# Otherwise, treat the section as opaque.
paul@32	309
paul@32	310	else:
paul@37	311	self.parse_region_opaque(region)
paul@32	312
paul@37	313	def parse_region_header(self, region):
paul@32	314
paul@32	315	"""
paul@37	316	Parse the region header, setting it on the 'region' object.
paul@32	317	"""
paul@32	318
paul@37	319	if self.read_until(["header"], False) == "": # None means no header
paul@54	320	region.type = self.match_group()
paul@32	321
paul@37	322	def parse_region_opaque(self, region):
paul@32	323
paul@37	324	"Parse the data to populate an opaque 'region'."
paul@32	325
paul@32	326	region.transparent = False
paul@37	327	self.parse_region_details(region, ["regionend"])
paul@32	328
paul@52	329	def parse_region_inline(self, region):
paul@52	330
paul@52	331	"Parse the data to populate an inline 'region'."
paul@52	332
paul@52	333	region.transparent = False
paul@52	334	self.parse_region_details(region, ["regionend"])
paul@52	335
paul@52	336	# Reset the type if the region was not inline.
paul@52	337
paul@52	338	if region.type == "inline":
paul@52	339	first = region.nodes and region.nodes[0]
paul@52	340	if first and isinstance(first, Text) and first.multiline():
paul@52	341	region.type = None
paul@52	342
paul@32	343	# Parsing utilities.
paul@32	344
paul@43	345	def parse_region_details(self, region, pattern_names, strict=False):
paul@32	346
paul@43	347	"""
paul@43	348	Search 'region' using the 'pattern_names'. If 'strict' is set to a true
paul@43	349	value, forbid the accumulation of additional textual padding.
paul@43	350	"""
paul@32	351
paul@32	352	try:
paul@32	353	while True:
paul@32	354
paul@32	355	# Obtain text before any marker or the end of the input.
paul@32	356
paul@37	357	preceding = self.read_until(pattern_names)
paul@32	358	if preceding:
paul@43	359	if not strict:
paul@43	360	region.append_inline(Text(preceding))
paul@43	361	else:
paul@43	362	break
paul@32	363
paul@32	364	# End of input.
paul@32	365
paul@54	366	if not self.matching_pattern():
paul@32	367	break
paul@32	368
paul@32	369	# Obtain any feature.
paul@32	370
paul@54	371	feature = self.match_group()
paul@54	372	handler = self.handlers.get(self.matching_pattern())
paul@32	373
paul@32	374	# Handle each feature or add text to the region.
paul@32	375
paul@32	376	if handler:
paul@37	377	handler(self, region)
paul@43	378	elif not strict:
paul@43	379	region.append_inline(Text(feature))
paul@32	380	else:
paul@43	381	break
paul@32	382
paul@32	383	except StopIteration:
paul@32	384	pass
paul@32	385
paul@32	386	region.normalise()
paul@32	387
paul@43	388	def add_node(self, region, node):
paul@43	389
paul@43	390	"Add to 'region' the given 'node'."
paul@43	391
paul@43	392	region.add(node)
paul@43	393
paul@43	394	def append_node(self, region, node):
paul@43	395
paul@43	396	"Append to 'region' the given 'node'."
paul@43	397
paul@43	398	region.append(node)
paul@43	399
paul@37	400	def end_region(self, region):
paul@32	401
paul@32	402	"End the parsing of 'region', breaking out of the parsing loop."
paul@32	403
paul@32	404	raise StopIteration
paul@32	405
paul@45	406	def queue_match(self):
paul@43	407
paul@45	408	"Queue the current match."
paul@43	409
paul@45	410	self.items.queue_match()
paul@43	411
paul@43	412	def new_block(self, region):
paul@43	413
paul@43	414	"Start a new block in 'region'."
paul@43	415
paul@43	416	self.add_node(region, Block([]))
paul@43	417
paul@32	418	# vim: tabstop=4 expandtab shiftwidth=4