MoinLight (annotate moinformat/parsers/common.py in 69cb676460b1)

MoinLight

Annotated moinformat/parsers/common.py

165:69cb676460b1

2018-08-14

Paul Boddie

Introduced a metadata abstraction to hold details of documents and the conversion parameters, also providing the necessary objects employing such parameters. Added a root pagename option to the conversion script. Removed support for invoking serialise without a serialiser. Configured the re-serialisation of nodes in the Moin parser explicitly. Added a wiki parser class as a kind of alias for the Moin parser class. Introduced on-demand pagename lookups in various classes in order to permit the re-use of instances with different pagenames.

paul@32	1	#!/usr/bin/env python
paul@32	2
paul@32	3	"""
paul@32	4	Moin wiki parsing functionality.
paul@32	5
paul@45	6	Copyright (C) 2017, 2018 Paul Boddie <paul@boddie.org.uk>
paul@32	7
paul@32	8	This program is free software; you can redistribute it and/or modify it under
paul@32	9	the terms of the GNU General Public License as published by the Free Software
paul@32	10	Foundation; either version 3 of the License, or (at your option) any later
paul@32	11	version.
paul@32	12
paul@32	13	This program is distributed in the hope that it will be useful, but WITHOUT
paul@32	14	ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
paul@32	15	FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
paul@32	16	details.
paul@32	17
paul@32	18	You should have received a copy of the GNU General Public License along with
paul@32	19	this program. If not, see <http://www.gnu.org/licenses/>.
paul@32	20	"""
paul@32	21
paul@43	22	from collections import defaultdict
paul@83	23	from moinformat.tree.moin import Block, Region, Text
paul@33	24	import re
paul@33	25
paul@33	26	# Pattern management.
paul@33	27
paul@36	28	ws_excl_nl = r"[ \f\r\t\v]"
paul@55	29	quotes = "['" '"]' # ['"]
paul@155	30	dotall = r"(.\|\n)"
paul@55	31
paul@121	32	def choice(l):
paul@121	33
paul@121	34	"Return a pattern matching a choice of patterns in 'l'."
paul@121	35
paul@121	36	return "(%s)" % "\|".join(l)
paul@121	37
paul@55	38	def excl(s):
paul@55	39
paul@55	40	"Return a non-matching pattern for 's'."
paul@55	41
paul@55	42	return "(?!%s)" % s
paul@55	43
paul@55	44	def expect(s):
paul@55	45
paul@55	46	"Return a pattern expecting 's'."
paul@55	47
paul@55	48	return "(?=%s)" % s
paul@55	49
paul@55	50	def group(name, s):
paul@55	51
paul@55	52	"Return a pattern group having 'name' and the pattern string 's'."
paul@55	53
paul@55	54	return "(?P<%s>%s)" % (name, s)
paul@55	55
paul@55	56	def optional(s):
paul@55	57
paul@55	58	"Return an optional pattern."
paul@55	59
paul@55	60	return "(?:%s)?" % s
paul@55	61
paul@55	62	def recur(name):
paul@55	63
paul@55	64	"Return a test for a recurrence of group 'name'."
paul@55	65
paul@55	66	return "(?P=%s)" % name
paul@55	67
paul@55	68	def repeat(s, min=None, max=None):
paul@55	69
paul@55	70	"Return a pattern matching 's' for the given 'min' and 'max' limits."
paul@55	71
paul@55	72	return "%s{%s,%s}" % (s, min is not None and min or "",
paul@55	73	max is not None and max or "")
paul@36	74
paul@99	75	def get_pattern(s):
paul@99	76
paul@99	77	"Return a compiled regular expression for the given pattern 's'."
paul@99	78
paul@99	79	return re.compile(s, re.UNICODE \| re.MULTILINE)
paul@99	80
paul@33	81	def get_patterns(syntax):
paul@33	82
paul@36	83	"""
paul@36	84	Define patterns for the regular expressions in the 'syntax' mapping. In each
paul@36	85	pattern, replace \N with a pattern for matching whitespace excluding
paul@36	86	newlines.
paul@36	87	"""
paul@33	88
paul@33	89	patterns = {}
paul@33	90	for name, value in syntax.items():
paul@36	91	value = value.replace(r"\N", ws_excl_nl)
paul@55	92	value = value.replace(r"\Q", quotes)
paul@155	93	value = value.replace(r"\E", dotall)
paul@99	94	patterns[name] = get_pattern(value)
paul@33	95	return patterns
paul@33	96
paul@37	97	def get_subset(d, keys):
paul@33	98
paul@37	99	"Return a subset of 'd' having the given 'keys'."
paul@36	100
paul@37	101	subset = {}
paul@37	102	for key in keys:
paul@37	103	subset[key] = d[key]
paul@37	104	return subset
paul@36	105
paul@36	106
paul@32	107
paul@32	108	# Tokenising functions.
paul@32	109
paul@32	110	class TokenStream:
paul@32	111
paul@32	112	"A stream of tokens taken from a string."
paul@32	113
paul@37	114	def __init__(self, s, pos=0):
paul@32	115	self.s = s
paul@36	116	self.pos = pos
paul@45	117
paul@45	118	# Match details.
paul@45	119
paul@32	120	self.match = None
paul@45	121	self.queued = None
paul@45	122	self.match_start = None
paul@45	123
paul@45	124	# Pattern name details.
paul@45	125
paul@32	126	self.matching = None
paul@32	127
paul@32	128	def rewind(self, length):
paul@32	129
paul@32	130	"Rewind in the string by 'length'."
paul@32	131
paul@32	132	self.pos -= min(length, self.pos)
paul@32	133
paul@45	134	def queue_match(self):
paul@45	135
paul@45	136	"Rewind in the string to the start of the last match."
paul@45	137
paul@45	138	self.queued = self.match
paul@45	139
paul@37	140	def read_until(self, patterns, remaining=True):
paul@32	141
paul@32	142	"""
paul@37	143	Find the first match for the given 'patterns'. Return the text preceding
paul@37	144	any match, the remaining text if no match was found, or None if no match
paul@37	145	was found and 'remaining' is given as a false value.
paul@32	146	"""
paul@32	147
paul@45	148	if self.queued:
paul@45	149	self.match = self.queued
paul@45	150	self.queued = None
paul@45	151	else:
paul@45	152	self.match_start = None
paul@45	153	self.matching = None
paul@32	154
paul@45	155	# Find the first matching pattern.
paul@32	156
paul@45	157	for pattern_name, pattern in patterns.items():
paul@45	158	match = pattern.search(self.s, self.pos)
paul@45	159	if match:
paul@45	160	start, end = match.span()
paul@74	161	if self.matching is None or start < self.start:
paul@45	162	self.start = start
paul@45	163	self.matching = pattern_name
paul@45	164	self.match = match
paul@32	165
paul@32	166	if self.matching is None:
paul@32	167	if remaining:
paul@32	168	return self.s[self.pos:]
paul@32	169	else:
paul@32	170	return None
paul@32	171	else:
paul@45	172	return self.s[self.pos:self.start]
paul@32	173
paul@54	174	def match_group(self, group=1):
paul@32	175
paul@32	176	"""
paul@32	177	Return the matched text, updating the position in the stream. If 'group'
paul@32	178	is specified, the indicated group in a match will be returned.
paul@32	179	Typically, group 1 should contain all pertinent data, but groups defined
paul@32	180	within group 1 can provide sections of the data.
paul@32	181	"""
paul@32	182
paul@54	183	self.update_pos()
paul@54	184
paul@32	185	if self.match:
paul@32	186	try:
paul@32	187	return self.match.group(group)
paul@32	188	except IndexError:
paul@32	189	return ""
paul@32	190	else:
paul@32	191	return None
paul@32	192
paul@54	193	def match_groups(self, groups=None):
paul@51	194
paul@54	195	"Return the match 'groups', or all groups if unspecified."
paul@54	196
paul@54	197	self.update_pos()
paul@51	198
paul@51	199	if self.match:
paul@54	200	if groups is None:
paul@54	201	return self.match.groups()
paul@54	202	else:
paul@54	203	return self.match.groups(groups)
paul@51	204	else:
paul@51	205	return []
paul@51	206
paul@54	207	def update_pos(self):
paul@54	208
paul@54	209	"Update the position in the stream."
paul@54	210
paul@54	211	if self.match:
paul@54	212	_start, self.pos = self.match.span()
paul@54	213	else:
paul@54	214	self.pos = len(self.s)
paul@54	215
paul@32	216
paul@32	217
paul@32	218	# Parser abstractions.
paul@32	219
paul@32	220	class ParserBase:
paul@32	221
paul@32	222	"Common parsing methods."
paul@32	223
paul@37	224	region_pattern_names = None
paul@37	225
paul@165	226	def __init__(self, metadata, parsers=None, root=None):
paul@32	227
paul@32	228	"""
paul@165	229	Initialise the parser with the given 'metadata' and optional 'parsers'.
paul@165	230	An optional 'root' indicates the document-level parser.
paul@32	231	"""
paul@32	232
paul@165	233	self.metadata = metadata
paul@165	234	self.parsers = parsers
paul@87	235	self.root = root
paul@37	236
paul@37	237	def get_parser(self, format_type):
paul@37	238
paul@37	239	"""
paul@37	240	Return a parser for 'format_type' or None if no suitable parser is found.
paul@37	241	"""
paul@37	242
paul@165	243	cls = self.parsers and self.parsers.get(format_type)
paul@37	244	if cls:
paul@165	245	return cls(self.metadata, self.parsers, self.root or self)
paul@37	246	else:
paul@37	247	return None
paul@37	248
paul@37	249	def get_patterns(self, pattern_names):
paul@37	250
paul@37	251	"Return a mapping of the given 'pattern_names' to patterns."
paul@37	252
paul@37	253	return get_subset(self.patterns, pattern_names)
paul@32	254
paul@36	255	def get_items(self, s, pos=0):
paul@32	256
paul@36	257	"Return a sequence of token items for 's' and 'pos'."
paul@32	258
paul@37	259	return TokenStream(s, pos)
paul@37	260
paul@37	261	def set_region(self, items, region):
paul@37	262
paul@37	263	"Set the 'items' used to populate the given 'region'."
paul@32	264
paul@37	265	self.items = items
paul@37	266	self.region = region
paul@37	267
paul@37	268	def read_until(self, pattern_names, remaining=True):
paul@36	269
paul@37	270	"""
paul@37	271	Read the next portion of input, matching using 'pattern_names'. Return
paul@37	272	the text preceding any match, the remaining text if no match was found,
paul@37	273	or None if no match was found and 'remaining' is given as a false value.
paul@37	274	"""
paul@36	275
paul@37	276	return self.items.read_until(self.get_patterns(pattern_names))
paul@37	277
paul@54	278	def match_group(self, group=1):
paul@37	279
paul@37	280	"""
paul@37	281	Return the group of the matching pattern with the given 'group' number.
paul@37	282	"""
paul@36	283
paul@54	284	return self.items.match_group(group)
paul@37	285
paul@54	286	def matching_pattern(self):
paul@36	287
paul@37	288	"Return the name of the matching pattern."
paul@36	289
paul@37	290	return self.items.matching
paul@37	291
paul@51	292	def match_groups(self):
paul@51	293
paul@51	294	"Return the number of groups in the match."
paul@51	295
paul@51	296	return self.items.match_groups()
paul@51	297
paul@37	298	# Parser methods invoked from other objects.
paul@36	299
paul@32	300	def parse(self, s):
paul@32	301
paul@32	302	"""
paul@32	303	Parse page text 's'. Pages consist of regions delimited by markers.
paul@32	304	"""
paul@32	305
paul@37	306	self.items = self.get_items(s)
paul@37	307	self.region = self.parse_region()
paul@37	308	return self.region
paul@37	309
paul@37	310	def parse_region_content(self, items, region):
paul@37	311
paul@37	312	"Parse the data provided by 'items' to populate a 'region'."
paul@37	313
paul@37	314	self.set_region(items, region)
paul@32	315
paul@70	316	# Parse inline and opaque regions.
paul@70	317
paul@70	318	if not region.transparent:
paul@70	319	pattern_names = ["regionend"]
paul@70	320
paul@70	321	# Define a block to hold text.
paul@70	322
paul@70	323	else:
paul@70	324	self.new_block(region)
paul@70	325	pattern_names = self.region_pattern_names
paul@37	326
paul@70	327	# Start parsing.
paul@70	328
paul@70	329	if pattern_names:
paul@70	330	self.parse_region_details(region, pattern_names)
paul@37	331
paul@70	332	# Reset the type if the region was not inline.
paul@70	333
paul@70	334	if region.type == "inline":
paul@70	335	first = region.nodes and region.nodes[0]
paul@70	336	if first and isinstance(first, Text) and first.multiline():
paul@70	337	region.type = None
paul@37	338
paul@37	339	# Top-level parser handler methods.
paul@37	340
paul@52	341	def parse_region(self, level=0, indent=0, type=None):
paul@32	342
paul@32	343	"""
paul@37	344	Parse the data to populate a region with the given 'level' at the given
paul@52	345	'indent' having the given initial 'type'.
paul@32	346	"""
paul@32	347
paul@52	348	region = Region([], level, indent, type)
paul@32	349
paul@151	350	# Parse section headers and directives, then parse according to region
paul@151	351	# type.
paul@32	352
paul@37	353	self.parse_region_header(region)
paul@151	354	self.parse_region_directives(region)
paul@37	355	self.parse_region_type(region)
paul@32	356
paul@32	357	return region
paul@32	358
paul@37	359	def parse_region_type(self, region):
paul@32	360
paul@32	361	"""
paul@37	362	Use configured parsers to parse 'region' based on its type.
paul@32	363	"""
paul@32	364
paul@32	365	# Find an appropriate parser given the type.
paul@32	366
paul@37	367	parser = self.get_parser(region.type)
paul@70	368	if not parser:
paul@70	369	region.transparent = False
paul@70	370	parser = parser or self.get_parser("moin")
paul@70	371	parser.parse_region_content(self.items, region)
paul@32	372
paul@37	373	def parse_region_header(self, region):
paul@32	374
paul@32	375	"""
paul@37	376	Parse the region header, setting it on the 'region' object.
paul@32	377	"""
paul@32	378
paul@37	379	if self.read_until(["header"], False) == "": # None means no header
paul@124	380	region.args = self.match_group("args")
paul@124	381	region.type = region.args.split(" ", 1)[0]
paul@32	382
paul@151	383	def parse_region_directives(self, region):
paul@151	384
paul@151	385	"""
paul@151	386	Parse any directives immediately after the region header, adding them to
paul@151	387	the 'region' object.
paul@151	388	"""
paul@151	389
paul@151	390	while True:
paul@151	391	preceding = self.read_until(["directive"], False)
paul@151	392
paul@151	393	# With an immediately-appearing directive, handle its details.
paul@151	394
paul@151	395	if preceding == "":
paul@151	396	handler = self.handlers.get(self.matching_pattern())
paul@151	397	if handler:
paul@151	398	handler(self, region)
paul@151	399	else:
paul@151	400	break
paul@151	401
paul@151	402	# Otherwise, with no immediate directive (or none at all), stop.
paul@151	403
paul@151	404	else:
paul@151	405	break
paul@151	406
paul@32	407	# Parsing utilities.
paul@32	408
paul@43	409	def parse_region_details(self, region, pattern_names, strict=False):
paul@32	410
paul@43	411	"""
paul@43	412	Search 'region' using the 'pattern_names'. If 'strict' is set to a true
paul@43	413	value, forbid the accumulation of additional textual padding.
paul@43	414	"""
paul@32	415
paul@32	416	try:
paul@32	417	while True:
paul@32	418
paul@32	419	# Obtain text before any marker or the end of the input.
paul@32	420
paul@37	421	preceding = self.read_until(pattern_names)
paul@32	422	if preceding:
paul@43	423	if not strict:
paul@43	424	region.append_inline(Text(preceding))
paul@43	425	else:
paul@43	426	break
paul@32	427
paul@32	428	# End of input.
paul@32	429
paul@54	430	if not self.matching_pattern():
paul@32	431	break
paul@32	432
paul@32	433	# Obtain any feature.
paul@32	434
paul@67	435	feature = self.match_group("feature") or self.match_group()
paul@54	436	handler = self.handlers.get(self.matching_pattern())
paul@32	437
paul@32	438	# Handle each feature or add text to the region.
paul@32	439
paul@32	440	if handler:
paul@37	441	handler(self, region)
paul@43	442	elif not strict:
paul@43	443	region.append_inline(Text(feature))
paul@32	444	else:
paul@43	445	break
paul@32	446
paul@32	447	except StopIteration:
paul@32	448	pass
paul@32	449
paul@32	450	region.normalise()
paul@32	451
paul@43	452	def add_node(self, region, node):
paul@43	453
paul@43	454	"Add to 'region' the given 'node'."
paul@43	455
paul@43	456	region.add(node)
paul@43	457
paul@43	458	def append_node(self, region, node):
paul@43	459
paul@43	460	"Append to 'region' the given 'node'."
paul@43	461
paul@43	462	region.append(node)
paul@43	463
paul@37	464	def end_region(self, region):
paul@32	465
paul@32	466	"End the parsing of 'region', breaking out of the parsing loop."
paul@32	467
paul@32	468	raise StopIteration
paul@32	469
paul@45	470	def queue_match(self):
paul@43	471
paul@45	472	"Queue the current match."
paul@43	473
paul@45	474	self.items.queue_match()
paul@43	475
paul@43	476	def new_block(self, region):
paul@43	477
paul@43	478	"Start a new block in 'region'."
paul@43	479
paul@43	480	self.add_node(region, Block([]))
paul@43	481
paul@98	482	# Common handler methods.
paul@98	483
paul@98	484	def parse_region_end(self, node):
paul@98	485
paul@98	486	"Handle the end of a region occurring within 'node'."
paul@98	487
paul@98	488	level = self.match_group("level")
paul@98	489	feature = self.match_group("feature")
paul@98	490	self.region.extra = self.match_group("extra")
paul@98	491
paul@98	492	if self.region.have_end(level):
paul@98	493	raise StopIteration
paul@98	494	else:
paul@98	495	node.append_inline(Text(feature))
paul@98	496
paul@32	497	# vim: tabstop=4 expandtab shiftwidth=4