# HG changeset patch # User Paul Boddie # Date 1493930340 -7200 # Node ID 3993165616f88b901f0897fe41fd0d7115d1454f # Parent 4ebe552530c805b23503b0919fb70b4acbb60e84 Moved common parsing functionality into a separate module. Eliminated "transparent" region decisions in the Region class, deciding region transparency in parser classes instead. diff -r 4ebe552530c8 -r 3993165616f8 moinformat/__init__.py --- a/moinformat/__init__.py Thu May 04 21:41:13 2017 +0200 +++ b/moinformat/__init__.py Thu May 04 22:39:00 2017 +0200 @@ -19,11 +19,13 @@ this program. If not, see . """ +from moinformat.parsing import ParserBase, TokenStream, new_block from moinformat.serialisers import serialise -from moinformat.tree import Block, Break, DefItem, DefTerm, FontStyle, Heading, \ +from moinformat.tree import Break, DefItem, DefTerm, FontStyle, Heading, \ Larger, ListItem, Monospace, Region, Rule, Smaller, \ Subscript, Superscript, TableAttr, TableAttrs, \ TableCell, TableRow, Text, Underline + import re # Regular expressions. @@ -120,141 +122,60 @@ -# Tokenising functions. - -class TokenStream: - - "A stream of tokens taken from a string." +class Parser(ParserBase): - def __init__(self, s): - self.s = s - self.pos = 0 - self.match = None - self.matching = None + "A wiki region parser." - def rewind(self, length): - - "Rewind in the string by 'length'." - - self.pos -= min(length, self.pos) - - def read_until(self, pattern_names, remaining=True): + def __init__(self, formats=None): """ - Find the first match for the given 'pattern_names'. Return the text - preceding any match, the remaining text if no match was found, or None - if no match was found and 'remaining' is given as a false value. + Initialise the parser with any given 'formats' mapping from region type + names to parser objects. """ - first = None - self.matching = None - - # Find the first matching pattern. - - for pattern_name in pattern_names: - match = patterns[pattern_name].search(self.s, self.pos) - if match: - start, end = match.span() - if self.matching is None or start < first: - first = start - self.matching = pattern_name - self.match = match + formats = {"wiki" : self} + if formats: + formats.update(formats) - if self.matching is None: - if remaining: - return self.s[self.pos:] - else: - return None - else: - return self.s[self.pos:first] - - def read_match(self, group=1): - - """ - Return the matched text, updating the position in the stream. If 'group' - is specified, the indicated group in a match will be returned. - Typically, group 1 should contain all pertinent data, but groups defined - within group 1 can provide sections of the data. - """ + ParserBase.__init__(self, formats) - if self.match: - _start, self.pos = self.match.span() - try: - return self.match.group(group) - except IndexError: - return "" - else: - self.pos = len(self.s) - return None - - - -# Utility functions. - -def new_block(region): + def get_items(self, s): - "Start a new block in 'region'." - - block = Block([]) - region.add(block) - - + "Return a sequence of token items for 's'." -# Parser abstraction. - -class Parser: - - "An extensible parser." - - def __init__(self, formats=None): - self.formats = formats + return TokenStream(s, patterns) # Principal parser methods. - def parse_page(self, s): + def parse(self, s): """ Parse page text 's'. Pages consist of regions delimited by markers. """ - return self.parse_region(TokenStream(s)) - - def parse_region(self, items, level=0, indent=0): + items = self.get_items(s) + region = Region([]) - """ - Parse the data provided by 'items' to populate a region with the given - 'level' at the given 'indent'. - """ - - region = Region([], level, indent) - - # Parse section headers. + # Parse page header. self.parse_region_header(items, region) - # Parse section body. + # Handle pages directly with this parser. + # Otherwise, test the type and find an appropriate parser. - if region.is_transparent(): - self.parse_region_wiki(items, region) + if not region.type: + self.parse_region_content(items, region) else: - self.parse_region_opaque(items, region) + self.parse_region_type(items, region) return region - def parse_region_header(self, items, region): - - """ - Parse the region header from the 'items', setting it for the given 'region'. - """ - - if items.read_until(["header"], False) == "": # None means no header - region.type = items.read_match() - - def parse_region_wiki(self, items, region): + def parse_region_content(self, items, region): "Parse the data provided by 'items' to populate a wiki 'region'." new_block(region) + self.parse_region_details(items, region, inline_pattern_names + [ "break", "heading", "defterm", "defterm_empty", @@ -265,12 +186,6 @@ "tablerow", ]) - def parse_region_opaque(self, items, region): - - "Parse the data provided by 'items' to populate an opaque 'region'." - - self.parse_region_details(items, region, ["regionend"]) - # Parser methods supporting different page features. def parse_attrname(self, items, attrs): @@ -575,52 +490,9 @@ - # Parsing utilities. - - def parse_region_details(self, items, region, pattern_names): - - "Parse 'items' within 'region' searching using 'pattern_names'." - - try: - while True: - - # Obtain text before any marker or the end of the input. - - preceding = items.read_until(pattern_names) - if preceding: - region.append_inline(Text(preceding)) - - # End of input. - - if not items.matching: - break - - # Obtain any feature. + # Pattern handlers. - feature = items.read_match() - handler = self.handlers.get(items.matching) - - # Handle each feature or add text to the region. - - if handler: - handler(self, items, region) - else: - region.append_inline(Text(feature)) - - except StopIteration: - pass - - region.normalise() - - def end_region(self, items, region): - - "End the parsing of 'region', breaking out of the parsing loop." - - raise StopIteration - - - - # Pattern handlers. + end_region = ParserBase.end_region handlers = { None : end_region, @@ -672,6 +544,6 @@ # Top-level functions. def parse(s, formats=None): - return Parser(formats).parse_page(s) + return Parser(formats).parse(s) # vim: tabstop=4 expandtab shiftwidth=4 diff -r 4ebe552530c8 -r 3993165616f8 moinformat/parsing.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/moinformat/parsing.py Thu May 04 22:39:00 2017 +0200 @@ -0,0 +1,231 @@ +#!/usr/bin/env python + +""" +Moin wiki parsing functionality. + +Copyright (C) 2017 Paul Boddie + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; either version 3 of the License, or (at your option) any later +version. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +details. + +You should have received a copy of the GNU General Public License along with +this program. If not, see . +""" + +from moinformat.tree import Block, Region, Text + +# Tokenising functions. + +class TokenStream: + + "A stream of tokens taken from a string." + + def __init__(self, s, patterns): + self.s = s + self.patterns = patterns + self.pos = 0 + self.match = None + self.matching = None + + def rewind(self, length): + + "Rewind in the string by 'length'." + + self.pos -= min(length, self.pos) + + def read_until(self, pattern_names, remaining=True): + + """ + Find the first match for the given 'pattern_names'. Return the text + preceding any match, the remaining text if no match was found, or None + if no match was found and 'remaining' is given as a false value. + """ + + first = None + self.matching = None + + # Find the first matching pattern. + + for pattern_name in pattern_names: + match = self.patterns[pattern_name].search(self.s, self.pos) + if match: + start, end = match.span() + if self.matching is None or start < first: + first = start + self.matching = pattern_name + self.match = match + + if self.matching is None: + if remaining: + return self.s[self.pos:] + else: + return None + else: + return self.s[self.pos:first] + + def read_match(self, group=1): + + """ + Return the matched text, updating the position in the stream. If 'group' + is specified, the indicated group in a match will be returned. + Typically, group 1 should contain all pertinent data, but groups defined + within group 1 can provide sections of the data. + """ + + if self.match: + _start, self.pos = self.match.span() + try: + return self.match.group(group) + except IndexError: + return "" + else: + self.pos = len(self.s) + return None + + + +# Utility functions. + +def new_block(region): + + "Start a new block in 'region'." + + region.add(Block([])) + + + +# Parser abstractions. + +class ParserBase: + + "Common parsing methods." + + def __init__(self, formats=None): + + """ + Initialise the parser with any given 'formats' mapping from region type + names to parser objects. + """ + + self.formats = formats + + def get_items(self, s): + + "Return a sequence of token items for 's'." + + raise NotImplementedError + + def parse(self, s): + + """ + Parse page text 's'. Pages consist of regions delimited by markers. + """ + + return self.parse_region(self.get_items(s)) + + def parse_region(self, items, level=0, indent=0): + + """ + Parse the data provided by 'items' to populate a region with the given + 'level' at the given 'indent'. + """ + + region = Region([], level, indent) + + # Parse section headers, then parse according to region type. + + self.parse_region_header(items, region) + self.parse_region_type(items, region) + + return region + + def parse_region_type(self, items, region): + + """ + Given data provided by 'items', use configured parsers to parse the + 'region' based on its type. + """ + + # Find an appropriate parser given the type. + + if self.formats.has_key(region.type): + self.formats[region.type].parse_region_content(items, region) + + # Otherwise, treat the section as opaque. + + else: + self.parse_region_opaque(items, region) + + def parse_region_header(self, items, region): + + """ + Parse the region header from the 'items', setting it for the given 'region'. + """ + + if items.read_until(["header"], False) == "": # None means no header + region.type = items.read_match() + + def parse_region_opaque(self, items, region): + + "Parse the data provided by 'items' to populate an opaque 'region'." + + region.transparent = False + self.parse_region_details(items, region, ["regionend"]) + + def parse_region_content(self, items, region): + + "Parse the data provided by 'items' to populate the given 'region'." + + pass + + # Parsing utilities. + + def parse_region_details(self, items, region, pattern_names): + + "Parse 'items' within 'region' searching using 'pattern_names'." + + try: + while True: + + # Obtain text before any marker or the end of the input. + + preceding = items.read_until(pattern_names) + if preceding: + region.append_inline(Text(preceding)) + + # End of input. + + if not items.matching: + break + + # Obtain any feature. + + feature = items.read_match() + handler = self.handlers.get(items.matching) + + # Handle each feature or add text to the region. + + if handler: + handler(self, items, region) + else: + region.append_inline(Text(feature)) + + except StopIteration: + pass + + region.normalise() + + def end_region(self, items, region): + + "End the parsing of 'region', breaking out of the parsing loop." + + raise StopIteration + +# vim: tabstop=4 expandtab shiftwidth=4 diff -r 4ebe552530c8 -r 3993165616f8 moinformat/tree.py --- a/moinformat/tree.py Thu May 04 21:41:13 2017 +0200 +++ b/moinformat/tree.py Thu May 04 22:39:00 2017 +0200 @@ -97,13 +97,12 @@ "A region of the page." - transparent_region_types = ["wiki"] - - def __init__(self, nodes, level=0, indent=0, type=None): + def __init__(self, nodes, level=0, indent=0, type=None, transparent=True): Container.__init__(self, nodes) self.level = level self.indent = indent self.type = type + self.transparent = transparent def add(self, node): last = self.node(-1) @@ -113,7 +112,7 @@ self.append(node) def append_inline(self, node): - if self.is_transparent(): + if self.transparent: self.nodes[-1].append(node) else: self.append(node) @@ -121,9 +120,6 @@ def have_end(self, s): return self.level and s.startswith("}") and self.level == len(s) - def is_transparent(self): - return not self.level or self.type in self.transparent_region_types - def __repr__(self): return "Region(%r, %r, %r, %r)" % (self.nodes, self.level, self.indent, self.type)