# HG changeset patch # User Paul Boddie # Date 1513122609 -3600 # Node ID 083967e3240691a39d5360b1edc5f041bd20960b # Parent f753f631d055e00dc047f55821612231ca9c97b1 Moved the Moin wiki parser into the parsers subpackage. Made the parsers and serialisers plus general functions available via the package root. diff -r f753f631d055 -r 083967e32406 moinformat/__init__.py --- a/moinformat/__init__.py Tue Dec 12 22:53:20 2017 +0100 +++ b/moinformat/__init__.py Wed Dec 13 00:50:09 2017 +0100 @@ -1,7 +1,7 @@ #!/usr/bin/env python """ -Moin wiki format parser. +Moin wiki format tools. Copyright (C) 2017 Paul Boddie @@ -19,543 +19,7 @@ this program. If not, see . """ -from moinformat.parsing import ParserBase, get_patterns, get_subset, new_block -from moinformat.serialisers import serialise -from moinformat.tree import Break, DefItem, DefTerm, FontStyle, Heading, \ - Larger, ListItem, Monospace, Region, Rule, Smaller, \ - Subscript, Superscript, Table, TableAttr, \ - TableAttrs, TableCell, TableRow, Text, Underline - -class Parser(ParserBase): - - "A wiki region parser." - - def __init__(self, formats=None): - - """ - Initialise the parser with any given 'formats' mapping from region type - names to parser objects. - """ - - # Introduce this class as the default parser for the wiki format. - - default_formats = {"wiki" : Parser} - if formats: - default_formats.update(formats) - - ParserBase.__init__(self, default_formats) - - # Principal parser methods. - - def parse(self, s): - - """ - Parse page text 's'. Pages consist of regions delimited by markers. - """ - - self.items = self.get_items(s) - self.region = Region([]) - - # Parse page header. - - self.parse_region_header(self.region) - - # Handle pages directly with this parser. Pages do not need to use an - # explicit format indicator. - - if not self.region.type: - self.parse_region_content(self.items, self.region) - - # Otherwise, test the type and find an appropriate parser. - - else: - self.parse_region_type(self.region) - - return self.region - - - - # Parser methods supporting different page features. - - def parse_attrname(self, attrs): - - "Handle an attribute name within 'attrs'." - - name = self.read_match() - attr = TableAttr(name) - - preceding = self.read_until(["attrvalue"], False) - if preceding == "": - attr.quote = self.read_match(1) - attr.value = self.read_match(2) - - attrs.append(attr) - - def parse_break(self, region): - - "Handle a paragraph break within 'region'." - - region.add(Break()) - new_block(region) - - def parse_defitem(self, region, extra=""): - - "Handle a definition item within 'region'." - - pad = self.read_match(1) - item = DefItem([], pad, extra) - self.parse_region_details(item, ["listitemend"]) - region.add(item) - new_block(region) - - def parse_defterm(self, region): - - "Handle a definition term within 'region'." - - pad = self.read_match(1) - term = DefTerm([], pad) - self.parse_region_details(term, ["deftermend", "deftermsep"]) - region.add(term) - if self.read_matching() == "deftermsep": - self.parse_defitem(region) - - def parse_defterm_empty(self, region): - - "Handle an empty definition term within 'region'." - - extra = self.read_match(1) - self.parse_region_details(region, ["deftermsep"]) - self.parse_defitem(region, extra) - - def parse_fontstyle(self, region): - - "Handle emphasis and strong styles." - - n = len(self.read_match(1)) - - # Handle endings. - - if isinstance(region, FontStyle): - emphasis = n in (2, 4, 5) - strong = n in (3, 5, 6) - active = True - - if region.emphasis and emphasis: - active = region.close_emphasis() - n -= 2 - if region.strong and strong: - active = region.close_strong() - n -= 3 - - if not active: - if n: - self.items.rewind(n) - raise StopIteration - - elif not n: - return - - # Handle new styles. - - emphasis = n in (2, 4, 5) - strong = n in (3, 5, 6) - double = n in (4, 6) - - span = FontStyle([], emphasis, strong) - if not double: - self.parse_region_details(span, self.inline_pattern_names) - region.append_inline(span) - - def parse_halign(self, attrs): - - "Handle horizontal alignment within 'attrs'." - - value = self.read_match() - attr = TableAttr("halign", value == "(" and "left" or value == ")" and "right" or "center", True) - attrs.append(attr) - - def parse_heading(self, region): - - "Handle a heading." - - start_extra = self.read_match(1) - level = len(self.read_match(2)) - start_pad = self.read_match(3) - heading = Heading([], level, start_extra, start_pad) - self.parse_region_details(heading, ["headingend"] + self.inline_pattern_names) - region.add(heading) - new_block(region) - - def parse_heading_end(self, heading): - - "Handle the end of a heading." - - level = len(self.read_match(2)) - if heading.level == level: - heading.end_pad = self.read_match(1) - heading.end_extra = self.read_match(3) - raise StopIteration - - def parse_listitem(self, region): - - "Handle a list item marker within 'region'." - - indent = len(self.read_match(1)) - marker = self.read_match(2) - space = self.read_match(3) - item = ListItem([], indent, marker, space) - self.parse_region_details(item, self.listitem_pattern_names) - region.add(item) - new_block(region) - - def parse_rule(self, region): - - "Handle a horizontal rule within 'region'." - - length = len(self.read_match(1)) - rule = Rule(length) - region.add(rule) - new_block(region) - - def parse_section(self, region): - - "Handle the start of a new section within 'region'." - - # Parse the section and start a new block after the section. - - indent = len(self.read_match(2)) - level = len(self.read_match(3)) - region.add(self.parse_region(level, indent)) - new_block(region) - - def parse_section_end(self, region): - - "Handle the end of a new section within 'region'." - - feature = self.read_match() - if region.have_end(feature): - raise StopIteration - else: - region.append_inline(Text(feature)) - - def parse_table_attrs(self, cell): - - "Handle the start of table attributes within 'cell'." - - attrs = TableAttrs([]) - self.parse_region_details(attrs, self.table_pattern_names) - - # Test the validity of the attributes. - - last = None - - for node in attrs.nodes: - - # Text separator nodes must be whitespace. - - if isinstance(node, Text): - if node.s.strip(): - break - - # Named attributes must be preceded by space if not the first. - - elif last and not node.concise and not isinstance(last, Text): - break - - last = node - - # All nodes were valid: preserve the collection. - - else: - cell.attrs = attrs - return - - # Invalid nodes were found: serialise the attributes as text. - - cell.append_inline(Text(serialise(attrs))) - - def parse_table_row(self, region): - - "Handle the start of a table row within 'region'." - - # Identify any active table. - - table = region.node(-2) - block = region.node(-1) - - if not (isinstance(table, Table) and block.empty()): - new_table = table = Table([]) - else: - new_table = None - - row = TableRow([]) - - while True: - cell = TableCell([]) - self.parse_region_details(cell, self.table_region_pattern_names) - - # Handle the end of the row. - - if self.read_matching() == "tableend": - trailing = self.read_match() - - # If the cell was started but not finished, convert the row into text. - - if not row.nodes or not cell.empty(): - for node in row.nodes: - region.append_inline(Text(serialise(node))) - region.append_inline(Text(serialise(cell))) - region.append_inline(Text(trailing)) - - new_block(region) - return - - # Append the final cell, if not empty. - - else: - row.trailing = trailing - - if not cell.empty(): - row.append(cell) - break - - # A cell separator has been found. - - row.append(cell) - - # Add the row to the table and any new table to the region. - - table.add(row) - if new_table: - region.add(new_table) - - new_block(region) - - def parse_valign(self, attrs): - - "Handle vertical alignment within 'attrs'." - - value = self.read_match() - attr = TableAttr("valign", value == "^" and "top" or "bottom", True) - attrs.append(attr) - - - - # Inline formatting handlers. - - def parse_inline(self, region, cls, pattern_name): - - "Handle an inline region." - - span = cls([]) - self.parse_region_details(span, self.inline_patterns_for(pattern_name)) - region.append_inline(span) - - def parse_larger(self, region): - self.parse_inline(region, Larger, "larger") - - def parse_monospace(self, region): - self.parse_inline(region, Monospace, "monospace") - - def parse_smaller(self, region): - self.parse_inline(region, Smaller, "smaller") - - def parse_sub(self, region): - self.parse_inline(region, Subscript, "sub") - - def parse_super(self, region): - self.parse_inline(region, Superscript, "super") - - def parse_underline(self, region): - self.parse_inline(region, Underline, "underline") - - - - # Table attribute handlers. - - def parse_table_attr(self, attrs, pattern_name): - - "Handle a table attribute." - - attrs.append(TableAttr(pattern_name, self.read_match(), True)) - - def parse_colour(self, cell): - self.parse_table_attr(cell, "colour") - - def parse_colspan(self, cell): - self.parse_table_attr(cell, "colspan") - - def parse_rowspan(self, cell): - self.parse_table_attr(cell, "rowspan") - - def parse_width(self, cell): - self.parse_table_attr(cell, "width") - - - - # Regular expressions. - - syntax = { - # Page regions: - "regionstart" : r"((^\N*)([{]{3,}))", # {{{... - "regionend" : r"^\N*([}]{3,})", # }}}... - "header" : r"#!(.*?)\n", # #! char-excl-nl - - # Region contents: - # Line-oriented patterns: - # blank line - "break" : r"^(\s*?)\n", - # ws... expecting text :: - "defterm" : r"^(\N+)(?=.+?::)", - # ws... expecting :: ws... - "defterm_empty" : r"^(\N+)(?=::\s+)", - # [ws...] =... ws... expecting headingend - "heading" : r"^(\N*)(?P=+)(\s+)(?=.*?\N+(?P=x)\N*$)", - # ws... list-item [ws...] - "listitem" : r"^(\N+)(\*)(\s*)", - # ws... number-item ws... - "listitem_num" : r"^(\N+)(\d+\.)(\s+)", - # ws... alpha-item ws... - "listitem_alpha": r"^(\N+)([aA]\.)(\s+)", - # ws... roman-item ws... - "listitem_roman": r"^(\N+)([iI]\.)(\s+)", - # ws... dot-item [ws...] - "listitem_dot" : r"^(\N+)(\.)(\s*)", - # || - "tablerow" : r"^\|\|", - - # Region contents: - # Inline patterns: - "fontstyle" : r"('{2,6})", - "larger" : r"~\+", - "monospace" : r"`", - "rule" : r"(-----*)", # ----... - "smaller" : r"~-", - "sub" : r",,", - "super" : r"\^", - "underline" : r"__", - - # Inline contents: - "largerend" : r"\+~", - "monospaceend" : r"`", - "smallerend" : r"-~", - "subend" : r",,", - "superend" : r"\^", - "underlineend" : r"__", - - # Heading contents: - "headingend" : r"(\N+)(=+)(\N*$)", # ws... =... [ws...] nl - - # List contents: - "deftermend" : r"::(\s*?\n)", - "deftermsep" : r"::(\s+)", - "listitemend" : r"^", # next line - - # Table contents: - "tableattrs" : r"<", - "tablecell" : r"\|\|", - "tableend" : r"(\s*?)^", # [ws...] next line - - # Table attributes: - "tableattrsend" : r">", - "halign" : r"([(:)])", - "valign" : r"([v^])", - "colour" : r"(\#[0-9A-F]{6})", - "colspan" : r"-(\d+)", - "rowspan" : r"\|(\d+)", - "width" : r"(\d+%)", - "attrname" : r"((?![-\d])[-\w]+)", # not-dash-or-digit dash-or-word-char... - "attrvalue" : r"""=(?P['"])(.*?)(?P=x)""", - } - - patterns = get_patterns(syntax) - - - - # Pattern details. - - table_pattern_names = [ - "attrname", "colour", "colspan", "halign", "rowspan", "tableattrsend", - "valign", "width" - ] - - inline_pattern_names = [ - "fontstyle", "larger", "monospace", "smaller", "sub", "super", "underline", - ] - - listitem_pattern_names = inline_pattern_names + ["listitemend"] - - region_pattern_names = inline_pattern_names + [ - "break", "heading", "defterm", "defterm_empty", "listitem", - "listitem_alpha", "listitem_dot", "listitem_num", "listitem_roman", - "regionstart", "regionend", "rule", "tablerow", - ] - - table_region_pattern_names = inline_pattern_names + [ - "tableattrs", "tablecell", "tableend" - ] - - def inline_patterns_for(self, name): - names = self.inline_pattern_names[:] - names[names.index(name)] = "%send" % name - return names - - - - # Pattern handlers. - - end_region = ParserBase.end_region - - handlers = { - None : end_region, - "attrname" : parse_attrname, - "break" : parse_break, - "colour" : parse_colour, - "colspan" : parse_colspan, - "defterm" : parse_defterm, - "defterm_empty" : parse_defterm_empty, - "deftermend" : end_region, - "deftermsep" : end_region, - "fontstyle" : parse_fontstyle, - "halign" : parse_halign, - "heading" : parse_heading, - "headingend" : parse_heading_end, - "larger" : parse_larger, - "largerend" : end_region, - "listitemend" : end_region, - "listitem" : parse_listitem, - "listitem_alpha" : parse_listitem, - "listitem_dot" : parse_listitem, - "listitem_num" : parse_listitem, - "listitem_roman" : parse_listitem, - "monospace" : parse_monospace, - "monospaceend" : end_region, - "regionstart" : parse_section, - "regionend" : parse_section_end, - "rowspan" : parse_rowspan, - "rule" : parse_rule, - "smaller" : parse_smaller, - "smallerend" : end_region, - "sub" : parse_sub, - "subend" : end_region, - "super" : parse_super, - "superend" : end_region, - "tableattrs" : parse_table_attrs, - "tableattrsend" : end_region, - "tablerow" : parse_table_row, - "tablecell" : end_region, - "tableend" : end_region, - "underline" : parse_underline, - "underlineend" : end_region, - "valign" : parse_valign, - "width" : parse_width, - } - - - -# Top-level functions. - -def parse(s, formats=None): - return Parser(formats).parse(s) +from moinformat.parsers import parse, parsers +from moinformat.serialisers import serialise, serialisers # vim: tabstop=4 expandtab shiftwidth=4 diff -r f753f631d055 -r 083967e32406 moinformat/parsers/__init__.py --- a/moinformat/parsers/__init__.py Tue Dec 12 22:53:20 2017 +0100 +++ b/moinformat/parsers/__init__.py Wed Dec 13 00:50:09 2017 +0100 @@ -21,4 +21,9 @@ from moinformat.parsers.manifest import parsers +# Top-level functions. + +def parse(s, formats=None): + return parsers["moin"](formats).parse(s) + # vim: tabstop=4 expandtab shiftwidth=4 diff -r f753f631d055 -r 083967e32406 moinformat/parsers/common.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/moinformat/parsers/common.py Wed Dec 13 00:50:09 2017 +0100 @@ -0,0 +1,328 @@ +#!/usr/bin/env python + +""" +Moin wiki parsing functionality. + +Copyright (C) 2017 Paul Boddie + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; either version 3 of the License, or (at your option) any later +version. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +details. + +You should have received a copy of the GNU General Public License along with +this program. If not, see . +""" + +from moinformat.tree import Block, Region, Text +import re + +# Pattern management. + +ws_excl_nl = r"[ \f\r\t\v]" + +def get_patterns(syntax): + + """ + Define patterns for the regular expressions in the 'syntax' mapping. In each + pattern, replace \N with a pattern for matching whitespace excluding + newlines. + """ + + patterns = {} + for name, value in syntax.items(): + value = value.replace(r"\N", ws_excl_nl) + patterns[name] = re.compile(value, re.UNICODE | re.MULTILINE) + return patterns + +def get_subset(d, keys): + + "Return a subset of 'd' having the given 'keys'." + + subset = {} + for key in keys: + subset[key] = d[key] + return subset + + + +# Tokenising functions. + +class TokenStream: + + "A stream of tokens taken from a string." + + def __init__(self, s, pos=0): + self.s = s + self.pos = pos + self.match = None + self.matching = None + + def rewind(self, length): + + "Rewind in the string by 'length'." + + self.pos -= min(length, self.pos) + + def read_until(self, patterns, remaining=True): + + """ + Find the first match for the given 'patterns'. Return the text preceding + any match, the remaining text if no match was found, or None if no match + was found and 'remaining' is given as a false value. + """ + + first = None + self.matching = None + + # Find the first matching pattern. + + for pattern_name, pattern in patterns.items(): + match = pattern.search(self.s, self.pos) + if match: + start, end = match.span() + if self.matching is None or start < first: + first = start + self.matching = pattern_name + self.match = match + + if self.matching is None: + if remaining: + return self.s[self.pos:] + else: + return None + else: + return self.s[self.pos:first] + + def read_match(self, group=1): + + """ + Return the matched text, updating the position in the stream. If 'group' + is specified, the indicated group in a match will be returned. + Typically, group 1 should contain all pertinent data, but groups defined + within group 1 can provide sections of the data. + """ + + if self.match: + _start, self.pos = self.match.span() + try: + return self.match.group(group) + except IndexError: + return "" + else: + self.pos = len(self.s) + return None + + + +# Utility functions. + +def new_block(region): + + "Start a new block in 'region'." + + region.add(Block([])) + + + +# Parser abstractions. + +class ParserBase: + + "Common parsing methods." + + region_pattern_names = None + + def __init__(self, formats=None): + + """ + Initialise the parser with any given 'formats' mapping from region type + names to parser objects. + """ + + self.formats = formats + + def get_parser(self, format_type): + + """ + Return a parser for 'format_type' or None if no suitable parser is found. + """ + + if not self.formats: + return None + + cls = self.formats.get(format_type) + if cls: + return cls(self.formats) + else: + return None + + def get_patterns(self, pattern_names): + + "Return a mapping of the given 'pattern_names' to patterns." + + return get_subset(self.patterns, pattern_names) + + def get_items(self, s, pos=0): + + "Return a sequence of token items for 's' and 'pos'." + + return TokenStream(s, pos) + + def set_region(self, items, region): + + "Set the 'items' used to populate the given 'region'." + + self.items = items + self.region = region + + def read_until(self, pattern_names, remaining=True): + + """ + Read the next portion of input, matching using 'pattern_names'. Return + the text preceding any match, the remaining text if no match was found, + or None if no match was found and 'remaining' is given as a false value. + """ + + return self.items.read_until(self.get_patterns(pattern_names)) + + def read_match(self, group=1): + + """ + Return the group of the matching pattern with the given 'group' number. + """ + + return self.items.read_match(group) + + def read_matching(self): + + "Return the name of the matching pattern." + + return self.items.matching + + # Parser methods invoked from other objects. + + def parse(self, s): + + """ + Parse page text 's'. Pages consist of regions delimited by markers. + """ + + self.items = self.get_items(s) + self.region = self.parse_region() + return self.region + + def parse_region_content(self, items, region): + + "Parse the data provided by 'items' to populate a 'region'." + + self.set_region(items, region) + + # Define a block to hold text and start parsing. + + new_block(region) + + if self.region_pattern_names: + self.parse_region_details(region, self.region_pattern_names) + + # Top-level parser handler methods. + + def parse_region(self, level=0, indent=0): + + """ + Parse the data to populate a region with the given 'level' at the given + 'indent'. + """ + + region = Region([], level, indent) + + # Parse section headers, then parse according to region type. + + self.parse_region_header(region) + self.parse_region_type(region) + + return region + + def parse_region_type(self, region): + + """ + Use configured parsers to parse 'region' based on its type. + """ + + # Find an appropriate parser given the type. + + parser = self.get_parser(region.type) + + if parser: + parser.parse_region_content(self.items, region) + + # Otherwise, treat the section as opaque. + + else: + self.parse_region_opaque(region) + + def parse_region_header(self, region): + + """ + Parse the region header, setting it on the 'region' object. + """ + + if self.read_until(["header"], False) == "": # None means no header + region.type = self.read_match() + + def parse_region_opaque(self, region): + + "Parse the data to populate an opaque 'region'." + + region.transparent = False + self.parse_region_details(region, ["regionend"]) + + # Parsing utilities. + + def parse_region_details(self, region, pattern_names): + + "Search 'region' using the 'pattern_names'." + + try: + while True: + + # Obtain text before any marker or the end of the input. + + preceding = self.read_until(pattern_names) + if preceding: + region.append_inline(Text(preceding)) + + # End of input. + + if not self.read_matching(): + break + + # Obtain any feature. + + feature = self.read_match() + handler = self.handlers.get(self.read_matching()) + + # Handle each feature or add text to the region. + + if handler: + handler(self, region) + else: + region.append_inline(Text(feature)) + + except StopIteration: + pass + + region.normalise() + + def end_region(self, region): + + "End the parsing of 'region', breaking out of the parsing loop." + + raise StopIteration + +# vim: tabstop=4 expandtab shiftwidth=4 diff -r f753f631d055 -r 083967e32406 moinformat/parsers/moin.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/moinformat/parsers/moin.py Wed Dec 13 00:50:09 2017 +0100 @@ -0,0 +1,556 @@ +#!/usr/bin/env python + +""" +Moin wiki format parser. + +Copyright (C) 2017 Paul Boddie + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; either version 3 of the License, or (at your option) any later +version. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +details. + +You should have received a copy of the GNU General Public License along with +this program. If not, see . +""" + +from moinformat.parsers.common import ParserBase, get_patterns, get_subset, new_block +from moinformat.serialisers import serialise +from moinformat.tree import Break, DefItem, DefTerm, FontStyle, Heading, \ + Larger, ListItem, Monospace, Region, Rule, Smaller, \ + Subscript, Superscript, Table, TableAttr, \ + TableAttrs, TableCell, TableRow, Text, Underline + +class MoinParser(ParserBase): + + "A wiki region parser." + + def __init__(self, formats=None): + + """ + Initialise the parser with any given 'formats' mapping from region type + names to parser objects. + """ + + # Introduce this class as the default parser for the wiki format. + + default_formats = {"wiki" : MoinParser, "moin" : MoinParser} + if formats: + default_formats.update(formats) + + ParserBase.__init__(self, default_formats) + + # Principal parser methods. + + def parse(self, s): + + """ + Parse page text 's'. Pages consist of regions delimited by markers. + """ + + self.items = self.get_items(s) + self.region = Region([]) + + # Parse page header. + + self.parse_region_header(self.region) + + # Handle pages directly with this parser. Pages do not need to use an + # explicit format indicator. + + if not self.region.type: + self.parse_region_content(self.items, self.region) + + # Otherwise, test the type and find an appropriate parser. + + else: + self.parse_region_type(self.region) + + return self.region + + + + # Parser methods supporting different page features. + + def parse_attrname(self, attrs): + + "Handle an attribute name within 'attrs'." + + name = self.read_match() + attr = TableAttr(name) + + preceding = self.read_until(["attrvalue"], False) + if preceding == "": + attr.quote = self.read_match(1) + attr.value = self.read_match(2) + + attrs.append(attr) + + def parse_break(self, region): + + "Handle a paragraph break within 'region'." + + region.add(Break()) + new_block(region) + + def parse_defitem(self, region, extra=""): + + "Handle a definition item within 'region'." + + pad = self.read_match(1) + item = DefItem([], pad, extra) + self.parse_region_details(item, ["listitemend"]) + region.add(item) + new_block(region) + + def parse_defterm(self, region): + + "Handle a definition term within 'region'." + + pad = self.read_match(1) + term = DefTerm([], pad) + self.parse_region_details(term, ["deftermend", "deftermsep"]) + region.add(term) + if self.read_matching() == "deftermsep": + self.parse_defitem(region) + + def parse_defterm_empty(self, region): + + "Handle an empty definition term within 'region'." + + extra = self.read_match(1) + self.parse_region_details(region, ["deftermsep"]) + self.parse_defitem(region, extra) + + def parse_fontstyle(self, region): + + "Handle emphasis and strong styles." + + n = len(self.read_match(1)) + + # Handle endings. + + if isinstance(region, FontStyle): + emphasis = n in (2, 4, 5) + strong = n in (3, 5, 6) + active = True + + if region.emphasis and emphasis: + active = region.close_emphasis() + n -= 2 + if region.strong and strong: + active = region.close_strong() + n -= 3 + + if not active: + if n: + self.items.rewind(n) + raise StopIteration + + elif not n: + return + + # Handle new styles. + + emphasis = n in (2, 4, 5) + strong = n in (3, 5, 6) + double = n in (4, 6) + + span = FontStyle([], emphasis, strong) + if not double: + self.parse_region_details(span, self.inline_pattern_names) + region.append_inline(span) + + def parse_halign(self, attrs): + + "Handle horizontal alignment within 'attrs'." + + value = self.read_match() + attr = TableAttr("halign", value == "(" and "left" or value == ")" and "right" or "center", True) + attrs.append(attr) + + def parse_heading(self, region): + + "Handle a heading." + + start_extra = self.read_match(1) + level = len(self.read_match(2)) + start_pad = self.read_match(3) + heading = Heading([], level, start_extra, start_pad) + self.parse_region_details(heading, ["headingend"] + self.inline_pattern_names) + region.add(heading) + new_block(region) + + def parse_heading_end(self, heading): + + "Handle the end of a heading." + + level = len(self.read_match(2)) + if heading.level == level: + heading.end_pad = self.read_match(1) + heading.end_extra = self.read_match(3) + raise StopIteration + + def parse_listitem(self, region): + + "Handle a list item marker within 'region'." + + indent = len(self.read_match(1)) + marker = self.read_match(2) + space = self.read_match(3) + item = ListItem([], indent, marker, space) + self.parse_region_details(item, self.listitem_pattern_names) + region.add(item) + new_block(region) + + def parse_rule(self, region): + + "Handle a horizontal rule within 'region'." + + length = len(self.read_match(1)) + rule = Rule(length) + region.add(rule) + new_block(region) + + def parse_section(self, region): + + "Handle the start of a new section within 'region'." + + # Parse the section and start a new block after the section. + + indent = len(self.read_match(2)) + level = len(self.read_match(3)) + region.add(self.parse_region(level, indent)) + new_block(region) + + def parse_section_end(self, region): + + "Handle the end of a new section within 'region'." + + feature = self.read_match() + if region.have_end(feature): + raise StopIteration + else: + region.append_inline(Text(feature)) + + def parse_table_attrs(self, cell): + + "Handle the start of table attributes within 'cell'." + + attrs = TableAttrs([]) + self.parse_region_details(attrs, self.table_pattern_names) + + # Test the validity of the attributes. + + last = None + + for node in attrs.nodes: + + # Text separator nodes must be whitespace. + + if isinstance(node, Text): + if node.s.strip(): + break + + # Named attributes must be preceded by space if not the first. + + elif last and not node.concise and not isinstance(last, Text): + break + + last = node + + # All nodes were valid: preserve the collection. + + else: + cell.attrs = attrs + return + + # Invalid nodes were found: serialise the attributes as text. + + cell.append_inline(Text(serialise(attrs))) + + def parse_table_row(self, region): + + "Handle the start of a table row within 'region'." + + # Identify any active table. + + table = region.node(-2) + block = region.node(-1) + + if not (isinstance(table, Table) and block.empty()): + new_table = table = Table([]) + else: + new_table = None + + row = TableRow([]) + + while True: + cell = TableCell([]) + self.parse_region_details(cell, self.table_region_pattern_names) + + # Handle the end of the row. + + if self.read_matching() == "tableend": + trailing = self.read_match() + + # If the cell was started but not finished, convert the row into text. + + if not row.nodes or not cell.empty(): + for node in row.nodes: + region.append_inline(Text(serialise(node))) + region.append_inline(Text(serialise(cell))) + region.append_inline(Text(trailing)) + + new_block(region) + return + + # Append the final cell, if not empty. + + else: + row.trailing = trailing + + if not cell.empty(): + row.append(cell) + break + + # A cell separator has been found. + + row.append(cell) + + # Add the row to the table and any new table to the region. + + table.add(row) + if new_table: + region.add(new_table) + + new_block(region) + + def parse_valign(self, attrs): + + "Handle vertical alignment within 'attrs'." + + value = self.read_match() + attr = TableAttr("valign", value == "^" and "top" or "bottom", True) + attrs.append(attr) + + + + # Inline formatting handlers. + + def parse_inline(self, region, cls, pattern_name): + + "Handle an inline region." + + span = cls([]) + self.parse_region_details(span, self.inline_patterns_for(pattern_name)) + region.append_inline(span) + + def parse_larger(self, region): + self.parse_inline(region, Larger, "larger") + + def parse_monospace(self, region): + self.parse_inline(region, Monospace, "monospace") + + def parse_smaller(self, region): + self.parse_inline(region, Smaller, "smaller") + + def parse_sub(self, region): + self.parse_inline(region, Subscript, "sub") + + def parse_super(self, region): + self.parse_inline(region, Superscript, "super") + + def parse_underline(self, region): + self.parse_inline(region, Underline, "underline") + + + + # Table attribute handlers. + + def parse_table_attr(self, attrs, pattern_name): + + "Handle a table attribute." + + attrs.append(TableAttr(pattern_name, self.read_match(), True)) + + def parse_colour(self, cell): + self.parse_table_attr(cell, "colour") + + def parse_colspan(self, cell): + self.parse_table_attr(cell, "colspan") + + def parse_rowspan(self, cell): + self.parse_table_attr(cell, "rowspan") + + def parse_width(self, cell): + self.parse_table_attr(cell, "width") + + + + # Regular expressions. + + syntax = { + # Page regions: + "regionstart" : r"((^\N*)([{]{3,}))", # {{{... + "regionend" : r"^\N*([}]{3,})", # }}}... + "header" : r"#!(.*?)\n", # #! char-excl-nl + + # Region contents: + # Line-oriented patterns: + # blank line + "break" : r"^(\s*?)\n", + # ws... expecting text :: + "defterm" : r"^(\N+)(?=.+?::)", + # ws... expecting :: ws... + "defterm_empty" : r"^(\N+)(?=::\s+)", + # [ws...] =... ws... expecting headingend + "heading" : r"^(\N*)(?P=+)(\s+)(?=.*?\N+(?P=x)\N*$)", + # ws... list-item [ws...] + "listitem" : r"^(\N+)(\*)(\s*)", + # ws... number-item ws... + "listitem_num" : r"^(\N+)(\d+\.)(\s+)", + # ws... alpha-item ws... + "listitem_alpha": r"^(\N+)([aA]\.)(\s+)", + # ws... roman-item ws... + "listitem_roman": r"^(\N+)([iI]\.)(\s+)", + # ws... dot-item [ws...] + "listitem_dot" : r"^(\N+)(\.)(\s*)", + # || + "tablerow" : r"^\|\|", + + # Region contents: + # Inline patterns: + "fontstyle" : r"('{2,6})", + "larger" : r"~\+", + "monospace" : r"`", + "rule" : r"(-----*)", # ----... + "smaller" : r"~-", + "sub" : r",,", + "super" : r"\^", + "underline" : r"__", + + # Inline contents: + "largerend" : r"\+~", + "monospaceend" : r"`", + "smallerend" : r"-~", + "subend" : r",,", + "superend" : r"\^", + "underlineend" : r"__", + + # Heading contents: + "headingend" : r"(\N+)(=+)(\N*$)", # ws... =... [ws...] nl + + # List contents: + "deftermend" : r"::(\s*?\n)", + "deftermsep" : r"::(\s+)", + "listitemend" : r"^", # next line + + # Table contents: + "tableattrs" : r"<", + "tablecell" : r"\|\|", + "tableend" : r"(\s*?)^", # [ws...] next line + + # Table attributes: + "tableattrsend" : r">", + "halign" : r"([(:)])", + "valign" : r"([v^])", + "colour" : r"(\#[0-9A-F]{6})", + "colspan" : r"-(\d+)", + "rowspan" : r"\|(\d+)", + "width" : r"(\d+%)", + "attrname" : r"((?![-\d])[-\w]+)", # not-dash-or-digit dash-or-word-char... + "attrvalue" : r"""=(?P['"])(.*?)(?P=x)""", + } + + patterns = get_patterns(syntax) + + + + # Pattern details. + + table_pattern_names = [ + "attrname", "colour", "colspan", "halign", "rowspan", "tableattrsend", + "valign", "width" + ] + + inline_pattern_names = [ + "fontstyle", "larger", "monospace", "smaller", "sub", "super", "underline", + ] + + listitem_pattern_names = inline_pattern_names + ["listitemend"] + + region_pattern_names = inline_pattern_names + [ + "break", "heading", "defterm", "defterm_empty", "listitem", + "listitem_alpha", "listitem_dot", "listitem_num", "listitem_roman", + "regionstart", "regionend", "rule", "tablerow", + ] + + table_region_pattern_names = inline_pattern_names + [ + "tableattrs", "tablecell", "tableend" + ] + + def inline_patterns_for(self, name): + names = self.inline_pattern_names[:] + names[names.index(name)] = "%send" % name + return names + + + + # Pattern handlers. + + end_region = ParserBase.end_region + + handlers = { + None : end_region, + "attrname" : parse_attrname, + "break" : parse_break, + "colour" : parse_colour, + "colspan" : parse_colspan, + "defterm" : parse_defterm, + "defterm_empty" : parse_defterm_empty, + "deftermend" : end_region, + "deftermsep" : end_region, + "fontstyle" : parse_fontstyle, + "halign" : parse_halign, + "heading" : parse_heading, + "headingend" : parse_heading_end, + "larger" : parse_larger, + "largerend" : end_region, + "listitemend" : end_region, + "listitem" : parse_listitem, + "listitem_alpha" : parse_listitem, + "listitem_dot" : parse_listitem, + "listitem_num" : parse_listitem, + "listitem_roman" : parse_listitem, + "monospace" : parse_monospace, + "monospaceend" : end_region, + "regionstart" : parse_section, + "regionend" : parse_section_end, + "rowspan" : parse_rowspan, + "rule" : parse_rule, + "smaller" : parse_smaller, + "smallerend" : end_region, + "sub" : parse_sub, + "subend" : end_region, + "super" : parse_super, + "superend" : end_region, + "tableattrs" : parse_table_attrs, + "tableattrsend" : end_region, + "tablerow" : parse_table_row, + "tablecell" : end_region, + "tableend" : end_region, + "underline" : parse_underline, + "underlineend" : end_region, + "valign" : parse_valign, + "width" : parse_width, + } + +parser = MoinParser + +# vim: tabstop=4 expandtab shiftwidth=4 diff -r f753f631d055 -r 083967e32406 moinformat/parsers/table.py --- a/moinformat/parsers/table.py Tue Dec 12 22:53:20 2017 +0100 +++ b/moinformat/parsers/table.py Wed Dec 13 00:50:09 2017 +0100 @@ -19,15 +19,15 @@ this program. If not, see . """ -from moinformat.parsing import get_patterns +from moinformat.parsers.common import get_patterns +from moinformat.parsers.moin import MoinParser from moinformat.tree import Table, TableAttrs, TableCell, TableRow, Text -from moinformat import Parser # Parser functionality. -class TableParser(Parser): +class TableParser(MoinParser): "A parser for improved table syntax." @@ -85,7 +85,7 @@ # Regular expressions. syntax = {} - syntax.update(Parser.syntax) + syntax.update(MoinParser.syntax) syntax.update({ # At start of line: "rowsep" : r"^==(?!.*==\s*?$)(?=\N*?)", # == not-heading ws-excl-nl @@ -101,7 +101,7 @@ # Pattern details. - table_region_pattern_names = Parser.region_pattern_names + [ + table_region_pattern_names = MoinParser.region_pattern_names + [ "columnsep", "continuation", "regionend", "rowsep", ] @@ -110,11 +110,11 @@ # Pattern handlers. handlers = {} - handlers.update(Parser.handlers) + handlers.update(MoinParser.handlers) handlers.update({ - "columnsep" : Parser.end_region, + "columnsep" : MoinParser.end_region, "continuation" : parse_continuation, - "rowsep" : Parser.end_region, + "rowsep" : MoinParser.end_region, "regionend" : parse_table_end, }) diff -r f753f631d055 -r 083967e32406 moinformat/parsing.py --- a/moinformat/parsing.py Tue Dec 12 22:53:20 2017 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,328 +0,0 @@ -#!/usr/bin/env python - -""" -Moin wiki parsing functionality. - -Copyright (C) 2017 Paul Boddie - -This program is free software; you can redistribute it and/or modify it under -the terms of the GNU General Public License as published by the Free Software -Foundation; either version 3 of the License, or (at your option) any later -version. - -This program is distributed in the hope that it will be useful, but WITHOUT -ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS -FOR A PARTICULAR PURPOSE. See the GNU General Public License for more -details. - -You should have received a copy of the GNU General Public License along with -this program. If not, see . -""" - -from moinformat.tree import Block, Region, Text -import re - -# Pattern management. - -ws_excl_nl = r"[ \f\r\t\v]" - -def get_patterns(syntax): - - """ - Define patterns for the regular expressions in the 'syntax' mapping. In each - pattern, replace \N with a pattern for matching whitespace excluding - newlines. - """ - - patterns = {} - for name, value in syntax.items(): - value = value.replace(r"\N", ws_excl_nl) - patterns[name] = re.compile(value, re.UNICODE | re.MULTILINE) - return patterns - -def get_subset(d, keys): - - "Return a subset of 'd' having the given 'keys'." - - subset = {} - for key in keys: - subset[key] = d[key] - return subset - - - -# Tokenising functions. - -class TokenStream: - - "A stream of tokens taken from a string." - - def __init__(self, s, pos=0): - self.s = s - self.pos = pos - self.match = None - self.matching = None - - def rewind(self, length): - - "Rewind in the string by 'length'." - - self.pos -= min(length, self.pos) - - def read_until(self, patterns, remaining=True): - - """ - Find the first match for the given 'patterns'. Return the text preceding - any match, the remaining text if no match was found, or None if no match - was found and 'remaining' is given as a false value. - """ - - first = None - self.matching = None - - # Find the first matching pattern. - - for pattern_name, pattern in patterns.items(): - match = pattern.search(self.s, self.pos) - if match: - start, end = match.span() - if self.matching is None or start < first: - first = start - self.matching = pattern_name - self.match = match - - if self.matching is None: - if remaining: - return self.s[self.pos:] - else: - return None - else: - return self.s[self.pos:first] - - def read_match(self, group=1): - - """ - Return the matched text, updating the position in the stream. If 'group' - is specified, the indicated group in a match will be returned. - Typically, group 1 should contain all pertinent data, but groups defined - within group 1 can provide sections of the data. - """ - - if self.match: - _start, self.pos = self.match.span() - try: - return self.match.group(group) - except IndexError: - return "" - else: - self.pos = len(self.s) - return None - - - -# Utility functions. - -def new_block(region): - - "Start a new block in 'region'." - - region.add(Block([])) - - - -# Parser abstractions. - -class ParserBase: - - "Common parsing methods." - - region_pattern_names = None - - def __init__(self, formats=None): - - """ - Initialise the parser with any given 'formats' mapping from region type - names to parser objects. - """ - - self.formats = formats - - def get_parser(self, format_type): - - """ - Return a parser for 'format_type' or None if no suitable parser is found. - """ - - if not self.formats: - return None - - cls = self.formats.get(format_type) - if cls: - return cls(self.formats) - else: - return None - - def get_patterns(self, pattern_names): - - "Return a mapping of the given 'pattern_names' to patterns." - - return get_subset(self.patterns, pattern_names) - - def get_items(self, s, pos=0): - - "Return a sequence of token items for 's' and 'pos'." - - return TokenStream(s, pos) - - def set_region(self, items, region): - - "Set the 'items' used to populate the given 'region'." - - self.items = items - self.region = region - - def read_until(self, pattern_names, remaining=True): - - """ - Read the next portion of input, matching using 'pattern_names'. Return - the text preceding any match, the remaining text if no match was found, - or None if no match was found and 'remaining' is given as a false value. - """ - - return self.items.read_until(self.get_patterns(pattern_names)) - - def read_match(self, group=1): - - """ - Return the group of the matching pattern with the given 'group' number. - """ - - return self.items.read_match(group) - - def read_matching(self): - - "Return the name of the matching pattern." - - return self.items.matching - - # Parser methods invoked from other objects. - - def parse(self, s): - - """ - Parse page text 's'. Pages consist of regions delimited by markers. - """ - - self.items = self.get_items(s) - self.region = self.parse_region() - return self.region - - def parse_region_content(self, items, region): - - "Parse the data provided by 'items' to populate a 'region'." - - self.set_region(items, region) - - # Define a block to hold text and start parsing. - - new_block(region) - - if self.region_pattern_names: - self.parse_region_details(region, self.region_pattern_names) - - # Top-level parser handler methods. - - def parse_region(self, level=0, indent=0): - - """ - Parse the data to populate a region with the given 'level' at the given - 'indent'. - """ - - region = Region([], level, indent) - - # Parse section headers, then parse according to region type. - - self.parse_region_header(region) - self.parse_region_type(region) - - return region - - def parse_region_type(self, region): - - """ - Use configured parsers to parse 'region' based on its type. - """ - - # Find an appropriate parser given the type. - - parser = self.get_parser(region.type) - - if parser: - parser.parse_region_content(self.items, region) - - # Otherwise, treat the section as opaque. - - else: - self.parse_region_opaque(region) - - def parse_region_header(self, region): - - """ - Parse the region header, setting it on the 'region' object. - """ - - if self.read_until(["header"], False) == "": # None means no header - region.type = self.read_match() - - def parse_region_opaque(self, region): - - "Parse the data to populate an opaque 'region'." - - region.transparent = False - self.parse_region_details(region, ["regionend"]) - - # Parsing utilities. - - def parse_region_details(self, region, pattern_names): - - "Search 'region' using the 'pattern_names'." - - try: - while True: - - # Obtain text before any marker or the end of the input. - - preceding = self.read_until(pattern_names) - if preceding: - region.append_inline(Text(preceding)) - - # End of input. - - if not self.read_matching(): - break - - # Obtain any feature. - - feature = self.read_match() - handler = self.handlers.get(self.read_matching()) - - # Handle each feature or add text to the region. - - if handler: - handler(self, region) - else: - region.append_inline(Text(feature)) - - except StopIteration: - pass - - region.normalise() - - def end_region(self, region): - - "End the parsing of 'region', breaking out of the parsing loop." - - raise StopIteration - -# vim: tabstop=4 expandtab shiftwidth=4 diff -r f753f631d055 -r 083967e32406 tests/test_parser.py --- a/tests/test_parser.py Tue Dec 12 22:53:20 2017 +0100 +++ b/tests/test_parser.py Wed Dec 13 00:50:09 2017 +0100 @@ -1,21 +1,14 @@ #!/usr/bin/env python -from moinformat import parse -from moinformat.parsers import table -from moinformat.serialisers import serialise -from moinformat.serialisers.html import HTMLSerialiser +from moinformat import parse, parsers, serialise, serialisers from glob import glob from os.path import join, split import sys dirname = split(sys.argv[0])[0] -formats = { - "table" : table.TableParser, - } - def test_input(s): - d = parse(s, formats) + d = parse(s, parsers) o = serialise(d) print o == s @@ -29,7 +22,7 @@ print "-" * 60 print s print "-" * 60 - print serialise(d, HTMLSerialiser) + print serialise(d, serialisers["html"]) print "-" * 60 print d.prettyprint() print