# HG changeset patch # User Paul Boddie # Date 1493480823 -7200 # Node ID e1ed135d019fa20910d6000d3f986ac53b151419 # Parent 3c29a8a6263548bbc6c107a77e0267e0803084ec Reorganised the code into a package. diff -r 3c29a8a62635 -r e1ed135d019f moinformat.py --- a/moinformat.py Fri Apr 28 18:56:50 2017 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,530 +0,0 @@ -#!/usr/bin/env python - -""" -Moin wiki format parser. - -Copyright (C) 2012, 2013, 2015, 2017 Paul Boddie - -This program is free software; you can redistribute it and/or modify it under -the terms of the GNU General Public License as published by the Free Software -Foundation; either version 3 of the License, or (at your option) any later -version. - -This program is distributed in the hope that it will be useful, but WITHOUT -ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS -FOR A PARTICULAR PURPOSE. See the GNU General Public License for more -details. - -You should have received a copy of the GNU General Public License along with -this program. If not, see . -""" - -from cgi import escape -import re - -# Regular expressions. - -syntax = { - # Page regions: - "regionstart" : (r"((^\s*)([{]{3,}))", re.MULTILINE | re.DOTALL), # {{{... - "regionend" : (r"^\s*([}]{3,})", re.MULTILINE | re.DOTALL), # }}}... - "header" : (r"#!(.*?)\n", 0), # #! char-excl-nl - - # Region contents: - "break" : (r"^(\s*?)\n", re.MULTILINE), # blank line - "listitem" : (r"^((\s+)([*]|\d+[.]))", re.MULTILINE), # indent (list-item or number-item) - - # List contents: - "listitemend" : (r"^", re.MULTILINE), # next line - } - -# Define patterns for the regular expressions. - -patterns = {} -for name, (value, flags) in syntax.items(): - patterns[name] = re.compile(value, re.UNICODE | flags) - - - -# Document nodes. - -class Container: - - "A container of document nodes." - - def __init__(self, nodes): - self.nodes = nodes - - def append(self, node): - self.nodes.append(node) - - append_text = append - - def empty(self): - return not self.nodes - - def normalise(self): - - "Combine adjacent text nodes." - - nodes = self.nodes - self.nodes = [] - text = None - - for node in nodes: - - # Open a text node or merge text into an open node. - - if isinstance(node, Text): - if not text: - text = node - else: - text.merge(node) - - # Close any open text node and append the current node. - - else: - if text: - self.append(text) - text = None - self.append(node) - - # Add any open text node. - - if text: - self.append(text) - - def __str__(self): - return self.prettyprint() - - def prettyprint(self, indent=""): - pass - -class Region(Container): - - "A region of the page." - - transparent_region_types = ["wiki"] - - def __init__(self, nodes, level=0, indent=0, type=None): - Container.__init__(self, nodes) - self.level = level - self.indent = indent - self.type = type - - def append(self, node): - last = self.nodes and self.nodes[-1] - if last and last.empty(): - self.nodes[-1] = node - else: - self.nodes.append(node) - - def append_text(self, s): - if self.is_transparent(): - self.nodes[-1].append(s) - else: - self.append(s) - - def have_end(self, s): - return self.level and s.startswith("}") and self.level == len(s) - - def is_transparent(self): - return not self.level or self.type in self.transparent_region_types - - def __repr__(self): - return "Region(%r, %r, %r, %r)" % (self.nodes, self.level, self.indent, self.type) - - def prettyprint(self, indent=""): - l = ["%sRegion: level=%d indent=%d type=%s" % (indent, self.level, self.indent, self.type)] - for node in self.nodes: - l.append(node.prettyprint(indent + " ")) - return "\n".join(l) - - def to_string(self, out): - out.start_region(self.level, self.indent, self.type) - for node in self.nodes: - node.to_string(out) - out.end_region(self.level, self.indent, self.type) - -class Block(Container): - - "A block in the page." - - def __init__(self, nodes, final=True): - Container.__init__(self, nodes) - self.final = final - - def __repr__(self): - return "Block(%r)" % self.nodes - - def prettyprint(self, indent=""): - l = ["%sBlock: final=%s" % (indent, self.final)] - for node in self.nodes: - l.append(node.prettyprint(indent + " ")) - return "\n".join(l) - - def to_string(self, out): - out.start_block(self.final) - for node in self.nodes: - node.to_string(out) - out.end_block(self.final) - -class ListItem(Container): - - "A list item." - - def __repr__(self): - return "ListItem(%r)" % self.nodes - - def prettyprint(self, indent=""): - l = ["%sListItem:" % indent] - for node in self.nodes: - l.append(node.prettyprint(indent + " ")) - return "\n".join(l) - - def to_string(self, out): - out.start_listitem() - for node in self.nodes: - node.to_string(out) - out.end_listitem() - - -class Text: - - "A text node." - - def __init__(self, s): - self.s = s - - def empty(self): - return not self.s - - def merge(self, text): - self.s += text.s - - def __repr__(self): - return "Text(%r)" % self.s - - def prettyprint(self, indent=""): - return "%sText: %r" % (indent, self.s) - - def to_string(self, out): - out.text(self.s) - - - -# Serialisation. - -class Serialiser: - - "General serialisation support." - - def __init__(self, out): - self.out = out - -class MoinSerialiser(Serialiser): - - "Serialisation of the page." - - def start_region(self, level, indent, type): - out = self.out - if level: - out(" " * indent + "{" * level) - if type and level: - out("#!%s\n" % type) - - def end_region(self, level, indent, type): - out = self.out - if level: - out("}" * level) - - def start_block(self, final): - pass - - def end_block(self, final): - if not final: - self.out("\n") - - def start_listitem(self): - self.out(" *") - - def end_listitem(self): - pass - - def text(self, s): - self.out(s) - -class HTMLSerialiser(Serialiser): - - "Serialisation of the page." - - def start_region(self, level, indent, type): - l = [] - out = l.append - if level: - out("level-%d" % level) - - if indent: - out("indent-%d" % indent) - - # NOTE: Encode type details for CSS. - - if type: - out("type-%s" % escape(type, True)) - - self.out("" % " ".join(l)) - - def end_region(self, level, indent, type): - self.out("") - - def start_block(self, final): - self.out("

") - - def end_block(self, final): - self.out("

") - - def start_listitem(self): - self.out("
  • ") - - def end_listitem(self): - self.out("
  • ") - - def text(self, s): - self.out(escape(s)) - - - -# Tokenising functions. - -class TokenStream: - - "A stream of tokens taken from a string." - - def __init__(self, s): - self.s = s - self.pos = 0 - self.match = None - self.matching = None - - def read_until(self, pattern_names, remaining=True): - - """ - Find the first match for the given 'pattern_names'. Return the text - preceding any match, the remaining text if no match was found, or None - if no match was found and 'remaining' is given as a false value. - """ - - first = None - self.matching = None - - # Find the first matching pattern. - - for pattern_name in pattern_names: - match = patterns[pattern_name].search(self.s, self.pos) - if match: - start, end = match.span() - if self.matching is None or start < first: - first = start - self.matching = pattern_name - self.match = match - - if self.matching is None: - if remaining: - return self.s[self.pos:] - else: - return None - else: - return self.s[self.pos:first] - - def read_match(self, group=1): - - """ - Return the matched text, updating the position in the stream. If 'group' - is specified, the indicated group in a match will be returned. - Typically, group 1 should contain all pertinent data, but groups defined - within group 1 can provide sections of the data. - """ - - if self.match: - _start, self.pos = self.match.span() - try: - return self.match.group(group) - except IndexError: - return "" - else: - self.pos = len(self.s) - return None - - - -# Parser functions. - -def parse_page(s): - - """ - Parse page text 's'. Pages consist of regions delimited by markers. - """ - - return parse_region(TokenStream(s)) - -def parse_region(items, level=0, indent=0): - - """ - Parse the data provided by 'items' to populate a region with the given - 'level' at the given 'indent'. - """ - - region = Region([], level, indent) - - # Parse section headers. - - parse_region_header(items, region) - - # Parse section body. - - if region.is_transparent(): - parse_region_wiki(items, region) - else: - parse_region_opaque(items, region) - - return region - -def parse_region_header(items, region): - - """ - Parse the region header from the 'items', setting it for the given 'region'. - """ - - if items.read_until(["header"], False) == "": # None means no header - region.type = items.read_match() - -def parse_region_wiki(items, region): - - "Parse the data provided by 'items' to populate a wiki 'region'." - - new_block(region) - parse_region_details(items, region, ["break", "listitem", "regionstart", "regionend"]) - -def parse_region_opaque(items, region): - - "Parse the data provided by 'items' to populate an opaque 'region'." - - parse_region_details(items, region, ["regionend"]) - -def parse_region_details(items, region, pattern_names): - - "Parse 'items' within 'region' searching using 'pattern_names'." - - try: - while True: - - # Obtain text before any marker or the end of the input. - - preceding = items.read_until(pattern_names) - if preceding: - region.append_text(Text(preceding)) - - # End of input. - - if not items.matching: - break - - # Obtain any feature. - - feature = items.read_match() - handler = handlers.get(items.matching) - - # Handle each feature or add text to the region. - - if handler: - handler(items, region) - else: - region.append_text(Text(feature)) - - except StopIteration: - pass - - region.normalise() - -def end_region(items, region): - - "End the parsing of 'region'." - - raise StopIteration - -def parse_break(items, region): - - "Handle a paragraph break within 'region'." - - # Mark any previous block as not being the final one in a sequence. - - block = region.nodes[-1] - block.final = False - new_block(region) - -def parse_listitem_end(items, region): - - "Handle the end of a list." - - raise StopIteration - -def parse_listitem(items, region): - - "Handle a list item marker within 'region'." - - item = ListItem([]) - parse_region_details(items, item, ["listitemend"]) - region.append(item) - new_block(region) - -def parse_section(items, region): - - "Handle the start of a new section within 'region'." - - # Parse the section and start a new block after the section. - - indent = len(items.read_match(2)) - level = len(items.read_match(3)) - region.append(parse_region(items, level, indent)) - new_block(region) - -def parse_section_end(items, region): - - "Handle the end of a new section within 'region'." - - feature = items.read_match() - if region.have_end(feature): - raise StopIteration - else: - region.append_text(Text(feature)) - -# Pattern handlers. - -handlers = { - None : end_region, - "break" : parse_break, - "listitemend" : parse_listitem_end, - "listitem" : parse_listitem, - "regionstart" : parse_section, - "regionend" : parse_section_end, - } - -def new_block(region): - - "Start a new block in 'region'." - - block = Block([]) - region.append(block) - - - -# Top-level functions. - -parse = parse_page - -def serialise(doc, serialiser=MoinSerialiser): - l = [] - doc.to_string(serialiser(l.append)) - return "".join(l) - -# vim: tabstop=4 expandtab shiftwidth=4 diff -r 3c29a8a62635 -r e1ed135d019f moinformat/__init__.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/moinformat/__init__.py Sat Apr 29 17:47:03 2017 +0200 @@ -0,0 +1,277 @@ +#!/usr/bin/env python + +""" +Moin wiki format parser. + +Copyright (C) 2017 Paul Boddie + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; either version 3 of the License, or (at your option) any later +version. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +details. + +You should have received a copy of the GNU General Public License along with +this program. If not, see . +""" + +from moinformat.tree import Region, Block, ListItem, Text +import re + +# Regular expressions. + +syntax = { + # Page regions: + "regionstart" : (r"((^\s*)([{]{3,}))", re.MULTILINE | re.DOTALL), # {{{... + "regionend" : (r"^\s*([}]{3,})", re.MULTILINE | re.DOTALL), # }}}... + "header" : (r"#!(.*?)\n", 0), # #! char-excl-nl + + # Region contents: + "break" : (r"^(\s*?)\n", re.MULTILINE), # blank line + "listitem" : (r"^((\s+)([*]|\d+[.]))", re.MULTILINE), # indent (list-item or number-item) + + # List contents: + "listitemend" : (r"^", re.MULTILINE), # next line + } + +# Define patterns for the regular expressions. + +patterns = {} +for name, (value, flags) in syntax.items(): + patterns[name] = re.compile(value, re.UNICODE | flags) + + + +# Tokenising functions. + +class TokenStream: + + "A stream of tokens taken from a string." + + def __init__(self, s): + self.s = s + self.pos = 0 + self.match = None + self.matching = None + + def read_until(self, pattern_names, remaining=True): + + """ + Find the first match for the given 'pattern_names'. Return the text + preceding any match, the remaining text if no match was found, or None + if no match was found and 'remaining' is given as a false value. + """ + + first = None + self.matching = None + + # Find the first matching pattern. + + for pattern_name in pattern_names: + match = patterns[pattern_name].search(self.s, self.pos) + if match: + start, end = match.span() + if self.matching is None or start < first: + first = start + self.matching = pattern_name + self.match = match + + if self.matching is None: + if remaining: + return self.s[self.pos:] + else: + return None + else: + return self.s[self.pos:first] + + def read_match(self, group=1): + + """ + Return the matched text, updating the position in the stream. If 'group' + is specified, the indicated group in a match will be returned. + Typically, group 1 should contain all pertinent data, but groups defined + within group 1 can provide sections of the data. + """ + + if self.match: + _start, self.pos = self.match.span() + try: + return self.match.group(group) + except IndexError: + return "" + else: + self.pos = len(self.s) + return None + + + +# Parser functions. + +def parse_page(s): + + """ + Parse page text 's'. Pages consist of regions delimited by markers. + """ + + return parse_region(TokenStream(s)) + +def parse_region(items, level=0, indent=0): + + """ + Parse the data provided by 'items' to populate a region with the given + 'level' at the given 'indent'. + """ + + region = Region([], level, indent) + + # Parse section headers. + + parse_region_header(items, region) + + # Parse section body. + + if region.is_transparent(): + parse_region_wiki(items, region) + else: + parse_region_opaque(items, region) + + return region + +def parse_region_header(items, region): + + """ + Parse the region header from the 'items', setting it for the given 'region'. + """ + + if items.read_until(["header"], False) == "": # None means no header + region.type = items.read_match() + +def parse_region_wiki(items, region): + + "Parse the data provided by 'items' to populate a wiki 'region'." + + new_block(region) + parse_region_details(items, region, ["break", "listitem", "regionstart", "regionend"]) + +def parse_region_opaque(items, region): + + "Parse the data provided by 'items' to populate an opaque 'region'." + + parse_region_details(items, region, ["regionend"]) + +def parse_region_details(items, region, pattern_names): + + "Parse 'items' within 'region' searching using 'pattern_names'." + + try: + while True: + + # Obtain text before any marker or the end of the input. + + preceding = items.read_until(pattern_names) + if preceding: + region.append_text(Text(preceding)) + + # End of input. + + if not items.matching: + break + + # Obtain any feature. + + feature = items.read_match() + handler = handlers.get(items.matching) + + # Handle each feature or add text to the region. + + if handler: + handler(items, region) + else: + region.append_text(Text(feature)) + + except StopIteration: + pass + + region.normalise() + +def end_region(items, region): + + "End the parsing of 'region'." + + raise StopIteration + +def parse_break(items, region): + + "Handle a paragraph break within 'region'." + + # Mark any previous block as not being the final one in a sequence. + + block = region.nodes[-1] + block.final = False + new_block(region) + +def parse_listitem_end(items, region): + + "Handle the end of a list." + + raise StopIteration + +def parse_listitem(items, region): + + "Handle a list item marker within 'region'." + + item = ListItem([]) + parse_region_details(items, item, ["listitemend"]) + region.append(item) + new_block(region) + +def parse_section(items, region): + + "Handle the start of a new section within 'region'." + + # Parse the section and start a new block after the section. + + indent = len(items.read_match(2)) + level = len(items.read_match(3)) + region.append(parse_region(items, level, indent)) + new_block(region) + +def parse_section_end(items, region): + + "Handle the end of a new section within 'region'." + + feature = items.read_match() + if region.have_end(feature): + raise StopIteration + else: + region.append_text(Text(feature)) + +# Pattern handlers. + +handlers = { + None : end_region, + "break" : parse_break, + "listitemend" : parse_listitem_end, + "listitem" : parse_listitem, + "regionstart" : parse_section, + "regionend" : parse_section_end, + } + +def new_block(region): + + "Start a new block in 'region'." + + block = Block([]) + region.append(block) + + + +# Top-level functions. + +parse = parse_page + +# vim: tabstop=4 expandtab shiftwidth=4 diff -r 3c29a8a62635 -r e1ed135d019f moinformat/serialisers.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/moinformat/serialisers.py Sat Apr 29 17:47:03 2017 +0200 @@ -0,0 +1,108 @@ +#!/usr/bin/env python + +""" +Moin wiki serialisers. + +Copyright (C) 2017 Paul Boddie + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; either version 3 of the License, or (at your option) any later +version. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +details. + +You should have received a copy of the GNU General Public License along with +this program. If not, see . +""" + +from cgi import escape + +class Serialiser: + + "General serialisation support." + + def __init__(self, out): + self.out = out + +class MoinSerialiser(Serialiser): + + "Serialisation of the page." + + def start_region(self, level, indent, type): + out = self.out + if level: + out(" " * indent + "{" * level) + if type and level: + out("#!%s\n" % type) + + def end_region(self, level, indent, type): + out = self.out + if level: + out("}" * level) + + def start_block(self, final): + pass + + def end_block(self, final): + if not final: + self.out("\n") + + def start_listitem(self): + self.out(" *") + + def end_listitem(self): + pass + + def text(self, s): + self.out(s) + +class HTMLSerialiser(Serialiser): + + "Serialisation of the page." + + def start_region(self, level, indent, type): + l = [] + out = l.append + if level: + out("level-%d" % level) + + if indent: + out("indent-%d" % indent) + + # NOTE: Encode type details for CSS. + + if type: + out("type-%s" % escape(type, True)) + + self.out("" % " ".join(l)) + + def end_region(self, level, indent, type): + self.out("") + + def start_block(self, final): + self.out("

    ") + + def end_block(self, final): + self.out("

    ") + + def start_listitem(self): + self.out("
  • ") + + def end_listitem(self): + self.out("
  • ") + + def text(self, s): + self.out(escape(s)) + +# Top-level functions. + +def serialise(doc, serialiser=MoinSerialiser): + l = [] + doc.to_string(serialiser(l.append)) + return "".join(l) + +# vim: tabstop=4 expandtab shiftwidth=4 diff -r 3c29a8a62635 -r e1ed135d019f moinformat/tree.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/moinformat/tree.py Sat Apr 29 17:47:03 2017 +0200 @@ -0,0 +1,184 @@ +#!/usr/bin/env python + +""" +Moin wiki format document tree nodes. + +Copyright (C) 2017 Paul Boddie + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; either version 3 of the License, or (at your option) any later +version. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +details. + +You should have received a copy of the GNU General Public License along with +this program. If not, see . +""" + +class Container: + + "A container of document nodes." + + def __init__(self, nodes): + self.nodes = nodes + + def append(self, node): + self.nodes.append(node) + + append_text = append + + def empty(self): + return not self.nodes + + def normalise(self): + + "Combine adjacent text nodes." + + nodes = self.nodes + self.nodes = [] + text = None + + for node in nodes: + + # Open a text node or merge text into an open node. + + if isinstance(node, Text): + if not text: + text = node + else: + text.merge(node) + + # Close any open text node and append the current node. + + else: + if text: + self.append(text) + text = None + self.append(node) + + # Add any open text node. + + if text: + self.append(text) + + def __str__(self): + return self.prettyprint() + + def prettyprint(self, indent=""): + pass + +class Region(Container): + + "A region of the page." + + transparent_region_types = ["wiki"] + + def __init__(self, nodes, level=0, indent=0, type=None): + Container.__init__(self, nodes) + self.level = level + self.indent = indent + self.type = type + + def append(self, node): + last = self.nodes and self.nodes[-1] + if last and last.empty(): + self.nodes[-1] = node + else: + self.nodes.append(node) + + def append_text(self, s): + if self.is_transparent(): + self.nodes[-1].append(s) + else: + self.append(s) + + def have_end(self, s): + return self.level and s.startswith("}") and self.level == len(s) + + def is_transparent(self): + return not self.level or self.type in self.transparent_region_types + + def __repr__(self): + return "Region(%r, %r, %r, %r)" % (self.nodes, self.level, self.indent, self.type) + + def prettyprint(self, indent=""): + l = ["%sRegion: level=%d indent=%d type=%s" % (indent, self.level, self.indent, self.type)] + for node in self.nodes: + l.append(node.prettyprint(indent + " ")) + return "\n".join(l) + + def to_string(self, out): + out.start_region(self.level, self.indent, self.type) + for node in self.nodes: + node.to_string(out) + out.end_region(self.level, self.indent, self.type) + +class Block(Container): + + "A block in the page." + + def __init__(self, nodes, final=True): + Container.__init__(self, nodes) + self.final = final + + def __repr__(self): + return "Block(%r)" % self.nodes + + def prettyprint(self, indent=""): + l = ["%sBlock: final=%s" % (indent, self.final)] + for node in self.nodes: + l.append(node.prettyprint(indent + " ")) + return "\n".join(l) + + def to_string(self, out): + out.start_block(self.final) + for node in self.nodes: + node.to_string(out) + out.end_block(self.final) + +class ListItem(Container): + + "A list item." + + def __repr__(self): + return "ListItem(%r)" % self.nodes + + def prettyprint(self, indent=""): + l = ["%sListItem:" % indent] + for node in self.nodes: + l.append(node.prettyprint(indent + " ")) + return "\n".join(l) + + def to_string(self, out): + out.start_listitem() + for node in self.nodes: + node.to_string(out) + out.end_listitem() + +class Text: + + "A text node." + + def __init__(self, s): + self.s = s + + def empty(self): + return not self.s + + def merge(self, text): + self.s += text.s + + def __repr__(self): + return "Text(%r)" % self.s + + def prettyprint(self, indent=""): + return "%sText: %r" % (indent, self.s) + + def to_string(self, out): + out.text(self.s) + +# vim: tabstop=4 expandtab shiftwidth=4 diff -r 3c29a8a62635 -r e1ed135d019f tests/test_parser.py --- a/tests/test_parser.py Fri Apr 28 18:56:50 2017 +0200 +++ b/tests/test_parser.py Sat Apr 29 17:47:03 2017 +0200 @@ -1,6 +1,7 @@ #!/usr/bin/env python -from moinformat import parse, serialise, HTMLSerialiser +from moinformat import parse +from moinformat.serialisers import serialise, HTMLSerialiser s0 = """\ Hello