# HG changeset patch # User Paul Boddie # Date 1493926873 -7200 # Node ID 4ebe552530c805b23503b0919fb70b4acbb60e84 # Parent 90273d805859c419336727337cb6718006df7f24 Introduced a parser class to allow parameterisation of the parsing activity. diff -r 90273d805859 -r 4ebe552530c8 moinformat/__init__.py --- a/moinformat/__init__.py Thu May 04 19:13:31 2017 +0200 +++ b/moinformat/__init__.py Thu May 04 21:41:13 2017 +0200 @@ -189,48 +189,7 @@ -# Parsing utilities. - -def parse_region_details(items, region, pattern_names): - - "Parse 'items' within 'region' searching using 'pattern_names'." - - try: - while True: - - # Obtain text before any marker or the end of the input. - - preceding = items.read_until(pattern_names) - if preceding: - region.append_inline(Text(preceding)) - - # End of input. - - if not items.matching: - break - - # Obtain any feature. - - feature = items.read_match() - handler = handlers.get(items.matching) - - # Handle each feature or add text to the region. - - if handler: - handler(items, region) - else: - region.append_inline(Text(feature)) - - except StopIteration: - pass - - region.normalise() - -def end_region(items, region): - - "End the parsing of 'region', breaking out of the parsing loop." - - raise StopIteration +# Utility functions. def new_block(region): @@ -241,401 +200,478 @@ -# Parser functions for different page features. +# Parser abstraction. -def parse_page(s): +class Parser: + + "An extensible parser." - """ - Parse page text 's'. Pages consist of regions delimited by markers. - """ + def __init__(self, formats=None): + self.formats = formats + + # Principal parser methods. - return parse_region(TokenStream(s)) + def parse_page(self, s): -def parse_region(items, level=0, indent=0): + """ + Parse page text 's'. Pages consist of regions delimited by markers. + """ - """ - Parse the data provided by 'items' to populate a region with the given - 'level' at the given 'indent'. - """ + return self.parse_region(TokenStream(s)) - region = Region([], level, indent) - - # Parse section headers. + def parse_region(self, items, level=0, indent=0): - parse_region_header(items, region) - - # Parse section body. + """ + Parse the data provided by 'items' to populate a region with the given + 'level' at the given 'indent'. + """ - if region.is_transparent(): - parse_region_wiki(items, region) - else: - parse_region_opaque(items, region) + region = Region([], level, indent) + + # Parse section headers. + + self.parse_region_header(items, region) + + # Parse section body. - return region + if region.is_transparent(): + self.parse_region_wiki(items, region) + else: + self.parse_region_opaque(items, region) -def parse_region_header(items, region): + return region + + def parse_region_header(self, items, region): - """ - Parse the region header from the 'items', setting it for the given 'region'. - """ + """ + Parse the region header from the 'items', setting it for the given 'region'. + """ - if items.read_until(["header"], False) == "": # None means no header - region.type = items.read_match() + if items.read_until(["header"], False) == "": # None means no header + region.type = items.read_match() -def parse_region_wiki(items, region): + def parse_region_wiki(self, items, region): - "Parse the data provided by 'items' to populate a wiki 'region'." + "Parse the data provided by 'items' to populate a wiki 'region'." - new_block(region) - parse_region_details(items, region, inline_pattern_names + [ - "break", "heading", - "defterm", "defterm_empty", - "listitem", "listitem_alpha", "listitem_dot", "listitem_num", - "listitem_roman", - "regionstart", "regionend", - "rule", - "tablerow", - ]) + new_block(region) + self.parse_region_details(items, region, inline_pattern_names + [ + "break", "heading", + "defterm", "defterm_empty", + "listitem", "listitem_alpha", "listitem_dot", "listitem_num", + "listitem_roman", + "regionstart", "regionend", + "rule", + "tablerow", + ]) -def parse_region_opaque(items, region): + def parse_region_opaque(self, items, region): + + "Parse the data provided by 'items' to populate an opaque 'region'." + + self.parse_region_details(items, region, ["regionend"]) - "Parse the data provided by 'items' to populate an opaque 'region'." + # Parser methods supporting different page features. + + def parse_attrname(self, items, attrs): + + "Handle an attribute name within 'attrs'." - parse_region_details(items, region, ["regionend"]) + name = items.read_match() + attr = TableAttr(name) -def parse_attrname(items, attrs): + preceding = items.read_until(["attrvalue"], False) + if preceding == "": + attr.quote = items.read_match(1) + attr.value = items.read_match(2) - "Handle an attribute name within 'attrs'." + attrs.append(attr) + + def parse_break(self, items, region): - name = items.read_match() - attr = TableAttr(name) + "Handle a paragraph break within 'region'." + + region.add(Break()) + new_block(region) + + def parse_defitem(self, items, region, extra=""): - preceding = items.read_until(["attrvalue"], False) - if preceding == "": - attr.quote = items.read_match(1) - attr.value = items.read_match(2) + "Handle a definition item within 'region'." - attrs.append(attr) - -def parse_break(items, region): + pad = items.read_match(1) + item = DefItem([], pad, extra) + self.parse_region_details(items, item, ["listitemend"]) + region.add(item) + new_block(region) - "Handle a paragraph break within 'region'." + def parse_defterm(self, items, region): + + "Handle a definition term within 'region'." - region.add(Break()) - new_block(region) - -def parse_defitem(items, region, extra=""): - - "Handle a definition item within 'region'." + pad = items.read_match(1) + term = DefTerm([], pad) + self.parse_region_details(items, term, ["deftermend", "deftermsep"]) + region.add(term) + if items.matching == "deftermsep": + self.parse_defitem(items, region) - pad = items.read_match(1) - item = DefItem([], pad, extra) - parse_region_details(items, item, ["listitemend"]) - region.add(item) - new_block(region) + def parse_defterm_empty(self, items, region): + + "Handle an empty definition term within 'region'." -def parse_defterm(items, region): + extra = items.read_match(1) + self.parse_region_details(items, region, ["deftermsep"]) + self.parse_defitem(items, region, extra) - "Handle a definition term within 'region'." + def parse_fontstyle(self, items, region): + + "Handle emphasis and strong styles." - pad = items.read_match(1) - term = DefTerm([], pad) - parse_region_details(items, term, ["deftermend", "deftermsep"]) - region.add(term) - if items.matching == "deftermsep": - parse_defitem(items, region) + n = len(items.read_match(1)) + + # Handle endings. -def parse_defterm_empty(items, region): - - "Handle an empty definition term within 'region'." - - extra = items.read_match(1) - parse_region_details(items, region, ["deftermsep"]) - parse_defitem(items, region, extra) + if isinstance(region, FontStyle): + emphasis = n in (2, 4, 5) + strong = n in (3, 5, 6) + active = True -def parse_fontstyle(items, region): - - "Handle emphasis and strong styles." + if region.emphasis and emphasis: + active = region.close_emphasis() + n -= 2 + if region.strong and strong: + active = region.close_strong() + n -= 3 - n = len(items.read_match(1)) + if not active: + if n: + items.rewind(n) + raise StopIteration - # Handle endings. + elif not n: + return - if isinstance(region, FontStyle): + # Handle new styles. + emphasis = n in (2, 4, 5) strong = n in (3, 5, 6) - active = True + double = n in (4, 6) + + span = FontStyle([], emphasis, strong) + if not double: + self.parse_region_details(items, span, inline_pattern_names) + region.append_inline(span) + + def parse_halign(self, items, attrs): + + "Handle horizontal alignment within 'attrs'." + + value = items.read_match() + attr = TableAttr("halign", value == "(" and "left" or value == ")" and "right" or "center", True) + attrs.append(attr) + + def parse_heading(self, items, region): - if region.emphasis and emphasis: - active = region.close_emphasis() - n -= 2 - if region.strong and strong: - active = region.close_strong() - n -= 3 + "Handle a heading." - if not active: - if n: - items.rewind(n) + start_extra = items.read_match(1) + level = len(items.read_match(2)) + start_pad = items.read_match(3) + heading = Heading([], level, start_extra, start_pad) + self.parse_region_details(items, heading, ["headingend"] + inline_pattern_names) + region.add(heading) + new_block(region) + + def parse_heading_end(self, items, heading): + + "Handle the end of a heading." + + level = len(items.read_match(2)) + if heading.level == level: + heading.end_pad = items.read_match(1) + heading.end_extra = items.read_match(3) raise StopIteration - elif not n: + def parse_listitem(self, items, region): + + "Handle a list item marker within 'region'." + + indent = len(items.read_match(1)) + marker = items.read_match(2) + space = items.read_match(3) + item = ListItem([], indent, marker, space) + self.parse_region_details(items, item, ["listitemend"]) + region.add(item) + new_block(region) + + def parse_rule(self, items, region): + + "Handle a horizontal rule within 'region'." + + length = len(items.read_match(1)) + rule = Rule(length) + region.add(rule) + new_block(region) + + def parse_section(self, items, region): + + "Handle the start of a new section within 'region'." + + # Parse the section and start a new block after the section. + + indent = len(items.read_match(2)) + level = len(items.read_match(3)) + region.add(self.parse_region(items, level, indent)) + new_block(region) + + def parse_section_end(self, items, region): + + "Handle the end of a new section within 'region'." + + feature = items.read_match() + if region.have_end(feature): + raise StopIteration + else: + region.append_inline(Text(feature)) + + def parse_table_attrs(self, items, cell): + + "Handle the start of table attributes within 'cell'." + + attrs = TableAttrs([]) + self.parse_region_details(items, attrs, table_pattern_names) + + # Test the validity of the attributes. + + last = None + + for node in attrs.nodes: + + # Text separator nodes must be whitespace. + + if isinstance(node, Text): + if node.s.strip(): + break + + # Named attributes must be preceded by space if not the first. + + elif last and not node.concise and not isinstance(last, Text): + break + + last = node + + # All nodes were valid: preserve the collection. + + else: + cell.attrs = attrs return - # Handle new styles. - - emphasis = n in (2, 4, 5) - strong = n in (3, 5, 6) - double = n in (4, 6) + # Invalid nodes were found: serialise the attributes as text. - span = FontStyle([], emphasis, strong) - if not double: - parse_region_details(items, span, inline_pattern_names) - region.append_inline(span) - -def parse_halign(items, attrs): - - "Handle horizontal alignment within 'attrs'." - - value = items.read_match() - attr = TableAttr("halign", value == "(" and "left" or value == ")" and "right" or "center", True) - attrs.append(attr) + cell.append_inline(Text(serialise(attrs))) -def parse_heading(items, region): - - "Handle a heading." + def parse_table_row(self, items, region): - start_extra = items.read_match(1) - level = len(items.read_match(2)) - start_pad = items.read_match(3) - heading = Heading([], level, start_extra, start_pad) - parse_region_details(items, heading, ["headingend"] + inline_pattern_names) - region.add(heading) - new_block(region) + "Handle the start of a table row within 'region'." -def parse_heading_end(items, heading): - - "Handle the end of a heading." - - level = len(items.read_match(2)) - if heading.level == level: - heading.end_pad = items.read_match(1) - heading.end_extra = items.read_match(3) - raise StopIteration + row = TableRow([]) -def parse_listitem(items, region): - - "Handle a list item marker within 'region'." + while True: + cell = TableCell([]) + self.parse_region_details(items, cell, ["tableattrs", "tablecell", "tableend"]) - indent = len(items.read_match(1)) - marker = items.read_match(2) - space = items.read_match(3) - item = ListItem([], indent, marker, space) - parse_region_details(items, item, ["listitemend"]) - region.add(item) - new_block(region) - -def parse_rule(items, region): - - "Handle a horizontal rule within 'region'." - - length = len(items.read_match(1)) - rule = Rule(length) - region.add(rule) - new_block(region) + # Handle the end of the row. -def parse_section(items, region): + if items.matching == "tableend": + trailing = items.read_match() - "Handle the start of a new section within 'region'." - - # Parse the section and start a new block after the section. + # If the cell was started but not finished, convert the row into text. - indent = len(items.read_match(2)) - level = len(items.read_match(3)) - region.add(parse_region(items, level, indent)) - new_block(region) - -def parse_section_end(items, region): - - "Handle the end of a new section within 'region'." - - feature = items.read_match() - if region.have_end(feature): - raise StopIteration - else: - region.append_inline(Text(feature)) + if not row.nodes or not cell.empty(): + for node in row.nodes: + region.append_inline(Text(serialise(node))) + region.append_inline(Text(serialise(cell))) + region.append_inline(Text(trailing)) -def parse_table_attrs(items, cell): - - "Handle the start of table attributes within 'cell'." - - attrs = TableAttrs([]) - parse_region_details(items, attrs, table_pattern_names) - - # Test the validity of the attributes. + new_block(region) + return - last = None - - for node in attrs.nodes: - - # Text separator nodes must be whitespace. - - if isinstance(node, Text): - if node.s.strip(): - break - - # Named attributes must be preceded by space if not the first. + # Append the final cell, if not empty. - elif last and not node.concise and not isinstance(last, Text): - break - - last = node - - # All nodes were valid: preserve the collection. + else: + row.trailing = trailing - else: - cell.attrs = attrs - return - - # Invalid nodes were found: serialise the attributes as text. - - cell.append_inline(Text(serialise(attrs))) - -def parse_table_row(items, region): - - "Handle the start of a table row within 'region'." - - row = TableRow([]) + if not cell.empty(): + row.append(cell) + break - while True: - cell = TableCell([]) - parse_region_details(items, cell, ["tableattrs", "tablecell", "tableend"]) - - # Handle the end of the row. - - if items.matching == "tableend": - trailing = items.read_match() - - # If the cell was started but not finished, convert the row into text. + # A cell separator has been found. - if not row.nodes or not cell.empty(): - for node in row.nodes: - region.append_inline(Text(serialise(node))) - region.append_inline(Text(serialise(cell))) - region.append_inline(Text(trailing)) + row.append(cell) - new_block(region) - return - - # Append the final cell, if not empty. + region.add(row) + new_block(region) - else: - row.trailing = trailing + def parse_valign(self, items, attrs): - if not cell.empty(): - row.append(cell) - break - - # A cell separator has been found. - - row.append(cell) + "Handle vertical alignment within 'attrs'." - region.add(row) - new_block(region) - -def parse_valign(items, attrs): - - "Handle vertical alignment within 'attrs'." - - value = items.read_match() - attr = TableAttr("valign", value == "^" and "top" or "bottom", True) - attrs.append(attr) + value = items.read_match() + attr = TableAttr("valign", value == "^" and "top" or "bottom", True) + attrs.append(attr) -# Inline formatting handlers. + # Inline formatting handlers. + + def parse_inline(self, items, region, cls, pattern_name): -def parse_inline(items, region, cls, pattern_name): + "Handle an inline region." + + span = cls([]) + self.parse_region_details(items, span, inline_patterns_for(pattern_name)) + region.append_inline(span) - "Handle an inline region." + def parse_larger(self, items, region): + self.parse_inline(items, region, Larger, "larger") + + def parse_monospace(self, items, region): + self.parse_inline(items, region, Monospace, "monospace") - span = cls([]) - parse_region_details(items, span, inline_patterns_for(pattern_name)) - region.append_inline(span) + def parse_smaller(self, items, region): + self.parse_inline(items, region, Smaller, "smaller") + + def parse_sub(self, items, region): + self.parse_inline(items, region, Subscript, "sub") + + def parse_super(self, items, region): + self.parse_inline(items, region, Superscript, "super") -parse_larger = lambda items, region: parse_inline(items, region, Larger, "larger") -parse_monospace = lambda items, region: parse_inline(items, region, Monospace, "monospace") -parse_smaller = lambda items, region: parse_inline(items, region, Smaller, "smaller") -parse_sub = lambda items, region: parse_inline(items, region, Subscript, "sub") -parse_super = lambda items, region: parse_inline(items, region, Superscript, "super") -parse_underline = lambda items, region: parse_inline(items, region, Underline, "underline") + def parse_underline(self, items, region): + self.parse_inline(items, region, Underline, "underline") + + -# Table attribute handlers. + # Table attribute handlers. + + def parse_table_attr(self, items, attrs, pattern_name): + + "Handle a table attribute." -def parse_table_attr(items, attrs, pattern_name): + value = items.read_match() + attrs.append(TableAttr(pattern_name, value, True)) - "Handle a table attribute." + def parse_colour(self, items, cell): + self.parse_table_attr(items, cell, "colour") - value = items.read_match() - attrs.append(TableAttr(pattern_name, value, True)) + def parse_colspan(self, items, cell): + self.parse_table_attr(items, cell, "colspan") -parse_colour = lambda items, cell: parse_table_attr(items, cell, "colour") -parse_colspan = lambda items, cell: parse_table_attr(items, cell, "colspan") -parse_rowspan = lambda items, cell: parse_table_attr(items, cell, "rowspan") -parse_width = lambda items, cell: parse_table_attr(items, cell, "width") + def parse_rowspan(self, items, cell): + self.parse_table_attr(items, cell, "rowspan") + + def parse_width(self, items, cell): + self.parse_table_attr(items, cell, "width") -# Pattern handlers. + # Parsing utilities. + + def parse_region_details(self, items, region, pattern_names): + + "Parse 'items' within 'region' searching using 'pattern_names'." + + try: + while True: + + # Obtain text before any marker or the end of the input. + + preceding = items.read_until(pattern_names) + if preceding: + region.append_inline(Text(preceding)) + + # End of input. + + if not items.matching: + break + + # Obtain any feature. + + feature = items.read_match() + handler = self.handlers.get(items.matching) + + # Handle each feature or add text to the region. + + if handler: + handler(self, items, region) + else: + region.append_inline(Text(feature)) + + except StopIteration: + pass + + region.normalise() + + def end_region(self, items, region): + + "End the parsing of 'region', breaking out of the parsing loop." + + raise StopIteration + + -handlers = { - None : end_region, - "attrname" : parse_attrname, - "break" : parse_break, - "colour" : parse_colour, - "colspan" : parse_colspan, - "defterm" : parse_defterm, - "defterm_empty" : parse_defterm_empty, - "deftermend" : end_region, - "deftermsep" : end_region, - "fontstyle" : parse_fontstyle, - "halign" : parse_halign, - "heading" : parse_heading, - "headingend" : parse_heading_end, - "larger" : parse_larger, - "largerend" : end_region, - "listitemend" : end_region, - "listitem" : parse_listitem, - "listitem_alpha" : parse_listitem, - "listitem_dot" : parse_listitem, - "listitem_num" : parse_listitem, - "listitem_roman" : parse_listitem, - "monospace" : parse_monospace, - "monospaceend" : end_region, - "regionstart" : parse_section, - "regionend" : parse_section_end, - "rowspan" : parse_rowspan, - "rule" : parse_rule, - "smaller" : parse_smaller, - "smallerend" : end_region, - "sub" : parse_sub, - "subend" : end_region, - "super" : parse_super, - "superend" : end_region, - "tableattrs" : parse_table_attrs, - "tableattrsend" : end_region, - "tablerow" : parse_table_row, - "tablecell" : end_region, - "tableend" : end_region, - "underline" : parse_underline, - "underlineend" : end_region, - "valign" : parse_valign, - "width" : parse_width, - } + # Pattern handlers. + + handlers = { + None : end_region, + "attrname" : parse_attrname, + "break" : parse_break, + "colour" : parse_colour, + "colspan" : parse_colspan, + "defterm" : parse_defterm, + "defterm_empty" : parse_defterm_empty, + "deftermend" : end_region, + "deftermsep" : end_region, + "fontstyle" : parse_fontstyle, + "halign" : parse_halign, + "heading" : parse_heading, + "headingend" : parse_heading_end, + "larger" : parse_larger, + "largerend" : end_region, + "listitemend" : end_region, + "listitem" : parse_listitem, + "listitem_alpha" : parse_listitem, + "listitem_dot" : parse_listitem, + "listitem_num" : parse_listitem, + "listitem_roman" : parse_listitem, + "monospace" : parse_monospace, + "monospaceend" : end_region, + "regionstart" : parse_section, + "regionend" : parse_section_end, + "rowspan" : parse_rowspan, + "rule" : parse_rule, + "smaller" : parse_smaller, + "smallerend" : end_region, + "sub" : parse_sub, + "subend" : end_region, + "super" : parse_super, + "superend" : end_region, + "tableattrs" : parse_table_attrs, + "tableattrsend" : end_region, + "tablerow" : parse_table_row, + "tablecell" : end_region, + "tableend" : end_region, + "underline" : parse_underline, + "underlineend" : end_region, + "valign" : parse_valign, + "width" : parse_width, + } # Top-level functions. -parse = parse_page +def parse(s, formats=None): + return Parser(formats).parse_page(s) # vim: tabstop=4 expandtab shiftwidth=4