# HG changeset patch # User Paul Boddie # Date 1688165028 -7200 # Node ID f19281465a63a620f6068f8d7f3b6bb05346b3a5 # Parent cd2cbbe6e41dad41ecb184449228bc43b4b7318f Added initial support for parsing and handling HTML. diff -r cd2cbbe6e41d -r f19281465a63 moinformat/parsers/common.py --- a/moinformat/parsers/common.py Sat Jul 01 00:37:44 2023 +0200 +++ b/moinformat/parsers/common.py Sat Jul 01 00:43:48 2023 +0200 @@ -3,7 +3,7 @@ """ Moin wiki parsing functionality. -Copyright (C) 2017, 2018, 2019, 2021 Paul Boddie +Copyright (C) 2017, 2018, 2019, 2021, 2023 Paul Boddie This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -279,6 +279,9 @@ self.parsers = parsers self.root = root + def update_metadata(self, metadata): + pass + def get_parser(self, format_type): """ diff -r cd2cbbe6e41d -r f19281465a63 moinformat/parsers/html.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/moinformat/parsers/html.py Sat Jul 01 00:43:48 2023 +0200 @@ -0,0 +1,82 @@ +#!/usr/bin/env python + +""" +HTML document fragment parser. + +Copyright (C) 2023 Paul Boddie + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; either version 3 of the License, or (at your option) any later +version. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +details. + +You should have received a copy of the GNU General Public License along with +this program. If not, see . +""" + +from moinformat.parsers.common import ParserBase +from moinformat.tree.html import Element, Fragment +from moinformat.utils.htmlparse import Parser + +class HTMLParser(ParserBase): + + "A prettyprinted document tree parser." + + formats = ["html"] + + def __init__(self, metadata): + self.metadata = metadata + + def parse(self, s): + + "Parse the tree structure representation in 's'." + + doc = Parser(s).parse() + + # If a theme is going to be used with the document, find the body node + # and return its children in a fragment. + + if self.metadata.get("theme_name"): + body = self._find_body(doc) + + if body: + return Fragment(body.nodes) + else: + return None + + # Otherwise, return the top-level node. + + else: + return doc + + def _find_body(self, node): + + """ + Find the body element from 'node', returning the element if found or + None otherwise. + """ + + # Search all nodes with children. + + if isinstance(node, Fragment): + + # Return the node if it is a body element. + + if isinstance(node, Element) and node.name == "body": + return node + + for n in node.nodes: + body = self._find_body(n) + if body: + return body + + return None + +parser = HTMLParser + +# vim: tabstop=4 expandtab shiftwidth=4 diff -r cd2cbbe6e41d -r f19281465a63 moinformat/serialisers/html/html.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/moinformat/serialisers/html/html.py Sat Jul 01 00:43:48 2023 +0200 @@ -0,0 +1,72 @@ +#!/usr/bin/env python + +""" +HTML serialiser. + +Copyright (C) 2023 Paul Boddie + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; either version 3 of the License, or (at your option) any later +version. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +details. + +You should have received a copy of the GNU General Public License along with +this program. If not, see . +""" + +from moinformat.serialisers.common import Serialiser + + + +# The serialiser class. + +class HTMLSerialiser(Serialiser): + + "Serialisation of HTML fragments." + + input_formats = ["html"] + formats = ["html"] + + def attribute(self, attribute): + self.out(attribute.name) + if attribute.value is not None: + self.out("=") + self.visit(attribute.value) + + def attribute_value(self, attribute_value): + self.out("%s%s%s" % (attribute_value.quote, attribute_value.value, attribute_value.quote)) + + def element(self, element): + self.out("<%s" % element.name) + for attribute in element.attributes: + self.out(" ") + self.visit(attribute) + self.out(">") + self.container(element) + self.out("" % element.name) + + def comment(self, comment): + self.out("<%s>" % comment.value) + + def directive(self, directive): + self.out("<%s>" % directive.value) + + def inclusion(self, inclusion): + self.out("<%s>" % inclusion.value) + + def node(self, node): + self.out(node.value) + + text = node + + def fragment(self, fragment): + self.container(fragment) + +serialiser = HTMLSerialiser + +# vim: tabstop=4 expandtab shiftwidth=4 diff -r cd2cbbe6e41d -r f19281465a63 moinformat/serialisers/pretty/html.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/moinformat/serialisers/pretty/html.py Sat Jul 01 00:43:48 2023 +0200 @@ -0,0 +1,63 @@ +#!/usr/bin/env python + +""" +Prettyprinted HTML document node prettyprinter. + +Copyright (C) 2023 Paul Boddie + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; either version 3 of the License, or (at your option) any later +version. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +details. + +You should have received a copy of the GNU General Public License along with +this program. If not, see . +""" + +from moinformat.serialisers.pretty.common import Serialiser + +class HTMLSerialiser(Serialiser): + + "Serialisation of prettyprinted document nodes for inspection." + + input_formats = ["html"] + formats = ["pretty"] + + def attribute(self, attribute): + self.out("%sAttribute: %s" % (self.output.indent, attribute.name)) + if attribute.value is not None: + self.out("=") + self.visit(attribute.value) + self.out("\n") + + def attribute_value(self, attribute_value): + self.out("%s%s%s" % (attribute_value.quote, attribute_value.value, attribute_value.quote)) + + def element(self, element): + self.out("%sElement: name=%r\n" % (self.output.indent, element.name)) + self.output.indent += " " + for attribute in element.attributes: + self.visit(attribute) + self.output.indent = self.output.indent[:-2] + self.container(element) + + def node(self, node): + self.out("%s%s: %r\n" % (self.output.indent, node.__class__.__name__, node.value)) + + comment = node + directive = node + inclusion = node + text = node + + def fragment(self, fragment): + self.out("%s%s\n" % (self.output.indent, fragment.__class__.__name__)) + self.container(fragment) + +serialiser = HTMLSerialiser + +# vim: tabstop=4 expandtab shiftwidth=4 diff -r cd2cbbe6e41d -r f19281465a63 moinformat/tree/html.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/moinformat/tree/html.py Sat Jul 01 00:43:48 2023 +0200 @@ -0,0 +1,26 @@ +#!/usr/bin/env python + +""" +HTML document nodes. + +Copyright (C) 2023 Paul Boddie + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; either version 3 of the License, or (at your option) any later +version. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +details. + +You should have received a copy of the GNU General Public License along with +this program. If not, see . +""" + +from moinformat.utils.htmlparse.tree import Attribute, AttributeValue, \ + Comment, Directive, Element, \ + Fragment, Node, Inclusion, Text + +# vim: tabstop=4 expandtab shiftwidth=4 diff -r cd2cbbe6e41d -r f19281465a63 moinformat/utils/htmlparse/__init__.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/moinformat/utils/htmlparse/__init__.py Sat Jul 01 00:43:48 2023 +0200 @@ -0,0 +1,24 @@ +#!/usr/bin/env python + +""" +HTML parsing modules. + +Copyright (C) 2023 Paul Boddie + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; either version 3 of the License, or (at your option) any later +version. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +details. + +You should have received a copy of the GNU General Public License along with +this program. If not, see . +""" + +from moinformat.utils.htmlparse.parse import Parser + +# vim: tabstop=4 expandtab shiftwidth=4 diff -r cd2cbbe6e41d -r f19281465a63 moinformat/utils/htmlparse/lex.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/moinformat/utils/htmlparse/lex.py Sat Jul 01 00:43:48 2023 +0200 @@ -0,0 +1,234 @@ +#!/usr/bin/env python + +""" +Lexical partitioning of HTML document content. + +Copyright (C) 2023 Paul Boddie + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; either version 3 of the License, or (at your option) any later +version. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +details. + +You should have received a copy of the GNU General Public License along with +this program. If not, see . +""" + +# Lexical analysis state transition handler functions. + +def tag_or_similar(text, pos): + + # Consult the text positions following the position indicated. + + if text[pos:pos+2] == "", "", end_of_standalone_tag), + (">", "", end_of_tag), + ] + + def visit(self, visitor): + return visitor.in_tag(self) + +class IN_COMMENT(Span): + transitions = [("-->", "--", end_of_tag)] + + def visit(self, visitor): + return visitor.in_comment(self) + +class IN_DIRECTIVE(Span): + transitions = [(">", "", end_of_tag)] + + def visit(self, visitor): + return visitor.in_directive(self) + +class IN_INCLUSION(Span): + transitions = [("]]>", "]]", end_of_tag)] + + def visit(self, visitor): + return visitor.in_inclusion(self) + +class AFTER_ATTRIBUTE_VALUE(Span): + transitions = [ + ("=", "", at_attribute_value), + ("/>", "", end_of_standalone_tag), + (">", "", end_of_tag), + ] + + def empty(self): + return not self.text.strip() + + def visit(self, visitor): + return visitor.after_attribute_value(self) + +class AT_ATTRIBUTE_VALUE(Span): + transitions = [ + ("=", "", at_attribute_value), + ('"', "", in_dq_attribute_value), + ("'", "", in_sq_attribute_value), + ("/>", "", end_of_standalone_tag), + (">", "", end_of_tag), + ] + + def empty(self): + return not self.text.strip() + + def visit(self, visitor): + return visitor.at_attribute_value(self) + +class IN_DQ_ATTRIBUTE_VALUE(Span): + transitions = [('"', "", after_attribute_value)] + + def visit(self, visitor): + return visitor.in_dq_attribute_value(self) + +class IN_SQ_ATTRIBUTE_VALUE(Span): + transitions = [("'", "", after_attribute_value)] + + def visit(self, visitor): + return visitor.in_sq_attribute_value(self) + + + +# Utility functions. + +def find_one(text, pos, choices): + + """ + Find in 'text' from 'pos' the earliest occurring instance of one of the + given 'choices', these being a list of (token string, extra string, state) + tuples. + + The token string is a token marking the start of the next span, the extra + string is the portion of the token to be added to the end of the current + span upon matching, and the state applies to the next span. + + The associated state, the position of the occurrence, and the position of + the text following the occurrence are returned as a tuple. + """ + + next_state = None + first_pos = None + first_extra = None + next_pos = None + + for token, extra, state in choices: + if token is None: + return state, pos, extra, pos + + found_pos = text.find(token, pos) + + if found_pos != -1 and (next_state is None or found_pos < first_pos): + next_state = state + first_pos = found_pos + first_extra = extra + next_pos = found_pos + len(token) + + return next_state, first_pos, first_extra, next_pos + + + +# Lexical partitioning. + +class Lexer: + def __init__(self, text): + self.text = text + self.state = BETWEEN_TAGS + self.pos = 0 + + def _end_of_input(self): + start = self.pos + self.pos = None + return self._span(self.text[start:]) + + def _span(self, text): + return self.state(text) + + def __iter__(self): + return self + + def next(self): + if self.pos is None: + raise StopIteration + + # Obtain details of a state transition: a handler function to determine + # the next state, and the start and end positions of the token causing + # the transition. + + handler, pos, extra, next_pos = find_one(self.text, self.pos, self.state.transitions) + + if handler is None: + return self._end_of_input() + + # Obtain the lexical span and update the state and position. + + span = self._span(self.text[self.pos:pos] + extra) + + self.state = handler(self.text, pos) + self.pos = next_pos + + return span + +# vim: tabstop=4 expandtab shiftwidth=4 diff -r cd2cbbe6e41d -r f19281465a63 moinformat/utils/htmlparse/parse.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/moinformat/utils/htmlparse/parse.py Sat Jul 01 00:43:48 2023 +0200 @@ -0,0 +1,110 @@ +#!/usr/bin/env python + +""" +An absurdly minimal HTML parser. + +Copyright (C) 2023 Paul Boddie + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; either version 3 of the License, or (at your option) any later +version. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +details. + +You should have received a copy of the GNU General Public License along with +this program. If not, see . +""" + +from moinformat.utils.htmlparse.token import Tokeniser +from moinformat.utils.htmlparse.tree import Attribute, AttributeValue, \ + Comment, Directive, Element, \ + Fragment, Inclusion, Node, Text + + + +# Token processing employing the tokens from tokenisation. + +class Visitor: + def __init__(self): + self.node = Fragment() + self.stack = [self.node] + + def append(self, node): + self.node.nodes.append(node) + + def push(self, node): + self.stack.append(node) + self.append(node) + self.node = node + + def pop(self): + self.stack.pop() + self.node = self.stack[-1] + + def visit(self, token): + token.visit(self) + + # Specific handler methods. + + def attribute(self, token): + if isinstance(self.node, Element): + self.node.attributes.append(Attribute(token.value)) + else: + raise ValueError, token + + def attribute_value(self, token): + if isinstance(self.node, Element): + self.node.attributes[-1].value = AttributeValue(token.value, token.quote) + else: + raise ValueError, token + + def comment(self, token): + self.append(Comment(token.value)) + + def directive(self, token): + self.append(Directive(token.value)) + + def inclusion(self, token): + self.append(Inclusion(token.value)) + + def tag(self, token): + if not token.is_end(): + self.push(Element(token.tag_name())) + elif self.node.name == token.tag_name(): + self.pop() + else: + raise ValueError, token + + def tag_close(self, token): + self.pop() + + def text(self, token): + self.append(Text(token.value)) + + + +# Parsing and document construction. + +class Parser: + def __init__(self, text): + self.tokeniser = Tokeniser(text) + self.visitor = Visitor() + + def __iter__(self): + return self + + def next(self): + token = self.tokeniser.next() + self.visitor.visit(token) + + def parse(self): + for _none in self: + pass + + return self.visitor.node + +# vim: tabstop=4 expandtab shiftwidth=4 diff -r cd2cbbe6e41d -r f19281465a63 moinformat/utils/htmlparse/token.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/moinformat/utils/htmlparse/token.py Sat Jul 01 00:43:48 2023 +0200 @@ -0,0 +1,160 @@ +#!/usr/bin/env python + +""" +An absurdly minimal HTML tokeniser. + +Copyright (C) 2023 Paul Boddie + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; either version 3 of the License, or (at your option) any later +version. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +details. + +You should have received a copy of the GNU General Public License along with +this program. If not, see . +""" + +from collections import deque +from moinformat.utils.htmlparse.lex import Lexer + + + +# Document token classes. + +class Token: + def __init__(self, value): + self.value = value + + def __repr__(self): + return "%s(%r)" % (self.__class__.__name__, self.value) + +class Attribute(Token): + def visit(self, visitor): + return visitor.attribute(self) + +class AttributeValue(Token): + def __init__(self, value, quote): + self.value = value + self.quote = quote + + def __repr__(self): + return "%s(%r, %r)" % (self.__class__.__name__, self.value, self.quote) + + def visit(self, visitor): + return visitor.attribute_value(self) + +class Comment(Token): + def visit(self, visitor): + return visitor.comment(self) + +class Directive(Token): + def visit(self, visitor): + return visitor.directive(self) + +class Inclusion(Token): + def visit(self, visitor): + return visitor.inclusion(self) + +class Tag(Token): + def visit(self, visitor): + return visitor.tag(self) + + def is_end(self): + return self.value.startswith("/") + + def tag_name(self): + return self.is_end() and self.value[1:] or self.value + +class TagClose: + def visit(self, visitor): + return visitor.tag_close(self) + + def __repr__(self): + return "%s()" % self.__class__.__name__ + +class Text(Token): + def visit(self, visitor): + return visitor.text(self) + + + +# Tidying visitor employing the spans from lexical partitioning. + +class Visitor: + def __init__(self): + self.queued = deque() + + def visit(self, span): + return span.visit(self) + + # Specific handler methods. + + def between_tags(self, span): + return Text(span.text) + + def in_comment(self, span): + return Comment(span.text) + + def in_directive(self, span): + return Directive(span.text) + + def in_inclusion(self, span): + return Inclusion(span.text) + + def _queue_attributes(self, tokens): + for token in tokens: + self.queued.append(Attribute(token)) + + def in_tag(self, span): + tokens = span.text.split() + self._queue_attributes(tokens[1:]) + return Tag(tokens[0]) + + def at_end_of_tag(self, span): + return TagClose() + + def after_attribute_value(self, span): + tokens = span.text.split() + self._queue_attributes(tokens) + return self.queued.popleft() + + def at_attribute_value(self, span): + tokens = span.text.split() + self._queue_attributes(tokens[1:]) + return AttributeValue(tokens[0], "") + + def in_dq_attribute_value(self, span): + return AttributeValue(span.text, '"') + + def in_sq_attribute_value(self, span): + return AttributeValue(span.text, "'") + + + +# Tokenising. + +class Tokeniser: + def __init__(self, text): + self.lexer = Lexer(text) + self.visitor = Visitor() + + def __iter__(self): + return self + + def next(self): + if self.visitor.queued: + return self.visitor.queued.popleft() + + while 1: + span = self.lexer.next() + if not span.empty(): + break + + return self.visitor.visit(span) + +# vim: tabstop=4 expandtab shiftwidth=4 diff -r cd2cbbe6e41d -r f19281465a63 moinformat/utils/htmlparse/tree.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/moinformat/utils/htmlparse/tree.py Sat Jul 01 00:43:48 2023 +0200 @@ -0,0 +1,102 @@ +#!/usr/bin/env python + +""" +HTML document nodes. + +Copyright (C) 2023 Paul Boddie + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; either version 3 of the License, or (at your option) any later +version. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +details. + +You should have received a copy of the GNU General Public License along with +this program. If not, see . +""" + +# Element attributes. + +class Attribute: + def __init__(self, name, value=None): + self.name = name + self.value = value + + def __repr__(self): + return "%s(%r, %r)" % (self.__class__.__name__, self.name, self.value) + + def visit(self, visitor): + visitor.attribute(self) + + + +# Nodes containing other nodes. + +class Fragment: + def __init__(self, nodes=None): + self.nodes = nodes or [] + + def __repr__(self): + return "%s(%r)" % (self.__class__.__name__, self.nodes) + + def visit(self, visitor): + visitor.fragment(self) + +class Element(Fragment): + def __init__(self, name, attributes=None, nodes=None): + Fragment.__init__(self, nodes) + self.name = name + self.attributes = attributes or [] + + def __repr__(self): + return "%s(%r, %r, %r)" % (self.__class__.__name__, self.name, self.attributes, self.nodes) + + def visit(self, visitor): + visitor.element(self) + + + +# Nodes having values. + +class Node: + def __init__(self, value): + self.value = value + + def __repr__(self): + return "%s(%r)" % (self.__class__.__name__, self.value) + + def visit(self, visitor): + visitor.node(self) + +class AttributeValue(Node): + def __init__(self, value, quote): + Node.__init__(self, value) + self.quote = quote + + def __repr__(self): + return "%s(%r, %r)" % (self.__class__.__name__, self.value, self.quote) + + def visit(self, visitor): + visitor.attribute_value(self) + +class Comment(Node): + def visit(self, visitor): + visitor.comment(self) + +class Directive(Node): + def visit(self, visitor): + visitor.directive(self) + +class Inclusion(Node): + def visit(self, visitor): + visitor.inclusion(self) + +class Text(Node): + def visit(self, visitor): + visitor.text(self) + +# vim: tabstop=4 expandtab shiftwidth=4