# HG changeset patch
# User Paul Boddie <paul@boddie.org.uk>
# Date 1688165028 -7200
# Node ID f19281465a63a620f6068f8d7f3b6bb05346b3a5
# Parent  cd2cbbe6e41dad41ecb184449228bc43b4b7318f
Added initial support for parsing and handling HTML.

diff -r cd2cbbe6e41d -r f19281465a63 moinformat/parsers/common.py
--- a/moinformat/parsers/common.py	Sat Jul 01 00:37:44 2023 +0200
+++ b/moinformat/parsers/common.py	Sat Jul 01 00:43:48 2023 +0200
@@ -3,7 +3,7 @@
 """
 Moin wiki parsing functionality.
 
-Copyright (C) 2017, 2018, 2019, 2021 Paul Boddie <paul@boddie.org.uk>
+Copyright (C) 2017, 2018, 2019, 2021, 2023 Paul Boddie <paul@boddie.org.uk>
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -279,6 +279,9 @@
         self.parsers = parsers
         self.root = root
 
+    def update_metadata(self, metadata):
+        pass
+
     def get_parser(self, format_type):
 
         """
diff -r cd2cbbe6e41d -r f19281465a63 moinformat/parsers/html.py
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/moinformat/parsers/html.py	Sat Jul 01 00:43:48 2023 +0200
@@ -0,0 +1,82 @@
+#!/usr/bin/env python
+
+"""
+HTML document fragment parser.
+
+Copyright (C) 2023 Paul Boddie <paul@boddie.org.uk>
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; either version 3 of the License, or (at your option) any later
+version.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
+details.
+
+You should have received a copy of the GNU General Public License along with
+this program.  If not, see <http://www.gnu.org/licenses/>.
+"""
+
+from moinformat.parsers.common import ParserBase
+from moinformat.tree.html import Element, Fragment
+from moinformat.utils.htmlparse import Parser
+
+class HTMLParser(ParserBase):
+
+    "A prettyprinted document tree parser."
+
+    formats = ["html"]
+
+    def __init__(self, metadata):
+        self.metadata = metadata
+
+    def parse(self, s):
+
+        "Parse the tree structure representation in 's'."
+
+        doc = Parser(s).parse()
+
+        # If a theme is going to be used with the document, find the body node
+        # and return its children in a fragment.
+
+        if self.metadata.get("theme_name"):
+            body = self._find_body(doc)
+
+            if body:
+                return Fragment(body.nodes)
+            else:
+                return None
+
+        # Otherwise, return the top-level node.
+
+        else:
+            return doc
+
+    def _find_body(self, node):
+
+        """
+        Find the body element from 'node', returning the element if found or
+        None otherwise.
+        """
+
+        # Search all nodes with children.
+
+        if isinstance(node, Fragment):
+
+            # Return the node if it is a body element.
+
+            if isinstance(node, Element) and node.name == "body":
+                return node
+
+            for n in node.nodes:
+                body = self._find_body(n)
+                if body:
+                    return body
+
+        return None
+
+parser = HTMLParser
+
+# vim: tabstop=4 expandtab shiftwidth=4
diff -r cd2cbbe6e41d -r f19281465a63 moinformat/serialisers/html/html.py
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/moinformat/serialisers/html/html.py	Sat Jul 01 00:43:48 2023 +0200
@@ -0,0 +1,72 @@
+#!/usr/bin/env python
+
+"""
+HTML serialiser.
+
+Copyright (C) 2023 Paul Boddie <paul@boddie.org.uk>
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; either version 3 of the License, or (at your option) any later
+version.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
+details.
+
+You should have received a copy of the GNU General Public License along with
+this program.  If not, see <http://www.gnu.org/licenses/>.
+"""
+
+from moinformat.serialisers.common import Serialiser
+
+
+
+# The serialiser class.
+
+class HTMLSerialiser(Serialiser):
+
+    "Serialisation of HTML fragments."
+
+    input_formats = ["html"]
+    formats = ["html"]
+
+    def attribute(self, attribute):
+        self.out(attribute.name)
+        if attribute.value is not None:
+            self.out("=")
+            self.visit(attribute.value)
+
+    def attribute_value(self, attribute_value):
+        self.out("%s%s%s" % (attribute_value.quote, attribute_value.value, attribute_value.quote))
+
+    def element(self, element):
+        self.out("<%s" % element.name)
+        for attribute in element.attributes:
+            self.out(" ")
+            self.visit(attribute)
+        self.out(">")
+        self.container(element)
+        self.out("</%s>" % element.name)
+
+    def comment(self, comment):
+        self.out("<%s>" % comment.value)
+
+    def directive(self, directive):
+        self.out("<%s>" % directive.value)
+
+    def inclusion(self, inclusion):
+        self.out("<%s>" % inclusion.value)
+
+    def node(self, node):
+        self.out(node.value)
+
+    text = node
+
+    def fragment(self, fragment):
+        self.container(fragment)
+
+serialiser = HTMLSerialiser
+
+# vim: tabstop=4 expandtab shiftwidth=4
diff -r cd2cbbe6e41d -r f19281465a63 moinformat/serialisers/pretty/html.py
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/moinformat/serialisers/pretty/html.py	Sat Jul 01 00:43:48 2023 +0200
@@ -0,0 +1,63 @@
+#!/usr/bin/env python
+
+"""
+Prettyprinted HTML document node prettyprinter.
+
+Copyright (C) 2023 Paul Boddie <paul@boddie.org.uk>
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; either version 3 of the License, or (at your option) any later
+version.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
+details.
+
+You should have received a copy of the GNU General Public License along with
+this program.  If not, see <http://www.gnu.org/licenses/>.
+"""
+
+from moinformat.serialisers.pretty.common import Serialiser
+
+class HTMLSerialiser(Serialiser):
+
+    "Serialisation of prettyprinted document nodes for inspection."
+
+    input_formats = ["html"]
+    formats = ["pretty"]
+
+    def attribute(self, attribute):
+        self.out("%sAttribute: %s" % (self.output.indent, attribute.name))
+        if attribute.value is not None:
+            self.out("=")
+        self.visit(attribute.value)
+        self.out("\n")
+
+    def attribute_value(self, attribute_value):
+        self.out("%s%s%s" % (attribute_value.quote, attribute_value.value, attribute_value.quote))
+
+    def element(self, element):
+        self.out("%sElement: name=%r\n" % (self.output.indent, element.name))
+        self.output.indent += "  "
+        for attribute in element.attributes:
+            self.visit(attribute)
+        self.output.indent = self.output.indent[:-2]
+        self.container(element)
+
+    def node(self, node):
+        self.out("%s%s: %r\n" % (self.output.indent, node.__class__.__name__, node.value))
+
+    comment = node
+    directive = node
+    inclusion = node
+    text = node
+
+    def fragment(self, fragment):
+        self.out("%s%s\n" % (self.output.indent, fragment.__class__.__name__))
+        self.container(fragment)
+
+serialiser = HTMLSerialiser
+
+# vim: tabstop=4 expandtab shiftwidth=4
diff -r cd2cbbe6e41d -r f19281465a63 moinformat/tree/html.py
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/moinformat/tree/html.py	Sat Jul 01 00:43:48 2023 +0200
@@ -0,0 +1,26 @@
+#!/usr/bin/env python
+
+"""
+HTML document nodes.
+
+Copyright (C) 2023 Paul Boddie <paul@boddie.org.uk>
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; either version 3 of the License, or (at your option) any later
+version.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
+details.
+
+You should have received a copy of the GNU General Public License along with
+this program.  If not, see <http://www.gnu.org/licenses/>.
+"""
+
+from moinformat.utils.htmlparse.tree import Attribute, AttributeValue, \
+                                            Comment, Directive, Element, \
+                                            Fragment, Node, Inclusion, Text
+
+# vim: tabstop=4 expandtab shiftwidth=4
diff -r cd2cbbe6e41d -r f19281465a63 moinformat/utils/htmlparse/__init__.py
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/moinformat/utils/htmlparse/__init__.py	Sat Jul 01 00:43:48 2023 +0200
@@ -0,0 +1,24 @@
+#!/usr/bin/env python
+
+"""
+HTML parsing modules.
+
+Copyright (C) 2023 Paul Boddie <paul@boddie.org.uk>
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; either version 3 of the License, or (at your option) any later
+version.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
+details.
+
+You should have received a copy of the GNU General Public License along with
+this program.  If not, see <http://www.gnu.org/licenses/>.
+"""
+
+from moinformat.utils.htmlparse.parse import Parser
+
+# vim: tabstop=4 expandtab shiftwidth=4
diff -r cd2cbbe6e41d -r f19281465a63 moinformat/utils/htmlparse/lex.py
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/moinformat/utils/htmlparse/lex.py	Sat Jul 01 00:43:48 2023 +0200
@@ -0,0 +1,234 @@
+#!/usr/bin/env python
+
+"""
+Lexical partitioning of HTML document content.
+
+Copyright (C) 2023 Paul Boddie <paul@boddie.org.uk>
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; either version 3 of the License, or (at your option) any later
+version.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
+details.
+
+You should have received a copy of the GNU General Public License along with
+this program.  If not, see <http://www.gnu.org/licenses/>.
+"""
+
+# Lexical analysis state transition handler functions.
+
+def tag_or_similar(text, pos):
+
+    # Consult the text positions following the position indicated.
+
+    if text[pos:pos+2] == "<!":
+        if text[pos+3:pos+4] == "[":
+            return IN_INCLUSION
+        elif text[pos+3:pos+5] == "--":
+            return IN_COMMENT
+        else:
+            return IN_DIRECTIVE
+    else:
+        return IN_TAG
+
+def at_attribute_value(text, pos):
+    return AT_ATTRIBUTE_VALUE
+
+def in_dq_attribute_value(text, pos):
+    return IN_DQ_ATTRIBUTE_VALUE
+
+def in_sq_attribute_value(text, pos):
+    return IN_SQ_ATTRIBUTE_VALUE
+
+def after_attribute_value(text, pos):
+    return AFTER_ATTRIBUTE_VALUE
+
+def end_of_standalone_tag(text, pos):
+    return AT_END_OF_TAG
+
+def end_of_tag(text, pos):
+    return BETWEEN_TAGS
+
+
+
+# Lexical analysis states/spans.
+
+class Span:
+    def __init__(self, text):
+        self.text = text
+
+    def empty(self):
+        return not self.text
+
+    def __repr__(self):
+        return "%s(%r)" % (self.__class__.__name__, self.text)
+
+class AT_END_OF_TAG(Span):
+    transitions = [(None, "", end_of_tag)]
+
+    def empty(self):
+        return False
+
+    def visit(self, visitor):
+        return visitor.at_end_of_tag(self)
+
+class BETWEEN_TAGS(Span):
+    transitions = [("<", "", tag_or_similar)]
+
+    def visit(self, visitor):
+        return visitor.between_tags(self)
+
+class IN_TAG(Span):
+    transitions = [
+        ("=", "", at_attribute_value),
+        ("/>", "", end_of_standalone_tag),
+        (">", "", end_of_tag),
+        ]
+
+    def visit(self, visitor):
+        return visitor.in_tag(self)
+
+class IN_COMMENT(Span):
+    transitions = [("-->", "--", end_of_tag)]
+
+    def visit(self, visitor):
+        return visitor.in_comment(self)
+
+class IN_DIRECTIVE(Span):
+    transitions = [(">", "", end_of_tag)]
+
+    def visit(self, visitor):
+        return visitor.in_directive(self)
+
+class IN_INCLUSION(Span):
+    transitions = [("]]>", "]]", end_of_tag)]
+
+    def visit(self, visitor):
+        return visitor.in_inclusion(self)
+
+class AFTER_ATTRIBUTE_VALUE(Span):
+    transitions = [
+        ("=", "", at_attribute_value),
+        ("/>", "", end_of_standalone_tag),
+        (">", "", end_of_tag),
+        ]
+
+    def empty(self):
+        return not self.text.strip()
+
+    def visit(self, visitor):
+        return visitor.after_attribute_value(self)
+
+class AT_ATTRIBUTE_VALUE(Span):
+    transitions = [
+        ("=", "", at_attribute_value),
+        ('"', "", in_dq_attribute_value),
+        ("'", "", in_sq_attribute_value),
+        ("/>", "", end_of_standalone_tag),
+        (">", "", end_of_tag),
+        ]
+
+    def empty(self):
+        return not self.text.strip()
+
+    def visit(self, visitor):
+        return visitor.at_attribute_value(self)
+
+class IN_DQ_ATTRIBUTE_VALUE(Span):
+    transitions = [('"', "", after_attribute_value)]
+
+    def visit(self, visitor):
+        return visitor.in_dq_attribute_value(self)
+
+class IN_SQ_ATTRIBUTE_VALUE(Span):
+    transitions = [("'", "", after_attribute_value)]
+
+    def visit(self, visitor):
+        return visitor.in_sq_attribute_value(self)
+
+
+
+# Utility functions.
+
+def find_one(text, pos, choices):
+
+    """
+    Find in 'text' from 'pos' the earliest occurring instance of one of the
+    given 'choices', these being a list of (token string, extra string, state)
+    tuples.
+
+    The token string is a token marking the start of the next span, the extra
+    string is the portion of the token to be added to the end of the current
+    span upon matching, and the state applies to the next span.
+
+    The associated state, the position of the occurrence, and the position of
+    the text following the occurrence are returned as a tuple.
+    """
+
+    next_state = None
+    first_pos = None
+    first_extra = None
+    next_pos = None
+
+    for token, extra, state in choices:
+        if token is None:
+            return state, pos, extra, pos
+
+        found_pos = text.find(token, pos)
+
+        if found_pos != -1 and (next_state is None or found_pos < first_pos):
+            next_state = state
+            first_pos = found_pos
+            first_extra = extra
+            next_pos = found_pos + len(token)
+
+    return next_state, first_pos, first_extra, next_pos
+
+
+
+# Lexical partitioning.
+
+class Lexer:
+    def __init__(self, text):
+        self.text = text
+        self.state = BETWEEN_TAGS
+        self.pos = 0
+
+    def _end_of_input(self):
+        start = self.pos
+        self.pos = None
+        return self._span(self.text[start:])
+
+    def _span(self, text):
+        return self.state(text)
+
+    def __iter__(self):
+        return self
+
+    def next(self):
+        if self.pos is None:
+            raise StopIteration
+
+        # Obtain details of a state transition: a handler function to determine
+        # the next state, and the start and end positions of the token causing
+        # the transition.
+
+        handler, pos, extra, next_pos = find_one(self.text, self.pos, self.state.transitions)
+
+        if handler is None:
+            return self._end_of_input()
+
+        # Obtain the lexical span and update the state and position.
+
+        span = self._span(self.text[self.pos:pos] + extra)
+
+        self.state = handler(self.text, pos)
+        self.pos = next_pos
+
+        return span
+
+# vim: tabstop=4 expandtab shiftwidth=4
diff -r cd2cbbe6e41d -r f19281465a63 moinformat/utils/htmlparse/parse.py
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/moinformat/utils/htmlparse/parse.py	Sat Jul 01 00:43:48 2023 +0200
@@ -0,0 +1,110 @@
+#!/usr/bin/env python
+
+"""
+An absurdly minimal HTML parser.
+
+Copyright (C) 2023 Paul Boddie <paul@boddie.org.uk>
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; either version 3 of the License, or (at your option) any later
+version.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
+details.
+
+You should have received a copy of the GNU General Public License along with
+this program.  If not, see <http://www.gnu.org/licenses/>.
+"""
+
+from moinformat.utils.htmlparse.token import Tokeniser
+from moinformat.utils.htmlparse.tree import Attribute, AttributeValue, \
+                                            Comment, Directive, Element, \
+                                            Fragment, Inclusion, Node, Text
+
+
+
+# Token processing employing the tokens from tokenisation.
+
+class Visitor:
+    def __init__(self):
+        self.node = Fragment()
+        self.stack = [self.node]
+
+    def append(self, node):
+        self.node.nodes.append(node)
+
+    def push(self, node):
+        self.stack.append(node)
+        self.append(node)
+        self.node = node
+
+    def pop(self):
+        self.stack.pop()
+        self.node = self.stack[-1]
+
+    def visit(self, token):
+        token.visit(self)
+
+    # Specific handler methods.
+
+    def attribute(self, token):
+        if isinstance(self.node, Element):
+            self.node.attributes.append(Attribute(token.value))
+        else:
+            raise ValueError, token
+
+    def attribute_value(self, token):
+        if isinstance(self.node, Element):
+            self.node.attributes[-1].value = AttributeValue(token.value, token.quote)
+        else:
+            raise ValueError, token
+
+    def comment(self, token):
+        self.append(Comment(token.value))
+
+    def directive(self, token):
+        self.append(Directive(token.value))
+
+    def inclusion(self, token):
+        self.append(Inclusion(token.value))
+
+    def tag(self, token):
+        if not token.is_end():
+            self.push(Element(token.tag_name()))
+        elif self.node.name == token.tag_name():
+            self.pop()
+        else:
+            raise ValueError, token
+
+    def tag_close(self, token):
+        self.pop()
+
+    def text(self, token):
+        self.append(Text(token.value))
+
+
+
+# Parsing and document construction.
+
+class Parser:
+    def __init__(self, text):
+        self.tokeniser = Tokeniser(text)
+        self.visitor = Visitor()
+
+    def __iter__(self):
+        return self
+
+    def next(self):
+        token = self.tokeniser.next()
+        self.visitor.visit(token)
+
+    def parse(self):
+        for _none in self:
+            pass
+
+        return self.visitor.node
+
+# vim: tabstop=4 expandtab shiftwidth=4
diff -r cd2cbbe6e41d -r f19281465a63 moinformat/utils/htmlparse/token.py
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/moinformat/utils/htmlparse/token.py	Sat Jul 01 00:43:48 2023 +0200
@@ -0,0 +1,160 @@
+#!/usr/bin/env python
+
+"""
+An absurdly minimal HTML tokeniser.
+
+Copyright (C) 2023 Paul Boddie <paul@boddie.org.uk>
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; either version 3 of the License, or (at your option) any later
+version.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
+details.
+
+You should have received a copy of the GNU General Public License along with
+this program.  If not, see <http://www.gnu.org/licenses/>.
+"""
+
+from collections import deque
+from moinformat.utils.htmlparse.lex import Lexer
+
+
+
+# Document token classes.
+
+class Token:
+    def __init__(self, value):
+        self.value = value
+
+    def __repr__(self):
+        return "%s(%r)" % (self.__class__.__name__, self.value)
+
+class Attribute(Token):
+    def visit(self, visitor):
+        return visitor.attribute(self)
+
+class AttributeValue(Token):
+    def __init__(self, value, quote):
+        self.value = value
+        self.quote = quote
+
+    def __repr__(self):
+        return "%s(%r, %r)" % (self.__class__.__name__, self.value, self.quote)
+
+    def visit(self, visitor):
+        return visitor.attribute_value(self)
+
+class Comment(Token):
+    def visit(self, visitor):
+        return visitor.comment(self)
+
+class Directive(Token):
+    def visit(self, visitor):
+        return visitor.directive(self)
+
+class Inclusion(Token):
+    def visit(self, visitor):
+        return visitor.inclusion(self)
+
+class Tag(Token):
+    def visit(self, visitor):
+        return visitor.tag(self)
+
+    def is_end(self):
+        return self.value.startswith("/")
+
+    def tag_name(self):
+        return self.is_end() and self.value[1:] or self.value
+
+class TagClose:
+    def visit(self, visitor):
+        return visitor.tag_close(self)
+
+    def __repr__(self):
+        return "%s()" % self.__class__.__name__
+
+class Text(Token):
+    def visit(self, visitor):
+        return visitor.text(self)
+
+
+
+# Tidying visitor employing the spans from lexical partitioning.
+
+class Visitor:
+    def __init__(self):
+        self.queued = deque()
+
+    def visit(self, span):
+        return span.visit(self)
+
+    # Specific handler methods.
+
+    def between_tags(self, span):
+        return Text(span.text)
+
+    def in_comment(self, span):
+        return Comment(span.text)
+
+    def in_directive(self, span):
+        return Directive(span.text)
+
+    def in_inclusion(self, span):
+        return Inclusion(span.text)
+
+    def _queue_attributes(self, tokens):
+        for token in tokens:
+            self.queued.append(Attribute(token))
+
+    def in_tag(self, span):
+        tokens = span.text.split()
+        self._queue_attributes(tokens[1:])
+        return Tag(tokens[0])
+
+    def at_end_of_tag(self, span):
+        return TagClose()
+
+    def after_attribute_value(self, span):
+        tokens = span.text.split()
+        self._queue_attributes(tokens)
+        return self.queued.popleft()
+
+    def at_attribute_value(self, span):
+        tokens = span.text.split()
+        self._queue_attributes(tokens[1:])
+        return AttributeValue(tokens[0], "")
+
+    def in_dq_attribute_value(self, span):
+        return AttributeValue(span.text, '"')
+
+    def in_sq_attribute_value(self, span):
+        return AttributeValue(span.text, "'")
+
+
+
+# Tokenising.
+
+class Tokeniser:
+    def __init__(self, text):
+        self.lexer = Lexer(text)
+        self.visitor = Visitor()
+
+    def __iter__(self):
+        return self
+
+    def next(self):
+        if self.visitor.queued:
+            return self.visitor.queued.popleft()
+
+        while 1:
+            span = self.lexer.next()
+            if not span.empty():
+                break
+
+        return self.visitor.visit(span)
+
+# vim: tabstop=4 expandtab shiftwidth=4
diff -r cd2cbbe6e41d -r f19281465a63 moinformat/utils/htmlparse/tree.py
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/moinformat/utils/htmlparse/tree.py	Sat Jul 01 00:43:48 2023 +0200
@@ -0,0 +1,102 @@
+#!/usr/bin/env python
+
+"""
+HTML document nodes.
+
+Copyright (C) 2023 Paul Boddie <paul@boddie.org.uk>
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; either version 3 of the License, or (at your option) any later
+version.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
+details.
+
+You should have received a copy of the GNU General Public License along with
+this program.  If not, see <http://www.gnu.org/licenses/>.
+"""
+
+# Element attributes.
+
+class Attribute:
+    def __init__(self, name, value=None):
+        self.name = name
+        self.value = value
+
+    def __repr__(self):
+        return "%s(%r, %r)" % (self.__class__.__name__, self.name, self.value)
+
+    def visit(self, visitor):
+        visitor.attribute(self)
+
+
+
+# Nodes containing other nodes.
+
+class Fragment:
+    def __init__(self, nodes=None):
+        self.nodes = nodes or []
+
+    def __repr__(self):
+        return "%s(%r)" % (self.__class__.__name__, self.nodes)
+
+    def visit(self, visitor):
+        visitor.fragment(self)
+
+class Element(Fragment):
+    def __init__(self, name, attributes=None, nodes=None):
+        Fragment.__init__(self, nodes)
+        self.name = name
+        self.attributes = attributes or []
+
+    def __repr__(self):
+        return "%s(%r, %r, %r)" % (self.__class__.__name__, self.name, self.attributes, self.nodes)
+
+    def visit(self, visitor):
+        visitor.element(self)
+
+
+
+# Nodes having values.
+
+class Node:
+    def __init__(self, value):
+        self.value = value
+
+    def __repr__(self):
+        return "%s(%r)" % (self.__class__.__name__, self.value)
+
+    def visit(self, visitor):
+        visitor.node(self)
+
+class AttributeValue(Node):
+    def __init__(self, value, quote):
+        Node.__init__(self, value)
+        self.quote = quote
+
+    def __repr__(self):
+        return "%s(%r, %r)" % (self.__class__.__name__, self.value, self.quote)
+
+    def visit(self, visitor):
+        visitor.attribute_value(self)
+
+class Comment(Node):
+    def visit(self, visitor):
+        visitor.comment(self)
+
+class Directive(Node):
+    def visit(self, visitor):
+        visitor.directive(self)
+
+class Inclusion(Node):
+    def visit(self, visitor):
+        visitor.inclusion(self)
+
+class Text(Node):
+    def visit(self, visitor):
+        visitor.text(self)
+
+# vim: tabstop=4 expandtab shiftwidth=4