1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
1.2 +++ b/moinformat/utils/htmlparse/parse.py Sat Jul 01 00:43:48 2023 +0200
1.3 @@ -0,0 +1,110 @@
1.4 +#!/usr/bin/env python
1.5 +
1.6 +"""
1.7 +An absurdly minimal HTML parser.
1.8 +
1.9 +Copyright (C) 2023 Paul Boddie <paul@boddie.org.uk>
1.10 +
1.11 +This program is free software; you can redistribute it and/or modify it under
1.12 +the terms of the GNU General Public License as published by the Free Software
1.13 +Foundation; either version 3 of the License, or (at your option) any later
1.14 +version.
1.15 +
1.16 +This program is distributed in the hope that it will be useful, but WITHOUT
1.17 +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
1.18 +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
1.19 +details.
1.20 +
1.21 +You should have received a copy of the GNU General Public License along with
1.22 +this program. If not, see <http://www.gnu.org/licenses/>.
1.23 +"""
1.24 +
1.25 +from moinformat.utils.htmlparse.token import Tokeniser
1.26 +from moinformat.utils.htmlparse.tree import Attribute, AttributeValue, \
1.27 + Comment, Directive, Element, \
1.28 + Fragment, Inclusion, Node, Text
1.29 +
1.30 +
1.31 +
1.32 +# Token processing employing the tokens from tokenisation.
1.33 +
1.34 +class Visitor:
1.35 + def __init__(self):
1.36 + self.node = Fragment()
1.37 + self.stack = [self.node]
1.38 +
1.39 + def append(self, node):
1.40 + self.node.nodes.append(node)
1.41 +
1.42 + def push(self, node):
1.43 + self.stack.append(node)
1.44 + self.append(node)
1.45 + self.node = node
1.46 +
1.47 + def pop(self):
1.48 + self.stack.pop()
1.49 + self.node = self.stack[-1]
1.50 +
1.51 + def visit(self, token):
1.52 + token.visit(self)
1.53 +
1.54 + # Specific handler methods.
1.55 +
1.56 + def attribute(self, token):
1.57 + if isinstance(self.node, Element):
1.58 + self.node.attributes.append(Attribute(token.value))
1.59 + else:
1.60 + raise ValueError, token
1.61 +
1.62 + def attribute_value(self, token):
1.63 + if isinstance(self.node, Element):
1.64 + self.node.attributes[-1].value = AttributeValue(token.value, token.quote)
1.65 + else:
1.66 + raise ValueError, token
1.67 +
1.68 + def comment(self, token):
1.69 + self.append(Comment(token.value))
1.70 +
1.71 + def directive(self, token):
1.72 + self.append(Directive(token.value))
1.73 +
1.74 + def inclusion(self, token):
1.75 + self.append(Inclusion(token.value))
1.76 +
1.77 + def tag(self, token):
1.78 + if not token.is_end():
1.79 + self.push(Element(token.tag_name()))
1.80 + elif self.node.name == token.tag_name():
1.81 + self.pop()
1.82 + else:
1.83 + raise ValueError, token
1.84 +
1.85 + def tag_close(self, token):
1.86 + self.pop()
1.87 +
1.88 + def text(self, token):
1.89 + self.append(Text(token.value))
1.90 +
1.91 +
1.92 +
1.93 +# Parsing and document construction.
1.94 +
1.95 +class Parser:
1.96 + def __init__(self, text):
1.97 + self.tokeniser = Tokeniser(text)
1.98 + self.visitor = Visitor()
1.99 +
1.100 + def __iter__(self):
1.101 + return self
1.102 +
1.103 + def next(self):
1.104 + token = self.tokeniser.next()
1.105 + self.visitor.visit(token)
1.106 +
1.107 + def parse(self):
1.108 + for _none in self:
1.109 + pass
1.110 +
1.111 + return self.visitor.node
1.112 +
1.113 +# vim: tabstop=4 expandtab shiftwidth=4