1.1 --- a/moinformat/parsers/common.py Sat Jul 01 00:37:44 2023 +0200 1.2 +++ b/moinformat/parsers/common.py Sat Jul 01 00:43:48 2023 +0200 1.3 @@ -3,7 +3,7 @@ 1.4 """ 1.5 Moin wiki parsing functionality. 1.6 1.7 -Copyright (C) 2017, 2018, 2019, 2021 Paul Boddie <paul@boddie.org.uk> 1.8 +Copyright (C) 2017, 2018, 2019, 2021, 2023 Paul Boddie <paul@boddie.org.uk> 1.9 1.10 This program is free software; you can redistribute it and/or modify it under 1.11 the terms of the GNU General Public License as published by the Free Software 1.12 @@ -279,6 +279,9 @@ 1.13 self.parsers = parsers 1.14 self.root = root 1.15 1.16 + def update_metadata(self, metadata): 1.17 + pass 1.18 + 1.19 def get_parser(self, format_type): 1.20 1.21 """
2.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 2.2 +++ b/moinformat/parsers/html.py Sat Jul 01 00:43:48 2023 +0200 2.3 @@ -0,0 +1,82 @@ 2.4 +#!/usr/bin/env python 2.5 + 2.6 +""" 2.7 +HTML document fragment parser. 2.8 + 2.9 +Copyright (C) 2023 Paul Boddie <paul@boddie.org.uk> 2.10 + 2.11 +This program is free software; you can redistribute it and/or modify it under 2.12 +the terms of the GNU General Public License as published by the Free Software 2.13 +Foundation; either version 3 of the License, or (at your option) any later 2.14 +version. 2.15 + 2.16 +This program is distributed in the hope that it will be useful, but WITHOUT 2.17 +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS 2.18 +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more 2.19 +details. 2.20 + 2.21 +You should have received a copy of the GNU General Public License along with 2.22 +this program. If not, see <http://www.gnu.org/licenses/>. 2.23 +""" 2.24 + 2.25 +from moinformat.parsers.common import ParserBase 2.26 +from moinformat.tree.html import Element, Fragment 2.27 +from moinformat.utils.htmlparse import Parser 2.28 + 2.29 +class HTMLParser(ParserBase): 2.30 + 2.31 + "A prettyprinted document tree parser." 2.32 + 2.33 + formats = ["html"] 2.34 + 2.35 + def __init__(self, metadata): 2.36 + self.metadata = metadata 2.37 + 2.38 + def parse(self, s): 2.39 + 2.40 + "Parse the tree structure representation in 's'." 2.41 + 2.42 + doc = Parser(s).parse() 2.43 + 2.44 + # If a theme is going to be used with the document, find the body node 2.45 + # and return its children in a fragment. 2.46 + 2.47 + if self.metadata.get("theme_name"): 2.48 + body = self._find_body(doc) 2.49 + 2.50 + if body: 2.51 + return Fragment(body.nodes) 2.52 + else: 2.53 + return None 2.54 + 2.55 + # Otherwise, return the top-level node. 2.56 + 2.57 + else: 2.58 + return doc 2.59 + 2.60 + def _find_body(self, node): 2.61 + 2.62 + """ 2.63 + Find the body element from 'node', returning the element if found or 2.64 + None otherwise. 2.65 + """ 2.66 + 2.67 + # Search all nodes with children. 2.68 + 2.69 + if isinstance(node, Fragment): 2.70 + 2.71 + # Return the node if it is a body element. 2.72 + 2.73 + if isinstance(node, Element) and node.name == "body": 2.74 + return node 2.75 + 2.76 + for n in node.nodes: 2.77 + body = self._find_body(n) 2.78 + if body: 2.79 + return body 2.80 + 2.81 + return None 2.82 + 2.83 +parser = HTMLParser 2.84 + 2.85 +# vim: tabstop=4 expandtab shiftwidth=4
3.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 3.2 +++ b/moinformat/serialisers/html/html.py Sat Jul 01 00:43:48 2023 +0200 3.3 @@ -0,0 +1,72 @@ 3.4 +#!/usr/bin/env python 3.5 + 3.6 +""" 3.7 +HTML serialiser. 3.8 + 3.9 +Copyright (C) 2023 Paul Boddie <paul@boddie.org.uk> 3.10 + 3.11 +This program is free software; you can redistribute it and/or modify it under 3.12 +the terms of the GNU General Public License as published by the Free Software 3.13 +Foundation; either version 3 of the License, or (at your option) any later 3.14 +version. 3.15 + 3.16 +This program is distributed in the hope that it will be useful, but WITHOUT 3.17 +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS 3.18 +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more 3.19 +details. 3.20 + 3.21 +You should have received a copy of the GNU General Public License along with 3.22 +this program. If not, see <http://www.gnu.org/licenses/>. 3.23 +""" 3.24 + 3.25 +from moinformat.serialisers.common import Serialiser 3.26 + 3.27 + 3.28 + 3.29 +# The serialiser class. 3.30 + 3.31 +class HTMLSerialiser(Serialiser): 3.32 + 3.33 + "Serialisation of HTML fragments." 3.34 + 3.35 + input_formats = ["html"] 3.36 + formats = ["html"] 3.37 + 3.38 + def attribute(self, attribute): 3.39 + self.out(attribute.name) 3.40 + if attribute.value is not None: 3.41 + self.out("=") 3.42 + self.visit(attribute.value) 3.43 + 3.44 + def attribute_value(self, attribute_value): 3.45 + self.out("%s%s%s" % (attribute_value.quote, attribute_value.value, attribute_value.quote)) 3.46 + 3.47 + def element(self, element): 3.48 + self.out("<%s" % element.name) 3.49 + for attribute in element.attributes: 3.50 + self.out(" ") 3.51 + self.visit(attribute) 3.52 + self.out(">") 3.53 + self.container(element) 3.54 + self.out("</%s>" % element.name) 3.55 + 3.56 + def comment(self, comment): 3.57 + self.out("<%s>" % comment.value) 3.58 + 3.59 + def directive(self, directive): 3.60 + self.out("<%s>" % directive.value) 3.61 + 3.62 + def inclusion(self, inclusion): 3.63 + self.out("<%s>" % inclusion.value) 3.64 + 3.65 + def node(self, node): 3.66 + self.out(node.value) 3.67 + 3.68 + text = node 3.69 + 3.70 + def fragment(self, fragment): 3.71 + self.container(fragment) 3.72 + 3.73 +serialiser = HTMLSerialiser 3.74 + 3.75 +# vim: tabstop=4 expandtab shiftwidth=4
4.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 4.2 +++ b/moinformat/serialisers/pretty/html.py Sat Jul 01 00:43:48 2023 +0200 4.3 @@ -0,0 +1,63 @@ 4.4 +#!/usr/bin/env python 4.5 + 4.6 +""" 4.7 +Prettyprinted HTML document node prettyprinter. 4.8 + 4.9 +Copyright (C) 2023 Paul Boddie <paul@boddie.org.uk> 4.10 + 4.11 +This program is free software; you can redistribute it and/or modify it under 4.12 +the terms of the GNU General Public License as published by the Free Software 4.13 +Foundation; either version 3 of the License, or (at your option) any later 4.14 +version. 4.15 + 4.16 +This program is distributed in the hope that it will be useful, but WITHOUT 4.17 +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS 4.18 +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more 4.19 +details. 4.20 + 4.21 +You should have received a copy of the GNU General Public License along with 4.22 +this program. If not, see <http://www.gnu.org/licenses/>. 4.23 +""" 4.24 + 4.25 +from moinformat.serialisers.pretty.common import Serialiser 4.26 + 4.27 +class HTMLSerialiser(Serialiser): 4.28 + 4.29 + "Serialisation of prettyprinted document nodes for inspection." 4.30 + 4.31 + input_formats = ["html"] 4.32 + formats = ["pretty"] 4.33 + 4.34 + def attribute(self, attribute): 4.35 + self.out("%sAttribute: %s" % (self.output.indent, attribute.name)) 4.36 + if attribute.value is not None: 4.37 + self.out("=") 4.38 + self.visit(attribute.value) 4.39 + self.out("\n") 4.40 + 4.41 + def attribute_value(self, attribute_value): 4.42 + self.out("%s%s%s" % (attribute_value.quote, attribute_value.value, attribute_value.quote)) 4.43 + 4.44 + def element(self, element): 4.45 + self.out("%sElement: name=%r\n" % (self.output.indent, element.name)) 4.46 + self.output.indent += " " 4.47 + for attribute in element.attributes: 4.48 + self.visit(attribute) 4.49 + self.output.indent = self.output.indent[:-2] 4.50 + self.container(element) 4.51 + 4.52 + def node(self, node): 4.53 + self.out("%s%s: %r\n" % (self.output.indent, node.__class__.__name__, node.value)) 4.54 + 4.55 + comment = node 4.56 + directive = node 4.57 + inclusion = node 4.58 + text = node 4.59 + 4.60 + def fragment(self, fragment): 4.61 + self.out("%s%s\n" % (self.output.indent, fragment.__class__.__name__)) 4.62 + self.container(fragment) 4.63 + 4.64 +serialiser = HTMLSerialiser 4.65 + 4.66 +# vim: tabstop=4 expandtab shiftwidth=4
5.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 5.2 +++ b/moinformat/tree/html.py Sat Jul 01 00:43:48 2023 +0200 5.3 @@ -0,0 +1,26 @@ 5.4 +#!/usr/bin/env python 5.5 + 5.6 +""" 5.7 +HTML document nodes. 5.8 + 5.9 +Copyright (C) 2023 Paul Boddie <paul@boddie.org.uk> 5.10 + 5.11 +This program is free software; you can redistribute it and/or modify it under 5.12 +the terms of the GNU General Public License as published by the Free Software 5.13 +Foundation; either version 3 of the License, or (at your option) any later 5.14 +version. 5.15 + 5.16 +This program is distributed in the hope that it will be useful, but WITHOUT 5.17 +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS 5.18 +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more 5.19 +details. 5.20 + 5.21 +You should have received a copy of the GNU General Public License along with 5.22 +this program. If not, see <http://www.gnu.org/licenses/>. 5.23 +""" 5.24 + 5.25 +from moinformat.utils.htmlparse.tree import Attribute, AttributeValue, \ 5.26 + Comment, Directive, Element, \ 5.27 + Fragment, Node, Inclusion, Text 5.28 + 5.29 +# vim: tabstop=4 expandtab shiftwidth=4
6.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 6.2 +++ b/moinformat/utils/htmlparse/__init__.py Sat Jul 01 00:43:48 2023 +0200 6.3 @@ -0,0 +1,24 @@ 6.4 +#!/usr/bin/env python 6.5 + 6.6 +""" 6.7 +HTML parsing modules. 6.8 + 6.9 +Copyright (C) 2023 Paul Boddie <paul@boddie.org.uk> 6.10 + 6.11 +This program is free software; you can redistribute it and/or modify it under 6.12 +the terms of the GNU General Public License as published by the Free Software 6.13 +Foundation; either version 3 of the License, or (at your option) any later 6.14 +version. 6.15 + 6.16 +This program is distributed in the hope that it will be useful, but WITHOUT 6.17 +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS 6.18 +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more 6.19 +details. 6.20 + 6.21 +You should have received a copy of the GNU General Public License along with 6.22 +this program. If not, see <http://www.gnu.org/licenses/>. 6.23 +""" 6.24 + 6.25 +from moinformat.utils.htmlparse.parse import Parser 6.26 + 6.27 +# vim: tabstop=4 expandtab shiftwidth=4
7.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 7.2 +++ b/moinformat/utils/htmlparse/lex.py Sat Jul 01 00:43:48 2023 +0200 7.3 @@ -0,0 +1,234 @@ 7.4 +#!/usr/bin/env python 7.5 + 7.6 +""" 7.7 +Lexical partitioning of HTML document content. 7.8 + 7.9 +Copyright (C) 2023 Paul Boddie <paul@boddie.org.uk> 7.10 + 7.11 +This program is free software; you can redistribute it and/or modify it under 7.12 +the terms of the GNU General Public License as published by the Free Software 7.13 +Foundation; either version 3 of the License, or (at your option) any later 7.14 +version. 7.15 + 7.16 +This program is distributed in the hope that it will be useful, but WITHOUT 7.17 +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS 7.18 +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more 7.19 +details. 7.20 + 7.21 +You should have received a copy of the GNU General Public License along with 7.22 +this program. If not, see <http://www.gnu.org/licenses/>. 7.23 +""" 7.24 + 7.25 +# Lexical analysis state transition handler functions. 7.26 + 7.27 +def tag_or_similar(text, pos): 7.28 + 7.29 + # Consult the text positions following the position indicated. 7.30 + 7.31 + if text[pos:pos+2] == "<!": 7.32 + if text[pos+3:pos+4] == "[": 7.33 + return IN_INCLUSION 7.34 + elif text[pos+3:pos+5] == "--": 7.35 + return IN_COMMENT 7.36 + else: 7.37 + return IN_DIRECTIVE 7.38 + else: 7.39 + return IN_TAG 7.40 + 7.41 +def at_attribute_value(text, pos): 7.42 + return AT_ATTRIBUTE_VALUE 7.43 + 7.44 +def in_dq_attribute_value(text, pos): 7.45 + return IN_DQ_ATTRIBUTE_VALUE 7.46 + 7.47 +def in_sq_attribute_value(text, pos): 7.48 + return IN_SQ_ATTRIBUTE_VALUE 7.49 + 7.50 +def after_attribute_value(text, pos): 7.51 + return AFTER_ATTRIBUTE_VALUE 7.52 + 7.53 +def end_of_standalone_tag(text, pos): 7.54 + return AT_END_OF_TAG 7.55 + 7.56 +def end_of_tag(text, pos): 7.57 + return BETWEEN_TAGS 7.58 + 7.59 + 7.60 + 7.61 +# Lexical analysis states/spans. 7.62 + 7.63 +class Span: 7.64 + def __init__(self, text): 7.65 + self.text = text 7.66 + 7.67 + def empty(self): 7.68 + return not self.text 7.69 + 7.70 + def __repr__(self): 7.71 + return "%s(%r)" % (self.__class__.__name__, self.text) 7.72 + 7.73 +class AT_END_OF_TAG(Span): 7.74 + transitions = [(None, "", end_of_tag)] 7.75 + 7.76 + def empty(self): 7.77 + return False 7.78 + 7.79 + def visit(self, visitor): 7.80 + return visitor.at_end_of_tag(self) 7.81 + 7.82 +class BETWEEN_TAGS(Span): 7.83 + transitions = [("<", "", tag_or_similar)] 7.84 + 7.85 + def visit(self, visitor): 7.86 + return visitor.between_tags(self) 7.87 + 7.88 +class IN_TAG(Span): 7.89 + transitions = [ 7.90 + ("=", "", at_attribute_value), 7.91 + ("/>", "", end_of_standalone_tag), 7.92 + (">", "", end_of_tag), 7.93 + ] 7.94 + 7.95 + def visit(self, visitor): 7.96 + return visitor.in_tag(self) 7.97 + 7.98 +class IN_COMMENT(Span): 7.99 + transitions = [("-->", "--", end_of_tag)] 7.100 + 7.101 + def visit(self, visitor): 7.102 + return visitor.in_comment(self) 7.103 + 7.104 +class IN_DIRECTIVE(Span): 7.105 + transitions = [(">", "", end_of_tag)] 7.106 + 7.107 + def visit(self, visitor): 7.108 + return visitor.in_directive(self) 7.109 + 7.110 +class IN_INCLUSION(Span): 7.111 + transitions = [("]]>", "]]", end_of_tag)] 7.112 + 7.113 + def visit(self, visitor): 7.114 + return visitor.in_inclusion(self) 7.115 + 7.116 +class AFTER_ATTRIBUTE_VALUE(Span): 7.117 + transitions = [ 7.118 + ("=", "", at_attribute_value), 7.119 + ("/>", "", end_of_standalone_tag), 7.120 + (">", "", end_of_tag), 7.121 + ] 7.122 + 7.123 + def empty(self): 7.124 + return not self.text.strip() 7.125 + 7.126 + def visit(self, visitor): 7.127 + return visitor.after_attribute_value(self) 7.128 + 7.129 +class AT_ATTRIBUTE_VALUE(Span): 7.130 + transitions = [ 7.131 + ("=", "", at_attribute_value), 7.132 + ('"', "", in_dq_attribute_value), 7.133 + ("'", "", in_sq_attribute_value), 7.134 + ("/>", "", end_of_standalone_tag), 7.135 + (">", "", end_of_tag), 7.136 + ] 7.137 + 7.138 + def empty(self): 7.139 + return not self.text.strip() 7.140 + 7.141 + def visit(self, visitor): 7.142 + return visitor.at_attribute_value(self) 7.143 + 7.144 +class IN_DQ_ATTRIBUTE_VALUE(Span): 7.145 + transitions = [('"', "", after_attribute_value)] 7.146 + 7.147 + def visit(self, visitor): 7.148 + return visitor.in_dq_attribute_value(self) 7.149 + 7.150 +class IN_SQ_ATTRIBUTE_VALUE(Span): 7.151 + transitions = [("'", "", after_attribute_value)] 7.152 + 7.153 + def visit(self, visitor): 7.154 + return visitor.in_sq_attribute_value(self) 7.155 + 7.156 + 7.157 + 7.158 +# Utility functions. 7.159 + 7.160 +def find_one(text, pos, choices): 7.161 + 7.162 + """ 7.163 + Find in 'text' from 'pos' the earliest occurring instance of one of the 7.164 + given 'choices', these being a list of (token string, extra string, state) 7.165 + tuples. 7.166 + 7.167 + The token string is a token marking the start of the next span, the extra 7.168 + string is the portion of the token to be added to the end of the current 7.169 + span upon matching, and the state applies to the next span. 7.170 + 7.171 + The associated state, the position of the occurrence, and the position of 7.172 + the text following the occurrence are returned as a tuple. 7.173 + """ 7.174 + 7.175 + next_state = None 7.176 + first_pos = None 7.177 + first_extra = None 7.178 + next_pos = None 7.179 + 7.180 + for token, extra, state in choices: 7.181 + if token is None: 7.182 + return state, pos, extra, pos 7.183 + 7.184 + found_pos = text.find(token, pos) 7.185 + 7.186 + if found_pos != -1 and (next_state is None or found_pos < first_pos): 7.187 + next_state = state 7.188 + first_pos = found_pos 7.189 + first_extra = extra 7.190 + next_pos = found_pos + len(token) 7.191 + 7.192 + return next_state, first_pos, first_extra, next_pos 7.193 + 7.194 + 7.195 + 7.196 +# Lexical partitioning. 7.197 + 7.198 +class Lexer: 7.199 + def __init__(self, text): 7.200 + self.text = text 7.201 + self.state = BETWEEN_TAGS 7.202 + self.pos = 0 7.203 + 7.204 + def _end_of_input(self): 7.205 + start = self.pos 7.206 + self.pos = None 7.207 + return self._span(self.text[start:]) 7.208 + 7.209 + def _span(self, text): 7.210 + return self.state(text) 7.211 + 7.212 + def __iter__(self): 7.213 + return self 7.214 + 7.215 + def next(self): 7.216 + if self.pos is None: 7.217 + raise StopIteration 7.218 + 7.219 + # Obtain details of a state transition: a handler function to determine 7.220 + # the next state, and the start and end positions of the token causing 7.221 + # the transition. 7.222 + 7.223 + handler, pos, extra, next_pos = find_one(self.text, self.pos, self.state.transitions) 7.224 + 7.225 + if handler is None: 7.226 + return self._end_of_input() 7.227 + 7.228 + # Obtain the lexical span and update the state and position. 7.229 + 7.230 + span = self._span(self.text[self.pos:pos] + extra) 7.231 + 7.232 + self.state = handler(self.text, pos) 7.233 + self.pos = next_pos 7.234 + 7.235 + return span 7.236 + 7.237 +# vim: tabstop=4 expandtab shiftwidth=4
8.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 8.2 +++ b/moinformat/utils/htmlparse/parse.py Sat Jul 01 00:43:48 2023 +0200 8.3 @@ -0,0 +1,110 @@ 8.4 +#!/usr/bin/env python 8.5 + 8.6 +""" 8.7 +An absurdly minimal HTML parser. 8.8 + 8.9 +Copyright (C) 2023 Paul Boddie <paul@boddie.org.uk> 8.10 + 8.11 +This program is free software; you can redistribute it and/or modify it under 8.12 +the terms of the GNU General Public License as published by the Free Software 8.13 +Foundation; either version 3 of the License, or (at your option) any later 8.14 +version. 8.15 + 8.16 +This program is distributed in the hope that it will be useful, but WITHOUT 8.17 +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS 8.18 +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more 8.19 +details. 8.20 + 8.21 +You should have received a copy of the GNU General Public License along with 8.22 +this program. If not, see <http://www.gnu.org/licenses/>. 8.23 +""" 8.24 + 8.25 +from moinformat.utils.htmlparse.token import Tokeniser 8.26 +from moinformat.utils.htmlparse.tree import Attribute, AttributeValue, \ 8.27 + Comment, Directive, Element, \ 8.28 + Fragment, Inclusion, Node, Text 8.29 + 8.30 + 8.31 + 8.32 +# Token processing employing the tokens from tokenisation. 8.33 + 8.34 +class Visitor: 8.35 + def __init__(self): 8.36 + self.node = Fragment() 8.37 + self.stack = [self.node] 8.38 + 8.39 + def append(self, node): 8.40 + self.node.nodes.append(node) 8.41 + 8.42 + def push(self, node): 8.43 + self.stack.append(node) 8.44 + self.append(node) 8.45 + self.node = node 8.46 + 8.47 + def pop(self): 8.48 + self.stack.pop() 8.49 + self.node = self.stack[-1] 8.50 + 8.51 + def visit(self, token): 8.52 + token.visit(self) 8.53 + 8.54 + # Specific handler methods. 8.55 + 8.56 + def attribute(self, token): 8.57 + if isinstance(self.node, Element): 8.58 + self.node.attributes.append(Attribute(token.value)) 8.59 + else: 8.60 + raise ValueError, token 8.61 + 8.62 + def attribute_value(self, token): 8.63 + if isinstance(self.node, Element): 8.64 + self.node.attributes[-1].value = AttributeValue(token.value, token.quote) 8.65 + else: 8.66 + raise ValueError, token 8.67 + 8.68 + def comment(self, token): 8.69 + self.append(Comment(token.value)) 8.70 + 8.71 + def directive(self, token): 8.72 + self.append(Directive(token.value)) 8.73 + 8.74 + def inclusion(self, token): 8.75 + self.append(Inclusion(token.value)) 8.76 + 8.77 + def tag(self, token): 8.78 + if not token.is_end(): 8.79 + self.push(Element(token.tag_name())) 8.80 + elif self.node.name == token.tag_name(): 8.81 + self.pop() 8.82 + else: 8.83 + raise ValueError, token 8.84 + 8.85 + def tag_close(self, token): 8.86 + self.pop() 8.87 + 8.88 + def text(self, token): 8.89 + self.append(Text(token.value)) 8.90 + 8.91 + 8.92 + 8.93 +# Parsing and document construction. 8.94 + 8.95 +class Parser: 8.96 + def __init__(self, text): 8.97 + self.tokeniser = Tokeniser(text) 8.98 + self.visitor = Visitor() 8.99 + 8.100 + def __iter__(self): 8.101 + return self 8.102 + 8.103 + def next(self): 8.104 + token = self.tokeniser.next() 8.105 + self.visitor.visit(token) 8.106 + 8.107 + def parse(self): 8.108 + for _none in self: 8.109 + pass 8.110 + 8.111 + return self.visitor.node 8.112 + 8.113 +# vim: tabstop=4 expandtab shiftwidth=4
9.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 9.2 +++ b/moinformat/utils/htmlparse/token.py Sat Jul 01 00:43:48 2023 +0200 9.3 @@ -0,0 +1,160 @@ 9.4 +#!/usr/bin/env python 9.5 + 9.6 +""" 9.7 +An absurdly minimal HTML tokeniser. 9.8 + 9.9 +Copyright (C) 2023 Paul Boddie <paul@boddie.org.uk> 9.10 + 9.11 +This program is free software; you can redistribute it and/or modify it under 9.12 +the terms of the GNU General Public License as published by the Free Software 9.13 +Foundation; either version 3 of the License, or (at your option) any later 9.14 +version. 9.15 + 9.16 +This program is distributed in the hope that it will be useful, but WITHOUT 9.17 +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS 9.18 +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more 9.19 +details. 9.20 + 9.21 +You should have received a copy of the GNU General Public License along with 9.22 +this program. If not, see <http://www.gnu.org/licenses/>. 9.23 +""" 9.24 + 9.25 +from collections import deque 9.26 +from moinformat.utils.htmlparse.lex import Lexer 9.27 + 9.28 + 9.29 + 9.30 +# Document token classes. 9.31 + 9.32 +class Token: 9.33 + def __init__(self, value): 9.34 + self.value = value 9.35 + 9.36 + def __repr__(self): 9.37 + return "%s(%r)" % (self.__class__.__name__, self.value) 9.38 + 9.39 +class Attribute(Token): 9.40 + def visit(self, visitor): 9.41 + return visitor.attribute(self) 9.42 + 9.43 +class AttributeValue(Token): 9.44 + def __init__(self, value, quote): 9.45 + self.value = value 9.46 + self.quote = quote 9.47 + 9.48 + def __repr__(self): 9.49 + return "%s(%r, %r)" % (self.__class__.__name__, self.value, self.quote) 9.50 + 9.51 + def visit(self, visitor): 9.52 + return visitor.attribute_value(self) 9.53 + 9.54 +class Comment(Token): 9.55 + def visit(self, visitor): 9.56 + return visitor.comment(self) 9.57 + 9.58 +class Directive(Token): 9.59 + def visit(self, visitor): 9.60 + return visitor.directive(self) 9.61 + 9.62 +class Inclusion(Token): 9.63 + def visit(self, visitor): 9.64 + return visitor.inclusion(self) 9.65 + 9.66 +class Tag(Token): 9.67 + def visit(self, visitor): 9.68 + return visitor.tag(self) 9.69 + 9.70 + def is_end(self): 9.71 + return self.value.startswith("/") 9.72 + 9.73 + def tag_name(self): 9.74 + return self.is_end() and self.value[1:] or self.value 9.75 + 9.76 +class TagClose: 9.77 + def visit(self, visitor): 9.78 + return visitor.tag_close(self) 9.79 + 9.80 + def __repr__(self): 9.81 + return "%s()" % self.__class__.__name__ 9.82 + 9.83 +class Text(Token): 9.84 + def visit(self, visitor): 9.85 + return visitor.text(self) 9.86 + 9.87 + 9.88 + 9.89 +# Tidying visitor employing the spans from lexical partitioning. 9.90 + 9.91 +class Visitor: 9.92 + def __init__(self): 9.93 + self.queued = deque() 9.94 + 9.95 + def visit(self, span): 9.96 + return span.visit(self) 9.97 + 9.98 + # Specific handler methods. 9.99 + 9.100 + def between_tags(self, span): 9.101 + return Text(span.text) 9.102 + 9.103 + def in_comment(self, span): 9.104 + return Comment(span.text) 9.105 + 9.106 + def in_directive(self, span): 9.107 + return Directive(span.text) 9.108 + 9.109 + def in_inclusion(self, span): 9.110 + return Inclusion(span.text) 9.111 + 9.112 + def _queue_attributes(self, tokens): 9.113 + for token in tokens: 9.114 + self.queued.append(Attribute(token)) 9.115 + 9.116 + def in_tag(self, span): 9.117 + tokens = span.text.split() 9.118 + self._queue_attributes(tokens[1:]) 9.119 + return Tag(tokens[0]) 9.120 + 9.121 + def at_end_of_tag(self, span): 9.122 + return TagClose() 9.123 + 9.124 + def after_attribute_value(self, span): 9.125 + tokens = span.text.split() 9.126 + self._queue_attributes(tokens) 9.127 + return self.queued.popleft() 9.128 + 9.129 + def at_attribute_value(self, span): 9.130 + tokens = span.text.split() 9.131 + self._queue_attributes(tokens[1:]) 9.132 + return AttributeValue(tokens[0], "") 9.133 + 9.134 + def in_dq_attribute_value(self, span): 9.135 + return AttributeValue(span.text, '"') 9.136 + 9.137 + def in_sq_attribute_value(self, span): 9.138 + return AttributeValue(span.text, "'") 9.139 + 9.140 + 9.141 + 9.142 +# Tokenising. 9.143 + 9.144 +class Tokeniser: 9.145 + def __init__(self, text): 9.146 + self.lexer = Lexer(text) 9.147 + self.visitor = Visitor() 9.148 + 9.149 + def __iter__(self): 9.150 + return self 9.151 + 9.152 + def next(self): 9.153 + if self.visitor.queued: 9.154 + return self.visitor.queued.popleft() 9.155 + 9.156 + while 1: 9.157 + span = self.lexer.next() 9.158 + if not span.empty(): 9.159 + break 9.160 + 9.161 + return self.visitor.visit(span) 9.162 + 9.163 +# vim: tabstop=4 expandtab shiftwidth=4
10.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 10.2 +++ b/moinformat/utils/htmlparse/tree.py Sat Jul 01 00:43:48 2023 +0200 10.3 @@ -0,0 +1,102 @@ 10.4 +#!/usr/bin/env python 10.5 + 10.6 +""" 10.7 +HTML document nodes. 10.8 + 10.9 +Copyright (C) 2023 Paul Boddie <paul@boddie.org.uk> 10.10 + 10.11 +This program is free software; you can redistribute it and/or modify it under 10.12 +the terms of the GNU General Public License as published by the Free Software 10.13 +Foundation; either version 3 of the License, or (at your option) any later 10.14 +version. 10.15 + 10.16 +This program is distributed in the hope that it will be useful, but WITHOUT 10.17 +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS 10.18 +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more 10.19 +details. 10.20 + 10.21 +You should have received a copy of the GNU General Public License along with 10.22 +this program. If not, see <http://www.gnu.org/licenses/>. 10.23 +""" 10.24 + 10.25 +# Element attributes. 10.26 + 10.27 +class Attribute: 10.28 + def __init__(self, name, value=None): 10.29 + self.name = name 10.30 + self.value = value 10.31 + 10.32 + def __repr__(self): 10.33 + return "%s(%r, %r)" % (self.__class__.__name__, self.name, self.value) 10.34 + 10.35 + def visit(self, visitor): 10.36 + visitor.attribute(self) 10.37 + 10.38 + 10.39 + 10.40 +# Nodes containing other nodes. 10.41 + 10.42 +class Fragment: 10.43 + def __init__(self, nodes=None): 10.44 + self.nodes = nodes or [] 10.45 + 10.46 + def __repr__(self): 10.47 + return "%s(%r)" % (self.__class__.__name__, self.nodes) 10.48 + 10.49 + def visit(self, visitor): 10.50 + visitor.fragment(self) 10.51 + 10.52 +class Element(Fragment): 10.53 + def __init__(self, name, attributes=None, nodes=None): 10.54 + Fragment.__init__(self, nodes) 10.55 + self.name = name 10.56 + self.attributes = attributes or [] 10.57 + 10.58 + def __repr__(self): 10.59 + return "%s(%r, %r, %r)" % (self.__class__.__name__, self.name, self.attributes, self.nodes) 10.60 + 10.61 + def visit(self, visitor): 10.62 + visitor.element(self) 10.63 + 10.64 + 10.65 + 10.66 +# Nodes having values. 10.67 + 10.68 +class Node: 10.69 + def __init__(self, value): 10.70 + self.value = value 10.71 + 10.72 + def __repr__(self): 10.73 + return "%s(%r)" % (self.__class__.__name__, self.value) 10.74 + 10.75 + def visit(self, visitor): 10.76 + visitor.node(self) 10.77 + 10.78 +class AttributeValue(Node): 10.79 + def __init__(self, value, quote): 10.80 + Node.__init__(self, value) 10.81 + self.quote = quote 10.82 + 10.83 + def __repr__(self): 10.84 + return "%s(%r, %r)" % (self.__class__.__name__, self.value, self.quote) 10.85 + 10.86 + def visit(self, visitor): 10.87 + visitor.attribute_value(self) 10.88 + 10.89 +class Comment(Node): 10.90 + def visit(self, visitor): 10.91 + visitor.comment(self) 10.92 + 10.93 +class Directive(Node): 10.94 + def visit(self, visitor): 10.95 + visitor.directive(self) 10.96 + 10.97 +class Inclusion(Node): 10.98 + def visit(self, visitor): 10.99 + visitor.inclusion(self) 10.100 + 10.101 +class Text(Node): 10.102 + def visit(self, visitor): 10.103 + visitor.text(self) 10.104 + 10.105 +# vim: tabstop=4 expandtab shiftwidth=4