paul@341 | 1 | #!/usr/bin/env python |
paul@341 | 2 | |
paul@341 | 3 | """ |
paul@341 | 4 | An absurdly minimal HTML parser. |
paul@341 | 5 | |
paul@341 | 6 | Copyright (C) 2023 Paul Boddie <paul@boddie.org.uk> |
paul@341 | 7 | |
paul@341 | 8 | This program is free software; you can redistribute it and/or modify it under |
paul@341 | 9 | the terms of the GNU General Public License as published by the Free Software |
paul@341 | 10 | Foundation; either version 3 of the License, or (at your option) any later |
paul@341 | 11 | version. |
paul@341 | 12 | |
paul@341 | 13 | This program is distributed in the hope that it will be useful, but WITHOUT |
paul@341 | 14 | ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS |
paul@341 | 15 | FOR A PARTICULAR PURPOSE. See the GNU General Public License for more |
paul@341 | 16 | details. |
paul@341 | 17 | |
paul@341 | 18 | You should have received a copy of the GNU General Public License along with |
paul@341 | 19 | this program. If not, see <http://www.gnu.org/licenses/>. |
paul@341 | 20 | """ |
paul@341 | 21 | |
paul@341 | 22 | from moinformat.utils.htmlparse.token import Tokeniser |
paul@341 | 23 | from moinformat.utils.htmlparse.tree import Attribute, AttributeValue, \ |
paul@341 | 24 | Comment, Directive, Element, \ |
paul@341 | 25 | Fragment, Inclusion, Node, Text |
paul@341 | 26 | |
paul@341 | 27 | |
paul@341 | 28 | |
paul@341 | 29 | # Token processing employing the tokens from tokenisation. |
paul@341 | 30 | |
paul@341 | 31 | class Visitor: |
paul@341 | 32 | def __init__(self): |
paul@341 | 33 | self.node = Fragment() |
paul@341 | 34 | self.stack = [self.node] |
paul@341 | 35 | |
paul@341 | 36 | def append(self, node): |
paul@341 | 37 | self.node.nodes.append(node) |
paul@341 | 38 | |
paul@341 | 39 | def push(self, node): |
paul@341 | 40 | self.stack.append(node) |
paul@341 | 41 | self.append(node) |
paul@341 | 42 | self.node = node |
paul@341 | 43 | |
paul@341 | 44 | def pop(self): |
paul@341 | 45 | self.stack.pop() |
paul@341 | 46 | self.node = self.stack[-1] |
paul@341 | 47 | |
paul@341 | 48 | def visit(self, token): |
paul@341 | 49 | token.visit(self) |
paul@341 | 50 | |
paul@341 | 51 | # Specific handler methods. |
paul@341 | 52 | |
paul@341 | 53 | def attribute(self, token): |
paul@341 | 54 | if isinstance(self.node, Element): |
paul@341 | 55 | self.node.attributes.append(Attribute(token.value)) |
paul@341 | 56 | else: |
paul@341 | 57 | raise ValueError, token |
paul@341 | 58 | |
paul@341 | 59 | def attribute_value(self, token): |
paul@341 | 60 | if isinstance(self.node, Element): |
paul@341 | 61 | self.node.attributes[-1].value = AttributeValue(token.value, token.quote) |
paul@341 | 62 | else: |
paul@341 | 63 | raise ValueError, token |
paul@341 | 64 | |
paul@341 | 65 | def comment(self, token): |
paul@341 | 66 | self.append(Comment(token.value)) |
paul@341 | 67 | |
paul@341 | 68 | def directive(self, token): |
paul@341 | 69 | self.append(Directive(token.value)) |
paul@341 | 70 | |
paul@341 | 71 | def inclusion(self, token): |
paul@341 | 72 | self.append(Inclusion(token.value)) |
paul@341 | 73 | |
paul@341 | 74 | def tag(self, token): |
paul@341 | 75 | if not token.is_end(): |
paul@341 | 76 | self.push(Element(token.tag_name())) |
paul@341 | 77 | elif self.node.name == token.tag_name(): |
paul@341 | 78 | self.pop() |
paul@341 | 79 | else: |
paul@341 | 80 | raise ValueError, token |
paul@341 | 81 | |
paul@341 | 82 | def tag_close(self, token): |
paul@341 | 83 | self.pop() |
paul@341 | 84 | |
paul@341 | 85 | def text(self, token): |
paul@341 | 86 | self.append(Text(token.value)) |
paul@341 | 87 | |
paul@341 | 88 | |
paul@341 | 89 | |
paul@341 | 90 | # Parsing and document construction. |
paul@341 | 91 | |
paul@341 | 92 | class Parser: |
paul@341 | 93 | def __init__(self, text): |
paul@341 | 94 | self.tokeniser = Tokeniser(text) |
paul@341 | 95 | self.visitor = Visitor() |
paul@341 | 96 | |
paul@341 | 97 | def __iter__(self): |
paul@341 | 98 | return self |
paul@341 | 99 | |
paul@341 | 100 | def next(self): |
paul@341 | 101 | token = self.tokeniser.next() |
paul@341 | 102 | self.visitor.visit(token) |
paul@341 | 103 | |
paul@341 | 104 | def parse(self): |
paul@341 | 105 | for _none in self: |
paul@341 | 106 | pass |
paul@341 | 107 | |
paul@341 | 108 | return self.visitor.node |
paul@341 | 109 | |
paul@341 | 110 | # vim: tabstop=4 expandtab shiftwidth=4 |