paul@341 | 1 | #!/usr/bin/env python |
paul@341 | 2 | |
paul@341 | 3 | """ |
paul@341 | 4 | An absurdly minimal HTML tokeniser. |
paul@341 | 5 | |
paul@341 | 6 | Copyright (C) 2023 Paul Boddie <paul@boddie.org.uk> |
paul@341 | 7 | |
paul@341 | 8 | This program is free software; you can redistribute it and/or modify it under |
paul@341 | 9 | the terms of the GNU General Public License as published by the Free Software |
paul@341 | 10 | Foundation; either version 3 of the License, or (at your option) any later |
paul@341 | 11 | version. |
paul@341 | 12 | |
paul@341 | 13 | This program is distributed in the hope that it will be useful, but WITHOUT |
paul@341 | 14 | ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS |
paul@341 | 15 | FOR A PARTICULAR PURPOSE. See the GNU General Public License for more |
paul@341 | 16 | details. |
paul@341 | 17 | |
paul@341 | 18 | You should have received a copy of the GNU General Public License along with |
paul@341 | 19 | this program. If not, see <http://www.gnu.org/licenses/>. |
paul@341 | 20 | """ |
paul@341 | 21 | |
paul@341 | 22 | from collections import deque |
paul@341 | 23 | from moinformat.utils.htmlparse.lex import Lexer |
paul@341 | 24 | |
paul@341 | 25 | |
paul@341 | 26 | |
paul@341 | 27 | # Document token classes. |
paul@341 | 28 | |
paul@341 | 29 | class Token: |
paul@341 | 30 | def __init__(self, value): |
paul@341 | 31 | self.value = value |
paul@341 | 32 | |
paul@341 | 33 | def __repr__(self): |
paul@341 | 34 | return "%s(%r)" % (self.__class__.__name__, self.value) |
paul@341 | 35 | |
paul@341 | 36 | class Attribute(Token): |
paul@341 | 37 | def visit(self, visitor): |
paul@341 | 38 | return visitor.attribute(self) |
paul@341 | 39 | |
paul@341 | 40 | class AttributeValue(Token): |
paul@341 | 41 | def __init__(self, value, quote): |
paul@341 | 42 | self.value = value |
paul@341 | 43 | self.quote = quote |
paul@341 | 44 | |
paul@341 | 45 | def __repr__(self): |
paul@341 | 46 | return "%s(%r, %r)" % (self.__class__.__name__, self.value, self.quote) |
paul@341 | 47 | |
paul@341 | 48 | def visit(self, visitor): |
paul@341 | 49 | return visitor.attribute_value(self) |
paul@341 | 50 | |
paul@341 | 51 | class Comment(Token): |
paul@341 | 52 | def visit(self, visitor): |
paul@341 | 53 | return visitor.comment(self) |
paul@341 | 54 | |
paul@341 | 55 | class Directive(Token): |
paul@341 | 56 | def visit(self, visitor): |
paul@341 | 57 | return visitor.directive(self) |
paul@341 | 58 | |
paul@341 | 59 | class Inclusion(Token): |
paul@341 | 60 | def visit(self, visitor): |
paul@341 | 61 | return visitor.inclusion(self) |
paul@341 | 62 | |
paul@341 | 63 | class Tag(Token): |
paul@341 | 64 | def visit(self, visitor): |
paul@341 | 65 | return visitor.tag(self) |
paul@341 | 66 | |
paul@341 | 67 | def is_end(self): |
paul@341 | 68 | return self.value.startswith("/") |
paul@341 | 69 | |
paul@341 | 70 | def tag_name(self): |
paul@341 | 71 | return self.is_end() and self.value[1:] or self.value |
paul@341 | 72 | |
paul@341 | 73 | class TagClose: |
paul@341 | 74 | def visit(self, visitor): |
paul@341 | 75 | return visitor.tag_close(self) |
paul@341 | 76 | |
paul@341 | 77 | def __repr__(self): |
paul@341 | 78 | return "%s()" % self.__class__.__name__ |
paul@341 | 79 | |
paul@341 | 80 | class Text(Token): |
paul@341 | 81 | def visit(self, visitor): |
paul@341 | 82 | return visitor.text(self) |
paul@341 | 83 | |
paul@341 | 84 | |
paul@341 | 85 | |
paul@341 | 86 | # Tidying visitor employing the spans from lexical partitioning. |
paul@341 | 87 | |
paul@341 | 88 | class Visitor: |
paul@341 | 89 | def __init__(self): |
paul@341 | 90 | self.queued = deque() |
paul@341 | 91 | |
paul@341 | 92 | def visit(self, span): |
paul@341 | 93 | return span.visit(self) |
paul@341 | 94 | |
paul@341 | 95 | # Specific handler methods. |
paul@341 | 96 | |
paul@341 | 97 | def between_tags(self, span): |
paul@341 | 98 | return Text(span.text) |
paul@341 | 99 | |
paul@341 | 100 | def in_comment(self, span): |
paul@341 | 101 | return Comment(span.text) |
paul@341 | 102 | |
paul@341 | 103 | def in_directive(self, span): |
paul@341 | 104 | return Directive(span.text) |
paul@341 | 105 | |
paul@341 | 106 | def in_inclusion(self, span): |
paul@341 | 107 | return Inclusion(span.text) |
paul@341 | 108 | |
paul@341 | 109 | def _queue_attributes(self, tokens): |
paul@341 | 110 | for token in tokens: |
paul@341 | 111 | self.queued.append(Attribute(token)) |
paul@341 | 112 | |
paul@341 | 113 | def in_tag(self, span): |
paul@341 | 114 | tokens = span.text.split() |
paul@341 | 115 | self._queue_attributes(tokens[1:]) |
paul@341 | 116 | return Tag(tokens[0]) |
paul@341 | 117 | |
paul@341 | 118 | def at_end_of_tag(self, span): |
paul@341 | 119 | return TagClose() |
paul@341 | 120 | |
paul@341 | 121 | def after_attribute_value(self, span): |
paul@341 | 122 | tokens = span.text.split() |
paul@341 | 123 | self._queue_attributes(tokens) |
paul@341 | 124 | return self.queued.popleft() |
paul@341 | 125 | |
paul@341 | 126 | def at_attribute_value(self, span): |
paul@341 | 127 | tokens = span.text.split() |
paul@341 | 128 | self._queue_attributes(tokens[1:]) |
paul@341 | 129 | return AttributeValue(tokens[0], "") |
paul@341 | 130 | |
paul@341 | 131 | def in_dq_attribute_value(self, span): |
paul@341 | 132 | return AttributeValue(span.text, '"') |
paul@341 | 133 | |
paul@341 | 134 | def in_sq_attribute_value(self, span): |
paul@341 | 135 | return AttributeValue(span.text, "'") |
paul@341 | 136 | |
paul@341 | 137 | |
paul@341 | 138 | |
paul@341 | 139 | # Tokenising. |
paul@341 | 140 | |
paul@341 | 141 | class Tokeniser: |
paul@341 | 142 | def __init__(self, text): |
paul@341 | 143 | self.lexer = Lexer(text) |
paul@341 | 144 | self.visitor = Visitor() |
paul@341 | 145 | |
paul@341 | 146 | def __iter__(self): |
paul@341 | 147 | return self |
paul@341 | 148 | |
paul@341 | 149 | def next(self): |
paul@341 | 150 | if self.visitor.queued: |
paul@341 | 151 | return self.visitor.queued.popleft() |
paul@341 | 152 | |
paul@341 | 153 | while 1: |
paul@341 | 154 | span = self.lexer.next() |
paul@341 | 155 | if not span.empty(): |
paul@341 | 156 | break |
paul@341 | 157 | |
paul@341 | 158 | return self.visitor.visit(span) |
paul@341 | 159 | |
paul@341 | 160 | # vim: tabstop=4 expandtab shiftwidth=4 |