1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
1.2 +++ b/moinformat/utils/htmlparse/token.py Sat Jul 01 00:43:48 2023 +0200
1.3 @@ -0,0 +1,160 @@
1.4 +#!/usr/bin/env python
1.5 +
1.6 +"""
1.7 +An absurdly minimal HTML tokeniser.
1.8 +
1.9 +Copyright (C) 2023 Paul Boddie <paul@boddie.org.uk>
1.10 +
1.11 +This program is free software; you can redistribute it and/or modify it under
1.12 +the terms of the GNU General Public License as published by the Free Software
1.13 +Foundation; either version 3 of the License, or (at your option) any later
1.14 +version.
1.15 +
1.16 +This program is distributed in the hope that it will be useful, but WITHOUT
1.17 +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
1.18 +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
1.19 +details.
1.20 +
1.21 +You should have received a copy of the GNU General Public License along with
1.22 +this program. If not, see <http://www.gnu.org/licenses/>.
1.23 +"""
1.24 +
1.25 +from collections import deque
1.26 +from moinformat.utils.htmlparse.lex import Lexer
1.27 +
1.28 +
1.29 +
1.30 +# Document token classes.
1.31 +
1.32 +class Token:
1.33 + def __init__(self, value):
1.34 + self.value = value
1.35 +
1.36 + def __repr__(self):
1.37 + return "%s(%r)" % (self.__class__.__name__, self.value)
1.38 +
1.39 +class Attribute(Token):
1.40 + def visit(self, visitor):
1.41 + return visitor.attribute(self)
1.42 +
1.43 +class AttributeValue(Token):
1.44 + def __init__(self, value, quote):
1.45 + self.value = value
1.46 + self.quote = quote
1.47 +
1.48 + def __repr__(self):
1.49 + return "%s(%r, %r)" % (self.__class__.__name__, self.value, self.quote)
1.50 +
1.51 + def visit(self, visitor):
1.52 + return visitor.attribute_value(self)
1.53 +
1.54 +class Comment(Token):
1.55 + def visit(self, visitor):
1.56 + return visitor.comment(self)
1.57 +
1.58 +class Directive(Token):
1.59 + def visit(self, visitor):
1.60 + return visitor.directive(self)
1.61 +
1.62 +class Inclusion(Token):
1.63 + def visit(self, visitor):
1.64 + return visitor.inclusion(self)
1.65 +
1.66 +class Tag(Token):
1.67 + def visit(self, visitor):
1.68 + return visitor.tag(self)
1.69 +
1.70 + def is_end(self):
1.71 + return self.value.startswith("/")
1.72 +
1.73 + def tag_name(self):
1.74 + return self.is_end() and self.value[1:] or self.value
1.75 +
1.76 +class TagClose:
1.77 + def visit(self, visitor):
1.78 + return visitor.tag_close(self)
1.79 +
1.80 + def __repr__(self):
1.81 + return "%s()" % self.__class__.__name__
1.82 +
1.83 +class Text(Token):
1.84 + def visit(self, visitor):
1.85 + return visitor.text(self)
1.86 +
1.87 +
1.88 +
1.89 +# Tidying visitor employing the spans from lexical partitioning.
1.90 +
1.91 +class Visitor:
1.92 + def __init__(self):
1.93 + self.queued = deque()
1.94 +
1.95 + def visit(self, span):
1.96 + return span.visit(self)
1.97 +
1.98 + # Specific handler methods.
1.99 +
1.100 + def between_tags(self, span):
1.101 + return Text(span.text)
1.102 +
1.103 + def in_comment(self, span):
1.104 + return Comment(span.text)
1.105 +
1.106 + def in_directive(self, span):
1.107 + return Directive(span.text)
1.108 +
1.109 + def in_inclusion(self, span):
1.110 + return Inclusion(span.text)
1.111 +
1.112 + def _queue_attributes(self, tokens):
1.113 + for token in tokens:
1.114 + self.queued.append(Attribute(token))
1.115 +
1.116 + def in_tag(self, span):
1.117 + tokens = span.text.split()
1.118 + self._queue_attributes(tokens[1:])
1.119 + return Tag(tokens[0])
1.120 +
1.121 + def at_end_of_tag(self, span):
1.122 + return TagClose()
1.123 +
1.124 + def after_attribute_value(self, span):
1.125 + tokens = span.text.split()
1.126 + self._queue_attributes(tokens)
1.127 + return self.queued.popleft()
1.128 +
1.129 + def at_attribute_value(self, span):
1.130 + tokens = span.text.split()
1.131 + self._queue_attributes(tokens[1:])
1.132 + return AttributeValue(tokens[0], "")
1.133 +
1.134 + def in_dq_attribute_value(self, span):
1.135 + return AttributeValue(span.text, '"')
1.136 +
1.137 + def in_sq_attribute_value(self, span):
1.138 + return AttributeValue(span.text, "'")
1.139 +
1.140 +
1.141 +
1.142 +# Tokenising.
1.143 +
1.144 +class Tokeniser:
1.145 + def __init__(self, text):
1.146 + self.lexer = Lexer(text)
1.147 + self.visitor = Visitor()
1.148 +
1.149 + def __iter__(self):
1.150 + return self
1.151 +
1.152 + def next(self):
1.153 + if self.visitor.queued:
1.154 + return self.visitor.queued.popleft()
1.155 +
1.156 + while 1:
1.157 + span = self.lexer.next()
1.158 + if not span.empty():
1.159 + break
1.160 +
1.161 + return self.visitor.visit(span)
1.162 +
1.163 +# vim: tabstop=4 expandtab shiftwidth=4