1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
1.2 +++ b/moinformat/utils/htmlparse/lex.py Sat Jul 01 00:43:48 2023 +0200
1.3 @@ -0,0 +1,234 @@
1.4 +#!/usr/bin/env python
1.5 +
1.6 +"""
1.7 +Lexical partitioning of HTML document content.
1.8 +
1.9 +Copyright (C) 2023 Paul Boddie <paul@boddie.org.uk>
1.10 +
1.11 +This program is free software; you can redistribute it and/or modify it under
1.12 +the terms of the GNU General Public License as published by the Free Software
1.13 +Foundation; either version 3 of the License, or (at your option) any later
1.14 +version.
1.15 +
1.16 +This program is distributed in the hope that it will be useful, but WITHOUT
1.17 +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
1.18 +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
1.19 +details.
1.20 +
1.21 +You should have received a copy of the GNU General Public License along with
1.22 +this program. If not, see <http://www.gnu.org/licenses/>.
1.23 +"""
1.24 +
1.25 +# Lexical analysis state transition handler functions.
1.26 +
1.27 +def tag_or_similar(text, pos):
1.28 +
1.29 + # Consult the text positions following the position indicated.
1.30 +
1.31 + if text[pos:pos+2] == "<!":
1.32 + if text[pos+3:pos+4] == "[":
1.33 + return IN_INCLUSION
1.34 + elif text[pos+3:pos+5] == "--":
1.35 + return IN_COMMENT
1.36 + else:
1.37 + return IN_DIRECTIVE
1.38 + else:
1.39 + return IN_TAG
1.40 +
1.41 +def at_attribute_value(text, pos):
1.42 + return AT_ATTRIBUTE_VALUE
1.43 +
1.44 +def in_dq_attribute_value(text, pos):
1.45 + return IN_DQ_ATTRIBUTE_VALUE
1.46 +
1.47 +def in_sq_attribute_value(text, pos):
1.48 + return IN_SQ_ATTRIBUTE_VALUE
1.49 +
1.50 +def after_attribute_value(text, pos):
1.51 + return AFTER_ATTRIBUTE_VALUE
1.52 +
1.53 +def end_of_standalone_tag(text, pos):
1.54 + return AT_END_OF_TAG
1.55 +
1.56 +def end_of_tag(text, pos):
1.57 + return BETWEEN_TAGS
1.58 +
1.59 +
1.60 +
1.61 +# Lexical analysis states/spans.
1.62 +
1.63 +class Span:
1.64 + def __init__(self, text):
1.65 + self.text = text
1.66 +
1.67 + def empty(self):
1.68 + return not self.text
1.69 +
1.70 + def __repr__(self):
1.71 + return "%s(%r)" % (self.__class__.__name__, self.text)
1.72 +
1.73 +class AT_END_OF_TAG(Span):
1.74 + transitions = [(None, "", end_of_tag)]
1.75 +
1.76 + def empty(self):
1.77 + return False
1.78 +
1.79 + def visit(self, visitor):
1.80 + return visitor.at_end_of_tag(self)
1.81 +
1.82 +class BETWEEN_TAGS(Span):
1.83 + transitions = [("<", "", tag_or_similar)]
1.84 +
1.85 + def visit(self, visitor):
1.86 + return visitor.between_tags(self)
1.87 +
1.88 +class IN_TAG(Span):
1.89 + transitions = [
1.90 + ("=", "", at_attribute_value),
1.91 + ("/>", "", end_of_standalone_tag),
1.92 + (">", "", end_of_tag),
1.93 + ]
1.94 +
1.95 + def visit(self, visitor):
1.96 + return visitor.in_tag(self)
1.97 +
1.98 +class IN_COMMENT(Span):
1.99 + transitions = [("-->", "--", end_of_tag)]
1.100 +
1.101 + def visit(self, visitor):
1.102 + return visitor.in_comment(self)
1.103 +
1.104 +class IN_DIRECTIVE(Span):
1.105 + transitions = [(">", "", end_of_tag)]
1.106 +
1.107 + def visit(self, visitor):
1.108 + return visitor.in_directive(self)
1.109 +
1.110 +class IN_INCLUSION(Span):
1.111 + transitions = [("]]>", "]]", end_of_tag)]
1.112 +
1.113 + def visit(self, visitor):
1.114 + return visitor.in_inclusion(self)
1.115 +
1.116 +class AFTER_ATTRIBUTE_VALUE(Span):
1.117 + transitions = [
1.118 + ("=", "", at_attribute_value),
1.119 + ("/>", "", end_of_standalone_tag),
1.120 + (">", "", end_of_tag),
1.121 + ]
1.122 +
1.123 + def empty(self):
1.124 + return not self.text.strip()
1.125 +
1.126 + def visit(self, visitor):
1.127 + return visitor.after_attribute_value(self)
1.128 +
1.129 +class AT_ATTRIBUTE_VALUE(Span):
1.130 + transitions = [
1.131 + ("=", "", at_attribute_value),
1.132 + ('"', "", in_dq_attribute_value),
1.133 + ("'", "", in_sq_attribute_value),
1.134 + ("/>", "", end_of_standalone_tag),
1.135 + (">", "", end_of_tag),
1.136 + ]
1.137 +
1.138 + def empty(self):
1.139 + return not self.text.strip()
1.140 +
1.141 + def visit(self, visitor):
1.142 + return visitor.at_attribute_value(self)
1.143 +
1.144 +class IN_DQ_ATTRIBUTE_VALUE(Span):
1.145 + transitions = [('"', "", after_attribute_value)]
1.146 +
1.147 + def visit(self, visitor):
1.148 + return visitor.in_dq_attribute_value(self)
1.149 +
1.150 +class IN_SQ_ATTRIBUTE_VALUE(Span):
1.151 + transitions = [("'", "", after_attribute_value)]
1.152 +
1.153 + def visit(self, visitor):
1.154 + return visitor.in_sq_attribute_value(self)
1.155 +
1.156 +
1.157 +
1.158 +# Utility functions.
1.159 +
1.160 +def find_one(text, pos, choices):
1.161 +
1.162 + """
1.163 + Find in 'text' from 'pos' the earliest occurring instance of one of the
1.164 + given 'choices', these being a list of (token string, extra string, state)
1.165 + tuples.
1.166 +
1.167 + The token string is a token marking the start of the next span, the extra
1.168 + string is the portion of the token to be added to the end of the current
1.169 + span upon matching, and the state applies to the next span.
1.170 +
1.171 + The associated state, the position of the occurrence, and the position of
1.172 + the text following the occurrence are returned as a tuple.
1.173 + """
1.174 +
1.175 + next_state = None
1.176 + first_pos = None
1.177 + first_extra = None
1.178 + next_pos = None
1.179 +
1.180 + for token, extra, state in choices:
1.181 + if token is None:
1.182 + return state, pos, extra, pos
1.183 +
1.184 + found_pos = text.find(token, pos)
1.185 +
1.186 + if found_pos != -1 and (next_state is None or found_pos < first_pos):
1.187 + next_state = state
1.188 + first_pos = found_pos
1.189 + first_extra = extra
1.190 + next_pos = found_pos + len(token)
1.191 +
1.192 + return next_state, first_pos, first_extra, next_pos
1.193 +
1.194 +
1.195 +
1.196 +# Lexical partitioning.
1.197 +
1.198 +class Lexer:
1.199 + def __init__(self, text):
1.200 + self.text = text
1.201 + self.state = BETWEEN_TAGS
1.202 + self.pos = 0
1.203 +
1.204 + def _end_of_input(self):
1.205 + start = self.pos
1.206 + self.pos = None
1.207 + return self._span(self.text[start:])
1.208 +
1.209 + def _span(self, text):
1.210 + return self.state(text)
1.211 +
1.212 + def __iter__(self):
1.213 + return self
1.214 +
1.215 + def next(self):
1.216 + if self.pos is None:
1.217 + raise StopIteration
1.218 +
1.219 + # Obtain details of a state transition: a handler function to determine
1.220 + # the next state, and the start and end positions of the token causing
1.221 + # the transition.
1.222 +
1.223 + handler, pos, extra, next_pos = find_one(self.text, self.pos, self.state.transitions)
1.224 +
1.225 + if handler is None:
1.226 + return self._end_of_input()
1.227 +
1.228 + # Obtain the lexical span and update the state and position.
1.229 +
1.230 + span = self._span(self.text[self.pos:pos] + extra)
1.231 +
1.232 + self.state = handler(self.text, pos)
1.233 + self.pos = next_pos
1.234 +
1.235 + return span
1.236 +
1.237 +# vim: tabstop=4 expandtab shiftwidth=4