paul@341 | 1 | #!/usr/bin/env python |
paul@341 | 2 | |
paul@341 | 3 | """ |
paul@341 | 4 | Lexical partitioning of HTML document content. |
paul@341 | 5 | |
paul@341 | 6 | Copyright (C) 2023 Paul Boddie <paul@boddie.org.uk> |
paul@341 | 7 | |
paul@341 | 8 | This program is free software; you can redistribute it and/or modify it under |
paul@341 | 9 | the terms of the GNU General Public License as published by the Free Software |
paul@341 | 10 | Foundation; either version 3 of the License, or (at your option) any later |
paul@341 | 11 | version. |
paul@341 | 12 | |
paul@341 | 13 | This program is distributed in the hope that it will be useful, but WITHOUT |
paul@341 | 14 | ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS |
paul@341 | 15 | FOR A PARTICULAR PURPOSE. See the GNU General Public License for more |
paul@341 | 16 | details. |
paul@341 | 17 | |
paul@341 | 18 | You should have received a copy of the GNU General Public License along with |
paul@341 | 19 | this program. If not, see <http://www.gnu.org/licenses/>. |
paul@341 | 20 | """ |
paul@341 | 21 | |
paul@341 | 22 | # Lexical analysis state transition handler functions. |
paul@341 | 23 | |
paul@341 | 24 | def tag_or_similar(text, pos): |
paul@341 | 25 | |
paul@341 | 26 | # Consult the text positions following the position indicated. |
paul@341 | 27 | |
paul@341 | 28 | if text[pos:pos+2] == "<!": |
paul@341 | 29 | if text[pos+3:pos+4] == "[": |
paul@341 | 30 | return IN_INCLUSION |
paul@341 | 31 | elif text[pos+3:pos+5] == "--": |
paul@341 | 32 | return IN_COMMENT |
paul@341 | 33 | else: |
paul@341 | 34 | return IN_DIRECTIVE |
paul@341 | 35 | else: |
paul@341 | 36 | return IN_TAG |
paul@341 | 37 | |
paul@341 | 38 | def at_attribute_value(text, pos): |
paul@341 | 39 | return AT_ATTRIBUTE_VALUE |
paul@341 | 40 | |
paul@341 | 41 | def in_dq_attribute_value(text, pos): |
paul@341 | 42 | return IN_DQ_ATTRIBUTE_VALUE |
paul@341 | 43 | |
paul@341 | 44 | def in_sq_attribute_value(text, pos): |
paul@341 | 45 | return IN_SQ_ATTRIBUTE_VALUE |
paul@341 | 46 | |
paul@341 | 47 | def after_attribute_value(text, pos): |
paul@341 | 48 | return AFTER_ATTRIBUTE_VALUE |
paul@341 | 49 | |
paul@341 | 50 | def end_of_standalone_tag(text, pos): |
paul@341 | 51 | return AT_END_OF_TAG |
paul@341 | 52 | |
paul@341 | 53 | def end_of_tag(text, pos): |
paul@341 | 54 | return BETWEEN_TAGS |
paul@341 | 55 | |
paul@341 | 56 | |
paul@341 | 57 | |
paul@341 | 58 | # Lexical analysis states/spans. |
paul@341 | 59 | |
paul@341 | 60 | class Span: |
paul@341 | 61 | def __init__(self, text): |
paul@341 | 62 | self.text = text |
paul@341 | 63 | |
paul@341 | 64 | def empty(self): |
paul@341 | 65 | return not self.text |
paul@341 | 66 | |
paul@341 | 67 | def __repr__(self): |
paul@341 | 68 | return "%s(%r)" % (self.__class__.__name__, self.text) |
paul@341 | 69 | |
paul@341 | 70 | class AT_END_OF_TAG(Span): |
paul@341 | 71 | transitions = [(None, "", end_of_tag)] |
paul@341 | 72 | |
paul@341 | 73 | def empty(self): |
paul@341 | 74 | return False |
paul@341 | 75 | |
paul@341 | 76 | def visit(self, visitor): |
paul@341 | 77 | return visitor.at_end_of_tag(self) |
paul@341 | 78 | |
paul@341 | 79 | class BETWEEN_TAGS(Span): |
paul@341 | 80 | transitions = [("<", "", tag_or_similar)] |
paul@341 | 81 | |
paul@341 | 82 | def visit(self, visitor): |
paul@341 | 83 | return visitor.between_tags(self) |
paul@341 | 84 | |
paul@341 | 85 | class IN_TAG(Span): |
paul@341 | 86 | transitions = [ |
paul@341 | 87 | ("=", "", at_attribute_value), |
paul@341 | 88 | ("/>", "", end_of_standalone_tag), |
paul@341 | 89 | (">", "", end_of_tag), |
paul@341 | 90 | ] |
paul@341 | 91 | |
paul@341 | 92 | def visit(self, visitor): |
paul@341 | 93 | return visitor.in_tag(self) |
paul@341 | 94 | |
paul@341 | 95 | class IN_COMMENT(Span): |
paul@341 | 96 | transitions = [("-->", "--", end_of_tag)] |
paul@341 | 97 | |
paul@341 | 98 | def visit(self, visitor): |
paul@341 | 99 | return visitor.in_comment(self) |
paul@341 | 100 | |
paul@341 | 101 | class IN_DIRECTIVE(Span): |
paul@341 | 102 | transitions = [(">", "", end_of_tag)] |
paul@341 | 103 | |
paul@341 | 104 | def visit(self, visitor): |
paul@341 | 105 | return visitor.in_directive(self) |
paul@341 | 106 | |
paul@341 | 107 | class IN_INCLUSION(Span): |
paul@341 | 108 | transitions = [("]]>", "]]", end_of_tag)] |
paul@341 | 109 | |
paul@341 | 110 | def visit(self, visitor): |
paul@341 | 111 | return visitor.in_inclusion(self) |
paul@341 | 112 | |
paul@341 | 113 | class AFTER_ATTRIBUTE_VALUE(Span): |
paul@341 | 114 | transitions = [ |
paul@341 | 115 | ("=", "", at_attribute_value), |
paul@341 | 116 | ("/>", "", end_of_standalone_tag), |
paul@341 | 117 | (">", "", end_of_tag), |
paul@341 | 118 | ] |
paul@341 | 119 | |
paul@341 | 120 | def empty(self): |
paul@341 | 121 | return not self.text.strip() |
paul@341 | 122 | |
paul@341 | 123 | def visit(self, visitor): |
paul@341 | 124 | return visitor.after_attribute_value(self) |
paul@341 | 125 | |
paul@341 | 126 | class AT_ATTRIBUTE_VALUE(Span): |
paul@341 | 127 | transitions = [ |
paul@341 | 128 | ("=", "", at_attribute_value), |
paul@341 | 129 | ('"', "", in_dq_attribute_value), |
paul@341 | 130 | ("'", "", in_sq_attribute_value), |
paul@341 | 131 | ("/>", "", end_of_standalone_tag), |
paul@341 | 132 | (">", "", end_of_tag), |
paul@341 | 133 | ] |
paul@341 | 134 | |
paul@341 | 135 | def empty(self): |
paul@341 | 136 | return not self.text.strip() |
paul@341 | 137 | |
paul@341 | 138 | def visit(self, visitor): |
paul@341 | 139 | return visitor.at_attribute_value(self) |
paul@341 | 140 | |
paul@341 | 141 | class IN_DQ_ATTRIBUTE_VALUE(Span): |
paul@341 | 142 | transitions = [('"', "", after_attribute_value)] |
paul@341 | 143 | |
paul@341 | 144 | def visit(self, visitor): |
paul@341 | 145 | return visitor.in_dq_attribute_value(self) |
paul@341 | 146 | |
paul@341 | 147 | class IN_SQ_ATTRIBUTE_VALUE(Span): |
paul@341 | 148 | transitions = [("'", "", after_attribute_value)] |
paul@341 | 149 | |
paul@341 | 150 | def visit(self, visitor): |
paul@341 | 151 | return visitor.in_sq_attribute_value(self) |
paul@341 | 152 | |
paul@341 | 153 | |
paul@341 | 154 | |
paul@341 | 155 | # Utility functions. |
paul@341 | 156 | |
paul@341 | 157 | def find_one(text, pos, choices): |
paul@341 | 158 | |
paul@341 | 159 | """ |
paul@341 | 160 | Find in 'text' from 'pos' the earliest occurring instance of one of the |
paul@341 | 161 | given 'choices', these being a list of (token string, extra string, state) |
paul@341 | 162 | tuples. |
paul@341 | 163 | |
paul@341 | 164 | The token string is a token marking the start of the next span, the extra |
paul@341 | 165 | string is the portion of the token to be added to the end of the current |
paul@341 | 166 | span upon matching, and the state applies to the next span. |
paul@341 | 167 | |
paul@341 | 168 | The associated state, the position of the occurrence, and the position of |
paul@341 | 169 | the text following the occurrence are returned as a tuple. |
paul@341 | 170 | """ |
paul@341 | 171 | |
paul@341 | 172 | next_state = None |
paul@341 | 173 | first_pos = None |
paul@341 | 174 | first_extra = None |
paul@341 | 175 | next_pos = None |
paul@341 | 176 | |
paul@341 | 177 | for token, extra, state in choices: |
paul@341 | 178 | if token is None: |
paul@341 | 179 | return state, pos, extra, pos |
paul@341 | 180 | |
paul@341 | 181 | found_pos = text.find(token, pos) |
paul@341 | 182 | |
paul@341 | 183 | if found_pos != -1 and (next_state is None or found_pos < first_pos): |
paul@341 | 184 | next_state = state |
paul@341 | 185 | first_pos = found_pos |
paul@341 | 186 | first_extra = extra |
paul@341 | 187 | next_pos = found_pos + len(token) |
paul@341 | 188 | |
paul@341 | 189 | return next_state, first_pos, first_extra, next_pos |
paul@341 | 190 | |
paul@341 | 191 | |
paul@341 | 192 | |
paul@341 | 193 | # Lexical partitioning. |
paul@341 | 194 | |
paul@341 | 195 | class Lexer: |
paul@341 | 196 | def __init__(self, text): |
paul@341 | 197 | self.text = text |
paul@341 | 198 | self.state = BETWEEN_TAGS |
paul@341 | 199 | self.pos = 0 |
paul@341 | 200 | |
paul@341 | 201 | def _end_of_input(self): |
paul@341 | 202 | start = self.pos |
paul@341 | 203 | self.pos = None |
paul@341 | 204 | return self._span(self.text[start:]) |
paul@341 | 205 | |
paul@341 | 206 | def _span(self, text): |
paul@341 | 207 | return self.state(text) |
paul@341 | 208 | |
paul@341 | 209 | def __iter__(self): |
paul@341 | 210 | return self |
paul@341 | 211 | |
paul@341 | 212 | def next(self): |
paul@341 | 213 | if self.pos is None: |
paul@341 | 214 | raise StopIteration |
paul@341 | 215 | |
paul@341 | 216 | # Obtain details of a state transition: a handler function to determine |
paul@341 | 217 | # the next state, and the start and end positions of the token causing |
paul@341 | 218 | # the transition. |
paul@341 | 219 | |
paul@341 | 220 | handler, pos, extra, next_pos = find_one(self.text, self.pos, self.state.transitions) |
paul@341 | 221 | |
paul@341 | 222 | if handler is None: |
paul@341 | 223 | return self._end_of_input() |
paul@341 | 224 | |
paul@341 | 225 | # Obtain the lexical span and update the state and position. |
paul@341 | 226 | |
paul@341 | 227 | span = self._span(self.text[self.pos:pos] + extra) |
paul@341 | 228 | |
paul@341 | 229 | self.state = handler(self.text, pos) |
paul@341 | 230 | self.pos = next_pos |
paul@341 | 231 | |
paul@341 | 232 | return span |
paul@341 | 233 | |
paul@341 | 234 | # vim: tabstop=4 expandtab shiftwidth=4 |