1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/moinformat/utils/htmlparse/lex.py	Sat Jul 01 00:43:48 2023 +0200
     1.3 @@ -0,0 +1,234 @@
     1.4 +#!/usr/bin/env python
     1.5 +
     1.6 +"""
     1.7 +Lexical partitioning of HTML document content.
     1.8 +
     1.9 +Copyright (C) 2023 Paul Boddie <paul@boddie.org.uk>
    1.10 +
    1.11 +This program is free software; you can redistribute it and/or modify it under
    1.12 +the terms of the GNU General Public License as published by the Free Software
    1.13 +Foundation; either version 3 of the License, or (at your option) any later
    1.14 +version.
    1.15 +
    1.16 +This program is distributed in the hope that it will be useful, but WITHOUT
    1.17 +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
    1.18 +FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
    1.19 +details.
    1.20 +
    1.21 +You should have received a copy of the GNU General Public License along with
    1.22 +this program.  If not, see <http://www.gnu.org/licenses/>.
    1.23 +"""
    1.24 +
    1.25 +# Lexical analysis state transition handler functions.
    1.26 +
    1.27 +def tag_or_similar(text, pos):
    1.28 +
    1.29 +    # Consult the text positions following the position indicated.
    1.30 +
    1.31 +    if text[pos:pos+2] == "<!":
    1.32 +        if text[pos+3:pos+4] == "[":
    1.33 +            return IN_INCLUSION
    1.34 +        elif text[pos+3:pos+5] == "--":
    1.35 +            return IN_COMMENT
    1.36 +        else:
    1.37 +            return IN_DIRECTIVE
    1.38 +    else:
    1.39 +        return IN_TAG
    1.40 +
    1.41 +def at_attribute_value(text, pos):
    1.42 +    return AT_ATTRIBUTE_VALUE
    1.43 +
    1.44 +def in_dq_attribute_value(text, pos):
    1.45 +    return IN_DQ_ATTRIBUTE_VALUE
    1.46 +
    1.47 +def in_sq_attribute_value(text, pos):
    1.48 +    return IN_SQ_ATTRIBUTE_VALUE
    1.49 +
    1.50 +def after_attribute_value(text, pos):
    1.51 +    return AFTER_ATTRIBUTE_VALUE
    1.52 +
    1.53 +def end_of_standalone_tag(text, pos):
    1.54 +    return AT_END_OF_TAG
    1.55 +
    1.56 +def end_of_tag(text, pos):
    1.57 +    return BETWEEN_TAGS
    1.58 +
    1.59 +
    1.60 +
    1.61 +# Lexical analysis states/spans.
    1.62 +
    1.63 +class Span:
    1.64 +    def __init__(self, text):
    1.65 +        self.text = text
    1.66 +
    1.67 +    def empty(self):
    1.68 +        return not self.text
    1.69 +
    1.70 +    def __repr__(self):
    1.71 +        return "%s(%r)" % (self.__class__.__name__, self.text)
    1.72 +
    1.73 +class AT_END_OF_TAG(Span):
    1.74 +    transitions = [(None, "", end_of_tag)]
    1.75 +
    1.76 +    def empty(self):
    1.77 +        return False
    1.78 +
    1.79 +    def visit(self, visitor):
    1.80 +        return visitor.at_end_of_tag(self)
    1.81 +
    1.82 +class BETWEEN_TAGS(Span):
    1.83 +    transitions = [("<", "", tag_or_similar)]
    1.84 +
    1.85 +    def visit(self, visitor):
    1.86 +        return visitor.between_tags(self)
    1.87 +
    1.88 +class IN_TAG(Span):
    1.89 +    transitions = [
    1.90 +        ("=", "", at_attribute_value),
    1.91 +        ("/>", "", end_of_standalone_tag),
    1.92 +        (">", "", end_of_tag),
    1.93 +        ]
    1.94 +
    1.95 +    def visit(self, visitor):
    1.96 +        return visitor.in_tag(self)
    1.97 +
    1.98 +class IN_COMMENT(Span):
    1.99 +    transitions = [("-->", "--", end_of_tag)]
   1.100 +
   1.101 +    def visit(self, visitor):
   1.102 +        return visitor.in_comment(self)
   1.103 +
   1.104 +class IN_DIRECTIVE(Span):
   1.105 +    transitions = [(">", "", end_of_tag)]
   1.106 +
   1.107 +    def visit(self, visitor):
   1.108 +        return visitor.in_directive(self)
   1.109 +
   1.110 +class IN_INCLUSION(Span):
   1.111 +    transitions = [("]]>", "]]", end_of_tag)]
   1.112 +
   1.113 +    def visit(self, visitor):
   1.114 +        return visitor.in_inclusion(self)
   1.115 +
   1.116 +class AFTER_ATTRIBUTE_VALUE(Span):
   1.117 +    transitions = [
   1.118 +        ("=", "", at_attribute_value),
   1.119 +        ("/>", "", end_of_standalone_tag),
   1.120 +        (">", "", end_of_tag),
   1.121 +        ]
   1.122 +
   1.123 +    def empty(self):
   1.124 +        return not self.text.strip()
   1.125 +
   1.126 +    def visit(self, visitor):
   1.127 +        return visitor.after_attribute_value(self)
   1.128 +
   1.129 +class AT_ATTRIBUTE_VALUE(Span):
   1.130 +    transitions = [
   1.131 +        ("=", "", at_attribute_value),
   1.132 +        ('"', "", in_dq_attribute_value),
   1.133 +        ("'", "", in_sq_attribute_value),
   1.134 +        ("/>", "", end_of_standalone_tag),
   1.135 +        (">", "", end_of_tag),
   1.136 +        ]
   1.137 +
   1.138 +    def empty(self):
   1.139 +        return not self.text.strip()
   1.140 +
   1.141 +    def visit(self, visitor):
   1.142 +        return visitor.at_attribute_value(self)
   1.143 +
   1.144 +class IN_DQ_ATTRIBUTE_VALUE(Span):
   1.145 +    transitions = [('"', "", after_attribute_value)]
   1.146 +
   1.147 +    def visit(self, visitor):
   1.148 +        return visitor.in_dq_attribute_value(self)
   1.149 +
   1.150 +class IN_SQ_ATTRIBUTE_VALUE(Span):
   1.151 +    transitions = [("'", "", after_attribute_value)]
   1.152 +
   1.153 +    def visit(self, visitor):
   1.154 +        return visitor.in_sq_attribute_value(self)
   1.155 +
   1.156 +
   1.157 +
   1.158 +# Utility functions.
   1.159 +
   1.160 +def find_one(text, pos, choices):
   1.161 +
   1.162 +    """
   1.163 +    Find in 'text' from 'pos' the earliest occurring instance of one of the
   1.164 +    given 'choices', these being a list of (token string, extra string, state)
   1.165 +    tuples.
   1.166 +
   1.167 +    The token string is a token marking the start of the next span, the extra
   1.168 +    string is the portion of the token to be added to the end of the current
   1.169 +    span upon matching, and the state applies to the next span.
   1.170 +
   1.171 +    The associated state, the position of the occurrence, and the position of
   1.172 +    the text following the occurrence are returned as a tuple.
   1.173 +    """
   1.174 +
   1.175 +    next_state = None
   1.176 +    first_pos = None
   1.177 +    first_extra = None
   1.178 +    next_pos = None
   1.179 +
   1.180 +    for token, extra, state in choices:
   1.181 +        if token is None:
   1.182 +            return state, pos, extra, pos
   1.183 +
   1.184 +        found_pos = text.find(token, pos)
   1.185 +
   1.186 +        if found_pos != -1 and (next_state is None or found_pos < first_pos):
   1.187 +            next_state = state
   1.188 +            first_pos = found_pos
   1.189 +            first_extra = extra
   1.190 +            next_pos = found_pos + len(token)
   1.191 +
   1.192 +    return next_state, first_pos, first_extra, next_pos
   1.193 +
   1.194 +
   1.195 +
   1.196 +# Lexical partitioning.
   1.197 +
   1.198 +class Lexer:
   1.199 +    def __init__(self, text):
   1.200 +        self.text = text
   1.201 +        self.state = BETWEEN_TAGS
   1.202 +        self.pos = 0
   1.203 +
   1.204 +    def _end_of_input(self):
   1.205 +        start = self.pos
   1.206 +        self.pos = None
   1.207 +        return self._span(self.text[start:])
   1.208 +
   1.209 +    def _span(self, text):
   1.210 +        return self.state(text)
   1.211 +
   1.212 +    def __iter__(self):
   1.213 +        return self
   1.214 +
   1.215 +    def next(self):
   1.216 +        if self.pos is None:
   1.217 +            raise StopIteration
   1.218 +
   1.219 +        # Obtain details of a state transition: a handler function to determine
   1.220 +        # the next state, and the start and end positions of the token causing
   1.221 +        # the transition.
   1.222 +
   1.223 +        handler, pos, extra, next_pos = find_one(self.text, self.pos, self.state.transitions)
   1.224 +
   1.225 +        if handler is None:
   1.226 +            return self._end_of_input()
   1.227 +
   1.228 +        # Obtain the lexical span and update the state and position.
   1.229 +
   1.230 +        span = self._span(self.text[self.pos:pos] + extra)
   1.231 +
   1.232 +        self.state = handler(self.text, pos)
   1.233 +        self.pos = next_pos
   1.234 +
   1.235 +        return span
   1.236 +
   1.237 +# vim: tabstop=4 expandtab shiftwidth=4