Lichen

Annotated pyparser/parser.py

775:c8ba74a474eb
2017-03-25 Paul Boddie Fixed method name.
paul@437 1
"""
paul@437 2
A CPython inspired RPython parser.
paul@437 3
"""
paul@437 4
paul@437 5
paul@437 6
class Grammar(object):
paul@437 7
    """
paul@437 8
    Base Grammar object.
paul@437 9
paul@437 10
    Pass this to ParserGenerator.build_grammar to fill it with useful values for
paul@437 11
    the Parser.
paul@437 12
    """
paul@437 13
paul@437 14
    def __init__(self):
paul@437 15
        self.symbol_ids = {}
paul@437 16
        self.symbol_names = {}
paul@437 17
        self.symbol_to_label = {}
paul@437 18
        self.keyword_ids = {}
paul@437 19
        self.dfas = []
paul@437 20
        self.labels = [0]
paul@437 21
        self.token_ids = {}
paul@437 22
        self.start = -1
paul@437 23
paul@437 24
    def shared_copy(self):
paul@437 25
        new = self.__class__()
paul@437 26
        new.symbol_ids = self.symbol_ids
paul@437 27
        new.symbols_names = self.symbol_names
paul@437 28
        new.keyword_ids = self.keyword_ids
paul@437 29
        new.dfas = self.dfas
paul@437 30
        new.labels = self.labels
paul@437 31
        new.token_ids = self.token_ids
paul@437 32
        return new
paul@437 33
paul@437 34
    def _freeze_(self):
paul@437 35
        # Remove some attributes not used in parsing.
paul@437 36
        try:
paul@437 37
            del self.symbol_to_label
paul@437 38
            del self.symbol_names
paul@437 39
            del self.symbol_ids
paul@437 40
        except AttributeError:
paul@437 41
            pass
paul@437 42
        return True
paul@437 43
paul@437 44
paul@437 45
class Node(object):
paul@437 46
paul@437 47
    __slots__ = ("type", )
paul@437 48
paul@437 49
    def __init__(self, type):
paul@437 50
        self.type = type
paul@437 51
paul@437 52
    def __eq__(self, other):
paul@437 53
        raise NotImplementedError("abstract base class")
paul@437 54
paul@437 55
    def __ne__(self, other):
paul@437 56
        return not self == other
paul@437 57
paul@437 58
    def get_value(self):
paul@437 59
        return None
paul@437 60
paul@437 61
    def get_child(self, i):
paul@437 62
        raise NotImplementedError("abstract base class")
paul@437 63
paul@437 64
    def num_children(self):
paul@437 65
        return 0
paul@437 66
paul@437 67
    def append_child(self, child):
paul@437 68
        raise NotImplementedError("abstract base class")
paul@437 69
paul@437 70
    def get_lineno(self):
paul@437 71
        raise NotImplementedError("abstract base class")
paul@437 72
paul@437 73
    def get_column(self):
paul@437 74
        raise NotImplementedError("abstract base class")
paul@437 75
paul@437 76
paul@437 77
class Terminal(Node):
paul@437 78
    __slots__ = ("value", "lineno", "column")
paul@437 79
    def __init__(self, type, value, lineno, column):
paul@437 80
        Node.__init__(self, type)
paul@437 81
        self.value = value
paul@437 82
        self.lineno = lineno
paul@437 83
        self.column = column
paul@437 84
paul@437 85
    def __repr__(self):
paul@437 86
        return "Terminal(type=%s, value=%r)" % (self.type, self.value)
paul@437 87
paul@437 88
    def __eq__(self, other):
paul@437 89
        # For tests.
paul@437 90
        return (type(self) == type(other) and
paul@437 91
                self.type == other.type and
paul@437 92
                self.value == other.value)
paul@437 93
paul@437 94
    def get_value(self):
paul@437 95
        return self.value
paul@437 96
paul@437 97
    def get_lineno(self):
paul@437 98
        return self.lineno
paul@437 99
paul@437 100
    def get_column(self):
paul@437 101
        return self.column
paul@437 102
paul@437 103
paul@437 104
class AbstractNonterminal(Node):
paul@437 105
    __slots__ = ()
paul@437 106
paul@437 107
    def get_lineno(self):
paul@437 108
        return self.get_child(0).get_lineno()
paul@437 109
paul@437 110
    def get_column(self):
paul@437 111
        return self.get_child(0).get_column()
paul@437 112
paul@437 113
    def __eq__(self, other):
paul@437 114
        # For tests.
paul@437 115
        # grumble, annoying
paul@437 116
        if not isinstance(other, AbstractNonterminal):
paul@437 117
            return False
paul@437 118
        if self.type != other.type:
paul@437 119
            return False
paul@437 120
        if self.num_children() != other.num_children():
paul@437 121
            return False
paul@437 122
        for i in range(self.num_children()):
paul@437 123
            if self.get_child(i) != other.get_child(i):
paul@437 124
                return False
paul@437 125
        return True
paul@437 126
paul@437 127
paul@437 128
class Nonterminal(AbstractNonterminal):
paul@437 129
    __slots__ = ("_children", )
paul@437 130
    def __init__(self, type, children):
paul@437 131
        Node.__init__(self, type)
paul@437 132
        self._children = children
paul@437 133
paul@437 134
    def __repr__(self):
paul@437 135
        return "Nonterminal(type=%s, children=%r)" % (self.type, self._children)
paul@437 136
paul@437 137
    def get_child(self, i):
paul@437 138
        return self._children[i]
paul@437 139
paul@437 140
    def num_children(self):
paul@437 141
        return len(self._children)
paul@437 142
paul@437 143
    def append_child(self, child):
paul@437 144
        self._children.append(child)
paul@437 145
paul@437 146
paul@437 147
class Nonterminal1(AbstractNonterminal):
paul@437 148
    __slots__ = ("_child", )
paul@437 149
    def __init__(self, type, child):
paul@437 150
        Node.__init__(self, type)
paul@437 151
        self._child = child
paul@437 152
paul@437 153
    def __repr__(self):
paul@437 154
        return "Nonterminal(type=%s, children=[%r])" % (self.type, self._child)
paul@437 155
paul@437 156
    def get_child(self, i):
paul@437 157
        assert i == 0 or i == -1
paul@437 158
        return self._child
paul@437 159
paul@437 160
    def num_children(self):
paul@437 161
        return 1
paul@437 162
paul@437 163
    def append_child(self, child):
paul@437 164
        assert 0, "should be unreachable"
paul@437 165
paul@437 166
paul@438 167
class NonterminalEnc(Nonterminal1):
paul@438 168
    def __init__(self, type, child, encoding):
paul@438 169
        Nonterminal1.__init__(self, type, child)
paul@438 170
        self.encoding = encoding
paul@438 171
paul@438 172
    def __repr__(self):
paul@438 173
        return "NonterminalEnc(type=%s, child=%r, encoding=%r)" % (self.type, self._child, self.encoding)
paul@438 174
paul@437 175
paul@437 176
class ParseError(Exception):
paul@437 177
paul@437 178
    def __init__(self, msg, token_type, value, lineno, column, line,
paul@437 179
                 expected=-1):
paul@437 180
        self.msg = msg
paul@437 181
        self.token_type = token_type
paul@437 182
        self.value = value
paul@437 183
        self.lineno = lineno
paul@437 184
        self.column = column
paul@437 185
        self.line = line
paul@437 186
        self.expected = expected
paul@437 187
paul@437 188
    def __str__(self):
paul@437 189
        return "ParserError(%s, %r)" % (self.token_type, self.value)
paul@437 190
paul@437 191
paul@437 192
class Parser(object):
paul@437 193
paul@437 194
    def __init__(self, grammar):
paul@437 195
        self.grammar = grammar
paul@437 196
        self.root = None
paul@437 197
        self.stack = None
paul@437 198
paul@437 199
    def prepare(self, start=-1):
paul@437 200
        """Setup the parser for parsing.
paul@437 201
paul@437 202
        Takes the starting symbol as an argument.
paul@437 203
        """
paul@437 204
        if start == -1:
paul@437 205
            start = self.grammar.start
paul@437 206
        self.root = None
paul@437 207
        current_node = Nonterminal(start, [])
paul@437 208
        self.stack = []
paul@437 209
        self.stack.append((self.grammar.dfas[start - 256], 0, current_node))
paul@437 210
paul@437 211
    def add_token(self, token_type, value, lineno, column, line):
paul@437 212
        label_index = self.classify(token_type, value, lineno, column, line)
paul@437 213
        sym_id = 0 # for the annotator
paul@437 214
        while True:
paul@437 215
            dfa, state_index, node = self.stack[-1]
paul@437 216
            states, first = dfa
paul@437 217
            arcs, is_accepting = states[state_index]
paul@437 218
            for i, next_state in arcs:
paul@437 219
                sym_id = self.grammar.labels[i]
paul@437 220
                if label_index == i:
paul@437 221
                    # We matched a non-terminal.
paul@437 222
                    self.shift(next_state, token_type, value, lineno, column)
paul@437 223
                    state = states[next_state]
paul@437 224
                    # While the only possible action is to accept, pop nodes off
paul@437 225
                    # the stack.
paul@437 226
                    while state[1] and not state[0]:
paul@437 227
                        self.pop()
paul@437 228
                        if not self.stack:
paul@437 229
                            # Parsing is done.
paul@437 230
                            return True
paul@437 231
                        dfa, state_index, node = self.stack[-1]
paul@437 232
                        state = dfa[0][state_index]
paul@437 233
                    return False
paul@437 234
                elif sym_id >= 256:
paul@437 235
                    sub_node_dfa = self.grammar.dfas[sym_id - 256]
paul@437 236
                    # Check if this token can start a child node.
paul@437 237
                    if label_index in sub_node_dfa[1]:
paul@437 238
                        self.push(sub_node_dfa, next_state, sym_id, lineno,
paul@437 239
                                  column)
paul@437 240
                        break
paul@437 241
            else:
paul@437 242
                # We failed to find any arcs to another state, so unless this
paul@437 243
                # state is accepting, it's invalid input.
paul@437 244
                if is_accepting:
paul@437 245
                    self.pop()
paul@437 246
                    if not self.stack:
paul@437 247
                        raise ParseError("too much input", token_type, value,
paul@437 248
                                         lineno, column, line)
paul@437 249
                else:
paul@437 250
                    # If only one possible input would satisfy, attach it to the
paul@437 251
                    # error.
paul@437 252
                    if len(arcs) == 1:
paul@437 253
                        expected = sym_id
paul@437 254
                    else:
paul@437 255
                        expected = -1
paul@437 256
                    raise ParseError("bad input", token_type, value, lineno,
paul@437 257
                                     column, line, expected)
paul@437 258
paul@437 259
    def classify(self, token_type, value, lineno, column, line):
paul@437 260
        """Find the label for a token."""
paul@437 261
        if token_type == self.grammar.KEYWORD_TOKEN:
paul@437 262
            label_index = self.grammar.keyword_ids.get(value, -1)
paul@437 263
            if label_index != -1:
paul@437 264
                return label_index
paul@437 265
        label_index = self.grammar.token_ids.get(token_type, -1)
paul@437 266
        if label_index == -1:
paul@437 267
            raise ParseError("invalid token", token_type, value, lineno, column,
paul@437 268
                             line)
paul@437 269
        return label_index
paul@437 270
paul@437 271
    def shift(self, next_state, token_type, value, lineno, column):
paul@437 272
        """Shift a non-terminal and prepare for the next state."""
paul@437 273
        dfa, state, node = self.stack[-1]
paul@437 274
        new_node = Terminal(token_type, value, lineno, column)
paul@437 275
        node.append_child(new_node)
paul@437 276
        self.stack[-1] = (dfa, next_state, node)
paul@437 277
paul@437 278
    def push(self, next_dfa, next_state, node_type, lineno, column):
paul@437 279
        """Push a terminal and adjust the current state."""
paul@437 280
        dfa, state, node = self.stack[-1]
paul@437 281
        new_node = Nonterminal(node_type, [])
paul@437 282
        self.stack[-1] = (dfa, next_state, node)
paul@437 283
        self.stack.append((next_dfa, 0, new_node))
paul@437 284
paul@437 285
    def pop(self):
paul@437 286
        """Pop an entry off the stack and make its node a child of the last."""
paul@437 287
        dfa, state, node = self.stack.pop()
paul@437 288
        if self.stack:
paul@437 289
            # we are now done with node, so we can store it more efficiently if
paul@437 290
            # it has just one child
paul@437 291
            if node.num_children() == 1:
paul@437 292
                node = Nonterminal1(node.type, node.get_child(0))
paul@437 293
            self.stack[-1][2].append_child(node)
paul@437 294
        else:
paul@437 295
            self.root = node