Lichen

Annotated pyparser/metaparser.py

940:6ddce984649b
2021-10-30 Paul Boddie Fixed expected result in comment.
paul@437 1
"""
paul@437 2
Makes a parser from a grammar source.
paul@437 3
paul@437 4
Inspired by Guido van Rossum's pgen2.
paul@437 5
"""
paul@437 6
paul@437 7
import StringIO
paul@437 8
import tokenize
paul@437 9
import token
paul@437 10
paul@437 11
from pyparser import parser
paul@437 12
paul@437 13
paul@437 14
class PgenError(Exception):
paul@437 15
paul@437 16
    def __init__(self, msg, location=None):
paul@437 17
        Exception.__init__(self, msg)
paul@437 18
        self.location = location
paul@437 19
paul@437 20
paul@437 21
class NFA(object):
paul@437 22
paul@437 23
    def __init__(self):
paul@437 24
        self.arcs = []
paul@437 25
paul@437 26
    def arc(self, to_state, label=None):
paul@437 27
        self.arcs.append((label, to_state))
paul@437 28
paul@437 29
    def find_unlabeled_states(self, into):
paul@437 30
        if self in into:
paul@437 31
            return
paul@437 32
        into.add(self)
paul@437 33
        for label, state in self.arcs:
paul@437 34
            if label is None:
paul@437 35
                state.find_unlabeled_states(into)
paul@437 36
paul@437 37
paul@437 38
class DFA(object):
paul@437 39
paul@437 40
    def __init__(self, nfa_set, final_state):
paul@437 41
        self.nfas = nfa_set
paul@437 42
        self.is_final = final_state in nfa_set
paul@437 43
        self.arcs = {}
paul@437 44
paul@437 45
    def arc(self, next, label):
paul@437 46
        self.arcs[label] = next
paul@437 47
paul@437 48
    def unify_state(self, old, new):
paul@437 49
        for label, state in self.arcs.iteritems():
paul@437 50
            if state is old:
paul@437 51
                self.arcs[label] = new
paul@437 52
paul@437 53
    def __repr__(self):
paul@437 54
        return "<DFA arcs=%r>" % self.arcs
paul@437 55
paul@437 56
    def __eq__(self, other):
paul@437 57
        if not isinstance(other, DFA):
paul@437 58
            # This shouldn't really happen.
paul@437 59
            return NotImplemented
paul@437 60
        if other.is_final != self.is_final:
paul@437 61
            return False
paul@437 62
        if len(self.arcs) != len(other.arcs):
paul@437 63
            return False
paul@437 64
        for label, state in self.arcs.iteritems():
paul@437 65
            try:
paul@437 66
                other_state = other.arcs[label]
paul@437 67
            except KeyError:
paul@437 68
                return False
paul@437 69
            else:
paul@437 70
                if other_state is not state:
paul@437 71
                    return False
paul@437 72
        return True
paul@437 73
paul@437 74
paul@437 75
def nfa_to_dfa(start, end):
paul@437 76
    """Convert an NFA to a DFA(s)
paul@437 77
paul@437 78
    Each DFA is initially a set of NFA states without labels.  We start with the
paul@437 79
    DFA for the start NFA.  Then we add labeled arcs to it pointing to another
paul@437 80
    set of NFAs (the next state).  Finally, we do the same thing to every DFA
paul@437 81
    that is found and return the list of states.
paul@437 82
    """
paul@437 83
    base_nfas = set()
paul@437 84
    start.find_unlabeled_states(base_nfas)
paul@437 85
    state_stack = [DFA(base_nfas, end)]
paul@437 86
    for state in state_stack:
paul@437 87
        arcs = {}
paul@437 88
        for nfa in state.nfas:
paul@437 89
            for label, sub_nfa in nfa.arcs:
paul@437 90
                if label is not None:
paul@437 91
                    sub_nfa.find_unlabeled_states(arcs.setdefault(label, set()))
paul@437 92
        for label, nfa_set in arcs.iteritems():
paul@437 93
            for st in state_stack:
paul@437 94
                if st.nfas == nfa_set:
paul@437 95
                    break
paul@437 96
            else:
paul@437 97
                st = DFA(nfa_set, end)
paul@437 98
                state_stack.append(st)
paul@437 99
            state.arc(st, label)
paul@437 100
    return state_stack
paul@437 101
paul@437 102
def simplify_dfa(dfa):
paul@437 103
    changed = True
paul@437 104
    while changed:
paul@437 105
        changed = False
paul@437 106
        for i, state in enumerate(dfa):
paul@437 107
            for j in xrange(i + 1, len(dfa)):
paul@437 108
                other_state = dfa[j]
paul@437 109
                if state == other_state:
paul@437 110
                    del dfa[j]
paul@437 111
                    for sub_state in dfa:
paul@437 112
                        sub_state.unify_state(other_state, state)
paul@437 113
                    changed = True
paul@437 114
                    break
paul@437 115
paul@437 116
paul@437 117
class ParserGenerator(object):
paul@437 118
    """NOT_RPYTHON"""
paul@437 119
paul@437 120
    def __init__(self, grammar_source):
paul@437 121
        self.start_symbol = None
paul@437 122
        self.dfas = {}
paul@437 123
        stream = StringIO.StringIO(grammar_source)
paul@437 124
        self.token_stream = tokenize.generate_tokens(stream.readline)
paul@437 125
        self.parse()
paul@437 126
        self.first = {}
paul@437 127
        self.add_first_sets()
paul@437 128
paul@437 129
    def build_grammar(self, grammar_cls):
paul@437 130
        gram = grammar_cls()
paul@437 131
        gram.start = self.start_symbol
paul@437 132
        names = self.dfas.keys()
paul@437 133
        names.sort()
paul@437 134
        names.remove(self.start_symbol)
paul@437 135
        names.insert(0, self.start_symbol)
paul@437 136
        # First, build symbol and id mappings.
paul@437 137
        for name in names:
paul@437 138
            i = 256 + len(gram.symbol_ids)
paul@437 139
            gram.symbol_ids[name] = i
paul@437 140
            gram.symbol_names[i] = name
paul@437 141
        # Then, iterate through again and finalize labels.
paul@437 142
        for name in names:
paul@437 143
            dfa = self.dfas[name]
paul@437 144
            states = []
paul@437 145
            for state in dfa:
paul@437 146
                arcs = []
paul@437 147
                for label, next in state.arcs.iteritems():
paul@437 148
                    arcs.append((self.make_label(gram, label), dfa.index(next)))
paul@437 149
                states.append((arcs, state.is_final))
paul@437 150
            gram.dfas.append((states, self.make_first(gram, name)))
paul@437 151
            assert len(gram.dfas) - 1 == gram.symbol_ids[name] - 256
paul@437 152
        gram.start = gram.symbol_ids[self.start_symbol]
paul@437 153
        return gram
paul@437 154
paul@437 155
    def make_label(self, gram, label):
paul@437 156
        label_index = len(gram.labels)
paul@437 157
        if label[0].isalpha():
paul@437 158
            # Either a symbol or a token.
paul@437 159
            if label in gram.symbol_ids:
paul@437 160
                if label in gram.symbol_to_label:
paul@437 161
                    return gram.symbol_to_label[label]
paul@437 162
                else:
paul@437 163
                    gram.labels.append(gram.symbol_ids[label])
paul@437 164
                    gram.symbol_to_label[label] = label_index
paul@437 165
                    return label_index
paul@437 166
            elif label.isupper():
paul@437 167
                token_index = gram.TOKENS[label]
paul@437 168
                if token_index in gram.token_ids:
paul@437 169
                    return gram.token_ids[token_index]
paul@437 170
                else:
paul@437 171
                    gram.labels.append(token_index)
paul@437 172
                    gram.token_ids[token_index] = label_index
paul@437 173
                    return label_index
paul@437 174
            else:
paul@437 175
                # Probably a rule without a definition.
paul@437 176
                raise PgenError("no such rule: %r" % (label,))
paul@437 177
        else:
paul@437 178
            # A keyword or operator.
paul@437 179
            value = label.strip("\"'")
paul@437 180
            if value[0].isalpha():
paul@437 181
                if value in gram.keyword_ids:
paul@437 182
                    return gram.keyword_ids[value]
paul@437 183
                else:
paul@437 184
                    gram.labels.append(gram.KEYWORD_TOKEN)
paul@437 185
                    gram.keyword_ids[value] = label_index
paul@437 186
                    return label_index
paul@437 187
            else:
paul@437 188
                try:
paul@437 189
                    token_index = gram.OPERATOR_MAP[value]
paul@437 190
                except KeyError:
paul@437 191
                    raise PgenError("no such operator: %r" % (value,))
paul@437 192
                if token_index in gram.token_ids:
paul@437 193
                    return gram.token_ids[token_index]
paul@437 194
                else:
paul@437 195
                    gram.labels.append(token_index)
paul@437 196
                    gram.token_ids[token_index] = label_index
paul@437 197
                    return label_index
paul@437 198
paul@437 199
    def make_first(self, gram, name):
paul@437 200
        original_firsts = self.first[name]
paul@437 201
        firsts = dict()
paul@437 202
        for label in original_firsts:
paul@437 203
            firsts[self.make_label(gram, label)] = None
paul@437 204
        return firsts
paul@437 205
paul@437 206
    def add_first_sets(self):
paul@437 207
        for name, dfa in self.dfas.iteritems():
paul@437 208
            if name not in self.first:
paul@437 209
                self.get_first(name, dfa)
paul@437 210
paul@437 211
    def get_first(self, name, dfa):
paul@437 212
        self.first[name] = None
paul@437 213
        state = dfa[0]
paul@437 214
        all_labels = set()
paul@437 215
        overlap_check = {}
paul@437 216
        for label, sub_state in state.arcs.iteritems():
paul@437 217
            if label in self.dfas:
paul@437 218
                if label in self.first:
paul@437 219
                    new_labels = self.first[label]
paul@437 220
                    if new_labels is None:
paul@437 221
                        raise PgenError("recursion in rule: %r" % (name,))
paul@437 222
                else:
paul@437 223
                    new_labels = self.get_first(label, self.dfas[label])
paul@437 224
                all_labels.update(new_labels)
paul@437 225
                overlap_check[label] = new_labels
paul@437 226
            else:
paul@437 227
                all_labels.add(label)
paul@437 228
                overlap_check[label] = set((label,))
paul@437 229
        inverse = {}
paul@437 230
        for label, their_first in overlap_check.iteritems():
paul@437 231
            for sub_label in their_first:
paul@437 232
                if sub_label in inverse:
paul@437 233
                    raise PgenError("ambiguous symbol with label %s"
paul@437 234
                                    % (label,))
paul@437 235
                inverse[sub_label] = label
paul@437 236
        self.first[name] = all_labels
paul@437 237
        return all_labels
paul@437 238
paul@437 239
    def expect(self, token_type, value=None):
paul@437 240
        if token_type != self.type:
paul@437 241
            expected = token.tok_name[token_type]
paul@437 242
            got = token.tok_name[self.type]
paul@437 243
            raise PgenError("expected token %s but got %s" % (expected, got),
paul@437 244
                            self.location)
paul@437 245
        current_value = self.value
paul@437 246
        if value is not None:
paul@437 247
            if value != current_value:
paul@437 248
                msg = "expected %r but got %r" % (value, current_value)
paul@437 249
                raise PgenError(msg,self.location)
paul@437 250
        self.advance_token()
paul@437 251
        return current_value
paul@437 252
paul@437 253
    def test_token(self, token_type, value):
paul@437 254
        if self.type == token_type and self.value == value:
paul@437 255
            return True
paul@437 256
        return False
paul@437 257
paul@437 258
    def advance_token(self):
paul@437 259
        data = self.token_stream.next()
paul@437 260
        # Ignore comments and non-logical newlines.
paul@437 261
        while data[0] in (tokenize.NL, tokenize.COMMENT):
paul@437 262
            data = self.token_stream.next()
paul@437 263
        self.type, self.value = data[:2]
paul@437 264
        self.location = data[2:]
paul@437 265
paul@437 266
    def parse(self):
paul@437 267
        self.advance_token()
paul@437 268
        while self.type != token.ENDMARKER:
paul@437 269
            # Skip over whitespace.
paul@437 270
            while self.type == token.NEWLINE:
paul@437 271
                self.advance_token()
paul@437 272
            name, start_state, end_state = self.parse_rule()
paul@437 273
            dfa = nfa_to_dfa(start_state, end_state)
paul@437 274
            simplify_dfa(dfa)
paul@437 275
            self.dfas[name] = dfa
paul@437 276
            if self.start_symbol is None:
paul@437 277
                self.start_symbol = name
paul@437 278
paul@437 279
    def parse_rule(self):
paul@437 280
        # RULE: NAME ':' ALTERNATIVES
paul@437 281
        name = self.expect(token.NAME)
paul@437 282
        self.expect(token.OP, ":")
paul@437 283
        start_state, end_state = self.parse_alternatives()
paul@437 284
        self.expect(token.NEWLINE)
paul@437 285
        return name, start_state, end_state
paul@437 286
paul@437 287
    def parse_alternatives(self):
paul@437 288
        # ALTERNATIVES: ITEMS ('|' ITEMS)*
paul@437 289
        first_state, end_state = self.parse_items()
paul@437 290
        if self.test_token(token.OP, "|"):
paul@437 291
            # Link all alternatives into a enclosing set of states.
paul@437 292
            enclosing_start_state = NFA()
paul@437 293
            enclosing_end_state = NFA()
paul@437 294
            enclosing_start_state.arc(first_state)
paul@437 295
            end_state.arc(enclosing_end_state)
paul@437 296
            while self.test_token(token.OP, "|"):
paul@437 297
                self.advance_token()
paul@437 298
                sub_start_state, sub_end_state = self.parse_items()
paul@437 299
                enclosing_start_state.arc(sub_start_state)
paul@437 300
                sub_end_state.arc(enclosing_end_state)
paul@437 301
            first_state = enclosing_start_state
paul@437 302
            end_state = enclosing_end_state
paul@437 303
        return first_state, end_state
paul@437 304
paul@437 305
    def parse_items(self):
paul@437 306
        # ITEMS: ITEM+
paul@437 307
        first_state, end_state = self.parse_item()
paul@437 308
        while self.type in (token.STRING, token.NAME) or \
paul@437 309
                           self.test_token(token.OP, "(") or \
paul@437 310
                           self.test_token(token.OP, "["):
paul@437 311
            sub_first_state, new_end_state = self.parse_item()
paul@437 312
            end_state.arc(sub_first_state)
paul@437 313
            end_state = new_end_state
paul@437 314
        return first_state, end_state
paul@437 315
paul@437 316
    def parse_item(self):
paul@437 317
        # ITEM: '[' ALTERNATIVES ']' | ATOM ['+' | '*']
paul@437 318
        if self.test_token(token.OP, "["):
paul@437 319
            self.advance_token()
paul@437 320
            start_state, end_state = self.parse_alternatives()
paul@437 321
            self.expect(token.OP, "]")
paul@437 322
            # Bypass the rule if this is optional.
paul@437 323
            start_state.arc(end_state)
paul@437 324
            return start_state, end_state
paul@437 325
        else:
paul@437 326
            atom_state, next_state = self.parse_atom()
paul@437 327
            # Check for a repeater.
paul@437 328
            if self.type == token.OP and self.value in ("+", "*"):
paul@437 329
                next_state.arc(atom_state)
paul@437 330
                repeat = self.value
paul@437 331
                self.advance_token()
paul@437 332
                if repeat == "*":
paul@437 333
                    # Optionally repeated
paul@437 334
                    return atom_state, atom_state
paul@437 335
                else:
paul@437 336
                    # Required
paul@437 337
                    return atom_state, next_state
paul@437 338
            else:
paul@437 339
                return atom_state, next_state
paul@437 340
paul@437 341
    def parse_atom(self):
paul@437 342
        # ATOM: '(' ALTERNATIVES ')' | NAME | STRING
paul@437 343
        if self.test_token(token.OP, "("):
paul@437 344
            self.advance_token()
paul@437 345
            rule = self.parse_alternatives()
paul@437 346
            self.expect(token.OP, ")")
paul@437 347
            return rule
paul@437 348
        elif self.type in (token.NAME, token.STRING):
paul@437 349
            atom_state = NFA()
paul@437 350
            next_state = NFA()
paul@437 351
            atom_state.arc(next_state, self.value)
paul@437 352
            self.advance_token()
paul@437 353
            return atom_state, next_state
paul@437 354
        else:
paul@437 355
            invalid = token.tok_name[self.type]
paul@437 356
            raise PgenError("unexpected token: %s" % (invalid,),
paul@437 357
                            self.location)