paul@437 | 1 | """ |
paul@437 | 2 | Makes a parser from a grammar source. |
paul@437 | 3 | |
paul@437 | 4 | Inspired by Guido van Rossum's pgen2. |
paul@437 | 5 | """ |
paul@437 | 6 | |
paul@437 | 7 | import StringIO |
paul@437 | 8 | import tokenize |
paul@437 | 9 | import token |
paul@437 | 10 | |
paul@437 | 11 | from pyparser import parser |
paul@437 | 12 | |
paul@437 | 13 | |
paul@437 | 14 | class PgenError(Exception): |
paul@437 | 15 | |
paul@437 | 16 | def __init__(self, msg, location=None): |
paul@437 | 17 | Exception.__init__(self, msg) |
paul@437 | 18 | self.location = location |
paul@437 | 19 | |
paul@437 | 20 | |
paul@437 | 21 | class NFA(object): |
paul@437 | 22 | |
paul@437 | 23 | def __init__(self): |
paul@437 | 24 | self.arcs = [] |
paul@437 | 25 | |
paul@437 | 26 | def arc(self, to_state, label=None): |
paul@437 | 27 | self.arcs.append((label, to_state)) |
paul@437 | 28 | |
paul@437 | 29 | def find_unlabeled_states(self, into): |
paul@437 | 30 | if self in into: |
paul@437 | 31 | return |
paul@437 | 32 | into.add(self) |
paul@437 | 33 | for label, state in self.arcs: |
paul@437 | 34 | if label is None: |
paul@437 | 35 | state.find_unlabeled_states(into) |
paul@437 | 36 | |
paul@437 | 37 | |
paul@437 | 38 | class DFA(object): |
paul@437 | 39 | |
paul@437 | 40 | def __init__(self, nfa_set, final_state): |
paul@437 | 41 | self.nfas = nfa_set |
paul@437 | 42 | self.is_final = final_state in nfa_set |
paul@437 | 43 | self.arcs = {} |
paul@437 | 44 | |
paul@437 | 45 | def arc(self, next, label): |
paul@437 | 46 | self.arcs[label] = next |
paul@437 | 47 | |
paul@437 | 48 | def unify_state(self, old, new): |
paul@437 | 49 | for label, state in self.arcs.iteritems(): |
paul@437 | 50 | if state is old: |
paul@437 | 51 | self.arcs[label] = new |
paul@437 | 52 | |
paul@437 | 53 | def __repr__(self): |
paul@437 | 54 | return "<DFA arcs=%r>" % self.arcs |
paul@437 | 55 | |
paul@437 | 56 | def __eq__(self, other): |
paul@437 | 57 | if not isinstance(other, DFA): |
paul@437 | 58 | # This shouldn't really happen. |
paul@437 | 59 | return NotImplemented |
paul@437 | 60 | if other.is_final != self.is_final: |
paul@437 | 61 | return False |
paul@437 | 62 | if len(self.arcs) != len(other.arcs): |
paul@437 | 63 | return False |
paul@437 | 64 | for label, state in self.arcs.iteritems(): |
paul@437 | 65 | try: |
paul@437 | 66 | other_state = other.arcs[label] |
paul@437 | 67 | except KeyError: |
paul@437 | 68 | return False |
paul@437 | 69 | else: |
paul@437 | 70 | if other_state is not state: |
paul@437 | 71 | return False |
paul@437 | 72 | return True |
paul@437 | 73 | |
paul@437 | 74 | |
paul@437 | 75 | def nfa_to_dfa(start, end): |
paul@437 | 76 | """Convert an NFA to a DFA(s) |
paul@437 | 77 | |
paul@437 | 78 | Each DFA is initially a set of NFA states without labels. We start with the |
paul@437 | 79 | DFA for the start NFA. Then we add labeled arcs to it pointing to another |
paul@437 | 80 | set of NFAs (the next state). Finally, we do the same thing to every DFA |
paul@437 | 81 | that is found and return the list of states. |
paul@437 | 82 | """ |
paul@437 | 83 | base_nfas = set() |
paul@437 | 84 | start.find_unlabeled_states(base_nfas) |
paul@437 | 85 | state_stack = [DFA(base_nfas, end)] |
paul@437 | 86 | for state in state_stack: |
paul@437 | 87 | arcs = {} |
paul@437 | 88 | for nfa in state.nfas: |
paul@437 | 89 | for label, sub_nfa in nfa.arcs: |
paul@437 | 90 | if label is not None: |
paul@437 | 91 | sub_nfa.find_unlabeled_states(arcs.setdefault(label, set())) |
paul@437 | 92 | for label, nfa_set in arcs.iteritems(): |
paul@437 | 93 | for st in state_stack: |
paul@437 | 94 | if st.nfas == nfa_set: |
paul@437 | 95 | break |
paul@437 | 96 | else: |
paul@437 | 97 | st = DFA(nfa_set, end) |
paul@437 | 98 | state_stack.append(st) |
paul@437 | 99 | state.arc(st, label) |
paul@437 | 100 | return state_stack |
paul@437 | 101 | |
paul@437 | 102 | def simplify_dfa(dfa): |
paul@437 | 103 | changed = True |
paul@437 | 104 | while changed: |
paul@437 | 105 | changed = False |
paul@437 | 106 | for i, state in enumerate(dfa): |
paul@437 | 107 | for j in xrange(i + 1, len(dfa)): |
paul@437 | 108 | other_state = dfa[j] |
paul@437 | 109 | if state == other_state: |
paul@437 | 110 | del dfa[j] |
paul@437 | 111 | for sub_state in dfa: |
paul@437 | 112 | sub_state.unify_state(other_state, state) |
paul@437 | 113 | changed = True |
paul@437 | 114 | break |
paul@437 | 115 | |
paul@437 | 116 | |
paul@437 | 117 | class ParserGenerator(object): |
paul@437 | 118 | """NOT_RPYTHON""" |
paul@437 | 119 | |
paul@437 | 120 | def __init__(self, grammar_source): |
paul@437 | 121 | self.start_symbol = None |
paul@437 | 122 | self.dfas = {} |
paul@437 | 123 | stream = StringIO.StringIO(grammar_source) |
paul@437 | 124 | self.token_stream = tokenize.generate_tokens(stream.readline) |
paul@437 | 125 | self.parse() |
paul@437 | 126 | self.first = {} |
paul@437 | 127 | self.add_first_sets() |
paul@437 | 128 | |
paul@437 | 129 | def build_grammar(self, grammar_cls): |
paul@437 | 130 | gram = grammar_cls() |
paul@437 | 131 | gram.start = self.start_symbol |
paul@437 | 132 | names = self.dfas.keys() |
paul@437 | 133 | names.sort() |
paul@437 | 134 | names.remove(self.start_symbol) |
paul@437 | 135 | names.insert(0, self.start_symbol) |
paul@437 | 136 | # First, build symbol and id mappings. |
paul@437 | 137 | for name in names: |
paul@437 | 138 | i = 256 + len(gram.symbol_ids) |
paul@437 | 139 | gram.symbol_ids[name] = i |
paul@437 | 140 | gram.symbol_names[i] = name |
paul@437 | 141 | # Then, iterate through again and finalize labels. |
paul@437 | 142 | for name in names: |
paul@437 | 143 | dfa = self.dfas[name] |
paul@437 | 144 | states = [] |
paul@437 | 145 | for state in dfa: |
paul@437 | 146 | arcs = [] |
paul@437 | 147 | for label, next in state.arcs.iteritems(): |
paul@437 | 148 | arcs.append((self.make_label(gram, label), dfa.index(next))) |
paul@437 | 149 | states.append((arcs, state.is_final)) |
paul@437 | 150 | gram.dfas.append((states, self.make_first(gram, name))) |
paul@437 | 151 | assert len(gram.dfas) - 1 == gram.symbol_ids[name] - 256 |
paul@437 | 152 | gram.start = gram.symbol_ids[self.start_symbol] |
paul@437 | 153 | return gram |
paul@437 | 154 | |
paul@437 | 155 | def make_label(self, gram, label): |
paul@437 | 156 | label_index = len(gram.labels) |
paul@437 | 157 | if label[0].isalpha(): |
paul@437 | 158 | # Either a symbol or a token. |
paul@437 | 159 | if label in gram.symbol_ids: |
paul@437 | 160 | if label in gram.symbol_to_label: |
paul@437 | 161 | return gram.symbol_to_label[label] |
paul@437 | 162 | else: |
paul@437 | 163 | gram.labels.append(gram.symbol_ids[label]) |
paul@437 | 164 | gram.symbol_to_label[label] = label_index |
paul@437 | 165 | return label_index |
paul@437 | 166 | elif label.isupper(): |
paul@437 | 167 | token_index = gram.TOKENS[label] |
paul@437 | 168 | if token_index in gram.token_ids: |
paul@437 | 169 | return gram.token_ids[token_index] |
paul@437 | 170 | else: |
paul@437 | 171 | gram.labels.append(token_index) |
paul@437 | 172 | gram.token_ids[token_index] = label_index |
paul@437 | 173 | return label_index |
paul@437 | 174 | else: |
paul@437 | 175 | # Probably a rule without a definition. |
paul@437 | 176 | raise PgenError("no such rule: %r" % (label,)) |
paul@437 | 177 | else: |
paul@437 | 178 | # A keyword or operator. |
paul@437 | 179 | value = label.strip("\"'") |
paul@437 | 180 | if value[0].isalpha(): |
paul@437 | 181 | if value in gram.keyword_ids: |
paul@437 | 182 | return gram.keyword_ids[value] |
paul@437 | 183 | else: |
paul@437 | 184 | gram.labels.append(gram.KEYWORD_TOKEN) |
paul@437 | 185 | gram.keyword_ids[value] = label_index |
paul@437 | 186 | return label_index |
paul@437 | 187 | else: |
paul@437 | 188 | try: |
paul@437 | 189 | token_index = gram.OPERATOR_MAP[value] |
paul@437 | 190 | except KeyError: |
paul@437 | 191 | raise PgenError("no such operator: %r" % (value,)) |
paul@437 | 192 | if token_index in gram.token_ids: |
paul@437 | 193 | return gram.token_ids[token_index] |
paul@437 | 194 | else: |
paul@437 | 195 | gram.labels.append(token_index) |
paul@437 | 196 | gram.token_ids[token_index] = label_index |
paul@437 | 197 | return label_index |
paul@437 | 198 | |
paul@437 | 199 | def make_first(self, gram, name): |
paul@437 | 200 | original_firsts = self.first[name] |
paul@437 | 201 | firsts = dict() |
paul@437 | 202 | for label in original_firsts: |
paul@437 | 203 | firsts[self.make_label(gram, label)] = None |
paul@437 | 204 | return firsts |
paul@437 | 205 | |
paul@437 | 206 | def add_first_sets(self): |
paul@437 | 207 | for name, dfa in self.dfas.iteritems(): |
paul@437 | 208 | if name not in self.first: |
paul@437 | 209 | self.get_first(name, dfa) |
paul@437 | 210 | |
paul@437 | 211 | def get_first(self, name, dfa): |
paul@437 | 212 | self.first[name] = None |
paul@437 | 213 | state = dfa[0] |
paul@437 | 214 | all_labels = set() |
paul@437 | 215 | overlap_check = {} |
paul@437 | 216 | for label, sub_state in state.arcs.iteritems(): |
paul@437 | 217 | if label in self.dfas: |
paul@437 | 218 | if label in self.first: |
paul@437 | 219 | new_labels = self.first[label] |
paul@437 | 220 | if new_labels is None: |
paul@437 | 221 | raise PgenError("recursion in rule: %r" % (name,)) |
paul@437 | 222 | else: |
paul@437 | 223 | new_labels = self.get_first(label, self.dfas[label]) |
paul@437 | 224 | all_labels.update(new_labels) |
paul@437 | 225 | overlap_check[label] = new_labels |
paul@437 | 226 | else: |
paul@437 | 227 | all_labels.add(label) |
paul@437 | 228 | overlap_check[label] = set((label,)) |
paul@437 | 229 | inverse = {} |
paul@437 | 230 | for label, their_first in overlap_check.iteritems(): |
paul@437 | 231 | for sub_label in their_first: |
paul@437 | 232 | if sub_label in inverse: |
paul@437 | 233 | raise PgenError("ambiguous symbol with label %s" |
paul@437 | 234 | % (label,)) |
paul@437 | 235 | inverse[sub_label] = label |
paul@437 | 236 | self.first[name] = all_labels |
paul@437 | 237 | return all_labels |
paul@437 | 238 | |
paul@437 | 239 | def expect(self, token_type, value=None): |
paul@437 | 240 | if token_type != self.type: |
paul@437 | 241 | expected = token.tok_name[token_type] |
paul@437 | 242 | got = token.tok_name[self.type] |
paul@437 | 243 | raise PgenError("expected token %s but got %s" % (expected, got), |
paul@437 | 244 | self.location) |
paul@437 | 245 | current_value = self.value |
paul@437 | 246 | if value is not None: |
paul@437 | 247 | if value != current_value: |
paul@437 | 248 | msg = "expected %r but got %r" % (value, current_value) |
paul@437 | 249 | raise PgenError(msg,self.location) |
paul@437 | 250 | self.advance_token() |
paul@437 | 251 | return current_value |
paul@437 | 252 | |
paul@437 | 253 | def test_token(self, token_type, value): |
paul@437 | 254 | if self.type == token_type and self.value == value: |
paul@437 | 255 | return True |
paul@437 | 256 | return False |
paul@437 | 257 | |
paul@437 | 258 | def advance_token(self): |
paul@437 | 259 | data = self.token_stream.next() |
paul@437 | 260 | # Ignore comments and non-logical newlines. |
paul@437 | 261 | while data[0] in (tokenize.NL, tokenize.COMMENT): |
paul@437 | 262 | data = self.token_stream.next() |
paul@437 | 263 | self.type, self.value = data[:2] |
paul@437 | 264 | self.location = data[2:] |
paul@437 | 265 | |
paul@437 | 266 | def parse(self): |
paul@437 | 267 | self.advance_token() |
paul@437 | 268 | while self.type != token.ENDMARKER: |
paul@437 | 269 | # Skip over whitespace. |
paul@437 | 270 | while self.type == token.NEWLINE: |
paul@437 | 271 | self.advance_token() |
paul@437 | 272 | name, start_state, end_state = self.parse_rule() |
paul@437 | 273 | dfa = nfa_to_dfa(start_state, end_state) |
paul@437 | 274 | simplify_dfa(dfa) |
paul@437 | 275 | self.dfas[name] = dfa |
paul@437 | 276 | if self.start_symbol is None: |
paul@437 | 277 | self.start_symbol = name |
paul@437 | 278 | |
paul@437 | 279 | def parse_rule(self): |
paul@437 | 280 | # RULE: NAME ':' ALTERNATIVES |
paul@437 | 281 | name = self.expect(token.NAME) |
paul@437 | 282 | self.expect(token.OP, ":") |
paul@437 | 283 | start_state, end_state = self.parse_alternatives() |
paul@437 | 284 | self.expect(token.NEWLINE) |
paul@437 | 285 | return name, start_state, end_state |
paul@437 | 286 | |
paul@437 | 287 | def parse_alternatives(self): |
paul@437 | 288 | # ALTERNATIVES: ITEMS ('|' ITEMS)* |
paul@437 | 289 | first_state, end_state = self.parse_items() |
paul@437 | 290 | if self.test_token(token.OP, "|"): |
paul@437 | 291 | # Link all alternatives into a enclosing set of states. |
paul@437 | 292 | enclosing_start_state = NFA() |
paul@437 | 293 | enclosing_end_state = NFA() |
paul@437 | 294 | enclosing_start_state.arc(first_state) |
paul@437 | 295 | end_state.arc(enclosing_end_state) |
paul@437 | 296 | while self.test_token(token.OP, "|"): |
paul@437 | 297 | self.advance_token() |
paul@437 | 298 | sub_start_state, sub_end_state = self.parse_items() |
paul@437 | 299 | enclosing_start_state.arc(sub_start_state) |
paul@437 | 300 | sub_end_state.arc(enclosing_end_state) |
paul@437 | 301 | first_state = enclosing_start_state |
paul@437 | 302 | end_state = enclosing_end_state |
paul@437 | 303 | return first_state, end_state |
paul@437 | 304 | |
paul@437 | 305 | def parse_items(self): |
paul@437 | 306 | # ITEMS: ITEM+ |
paul@437 | 307 | first_state, end_state = self.parse_item() |
paul@437 | 308 | while self.type in (token.STRING, token.NAME) or \ |
paul@437 | 309 | self.test_token(token.OP, "(") or \ |
paul@437 | 310 | self.test_token(token.OP, "["): |
paul@437 | 311 | sub_first_state, new_end_state = self.parse_item() |
paul@437 | 312 | end_state.arc(sub_first_state) |
paul@437 | 313 | end_state = new_end_state |
paul@437 | 314 | return first_state, end_state |
paul@437 | 315 | |
paul@437 | 316 | def parse_item(self): |
paul@437 | 317 | # ITEM: '[' ALTERNATIVES ']' | ATOM ['+' | '*'] |
paul@437 | 318 | if self.test_token(token.OP, "["): |
paul@437 | 319 | self.advance_token() |
paul@437 | 320 | start_state, end_state = self.parse_alternatives() |
paul@437 | 321 | self.expect(token.OP, "]") |
paul@437 | 322 | # Bypass the rule if this is optional. |
paul@437 | 323 | start_state.arc(end_state) |
paul@437 | 324 | return start_state, end_state |
paul@437 | 325 | else: |
paul@437 | 326 | atom_state, next_state = self.parse_atom() |
paul@437 | 327 | # Check for a repeater. |
paul@437 | 328 | if self.type == token.OP and self.value in ("+", "*"): |
paul@437 | 329 | next_state.arc(atom_state) |
paul@437 | 330 | repeat = self.value |
paul@437 | 331 | self.advance_token() |
paul@437 | 332 | if repeat == "*": |
paul@437 | 333 | # Optionally repeated |
paul@437 | 334 | return atom_state, atom_state |
paul@437 | 335 | else: |
paul@437 | 336 | # Required |
paul@437 | 337 | return atom_state, next_state |
paul@437 | 338 | else: |
paul@437 | 339 | return atom_state, next_state |
paul@437 | 340 | |
paul@437 | 341 | def parse_atom(self): |
paul@437 | 342 | # ATOM: '(' ALTERNATIVES ')' | NAME | STRING |
paul@437 | 343 | if self.test_token(token.OP, "("): |
paul@437 | 344 | self.advance_token() |
paul@437 | 345 | rule = self.parse_alternatives() |
paul@437 | 346 | self.expect(token.OP, ")") |
paul@437 | 347 | return rule |
paul@437 | 348 | elif self.type in (token.NAME, token.STRING): |
paul@437 | 349 | atom_state = NFA() |
paul@437 | 350 | next_state = NFA() |
paul@437 | 351 | atom_state.arc(next_state, self.value) |
paul@437 | 352 | self.advance_token() |
paul@437 | 353 | return atom_state, next_state |
paul@437 | 354 | else: |
paul@437 | 355 | invalid = token.tok_name[self.type] |
paul@437 | 356 | raise PgenError("unexpected token: %s" % (invalid,), |
paul@437 | 357 | self.location) |