paul@437 | 1 | """ |
paul@437 | 2 | A CPython inspired RPython parser. |
paul@437 | 3 | """ |
paul@437 | 4 | |
paul@437 | 5 | |
paul@437 | 6 | class Grammar(object): |
paul@437 | 7 | """ |
paul@437 | 8 | Base Grammar object. |
paul@437 | 9 | |
paul@437 | 10 | Pass this to ParserGenerator.build_grammar to fill it with useful values for |
paul@437 | 11 | the Parser. |
paul@437 | 12 | """ |
paul@437 | 13 | |
paul@437 | 14 | def __init__(self): |
paul@437 | 15 | self.symbol_ids = {} |
paul@437 | 16 | self.symbol_names = {} |
paul@437 | 17 | self.symbol_to_label = {} |
paul@437 | 18 | self.keyword_ids = {} |
paul@437 | 19 | self.dfas = [] |
paul@437 | 20 | self.labels = [0] |
paul@437 | 21 | self.token_ids = {} |
paul@437 | 22 | self.start = -1 |
paul@437 | 23 | |
paul@437 | 24 | def shared_copy(self): |
paul@437 | 25 | new = self.__class__() |
paul@437 | 26 | new.symbol_ids = self.symbol_ids |
paul@437 | 27 | new.symbols_names = self.symbol_names |
paul@437 | 28 | new.keyword_ids = self.keyword_ids |
paul@437 | 29 | new.dfas = self.dfas |
paul@437 | 30 | new.labels = self.labels |
paul@437 | 31 | new.token_ids = self.token_ids |
paul@437 | 32 | return new |
paul@437 | 33 | |
paul@437 | 34 | def _freeze_(self): |
paul@437 | 35 | # Remove some attributes not used in parsing. |
paul@437 | 36 | try: |
paul@437 | 37 | del self.symbol_to_label |
paul@437 | 38 | del self.symbol_names |
paul@437 | 39 | del self.symbol_ids |
paul@437 | 40 | except AttributeError: |
paul@437 | 41 | pass |
paul@437 | 42 | return True |
paul@437 | 43 | |
paul@437 | 44 | |
paul@437 | 45 | class Node(object): |
paul@437 | 46 | |
paul@437 | 47 | __slots__ = ("type", ) |
paul@437 | 48 | |
paul@437 | 49 | def __init__(self, type): |
paul@437 | 50 | self.type = type |
paul@437 | 51 | |
paul@437 | 52 | def __eq__(self, other): |
paul@437 | 53 | raise NotImplementedError("abstract base class") |
paul@437 | 54 | |
paul@437 | 55 | def __ne__(self, other): |
paul@437 | 56 | return not self == other |
paul@437 | 57 | |
paul@437 | 58 | def get_value(self): |
paul@437 | 59 | return None |
paul@437 | 60 | |
paul@437 | 61 | def get_child(self, i): |
paul@437 | 62 | raise NotImplementedError("abstract base class") |
paul@437 | 63 | |
paul@437 | 64 | def num_children(self): |
paul@437 | 65 | return 0 |
paul@437 | 66 | |
paul@437 | 67 | def append_child(self, child): |
paul@437 | 68 | raise NotImplementedError("abstract base class") |
paul@437 | 69 | |
paul@437 | 70 | def get_lineno(self): |
paul@437 | 71 | raise NotImplementedError("abstract base class") |
paul@437 | 72 | |
paul@437 | 73 | def get_column(self): |
paul@437 | 74 | raise NotImplementedError("abstract base class") |
paul@437 | 75 | |
paul@437 | 76 | |
paul@437 | 77 | class Terminal(Node): |
paul@437 | 78 | __slots__ = ("value", "lineno", "column") |
paul@437 | 79 | def __init__(self, type, value, lineno, column): |
paul@437 | 80 | Node.__init__(self, type) |
paul@437 | 81 | self.value = value |
paul@437 | 82 | self.lineno = lineno |
paul@437 | 83 | self.column = column |
paul@437 | 84 | |
paul@437 | 85 | def __repr__(self): |
paul@437 | 86 | return "Terminal(type=%s, value=%r)" % (self.type, self.value) |
paul@437 | 87 | |
paul@437 | 88 | def __eq__(self, other): |
paul@437 | 89 | # For tests. |
paul@437 | 90 | return (type(self) == type(other) and |
paul@437 | 91 | self.type == other.type and |
paul@437 | 92 | self.value == other.value) |
paul@437 | 93 | |
paul@437 | 94 | def get_value(self): |
paul@437 | 95 | return self.value |
paul@437 | 96 | |
paul@437 | 97 | def get_lineno(self): |
paul@437 | 98 | return self.lineno |
paul@437 | 99 | |
paul@437 | 100 | def get_column(self): |
paul@437 | 101 | return self.column |
paul@437 | 102 | |
paul@437 | 103 | |
paul@437 | 104 | class AbstractNonterminal(Node): |
paul@437 | 105 | __slots__ = () |
paul@437 | 106 | |
paul@437 | 107 | def get_lineno(self): |
paul@437 | 108 | return self.get_child(0).get_lineno() |
paul@437 | 109 | |
paul@437 | 110 | def get_column(self): |
paul@437 | 111 | return self.get_child(0).get_column() |
paul@437 | 112 | |
paul@437 | 113 | def __eq__(self, other): |
paul@437 | 114 | # For tests. |
paul@437 | 115 | # grumble, annoying |
paul@437 | 116 | if not isinstance(other, AbstractNonterminal): |
paul@437 | 117 | return False |
paul@437 | 118 | if self.type != other.type: |
paul@437 | 119 | return False |
paul@437 | 120 | if self.num_children() != other.num_children(): |
paul@437 | 121 | return False |
paul@437 | 122 | for i in range(self.num_children()): |
paul@437 | 123 | if self.get_child(i) != other.get_child(i): |
paul@437 | 124 | return False |
paul@437 | 125 | return True |
paul@437 | 126 | |
paul@437 | 127 | |
paul@437 | 128 | class Nonterminal(AbstractNonterminal): |
paul@437 | 129 | __slots__ = ("_children", ) |
paul@437 | 130 | def __init__(self, type, children): |
paul@437 | 131 | Node.__init__(self, type) |
paul@437 | 132 | self._children = children |
paul@437 | 133 | |
paul@437 | 134 | def __repr__(self): |
paul@437 | 135 | return "Nonterminal(type=%s, children=%r)" % (self.type, self._children) |
paul@437 | 136 | |
paul@437 | 137 | def get_child(self, i): |
paul@437 | 138 | return self._children[i] |
paul@437 | 139 | |
paul@437 | 140 | def num_children(self): |
paul@437 | 141 | return len(self._children) |
paul@437 | 142 | |
paul@437 | 143 | def append_child(self, child): |
paul@437 | 144 | self._children.append(child) |
paul@437 | 145 | |
paul@437 | 146 | |
paul@437 | 147 | class Nonterminal1(AbstractNonterminal): |
paul@437 | 148 | __slots__ = ("_child", ) |
paul@437 | 149 | def __init__(self, type, child): |
paul@437 | 150 | Node.__init__(self, type) |
paul@437 | 151 | self._child = child |
paul@437 | 152 | |
paul@437 | 153 | def __repr__(self): |
paul@437 | 154 | return "Nonterminal(type=%s, children=[%r])" % (self.type, self._child) |
paul@437 | 155 | |
paul@437 | 156 | def get_child(self, i): |
paul@437 | 157 | assert i == 0 or i == -1 |
paul@437 | 158 | return self._child |
paul@437 | 159 | |
paul@437 | 160 | def num_children(self): |
paul@437 | 161 | return 1 |
paul@437 | 162 | |
paul@437 | 163 | def append_child(self, child): |
paul@437 | 164 | assert 0, "should be unreachable" |
paul@437 | 165 | |
paul@437 | 166 | |
paul@438 | 167 | class NonterminalEnc(Nonterminal1): |
paul@438 | 168 | def __init__(self, type, child, encoding): |
paul@438 | 169 | Nonterminal1.__init__(self, type, child) |
paul@438 | 170 | self.encoding = encoding |
paul@438 | 171 | |
paul@438 | 172 | def __repr__(self): |
paul@438 | 173 | return "NonterminalEnc(type=%s, child=%r, encoding=%r)" % (self.type, self._child, self.encoding) |
paul@438 | 174 | |
paul@437 | 175 | |
paul@437 | 176 | class ParseError(Exception): |
paul@437 | 177 | |
paul@437 | 178 | def __init__(self, msg, token_type, value, lineno, column, line, |
paul@437 | 179 | expected=-1): |
paul@437 | 180 | self.msg = msg |
paul@437 | 181 | self.token_type = token_type |
paul@437 | 182 | self.value = value |
paul@437 | 183 | self.lineno = lineno |
paul@437 | 184 | self.column = column |
paul@437 | 185 | self.line = line |
paul@437 | 186 | self.expected = expected |
paul@437 | 187 | |
paul@437 | 188 | def __str__(self): |
paul@437 | 189 | return "ParserError(%s, %r)" % (self.token_type, self.value) |
paul@437 | 190 | |
paul@437 | 191 | |
paul@437 | 192 | class Parser(object): |
paul@437 | 193 | |
paul@437 | 194 | def __init__(self, grammar): |
paul@437 | 195 | self.grammar = grammar |
paul@437 | 196 | self.root = None |
paul@437 | 197 | self.stack = None |
paul@437 | 198 | |
paul@437 | 199 | def prepare(self, start=-1): |
paul@437 | 200 | """Setup the parser for parsing. |
paul@437 | 201 | |
paul@437 | 202 | Takes the starting symbol as an argument. |
paul@437 | 203 | """ |
paul@437 | 204 | if start == -1: |
paul@437 | 205 | start = self.grammar.start |
paul@437 | 206 | self.root = None |
paul@437 | 207 | current_node = Nonterminal(start, []) |
paul@437 | 208 | self.stack = [] |
paul@437 | 209 | self.stack.append((self.grammar.dfas[start - 256], 0, current_node)) |
paul@437 | 210 | |
paul@437 | 211 | def add_token(self, token_type, value, lineno, column, line): |
paul@437 | 212 | label_index = self.classify(token_type, value, lineno, column, line) |
paul@437 | 213 | sym_id = 0 # for the annotator |
paul@437 | 214 | while True: |
paul@437 | 215 | dfa, state_index, node = self.stack[-1] |
paul@437 | 216 | states, first = dfa |
paul@437 | 217 | arcs, is_accepting = states[state_index] |
paul@437 | 218 | for i, next_state in arcs: |
paul@437 | 219 | sym_id = self.grammar.labels[i] |
paul@437 | 220 | if label_index == i: |
paul@437 | 221 | # We matched a non-terminal. |
paul@437 | 222 | self.shift(next_state, token_type, value, lineno, column) |
paul@437 | 223 | state = states[next_state] |
paul@437 | 224 | # While the only possible action is to accept, pop nodes off |
paul@437 | 225 | # the stack. |
paul@437 | 226 | while state[1] and not state[0]: |
paul@437 | 227 | self.pop() |
paul@437 | 228 | if not self.stack: |
paul@437 | 229 | # Parsing is done. |
paul@437 | 230 | return True |
paul@437 | 231 | dfa, state_index, node = self.stack[-1] |
paul@437 | 232 | state = dfa[0][state_index] |
paul@437 | 233 | return False |
paul@437 | 234 | elif sym_id >= 256: |
paul@437 | 235 | sub_node_dfa = self.grammar.dfas[sym_id - 256] |
paul@437 | 236 | # Check if this token can start a child node. |
paul@437 | 237 | if label_index in sub_node_dfa[1]: |
paul@437 | 238 | self.push(sub_node_dfa, next_state, sym_id, lineno, |
paul@437 | 239 | column) |
paul@437 | 240 | break |
paul@437 | 241 | else: |
paul@437 | 242 | # We failed to find any arcs to another state, so unless this |
paul@437 | 243 | # state is accepting, it's invalid input. |
paul@437 | 244 | if is_accepting: |
paul@437 | 245 | self.pop() |
paul@437 | 246 | if not self.stack: |
paul@437 | 247 | raise ParseError("too much input", token_type, value, |
paul@437 | 248 | lineno, column, line) |
paul@437 | 249 | else: |
paul@437 | 250 | # If only one possible input would satisfy, attach it to the |
paul@437 | 251 | # error. |
paul@437 | 252 | if len(arcs) == 1: |
paul@437 | 253 | expected = sym_id |
paul@437 | 254 | else: |
paul@437 | 255 | expected = -1 |
paul@437 | 256 | raise ParseError("bad input", token_type, value, lineno, |
paul@437 | 257 | column, line, expected) |
paul@437 | 258 | |
paul@437 | 259 | def classify(self, token_type, value, lineno, column, line): |
paul@437 | 260 | """Find the label for a token.""" |
paul@437 | 261 | if token_type == self.grammar.KEYWORD_TOKEN: |
paul@437 | 262 | label_index = self.grammar.keyword_ids.get(value, -1) |
paul@437 | 263 | if label_index != -1: |
paul@437 | 264 | return label_index |
paul@437 | 265 | label_index = self.grammar.token_ids.get(token_type, -1) |
paul@437 | 266 | if label_index == -1: |
paul@437 | 267 | raise ParseError("invalid token", token_type, value, lineno, column, |
paul@437 | 268 | line) |
paul@437 | 269 | return label_index |
paul@437 | 270 | |
paul@437 | 271 | def shift(self, next_state, token_type, value, lineno, column): |
paul@437 | 272 | """Shift a non-terminal and prepare for the next state.""" |
paul@437 | 273 | dfa, state, node = self.stack[-1] |
paul@437 | 274 | new_node = Terminal(token_type, value, lineno, column) |
paul@437 | 275 | node.append_child(new_node) |
paul@437 | 276 | self.stack[-1] = (dfa, next_state, node) |
paul@437 | 277 | |
paul@437 | 278 | def push(self, next_dfa, next_state, node_type, lineno, column): |
paul@437 | 279 | """Push a terminal and adjust the current state.""" |
paul@437 | 280 | dfa, state, node = self.stack[-1] |
paul@437 | 281 | new_node = Nonterminal(node_type, []) |
paul@437 | 282 | self.stack[-1] = (dfa, next_state, node) |
paul@437 | 283 | self.stack.append((next_dfa, 0, new_node)) |
paul@437 | 284 | |
paul@437 | 285 | def pop(self): |
paul@437 | 286 | """Pop an entry off the stack and make its node a child of the last.""" |
paul@437 | 287 | dfa, state, node = self.stack.pop() |
paul@437 | 288 | if self.stack: |
paul@437 | 289 | # we are now done with node, so we can store it more efficiently if |
paul@437 | 290 | # it has just one child |
paul@437 | 291 | if node.num_children() == 1: |
paul@437 | 292 | node = Nonterminal1(node.type, node.get_child(0)) |
paul@437 | 293 | self.stack[-1][2].append_child(node) |
paul@437 | 294 | else: |
paul@437 | 295 | self.root = node |