Lichen

pyparser/parser.py

934:2989aab1b4f7
6 months ago Paul Boddie Renamed the utf8string class to unicode, eliminating the unicode function. This means that the simple case of merely returning an object if it is already a Unicode object no longer occurs when using the unicode callable, but such behaviour might be better supported with more general customised instantiation functionality.
     1 """     2 A CPython inspired RPython parser.     3 """     4      5      6 class Grammar(object):     7     """     8     Base Grammar object.     9     10     Pass this to ParserGenerator.build_grammar to fill it with useful values for    11     the Parser.    12     """    13     14     def __init__(self):    15         self.symbol_ids = {}    16         self.symbol_names = {}    17         self.symbol_to_label = {}    18         self.keyword_ids = {}    19         self.dfas = []    20         self.labels = [0]    21         self.token_ids = {}    22         self.start = -1    23     24     def shared_copy(self):    25         new = self.__class__()    26         new.symbol_ids = self.symbol_ids    27         new.symbols_names = self.symbol_names    28         new.keyword_ids = self.keyword_ids    29         new.dfas = self.dfas    30         new.labels = self.labels    31         new.token_ids = self.token_ids    32         return new    33     34     def _freeze_(self):    35         # Remove some attributes not used in parsing.    36         try:    37             del self.symbol_to_label    38             del self.symbol_names    39             del self.symbol_ids    40         except AttributeError:    41             pass    42         return True    43     44     45 class Node(object):    46     47     __slots__ = ("type", )    48     49     def __init__(self, type):    50         self.type = type    51     52     def __eq__(self, other):    53         raise NotImplementedError("abstract base class")    54     55     def __ne__(self, other):    56         return not self == other    57     58     def get_value(self):    59         return None    60     61     def get_child(self, i):    62         raise NotImplementedError("abstract base class")    63     64     def num_children(self):    65         return 0    66     67     def append_child(self, child):    68         raise NotImplementedError("abstract base class")    69     70     def get_lineno(self):    71         raise NotImplementedError("abstract base class")    72     73     def get_column(self):    74         raise NotImplementedError("abstract base class")    75     76     77 class Terminal(Node):    78     __slots__ = ("value", "lineno", "column")    79     def __init__(self, type, value, lineno, column):    80         Node.__init__(self, type)    81         self.value = value    82         self.lineno = lineno    83         self.column = column    84     85     def __repr__(self):    86         return "Terminal(type=%s, value=%r)" % (self.type, self.value)    87     88     def __eq__(self, other):    89         # For tests.    90         return (type(self) == type(other) and    91                 self.type == other.type and    92                 self.value == other.value)    93     94     def get_value(self):    95         return self.value    96     97     def get_lineno(self):    98         return self.lineno    99    100     def get_column(self):   101         return self.column   102    103    104 class AbstractNonterminal(Node):   105     __slots__ = ()   106    107     def get_lineno(self):   108         return self.get_child(0).get_lineno()   109    110     def get_column(self):   111         return self.get_child(0).get_column()   112    113     def __eq__(self, other):   114         # For tests.   115         # grumble, annoying   116         if not isinstance(other, AbstractNonterminal):   117             return False   118         if self.type != other.type:   119             return False   120         if self.num_children() != other.num_children():   121             return False   122         for i in range(self.num_children()):   123             if self.get_child(i) != other.get_child(i):   124                 return False   125         return True   126    127    128 class Nonterminal(AbstractNonterminal):   129     __slots__ = ("_children", )   130     def __init__(self, type, children):   131         Node.__init__(self, type)   132         self._children = children   133    134     def __repr__(self):   135         return "Nonterminal(type=%s, children=%r)" % (self.type, self._children)   136    137     def get_child(self, i):   138         return self._children[i]   139    140     def num_children(self):   141         return len(self._children)   142    143     def append_child(self, child):   144         self._children.append(child)   145    146    147 class Nonterminal1(AbstractNonterminal):   148     __slots__ = ("_child", )   149     def __init__(self, type, child):   150         Node.__init__(self, type)   151         self._child = child   152    153     def __repr__(self):   154         return "Nonterminal(type=%s, children=[%r])" % (self.type, self._child)   155    156     def get_child(self, i):   157         assert i == 0 or i == -1   158         return self._child   159    160     def num_children(self):   161         return 1   162    163     def append_child(self, child):   164         assert 0, "should be unreachable"   165    166    167 class NonterminalEnc(Nonterminal1):   168     def __init__(self, type, child, encoding):   169         Nonterminal1.__init__(self, type, child)   170         self.encoding = encoding   171    172     def __repr__(self):   173         return "NonterminalEnc(type=%s, child=%r, encoding=%r)" % (self.type, self._child, self.encoding)   174    175    176 class ParseError(Exception):   177    178     def __init__(self, msg, token_type, value, lineno, column, line,   179                  expected=-1):   180         self.msg = msg   181         self.token_type = token_type   182         self.value = value   183         self.lineno = lineno   184         self.column = column   185         self.line = line   186         self.expected = expected   187    188     def __str__(self):   189         return "ParserError(%s, %r)" % (self.token_type, self.value)   190    191    192 class Parser(object):   193    194     def __init__(self, grammar):   195         self.grammar = grammar   196         self.root = None   197         self.stack = None   198    199     def prepare(self, start=-1):   200         """Setup the parser for parsing.   201    202         Takes the starting symbol as an argument.   203         """   204         if start == -1:   205             start = self.grammar.start   206         self.root = None   207         current_node = Nonterminal(start, [])   208         self.stack = []   209         self.stack.append((self.grammar.dfas[start - 256], 0, current_node))   210    211     def add_token(self, token_type, value, lineno, column, line):   212         label_index = self.classify(token_type, value, lineno, column, line)   213         sym_id = 0 # for the annotator   214         while True:   215             dfa, state_index, node = self.stack[-1]   216             states, first = dfa   217             arcs, is_accepting = states[state_index]   218             for i, next_state in arcs:   219                 sym_id = self.grammar.labels[i]   220                 if label_index == i:   221                     # We matched a non-terminal.   222                     self.shift(next_state, token_type, value, lineno, column)   223                     state = states[next_state]   224                     # While the only possible action is to accept, pop nodes off   225                     # the stack.   226                     while state[1] and not state[0]:   227                         self.pop()   228                         if not self.stack:   229                             # Parsing is done.   230                             return True   231                         dfa, state_index, node = self.stack[-1]   232                         state = dfa[0][state_index]   233                     return False   234                 elif sym_id >= 256:   235                     sub_node_dfa = self.grammar.dfas[sym_id - 256]   236                     # Check if this token can start a child node.   237                     if label_index in sub_node_dfa[1]:   238                         self.push(sub_node_dfa, next_state, sym_id, lineno,   239                                   column)   240                         break   241             else:   242                 # We failed to find any arcs to another state, so unless this   243                 # state is accepting, it's invalid input.   244                 if is_accepting:   245                     self.pop()   246                     if not self.stack:   247                         raise ParseError("too much input", token_type, value,   248                                          lineno, column, line)   249                 else:   250                     # If only one possible input would satisfy, attach it to the   251                     # error.   252                     if len(arcs) == 1:   253                         expected = sym_id   254                     else:   255                         expected = -1   256                     raise ParseError("bad input", token_type, value, lineno,   257                                      column, line, expected)   258    259     def classify(self, token_type, value, lineno, column, line):   260         """Find the label for a token."""   261         if token_type == self.grammar.KEYWORD_TOKEN:   262             label_index = self.grammar.keyword_ids.get(value, -1)   263             if label_index != -1:   264                 return label_index   265         label_index = self.grammar.token_ids.get(token_type, -1)   266         if label_index == -1:   267             raise ParseError("invalid token", token_type, value, lineno, column,   268                              line)   269         return label_index   270    271     def shift(self, next_state, token_type, value, lineno, column):   272         """Shift a non-terminal and prepare for the next state."""   273         dfa, state, node = self.stack[-1]   274         new_node = Terminal(token_type, value, lineno, column)   275         node.append_child(new_node)   276         self.stack[-1] = (dfa, next_state, node)   277    278     def push(self, next_dfa, next_state, node_type, lineno, column):   279         """Push a terminal and adjust the current state."""   280         dfa, state, node = self.stack[-1]   281         new_node = Nonterminal(node_type, [])   282         self.stack[-1] = (dfa, next_state, node)   283         self.stack.append((next_dfa, 0, new_node))   284    285     def pop(self):   286         """Pop an entry off the stack and make its node a child of the last."""   287         dfa, state, node = self.stack.pop()   288         if self.stack:   289             # we are now done with node, so we can store it more efficiently if   290             # it has just one child   291             if node.num_children() == 1:   292                 node = Nonterminal1(node.type, node.get_child(0))   293             self.stack[-1][2].append_child(node)   294         else:   295             self.root = node