Lichen

pyparser/pyparse.py

1037:2ef21d25c6e5
5 months ago Paul Boddie Merged changes from the default branch. trailing-data
     1 from pyparser import parser, pytokenizer, pygram, error     2 from pyparser import consts     3      4 def recode_to_utf8(bytes, encoding):     5     text = bytes.decode(encoding)     6     if not isinstance(text, unicode):     7         raise error.SyntaxError("codec did not return a unicode object")     8     recoded = text.encode("utf-8")     9     return recoded    10     11 def _normalize_encoding(encoding):    12     """returns normalized name for <encoding>    13     14     see dist/src/Parser/tokenizer.c 'get_normal_name()'    15     for implementation details / reference    16     17     NOTE: for now, parser.suite() raises a MemoryError when    18           a bad encoding is used. (SF bug #979739)    19     """    20     if encoding is None:    21         return None    22     # lower() + '_' / '-' conversion    23     encoding = encoding.replace('_', '-').lower()    24     if encoding == 'utf-8' or encoding.startswith('utf-8-'):    25         return 'utf-8'    26     for variant in ['latin-1', 'iso-latin-1', 'iso-8859-1']:    27         if (encoding == variant or    28             encoding.startswith(variant + '-')):    29             return 'iso-8859-1'    30     return encoding    31     32 def _check_for_encoding(s):    33     eol = s.find('\n')    34     if eol < 0:    35         return _check_line_for_encoding(s)[0]    36     enc, again = _check_line_for_encoding(s[:eol])    37     if enc or not again:    38         return enc    39     eol2 = s.find('\n', eol + 1)    40     if eol2 < 0:    41         return _check_line_for_encoding(s[eol + 1:])[0]    42     return _check_line_for_encoding(s[eol + 1:eol2])[0]    43     44     45 def _check_line_for_encoding(line):    46     """returns the declared encoding or None"""    47     i = 0    48     for i in range(len(line)):    49         if line[i] == '#':    50             break    51         if line[i] not in ' \t\014':    52             return None, False  # Not a comment, don't read the second line.    53     return pytokenizer.match_encoding_declaration(line[i:]), True    54     55     56 class CompileInfo(object):    57     """Stores information about the source being compiled.    58     59     * filename: The filename of the source.    60     * mode: The parse mode to use. ('exec', 'eval', or 'single')    61     * flags: Parser and compiler flags.    62     * encoding: The source encoding.    63     """    64     65     def __init__(self, filename, mode="exec", flags=0):    66         self.filename = filename    67         self.mode = mode    68         self.encoding = None    69         self.flags = flags    70     71     72 _targets = {    73 'eval' : pygram.syms["eval_input"],    74 'single' : pygram.syms["single_input"],    75 'exec' : pygram.syms["file_input"],    76 }    77     78 class PythonParser(parser.Parser):    79     80     def __init__(self, grammar=pygram.python_grammar):    81         parser.Parser.__init__(self, grammar)    82     83     def parse_source(self, textsrc, compile_info):    84         """Main entry point for parsing Python source.    85     86         Everything from decoding the source to tokenizing to building the parse    87         tree is handled here.    88         """    89         # Detect source encoding.    90         enc = None    91         if textsrc.startswith("\xEF\xBB\xBF"):    92             textsrc = textsrc[3:]    93             enc = 'utf-8'    94             # If an encoding is explicitly given check that it is utf-8.    95             decl_enc = _check_for_encoding(textsrc)    96             if decl_enc and decl_enc != "utf-8":    97                 raise error.SyntaxError("UTF-8 BOM with %s coding cookie" % decl_enc,    98                                         filename=compile_info.filename)    99         elif compile_info.flags & consts.PyCF_SOURCE_IS_UTF8:   100             enc = 'utf-8'   101             if _check_for_encoding(textsrc) is not None:   102                 raise error.SyntaxError("coding declaration in unicode string",   103                                         filename=compile_info.filename)   104         else:   105             enc = _normalize_encoding(_check_for_encoding(textsrc))   106             if enc is not None and enc != 'utf-8':   107                 try:   108                     textsrc = recode_to_utf8(textsrc, enc)   109                 except LookupError as e:   110                     # if the codec is not found, LookupError is raised.   111                     raise error.SyntaxError("Unknown encoding: %s" % enc,   112                                             filename=compile_info.filename)   113                 # Transform unicode errors into SyntaxError   114                 except UnicodeDecodeError as e:   115                     message = str(e)   116                     raise error.SyntaxError(message)   117    118         flags = compile_info.flags   119    120         # The tokenizer is very picky about how it wants its input.   121         source_lines = textsrc.splitlines(True)   122         if source_lines and not source_lines[-1].endswith("\n"):   123             source_lines[-1] += '\n'   124         if textsrc and textsrc[-1] == "\n":   125             flags &= ~consts.PyCF_DONT_IMPLY_DEDENT   126    127         self.prepare(_targets[compile_info.mode])   128         tp = 0   129         try:   130             try:   131                 # Note: we no longer pass the CO_FUTURE_* to the tokenizer,   132                 # which is expected to work independently of them.  It's   133                 # certainly the case for all futures in Python <= 2.7.   134                 tokens = pytokenizer.generate_tokens(source_lines, flags)   135    136                 self.grammar = pygram.python_grammar   137    138                 for tp, value, lineno, column, line in tokens:   139                     if self.add_token(tp, value, lineno, column, line):   140                         break   141             except error.TokenError as e:   142                 e.filename = compile_info.filename   143                 raise   144             except parser.ParseError as e:   145                 # Catch parse errors, pretty them up and reraise them as a   146                 # SyntaxError.   147                 new_err = error.IndentationError   148                 if tp == pygram.tokens["INDENT"]:   149                     msg = "unexpected indent"   150                 elif e.expected == pygram.tokens["INDENT"]:   151                     msg = "expected an indented block"   152                 else:   153                     new_err = error.SyntaxError   154                     msg = "invalid syntax"   155                 raise new_err(msg, e.lineno, e.column, e.line,   156                               compile_info.filename)   157             else:   158                 tree = self.root   159         finally:   160             # Avoid hanging onto the tree.   161             self.root = None   162         if enc is not None:   163             compile_info.encoding = enc   164             # Wrap the tree in a special encoding declaration for parser module   165             # compatibility.   166             tree = parser.NonterminalEnc(pygram.syms["encoding_decl"], tree, enc)   167         return tree   168    169 def parse(filename):   170     """returns the parsed contents of <filename>"""   171     info = CompileInfo(filename)   172     f = open(filename)   173     try:   174         return PythonParser().parse_source(f.read(), info)   175     finally:   176         f.close()   177    178 def suite(text):   179     """returns the parsed form of the given program <text>"""   180     info = CompileInfo("<stdin>")   181     return PythonParser().parse_source(text, info)   182    183 def expr(text):   184     """returns the parsed form of the given expression <text>"""   185     info = CompileInfo("<stdin>", "single")   186     return PythonParser().parse_source(text, info)   187    188 def st2tuple(tree, line_info=True, col_info=False):   189     """returns <tree> in tuple form for the compiler package"""   190     if isinstance(tree, parser.AbstractNonterminal):   191         l = [tree.type]   192         for i in range(0, tree.num_children()):   193             l.append(st2tuple(tree.get_child(i)))   194         if isinstance(tree, parser.NonterminalEnc):   195             l.append(tree.encoding)   196         return tuple(l)   197     elif isinstance(tree, parser.Terminal):   198         l = [tree.type, tree.value]   199         if line_info:   200             l.append(tree.get_lineno())   201         if col_info:   202             l.append(tree.get_column())   203         return tuple(l)   204     else:   205         raise TypeError, tree