Lichen

pyparser/pyparse.py

934:2989aab1b4f7
13 months ago Paul Boddie Renamed the utf8string class to unicode, eliminating the unicode function. This means that the simple case of merely returning an object if it is already a Unicode object no longer occurs when using the unicode callable, but such behaviour might be better supported with more general customised instantiation functionality.
     1 from pyparser import parser, pytokenizer, pygram, error     2 from pyparser import consts     3      4 def recode_to_utf8(bytes, encoding):     5     text = bytes.decode(encoding)     6     if not isinstance(text, unicode):     7         raise error.SyntaxError("codec did not return a unicode object")     8     recoded = text.encode("utf-8")     9     return recoded    10     11 def _normalize_encoding(encoding):    12     """returns normalized name for <encoding>    13     14     see dist/src/Parser/tokenizer.c 'get_normal_name()'    15     for implementation details / reference    16     17     NOTE: for now, parser.suite() raises a MemoryError when    18           a bad encoding is used. (SF bug #979739)    19     """    20     if encoding is None:    21         return None    22     # lower() + '_' / '-' conversion    23     encoding = encoding.replace('_', '-').lower()    24     if encoding == 'utf-8' or encoding.startswith('utf-8-'):    25         return 'utf-8'    26     for variant in ['latin-1', 'iso-latin-1', 'iso-8859-1']:    27         if (encoding == variant or    28             encoding.startswith(variant + '-')):    29             return 'iso-8859-1'    30     return encoding    31     32 def _check_for_encoding(s):    33     eol = s.find('\n')    34     if eol < 0:    35         return _check_line_for_encoding(s)[0]    36     enc, again = _check_line_for_encoding(s[:eol])    37     if enc or not again:    38         return enc    39     eol2 = s.find('\n', eol + 1)    40     if eol2 < 0:    41         return _check_line_for_encoding(s[eol + 1:])[0]    42     return _check_line_for_encoding(s[eol + 1:eol2])[0]    43     44     45 def _check_line_for_encoding(line):    46     """returns the declared encoding or None"""    47     i = 0    48     for i in range(len(line)):    49         if line[i] == '#':    50             break    51         if line[i] not in ' \t\014':    52             return None, False  # Not a comment, don't read the second line.    53     return pytokenizer.match_encoding_declaration(line[i:]), True    54     55     56 class CompileInfo(object):    57     """Stores information about the source being compiled.    58     59     * filename: The filename of the source.    60     * mode: The parse mode to use. ('exec', 'eval', or 'single')    61     * flags: Parser and compiler flags.    62     * encoding: The source encoding.    63     """    64     65     def __init__(self, filename, mode="exec", flags=0):    66         self.filename = filename    67         self.mode = mode    68         self.encoding = None    69         self.flags = flags    70     71     72 _targets = {    73 'eval' : pygram.syms["eval_input"],    74 'single' : pygram.syms["single_input"],    75 'exec' : pygram.syms["file_input"],    76 }    77     78 class PythonParser(parser.Parser):    79     80     def __init__(self, grammar=pygram.python_grammar):    81         parser.Parser.__init__(self, grammar)    82     83     def parse_source(self, textsrc, compile_info):    84         """Main entry point for parsing Python source.    85     86         Everything from decoding the source to tokenizing to building the parse    87         tree is handled here.    88         """    89         # Detect source encoding.    90         enc = None    91         if textsrc.startswith("\xEF\xBB\xBF"):    92             textsrc = textsrc[3:]    93             enc = 'utf-8'    94             # If an encoding is explicitly given check that it is utf-8.    95             decl_enc = _check_for_encoding(textsrc)    96             if decl_enc and decl_enc != "utf-8":    97                 raise error.SyntaxError("UTF-8 BOM with %s coding cookie" % decl_enc,    98                                         filename=compile_info.filename)    99         elif compile_info.flags & consts.PyCF_SOURCE_IS_UTF8:   100             enc = 'utf-8'   101             if _check_for_encoding(textsrc) is not None:   102                 raise error.SyntaxError("coding declaration in unicode string",   103                                         filename=compile_info.filename)   104         else:   105             enc = _normalize_encoding(_check_for_encoding(textsrc))   106             if enc is not None and enc != 'utf-8':   107                 try:   108                     textsrc = recode_to_utf8(textsrc, enc)   109                 except LookupError as e:   110                     # if the codec is not found, LookupError is raised.   111                     raise error.SyntaxError("Unknown encoding: %s" % enc,   112                                             filename=compile_info.filename)   113                 # Transform unicode errors into SyntaxError   114                 except UnicodeDecodeError as e:   115                     message = str(e)   116                     raise error.SyntaxError(message)   117    118         flags = compile_info.flags   119    120         # The tokenizer is very picky about how it wants its input.   121         source_lines = textsrc.splitlines(True)   122         if source_lines and not source_lines[-1].endswith("\n"):   123             source_lines[-1] += '\n'   124         if textsrc and textsrc[-1] == "\n":   125             flags &= ~consts.PyCF_DONT_IMPLY_DEDENT   126    127         self.prepare(_targets[compile_info.mode])   128         tp = 0   129         try:   130             try:   131                 # Note: we no longer pass the CO_FUTURE_* to the tokenizer,   132                 # which is expected to work independently of them.  It's   133                 # certainly the case for all futures in Python <= 2.7.   134                 tokens = pytokenizer.generate_tokens(source_lines, flags)   135    136                 self.grammar = pygram.python_grammar   137    138                 for tp, value, lineno, column, line in tokens:   139                     if self.add_token(tp, value, lineno, column, line):   140                         break   141             except error.TokenError as e:   142                 e.filename = compile_info.filename   143                 raise   144             except parser.ParseError as e:   145                 # Catch parse errors, pretty them up and reraise them as a   146                 # SyntaxError.   147                 new_err = error.IndentationError   148                 if tp == pygram.tokens["INDENT"]:   149                     msg = "unexpected indent"   150                 elif e.expected == pygram.tokens["INDENT"]:   151                     msg = "expected an indented block"   152                 else:   153                     new_err = error.SyntaxError   154                     msg = "invalid syntax"   155                 raise new_err(msg, e.lineno, e.column, e.line,   156                               compile_info.filename)   157             else:   158                 tree = self.root   159         finally:   160             # Avoid hanging onto the tree.   161             self.root = None   162         if enc is not None:   163             compile_info.encoding = enc   164             # Wrap the tree in a special encoding declaration for parser module   165             # compatibility.   166             tree = parser.NonterminalEnc(pygram.syms["encoding_decl"], tree, enc)   167         return tree   168    169 def parse(filename):   170     """returns the parsed contents of <filename>"""   171     info = CompileInfo(filename)   172     f = open(filename)   173     try:   174         return PythonParser().parse_source(f.read(), info)   175     finally:   176         f.close()   177    178 def suite(text):   179     """returns the parsed form of the given program <text>"""   180     info = CompileInfo("<stdin>")   181     return PythonParser().parse_source(text, info)   182    183 def expr(text):   184     """returns the parsed form of the given expression <text>"""   185     info = CompileInfo("<stdin>", "single")   186     return PythonParser().parse_source(text, info)   187    188 def st2tuple(tree, line_info=True, col_info=False):   189     """returns <tree> in tuple form for the compiler package"""   190     if isinstance(tree, parser.AbstractNonterminal):   191         l = [tree.type]   192         for i in range(0, tree.num_children()):   193             l.append(st2tuple(tree.get_child(i)))   194         if isinstance(tree, parser.NonterminalEnc):   195             l.append(tree.encoding)   196         return tuple(l)   197     elif isinstance(tree, parser.Terminal):   198         l = [tree.type, tree.value]   199         if line_info:   200             l.append(tree.get_lineno())   201         if col_info:   202             l.append(tree.get_column())   203         return tuple(l)   204     else:   205         raise TypeError, tree