paul@437 | 1 | from pyparser import parser, pytokenizer, pygram, error |
paul@437 | 2 | from pyparser import consts |
paul@437 | 3 | |
paul@437 | 4 | def recode_to_utf8(bytes, encoding): |
paul@437 | 5 | text = bytes.decode(encoding) |
paul@437 | 6 | if not isinstance(text, unicode): |
paul@437 | 7 | raise error.SyntaxError("codec did not return a unicode object") |
paul@437 | 8 | recoded = text.encode("utf-8") |
paul@437 | 9 | return recoded |
paul@437 | 10 | |
paul@437 | 11 | def _normalize_encoding(encoding): |
paul@437 | 12 | """returns normalized name for <encoding> |
paul@437 | 13 | |
paul@437 | 14 | see dist/src/Parser/tokenizer.c 'get_normal_name()' |
paul@437 | 15 | for implementation details / reference |
paul@437 | 16 | |
paul@437 | 17 | NOTE: for now, parser.suite() raises a MemoryError when |
paul@437 | 18 | a bad encoding is used. (SF bug #979739) |
paul@437 | 19 | """ |
paul@437 | 20 | if encoding is None: |
paul@437 | 21 | return None |
paul@437 | 22 | # lower() + '_' / '-' conversion |
paul@437 | 23 | encoding = encoding.replace('_', '-').lower() |
paul@437 | 24 | if encoding == 'utf-8' or encoding.startswith('utf-8-'): |
paul@437 | 25 | return 'utf-8' |
paul@437 | 26 | for variant in ['latin-1', 'iso-latin-1', 'iso-8859-1']: |
paul@437 | 27 | if (encoding == variant or |
paul@437 | 28 | encoding.startswith(variant + '-')): |
paul@437 | 29 | return 'iso-8859-1' |
paul@437 | 30 | return encoding |
paul@437 | 31 | |
paul@437 | 32 | def _check_for_encoding(s): |
paul@437 | 33 | eol = s.find('\n') |
paul@437 | 34 | if eol < 0: |
paul@437 | 35 | return _check_line_for_encoding(s)[0] |
paul@437 | 36 | enc, again = _check_line_for_encoding(s[:eol]) |
paul@437 | 37 | if enc or not again: |
paul@437 | 38 | return enc |
paul@437 | 39 | eol2 = s.find('\n', eol + 1) |
paul@437 | 40 | if eol2 < 0: |
paul@437 | 41 | return _check_line_for_encoding(s[eol + 1:])[0] |
paul@437 | 42 | return _check_line_for_encoding(s[eol + 1:eol2])[0] |
paul@437 | 43 | |
paul@437 | 44 | |
paul@437 | 45 | def _check_line_for_encoding(line): |
paul@437 | 46 | """returns the declared encoding or None""" |
paul@437 | 47 | i = 0 |
paul@437 | 48 | for i in range(len(line)): |
paul@437 | 49 | if line[i] == '#': |
paul@437 | 50 | break |
paul@437 | 51 | if line[i] not in ' \t\014': |
paul@437 | 52 | return None, False # Not a comment, don't read the second line. |
paul@437 | 53 | return pytokenizer.match_encoding_declaration(line[i:]), True |
paul@437 | 54 | |
paul@437 | 55 | |
paul@437 | 56 | class CompileInfo(object): |
paul@437 | 57 | """Stores information about the source being compiled. |
paul@437 | 58 | |
paul@437 | 59 | * filename: The filename of the source. |
paul@437 | 60 | * mode: The parse mode to use. ('exec', 'eval', or 'single') |
paul@437 | 61 | * flags: Parser and compiler flags. |
paul@437 | 62 | * encoding: The source encoding. |
paul@437 | 63 | """ |
paul@437 | 64 | |
paul@437 | 65 | def __init__(self, filename, mode="exec", flags=0): |
paul@437 | 66 | self.filename = filename |
paul@437 | 67 | self.mode = mode |
paul@437 | 68 | self.encoding = None |
paul@437 | 69 | self.flags = flags |
paul@437 | 70 | |
paul@437 | 71 | |
paul@437 | 72 | _targets = { |
paul@439 | 73 | 'eval' : pygram.syms["eval_input"], |
paul@439 | 74 | 'single' : pygram.syms["single_input"], |
paul@439 | 75 | 'exec' : pygram.syms["file_input"], |
paul@437 | 76 | } |
paul@437 | 77 | |
paul@437 | 78 | class PythonParser(parser.Parser): |
paul@437 | 79 | |
paul@437 | 80 | def __init__(self, grammar=pygram.python_grammar): |
paul@437 | 81 | parser.Parser.__init__(self, grammar) |
paul@437 | 82 | |
paul@437 | 83 | def parse_source(self, textsrc, compile_info): |
paul@437 | 84 | """Main entry point for parsing Python source. |
paul@437 | 85 | |
paul@437 | 86 | Everything from decoding the source to tokenizing to building the parse |
paul@437 | 87 | tree is handled here. |
paul@437 | 88 | """ |
paul@437 | 89 | # Detect source encoding. |
paul@437 | 90 | enc = None |
paul@437 | 91 | if textsrc.startswith("\xEF\xBB\xBF"): |
paul@437 | 92 | textsrc = textsrc[3:] |
paul@437 | 93 | enc = 'utf-8' |
paul@437 | 94 | # If an encoding is explicitly given check that it is utf-8. |
paul@437 | 95 | decl_enc = _check_for_encoding(textsrc) |
paul@437 | 96 | if decl_enc and decl_enc != "utf-8": |
paul@437 | 97 | raise error.SyntaxError("UTF-8 BOM with %s coding cookie" % decl_enc, |
paul@437 | 98 | filename=compile_info.filename) |
paul@437 | 99 | elif compile_info.flags & consts.PyCF_SOURCE_IS_UTF8: |
paul@437 | 100 | enc = 'utf-8' |
paul@437 | 101 | if _check_for_encoding(textsrc) is not None: |
paul@437 | 102 | raise error.SyntaxError("coding declaration in unicode string", |
paul@437 | 103 | filename=compile_info.filename) |
paul@437 | 104 | else: |
paul@437 | 105 | enc = _normalize_encoding(_check_for_encoding(textsrc)) |
paul@437 | 106 | if enc is not None and enc not in ('utf-8', 'iso-8859-1'): |
paul@437 | 107 | try: |
paul@437 | 108 | textsrc = recode_to_utf8(textsrc, enc) |
paul@437 | 109 | except LookupError as e: |
paul@437 | 110 | # if the codec is not found, LookupError is raised. |
paul@437 | 111 | raise error.SyntaxError("Unknown encoding: %s" % enc, |
paul@437 | 112 | filename=compile_info.filename) |
paul@437 | 113 | # Transform unicode errors into SyntaxError |
paul@437 | 114 | except UnicodeDecodeError as e: |
paul@437 | 115 | message = str(e) |
paul@437 | 116 | raise error.SyntaxError(message) |
paul@437 | 117 | |
paul@437 | 118 | flags = compile_info.flags |
paul@437 | 119 | |
paul@437 | 120 | # The tokenizer is very picky about how it wants its input. |
paul@437 | 121 | source_lines = textsrc.splitlines(True) |
paul@437 | 122 | if source_lines and not source_lines[-1].endswith("\n"): |
paul@437 | 123 | source_lines[-1] += '\n' |
paul@437 | 124 | if textsrc and textsrc[-1] == "\n": |
paul@437 | 125 | flags &= ~consts.PyCF_DONT_IMPLY_DEDENT |
paul@437 | 126 | |
paul@437 | 127 | self.prepare(_targets[compile_info.mode]) |
paul@437 | 128 | tp = 0 |
paul@437 | 129 | try: |
paul@437 | 130 | try: |
paul@437 | 131 | # Note: we no longer pass the CO_FUTURE_* to the tokenizer, |
paul@437 | 132 | # which is expected to work independently of them. It's |
paul@437 | 133 | # certainly the case for all futures in Python <= 2.7. |
paul@437 | 134 | tokens = pytokenizer.generate_tokens(source_lines, flags) |
paul@437 | 135 | |
paul@437 | 136 | self.grammar = pygram.python_grammar |
paul@437 | 137 | |
paul@437 | 138 | for tp, value, lineno, column, line in tokens: |
paul@437 | 139 | if self.add_token(tp, value, lineno, column, line): |
paul@437 | 140 | break |
paul@437 | 141 | except error.TokenError as e: |
paul@437 | 142 | e.filename = compile_info.filename |
paul@437 | 143 | raise |
paul@437 | 144 | except parser.ParseError as e: |
paul@437 | 145 | # Catch parse errors, pretty them up and reraise them as a |
paul@437 | 146 | # SyntaxError. |
paul@437 | 147 | new_err = error.IndentationError |
paul@439 | 148 | if tp == pygram.tokens["INDENT"]: |
paul@437 | 149 | msg = "unexpected indent" |
paul@439 | 150 | elif e.expected == pygram.tokens["INDENT"]: |
paul@437 | 151 | msg = "expected an indented block" |
paul@437 | 152 | else: |
paul@437 | 153 | new_err = error.SyntaxError |
paul@437 | 154 | msg = "invalid syntax" |
paul@437 | 155 | raise new_err(msg, e.lineno, e.column, e.line, |
paul@437 | 156 | compile_info.filename) |
paul@437 | 157 | else: |
paul@437 | 158 | tree = self.root |
paul@437 | 159 | finally: |
paul@437 | 160 | # Avoid hanging onto the tree. |
paul@437 | 161 | self.root = None |
paul@437 | 162 | if enc is not None: |
paul@437 | 163 | compile_info.encoding = enc |
paul@438 | 164 | # Wrap the tree in a special encoding declaration for parser module |
paul@438 | 165 | # compatibility. |
paul@439 | 166 | tree = parser.NonterminalEnc(pygram.syms["encoding_decl"], tree, enc) |
paul@437 | 167 | return tree |
paul@437 | 168 | |
paul@437 | 169 | def parse(filename): |
paul@437 | 170 | """returns the parsed contents of <filename>""" |
paul@437 | 171 | info = CompileInfo(filename) |
paul@437 | 172 | f = open(filename) |
paul@437 | 173 | try: |
paul@437 | 174 | return PythonParser().parse_source(f.read(), info) |
paul@437 | 175 | finally: |
paul@437 | 176 | f.close() |
paul@437 | 177 | |
paul@437 | 178 | def suite(text): |
paul@437 | 179 | """returns the parsed form of the given program <text>""" |
paul@437 | 180 | info = CompileInfo("<stdin>") |
paul@437 | 181 | return PythonParser().parse_source(text, info) |
paul@437 | 182 | |
paul@437 | 183 | def expr(text): |
paul@437 | 184 | """returns the parsed form of the given expression <text>""" |
paul@437 | 185 | info = CompileInfo("<stdin>", "single") |
paul@437 | 186 | return PythonParser().parse_source(text, info) |
paul@437 | 187 | |
paul@437 | 188 | def st2tuple(tree, line_info=True, col_info=False): |
paul@437 | 189 | """returns <tree> in tuple form for the compiler package""" |
paul@437 | 190 | if isinstance(tree, parser.AbstractNonterminal): |
paul@437 | 191 | l = [tree.type] |
paul@437 | 192 | for i in range(0, tree.num_children()): |
paul@437 | 193 | l.append(st2tuple(tree.get_child(i))) |
paul@438 | 194 | if isinstance(tree, parser.NonterminalEnc): |
paul@438 | 195 | l.append(tree.encoding) |
paul@437 | 196 | return tuple(l) |
paul@437 | 197 | elif isinstance(tree, parser.Terminal): |
paul@437 | 198 | l = [tree.type, tree.value] |
paul@437 | 199 | if line_info: |
paul@437 | 200 | l.append(tree.get_lineno()) |
paul@437 | 201 | if col_info: |
paul@437 | 202 | l.append(tree.get_column()) |
paul@437 | 203 | return tuple(l) |
paul@437 | 204 | else: |
paul@437 | 205 | raise TypeError, tree |