Lichen

Annotated pyparser/pyparse.py

627:05ad7964265c
2017-02-27 Paul Boddie Merged convenience macro changes.
paul@437 1
from pyparser import parser, pytokenizer, pygram, error
paul@437 2
from pyparser import consts
paul@437 3
paul@537 4
def recode_to_utf8(bytes, encoding):
paul@537 5
    text = bytes.decode(encoding)
paul@537 6
    if not isinstance(text, unicode):
paul@537 7
        raise error.SyntaxError("codec did not return a unicode object")
paul@537 8
    recoded = text.encode("utf-8")
paul@537 9
    return recoded
paul@537 10
paul@437 11
def _normalize_encoding(encoding):
paul@437 12
    """returns normalized name for <encoding>
paul@437 13
paul@437 14
    see dist/src/Parser/tokenizer.c 'get_normal_name()'
paul@437 15
    for implementation details / reference
paul@437 16
paul@437 17
    NOTE: for now, parser.suite() raises a MemoryError when
paul@437 18
          a bad encoding is used. (SF bug #979739)
paul@437 19
    """
paul@437 20
    if encoding is None:
paul@437 21
        return None
paul@437 22
    # lower() + '_' / '-' conversion
paul@437 23
    encoding = encoding.replace('_', '-').lower()
paul@437 24
    if encoding == 'utf-8' or encoding.startswith('utf-8-'):
paul@437 25
        return 'utf-8'
paul@437 26
    for variant in ['latin-1', 'iso-latin-1', 'iso-8859-1']:
paul@437 27
        if (encoding == variant or
paul@437 28
            encoding.startswith(variant + '-')):
paul@437 29
            return 'iso-8859-1'
paul@437 30
    return encoding
paul@437 31
paul@437 32
def _check_for_encoding(s):
paul@437 33
    eol = s.find('\n')
paul@437 34
    if eol < 0:
paul@437 35
        return _check_line_for_encoding(s)[0]
paul@437 36
    enc, again = _check_line_for_encoding(s[:eol])
paul@437 37
    if enc or not again:
paul@437 38
        return enc
paul@437 39
    eol2 = s.find('\n', eol + 1)
paul@437 40
    if eol2 < 0:
paul@437 41
        return _check_line_for_encoding(s[eol + 1:])[0]
paul@437 42
    return _check_line_for_encoding(s[eol + 1:eol2])[0]
paul@437 43
paul@437 44
paul@437 45
def _check_line_for_encoding(line):
paul@437 46
    """returns the declared encoding or None"""
paul@437 47
    i = 0
paul@437 48
    for i in range(len(line)):
paul@437 49
        if line[i] == '#':
paul@437 50
            break
paul@437 51
        if line[i] not in ' \t\014':
paul@437 52
            return None, False  # Not a comment, don't read the second line.
paul@437 53
    return pytokenizer.match_encoding_declaration(line[i:]), True
paul@437 54
paul@437 55
paul@437 56
class CompileInfo(object):
paul@437 57
    """Stores information about the source being compiled.
paul@437 58
paul@437 59
    * filename: The filename of the source.
paul@437 60
    * mode: The parse mode to use. ('exec', 'eval', or 'single')
paul@437 61
    * flags: Parser and compiler flags.
paul@437 62
    * encoding: The source encoding.
paul@437 63
    """
paul@437 64
paul@437 65
    def __init__(self, filename, mode="exec", flags=0):
paul@437 66
        self.filename = filename
paul@437 67
        self.mode = mode
paul@437 68
        self.encoding = None
paul@437 69
        self.flags = flags
paul@437 70
paul@437 71
paul@437 72
_targets = {
paul@439 73
'eval' : pygram.syms["eval_input"],
paul@439 74
'single' : pygram.syms["single_input"],
paul@439 75
'exec' : pygram.syms["file_input"],
paul@437 76
}
paul@437 77
paul@437 78
class PythonParser(parser.Parser):
paul@437 79
paul@437 80
    def __init__(self, grammar=pygram.python_grammar):
paul@437 81
        parser.Parser.__init__(self, grammar)
paul@437 82
paul@437 83
    def parse_source(self, textsrc, compile_info):
paul@437 84
        """Main entry point for parsing Python source.
paul@437 85
paul@437 86
        Everything from decoding the source to tokenizing to building the parse
paul@437 87
        tree is handled here.
paul@437 88
        """
paul@437 89
        # Detect source encoding.
paul@437 90
        enc = None
paul@437 91
        if textsrc.startswith("\xEF\xBB\xBF"):
paul@437 92
            textsrc = textsrc[3:]
paul@437 93
            enc = 'utf-8'
paul@437 94
            # If an encoding is explicitly given check that it is utf-8.
paul@437 95
            decl_enc = _check_for_encoding(textsrc)
paul@437 96
            if decl_enc and decl_enc != "utf-8":
paul@437 97
                raise error.SyntaxError("UTF-8 BOM with %s coding cookie" % decl_enc,
paul@437 98
                                        filename=compile_info.filename)
paul@437 99
        elif compile_info.flags & consts.PyCF_SOURCE_IS_UTF8:
paul@437 100
            enc = 'utf-8'
paul@437 101
            if _check_for_encoding(textsrc) is not None:
paul@437 102
                raise error.SyntaxError("coding declaration in unicode string",
paul@437 103
                                        filename=compile_info.filename)
paul@437 104
        else:
paul@437 105
            enc = _normalize_encoding(_check_for_encoding(textsrc))
paul@537 106
            if enc is not None and enc != 'utf-8':
paul@537 107
                try:
paul@537 108
                    textsrc = recode_to_utf8(textsrc, enc)
paul@537 109
                except LookupError as e:
paul@537 110
                    # if the codec is not found, LookupError is raised.
paul@537 111
                    raise error.SyntaxError("Unknown encoding: %s" % enc,
paul@537 112
                                            filename=compile_info.filename)
paul@537 113
                # Transform unicode errors into SyntaxError
paul@537 114
                except UnicodeDecodeError as e:
paul@537 115
                    message = str(e)
paul@537 116
                    raise error.SyntaxError(message)
paul@437 117
paul@437 118
        flags = compile_info.flags
paul@437 119
paul@437 120
        # The tokenizer is very picky about how it wants its input.
paul@437 121
        source_lines = textsrc.splitlines(True)
paul@437 122
        if source_lines and not source_lines[-1].endswith("\n"):
paul@437 123
            source_lines[-1] += '\n'
paul@437 124
        if textsrc and textsrc[-1] == "\n":
paul@437 125
            flags &= ~consts.PyCF_DONT_IMPLY_DEDENT
paul@437 126
paul@437 127
        self.prepare(_targets[compile_info.mode])
paul@437 128
        tp = 0
paul@437 129
        try:
paul@437 130
            try:
paul@437 131
                # Note: we no longer pass the CO_FUTURE_* to the tokenizer,
paul@437 132
                # which is expected to work independently of them.  It's
paul@437 133
                # certainly the case for all futures in Python <= 2.7.
paul@437 134
                tokens = pytokenizer.generate_tokens(source_lines, flags)
paul@437 135
paul@437 136
                self.grammar = pygram.python_grammar
paul@437 137
paul@437 138
                for tp, value, lineno, column, line in tokens:
paul@437 139
                    if self.add_token(tp, value, lineno, column, line):
paul@437 140
                        break
paul@437 141
            except error.TokenError as e:
paul@437 142
                e.filename = compile_info.filename
paul@437 143
                raise
paul@437 144
            except parser.ParseError as e:
paul@437 145
                # Catch parse errors, pretty them up and reraise them as a
paul@437 146
                # SyntaxError.
paul@437 147
                new_err = error.IndentationError
paul@439 148
                if tp == pygram.tokens["INDENT"]:
paul@437 149
                    msg = "unexpected indent"
paul@439 150
                elif e.expected == pygram.tokens["INDENT"]:
paul@437 151
                    msg = "expected an indented block"
paul@437 152
                else:
paul@437 153
                    new_err = error.SyntaxError
paul@437 154
                    msg = "invalid syntax"
paul@437 155
                raise new_err(msg, e.lineno, e.column, e.line,
paul@437 156
                              compile_info.filename)
paul@437 157
            else:
paul@437 158
                tree = self.root
paul@437 159
        finally:
paul@437 160
            # Avoid hanging onto the tree.
paul@437 161
            self.root = None
paul@437 162
        if enc is not None:
paul@437 163
            compile_info.encoding = enc
paul@438 164
            # Wrap the tree in a special encoding declaration for parser module
paul@438 165
            # compatibility.
paul@439 166
            tree = parser.NonterminalEnc(pygram.syms["encoding_decl"], tree, enc)
paul@437 167
        return tree
paul@437 168
paul@437 169
def parse(filename):
paul@437 170
    """returns the parsed contents of <filename>"""
paul@437 171
    info = CompileInfo(filename)
paul@437 172
    f = open(filename)
paul@437 173
    try:
paul@437 174
        return PythonParser().parse_source(f.read(), info)
paul@437 175
    finally:
paul@437 176
        f.close()
paul@437 177
paul@437 178
def suite(text):
paul@437 179
    """returns the parsed form of the given program <text>"""
paul@437 180
    info = CompileInfo("<stdin>")
paul@437 181
    return PythonParser().parse_source(text, info)
paul@437 182
paul@437 183
def expr(text):
paul@437 184
    """returns the parsed form of the given expression <text>"""
paul@437 185
    info = CompileInfo("<stdin>", "single")
paul@437 186
    return PythonParser().parse_source(text, info)
paul@437 187
paul@437 188
def st2tuple(tree, line_info=True, col_info=False):
paul@437 189
    """returns <tree> in tuple form for the compiler package"""
paul@437 190
    if isinstance(tree, parser.AbstractNonterminal):
paul@437 191
        l = [tree.type]
paul@437 192
        for i in range(0, tree.num_children()):
paul@437 193
            l.append(st2tuple(tree.get_child(i)))
paul@438 194
        if isinstance(tree, parser.NonterminalEnc):
paul@438 195
            l.append(tree.encoding)
paul@437 196
        return tuple(l)
paul@437 197
    elif isinstance(tree, parser.Terminal):
paul@437 198
        l = [tree.type, tree.value]
paul@437 199
        if line_info:
paul@437 200
            l.append(tree.get_lineno())
paul@437 201
        if col_info:
paul@437 202
            l.append(tree.get_column())
paul@437 203
        return tuple(l)
paul@437 204
    else:
paul@437 205
        raise TypeError, tree