Lichen

Annotated pyparser/pytokenizer.py

792:d70932955645
2017-03-31 Paul Boddie Fixed non-recognition of deferred references in non-module, non-function scopes.
paul@437 1
from pyparser import automata
paul@437 2
from pyparser.pygram import tokens
paul@437 3
from pyparser.pytoken import python_opmap
paul@437 4
from pyparser.error import TokenError, TokenIndentationError
paul@437 5
from pyparser.pytokenize import tabsize, whiteSpaceDFA, \
paul@437 6
    triple_quoted, endDFAs, single_quoted, pseudoDFA
paul@437 7
from pyparser import consts
paul@437 8
paul@437 9
NAMECHARS = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_'
paul@437 10
NUMCHARS = '0123456789'
paul@437 11
ALNUMCHARS = NAMECHARS + NUMCHARS
paul@437 12
EXTENDED_ALNUMCHARS = ALNUMCHARS + '-.'
paul@437 13
WHITESPACES = ' \t\n\r\v\f'
paul@437 14
paul@437 15
def match_encoding_declaration(comment):
paul@437 16
    """returns the declared encoding or None
paul@437 17
paul@437 18
    This function is a replacement for :
paul@437 19
    >>> py_encoding = re.compile(r"coding[:=]\s*([-\w.]+)")
paul@437 20
    >>> py_encoding.search(comment)
paul@437 21
    """
paul@437 22
    index = comment.find('coding')
paul@437 23
    if index < 0:
paul@437 24
        return None
paul@437 25
    next_char = comment[index + 6]
paul@437 26
    if next_char not in ':=':
paul@437 27
        return None
paul@437 28
    end_of_decl = comment[index + 7:]
paul@437 29
    index = 0
paul@437 30
    for char in end_of_decl:
paul@437 31
        if char not in WHITESPACES:
paul@437 32
            break
paul@437 33
        index += 1
paul@437 34
    else:
paul@437 35
        return None
paul@437 36
    encoding = ''
paul@437 37
    for char in end_of_decl[index:]:
paul@437 38
        if char in EXTENDED_ALNUMCHARS:
paul@437 39
            encoding += char
paul@437 40
        else:
paul@437 41
            break
paul@437 42
    if encoding != '':
paul@437 43
        return encoding
paul@437 44
    return None
paul@437 45
paul@437 46
paul@437 47
DUMMY_DFA = automata.DFA([], [])
paul@437 48
paul@437 49
def generate_tokens(lines, flags):
paul@437 50
    """
paul@437 51
    This is a rewrite of pypy.module.parser.pytokenize.generate_tokens since
paul@437 52
    the original function is not RPYTHON (uses yield)
paul@437 53
    It was also slightly modified to generate Token instances instead
paul@437 54
    of the original 5-tuples -- it's now a 4-tuple of
paul@437 55
paul@437 56
    * the Token instance
paul@437 57
    * the whole line as a string
paul@437 58
    * the line number (the real one, counting continuation lines)
paul@437 59
    * the position on the line of the end of the token.
paul@437 60
paul@437 61
    Original docstring ::
paul@437 62
paul@437 63
        The generate_tokens() generator requires one argment, readline, which
paul@437 64
        must be a callable object which provides the same interface as the
paul@437 65
        readline() method of built-in file objects. Each call to the function
paul@437 66
        should return one line of input as a string.
paul@437 67
paul@437 68
        The generator produces 5-tuples with these members: the token type; the
paul@437 69
        token string; a 2-tuple (srow, scol) of ints specifying the row and
paul@437 70
        column where the token begins in the source; a 2-tuple (erow, ecol) of
paul@437 71
        ints specifying the row and column where the token ends in the source;
paul@437 72
        and the line on which the token was found. The line passed is the
paul@437 73
        logical line; continuation lines are included.
paul@437 74
    """
paul@437 75
    token_list = []
paul@437 76
    lnum = parenlev = continued = 0
paul@437 77
    namechars = NAMECHARS
paul@437 78
    numchars = NUMCHARS
paul@437 79
    contstr, needcont = '', 0
paul@437 80
    contline = None
paul@437 81
    indents = [0]
paul@437 82
    last_comment = ''
paul@437 83
    parenlevstart = (0, 0, "")
paul@437 84
paul@437 85
    # make the annotator happy
paul@437 86
    endDFA = DUMMY_DFA
paul@437 87
    # make the annotator happy
paul@437 88
    line = ''
paul@437 89
    pos = 0
paul@437 90
    lines.append("")
paul@437 91
    strstart = (0, 0, "")
paul@437 92
    for line in lines:
paul@437 93
        lnum = lnum + 1
paul@437 94
        line = universal_newline(line)
paul@437 95
        pos, max = 0, len(line)
paul@437 96
paul@437 97
        if contstr:
paul@437 98
            if not line:
paul@437 99
                raise TokenError(
paul@437 100
                    "EOF while scanning triple-quoted string literal",
paul@437 101
                    strstart[2], strstart[0], strstart[1]+1,
paul@437 102
                    token_list, lnum-1)
paul@437 103
            endmatch = endDFA.recognize(line)
paul@437 104
            if endmatch >= 0:
paul@437 105
                pos = end = endmatch
paul@439 106
                tok = (tokens["STRING"], contstr + line[:end], strstart[0],
paul@437 107
                       strstart[1], line)
paul@437 108
                token_list.append(tok)
paul@437 109
                last_comment = ''
paul@437 110
                contstr, needcont = '', 0
paul@437 111
                contline = None
paul@437 112
            elif (needcont and not line.endswith('\\\n') and
paul@437 113
                               not line.endswith('\\\r\n')):
paul@439 114
                tok = (tokens["ERRORTOKEN"], contstr + line, strstart[0],
paul@437 115
                       strstart[1], line)
paul@437 116
                token_list.append(tok)
paul@437 117
                last_comment = ''
paul@437 118
                contstr = ''
paul@437 119
                contline = None
paul@437 120
                continue
paul@437 121
            else:
paul@437 122
                contstr = contstr + line
paul@437 123
                contline = contline + line
paul@437 124
                continue
paul@437 125
paul@437 126
        elif parenlev == 0 and not continued:  # new statement
paul@437 127
            if not line: break
paul@437 128
            column = 0
paul@437 129
            while pos < max:                   # measure leading whitespace
paul@437 130
                if line[pos] == ' ': column = column + 1
paul@437 131
                elif line[pos] == '\t': column = (column/tabsize + 1)*tabsize
paul@437 132
                elif line[pos] == '\f': column = 0
paul@437 133
                else: break
paul@437 134
                pos = pos + 1
paul@437 135
            if pos == max: break
paul@437 136
paul@437 137
            if line[pos] in '#\r\n':
paul@437 138
                # skip comments or blank lines
paul@437 139
                continue
paul@437 140
paul@437 141
            if column > indents[-1]:           # count indents or dedents
paul@437 142
                indents.append(column)
paul@439 143
                token_list.append((tokens["INDENT"], line[:pos], lnum, 0, line))
paul@437 144
                last_comment = ''
paul@437 145
            while column < indents[-1]:
paul@437 146
                indents = indents[:-1]
paul@439 147
                token_list.append((tokens["DEDENT"], '', lnum, pos, line))
paul@437 148
                last_comment = ''
paul@437 149
            if column != indents[-1]:
paul@437 150
                err = "unindent does not match any outer indentation level"
paul@437 151
                raise TokenIndentationError(err, line, lnum, 0, token_list)
paul@437 152
paul@437 153
        else:                                  # continued statement
paul@437 154
            if not line:
paul@437 155
                if parenlev > 0:
paul@437 156
                    lnum1, start1, line1 = parenlevstart
paul@437 157
                    raise TokenError("parenthesis is never closed", line1,
paul@437 158
                                     lnum1, start1 + 1, token_list, lnum)
paul@437 159
                raise TokenError("EOF in multi-line statement", line,
paul@437 160
                                 lnum, 0, token_list)
paul@437 161
            continued = 0
paul@437 162
paul@437 163
        while pos < max:
paul@437 164
            pseudomatch = pseudoDFA.recognize(line, pos)
paul@437 165
            if pseudomatch >= 0:                            # scan for tokens
paul@437 166
                # JDR: Modified
paul@437 167
                start = whiteSpaceDFA.recognize(line, pos)
paul@437 168
                if start < 0:
paul@437 169
                    start = pos
paul@437 170
                end = pseudomatch
paul@437 171
paul@437 172
                if start == end:
paul@437 173
                    raise TokenError("Unknown character", line,
paul@437 174
                                     lnum, start + 1, token_list)
paul@437 175
paul@437 176
                pos = end
paul@437 177
                token, initial = line[start:end], line[start]
paul@437 178
                if initial in numchars or \
paul@437 179
                   (initial == '.' and token != '.'):      # ordinary number
paul@439 180
                    token_list.append((tokens["NUMBER"], token, lnum, start, line))
paul@437 181
                    last_comment = ''
paul@437 182
                elif initial in '\r\n':
paul@437 183
                    if parenlev <= 0:
paul@439 184
                        tok = (tokens["NEWLINE"], last_comment, lnum, start, line)
paul@437 185
                        token_list.append(tok)
paul@437 186
                    last_comment = ''
paul@437 187
                elif initial == '#':
paul@437 188
                    # skip comment
paul@437 189
                    last_comment = token
paul@437 190
                elif token in triple_quoted:
paul@437 191
                    endDFA = endDFAs[token]
paul@437 192
                    endmatch = endDFA.recognize(line, pos)
paul@437 193
                    if endmatch >= 0:                     # all on one line
paul@437 194
                        pos = endmatch
paul@437 195
                        token = line[start:pos]
paul@439 196
                        tok = (tokens["STRING"], token, lnum, start, line)
paul@437 197
                        token_list.append(tok)
paul@437 198
                        last_comment = ''
paul@437 199
                    else:
paul@437 200
                        strstart = (lnum, start, line)
paul@437 201
                        contstr = line[start:]
paul@437 202
                        contline = line
paul@437 203
                        break
paul@437 204
                elif initial in single_quoted or \
paul@437 205
                    token[:2] in single_quoted or \
paul@437 206
                    token[:3] in single_quoted:
paul@437 207
                    if token[-1] == '\n':                  # continued string
paul@437 208
                        strstart = (lnum, start, line)
paul@437 209
                        endDFA = (endDFAs[initial] or endDFAs[token[1]] or
paul@437 210
                                   endDFAs[token[2]])
paul@437 211
                        contstr, needcont = line[start:], 1
paul@437 212
                        contline = line
paul@437 213
                        break
paul@437 214
                    else:                                  # ordinary string
paul@439 215
                        tok = (tokens["STRING"], token, lnum, start, line)
paul@437 216
                        token_list.append(tok)
paul@437 217
                        last_comment = ''
paul@437 218
                elif initial in namechars:                 # ordinary name
paul@439 219
                    token_list.append((tokens["NAME"], token, lnum, start, line))
paul@437 220
                    last_comment = ''
paul@437 221
                elif initial == '\\':                      # continued stmt
paul@437 222
                    continued = 1
paul@437 223
                else:
paul@437 224
                    if initial in '([{':
paul@437 225
                        if parenlev == 0:
paul@437 226
                            parenlevstart = (lnum, start, line)
paul@437 227
                        parenlev = parenlev + 1
paul@437 228
                    elif initial in ')]}':
paul@437 229
                        parenlev = parenlev - 1
paul@437 230
                        if parenlev < 0:
paul@437 231
                            raise TokenError("unmatched '%s'" % initial, line,
paul@437 232
                                             lnum, start + 1, token_list)
paul@437 233
                    if token in python_opmap:
paul@437 234
                        punct = python_opmap[token]
paul@437 235
                    else:
paul@439 236
                        punct = tokens["OP"]
paul@437 237
                    token_list.append((punct, token, lnum, start, line))
paul@437 238
                    last_comment = ''
paul@437 239
            else:
paul@437 240
                start = whiteSpaceDFA.recognize(line, pos)
paul@437 241
                if start < 0:
paul@437 242
                    start = pos
paul@437 243
                if start<max and line[start] in single_quoted:
paul@437 244
                    raise TokenError("EOL while scanning string literal",
paul@437 245
                             line, lnum, start+1, token_list)
paul@439 246
                tok = (tokens["ERRORTOKEN"], line[pos], lnum, pos, line)
paul@437 247
                token_list.append(tok)
paul@437 248
                last_comment = ''
paul@437 249
                pos = pos + 1
paul@437 250
paul@437 251
    lnum -= 1
paul@437 252
    if not (flags & consts.PyCF_DONT_IMPLY_DEDENT):
paul@439 253
        if token_list and token_list[-1][0] != tokens["NEWLINE"]:
paul@439 254
            tok = (tokens["NEWLINE"], '', lnum, 0, '\n')
paul@437 255
            token_list.append(tok)
paul@437 256
        for indent in indents[1:]:                # pop remaining indent levels
paul@439 257
            token_list.append((tokens["DEDENT"], '', lnum, pos, line))
paul@439 258
    tok = (tokens["NEWLINE"], '', lnum, 0, '\n')
paul@437 259
    token_list.append(tok)
paul@437 260
paul@439 261
    token_list.append((tokens["ENDMARKER"], '', lnum, pos, line))
paul@437 262
    return token_list
paul@437 263
paul@437 264
paul@437 265
def universal_newline(line):
paul@437 266
    # show annotator that indexes below are non-negative
paul@437 267
    line_len_m2 = len(line) - 2
paul@437 268
    if line_len_m2 >= 0 and line[-2] == '\r' and line[-1] == '\n':
paul@437 269
        return line[:line_len_m2] + '\n'
paul@437 270
    line_len_m1 = len(line) - 1
paul@437 271
    if line_len_m1 >= 0 and line[-1] == '\r':
paul@437 272
        return line[:line_len_m1] + '\n'
paul@437 273
    return line