Lichen

Annotated pyparser/pytokenize.py

750:44ea0968a550
2017-03-20 Paul Boddie Added the context identity to the AttrResult string representation.
paul@437 1
# ______________________________________________________________________
paul@437 2
"""Module pytokenize
paul@437 3
paul@437 4
THIS FILE WAS COPIED FROM pypy/module/parser/pytokenize.py AND ADAPTED
paul@437 5
TO BE ANNOTABLE (Mainly made lists homogeneous)
paul@437 6
paul@437 7
This is a modified version of Ka-Ping Yee's tokenize module found in the
paul@437 8
Python standard library.
paul@437 9
paul@437 10
The primary modification is the removal of the tokenizer's dependence on the
paul@437 11
standard Python regular expression module, which is written in C.  The regular
paul@437 12
expressions have been replaced with hand built DFA's using the
paul@437 13
basil.util.automata module.
paul@437 14
paul@437 15
$Id: pytokenize.py,v 1.3 2003/10/03 16:31:53 jriehl Exp $
paul@437 16
"""
paul@437 17
# ______________________________________________________________________
paul@437 18
paul@437 19
from pyparser import automata
paul@437 20
paul@437 21
__all__ = [ "tokenize" ]
paul@437 22
paul@437 23
# ______________________________________________________________________
paul@437 24
# Automatically generated DFA's
paul@437 25
paul@437 26
accepts = [True, True, True, True, True, True, True, True,
paul@437 27
           True, True, False, True, True, True, True, False,
paul@437 28
           False, False, True, False, False, True, False,
paul@437 29
           False, True, False, True, False, True, False,
paul@437 30
           False, True, False, False, True, True, True,
paul@437 31
           False, False, True, False, False, False, True]
paul@437 32
states = [
paul@437 33
    # 0
paul@437 34
    {'\t': 0, '\n': 13, '\x0c': 0,
paul@437 35
     '\r': 14, ' ': 0, '!': 10, '"': 16,
paul@437 36
     '#': 18, '%': 12, '&': 12, "'": 15,
paul@437 37
     '(': 13, ')': 13, '*': 7, '+': 12,
paul@437 38
     ',': 13, '-': 12, '.': 6, '/': 11,
paul@437 39
     '0': 4, '1': 5, '2': 5, '3': 5,
paul@437 40
     '4': 5, '5': 5, '6': 5, '7': 5,
paul@437 41
     '8': 5, '9': 5, ':': 13, ';': 13,
paul@437 42
     '<': 9, '=': 12, '>': 8, '@': 13,
paul@437 43
     'A': 1, 'B': 2, 'C': 1, 'D': 1,
paul@437 44
     'E': 1, 'F': 1, 'G': 1, 'H': 1,
paul@437 45
     'I': 1, 'J': 1, 'K': 1, 'L': 1,
paul@437 46
     'M': 1, 'N': 1, 'O': 1, 'P': 1,
paul@437 47
     'Q': 1, 'R': 3, 'S': 1, 'T': 1,
paul@437 48
     'U': 2, 'V': 1, 'W': 1, 'X': 1,
paul@437 49
     'Y': 1, 'Z': 1, '[': 13, '\\': 17,
paul@437 50
     ']': 13, '^': 12, '_': 1, '`': 13,
paul@437 51
     'a': 1, 'b': 2, 'c': 1, 'd': 1,
paul@437 52
     'e': 1, 'f': 1, 'g': 1, 'h': 1,
paul@437 53
     'i': 1, 'j': 1, 'k': 1, 'l': 1,
paul@437 54
     'm': 1, 'n': 1, 'o': 1, 'p': 1,
paul@437 55
     'q': 1, 'r': 3, 's': 1, 't': 1,
paul@437 56
     'u': 2, 'v': 1, 'w': 1, 'x': 1,
paul@437 57
     'y': 1, 'z': 1, '{': 13, '|': 12,
paul@437 58
     '}': 13, '~': 13},
paul@437 59
    # 1
paul@437 60
    {'0': 1, '1': 1, '2': 1, '3': 1,
paul@437 61
     '4': 1, '5': 1, '6': 1, '7': 1,
paul@437 62
     '8': 1, '9': 1, 'A': 1, 'B': 1,
paul@437 63
     'C': 1, 'D': 1, 'E': 1, 'F': 1,
paul@437 64
     'G': 1, 'H': 1, 'I': 1, 'J': 1,
paul@437 65
     'K': 1, 'L': 1, 'M': 1, 'N': 1,
paul@437 66
     'O': 1, 'P': 1, 'Q': 1, 'R': 1,
paul@437 67
     'S': 1, 'T': 1, 'U': 1, 'V': 1,
paul@437 68
     'W': 1, 'X': 1, 'Y': 1, 'Z': 1,
paul@437 69
     '_': 1, 'a': 1, 'b': 1, 'c': 1,
paul@437 70
     'd': 1, 'e': 1, 'f': 1, 'g': 1,
paul@437 71
     'h': 1, 'i': 1, 'j': 1, 'k': 1,
paul@437 72
     'l': 1, 'm': 1, 'n': 1, 'o': 1,
paul@437 73
     'p': 1, 'q': 1, 'r': 1, 's': 1,
paul@437 74
     't': 1, 'u': 1, 'v': 1, 'w': 1,
paul@437 75
     'x': 1, 'y': 1, 'z': 1},
paul@437 76
    # 2
paul@437 77
    {'"': 16, "'": 15, '0': 1, '1': 1,
paul@437 78
     '2': 1, '3': 1, '4': 1, '5': 1,
paul@437 79
     '6': 1, '7': 1, '8': 1, '9': 1,
paul@437 80
     'A': 1, 'B': 1, 'C': 1, 'D': 1,
paul@437 81
     'E': 1, 'F': 1, 'G': 1, 'H': 1,
paul@437 82
     'I': 1, 'J': 1, 'K': 1, 'L': 1,
paul@437 83
     'M': 1, 'N': 1, 'O': 1, 'P': 1,
paul@437 84
     'Q': 1, 'R': 3, 'S': 1, 'T': 1,
paul@437 85
     'U': 1, 'V': 1, 'W': 1, 'X': 1,
paul@437 86
     'Y': 1, 'Z': 1, '_': 1, 'a': 1,
paul@437 87
     'b': 1, 'c': 1, 'd': 1, 'e': 1,
paul@437 88
     'f': 1, 'g': 1, 'h': 1, 'i': 1,
paul@437 89
     'j': 1, 'k': 1, 'l': 1, 'm': 1,
paul@437 90
     'n': 1, 'o': 1, 'p': 1, 'q': 1,
paul@437 91
     'r': 3, 's': 1, 't': 1, 'u': 1,
paul@437 92
     'v': 1, 'w': 1, 'x': 1, 'y': 1,
paul@437 93
     'z': 1},
paul@437 94
    # 3
paul@437 95
    {'"': 16, "'": 15, '0': 1, '1': 1,
paul@437 96
     '2': 1, '3': 1, '4': 1, '5': 1,
paul@437 97
     '6': 1, '7': 1, '8': 1, '9': 1,
paul@437 98
     'A': 1, 'B': 1, 'C': 1, 'D': 1,
paul@437 99
     'E': 1, 'F': 1, 'G': 1, 'H': 1,
paul@437 100
     'I': 1, 'J': 1, 'K': 1, 'L': 1,
paul@437 101
     'M': 1, 'N': 1, 'O': 1, 'P': 1,
paul@437 102
     'Q': 1, 'R': 1, 'S': 1, 'T': 1,
paul@437 103
     'U': 1, 'V': 1, 'W': 1, 'X': 1,
paul@437 104
     'Y': 1, 'Z': 1, '_': 1, 'a': 1,
paul@437 105
     'b': 1, 'c': 1, 'd': 1, 'e': 1,
paul@437 106
     'f': 1, 'g': 1, 'h': 1, 'i': 1,
paul@437 107
     'j': 1, 'k': 1, 'l': 1, 'm': 1,
paul@437 108
     'n': 1, 'o': 1, 'p': 1, 'q': 1,
paul@437 109
     'r': 1, 's': 1, 't': 1, 'u': 1,
paul@437 110
     'v': 1, 'w': 1, 'x': 1, 'y': 1,
paul@437 111
     'z': 1},
paul@437 112
    # 4
paul@437 113
    {'.': 24, '0': 21, '1': 21, '2': 21,
paul@437 114
     '3': 21, '4': 21, '5': 21, '6': 21,
paul@437 115
     '7': 21, '8': 23, '9': 23, 'B': 22,
paul@437 116
     'E': 25, 'J': 13, 'L': 13, 'O': 20,
paul@437 117
     'X': 19, 'b': 22, 'e': 25, 'j': 13,
paul@437 118
     'l': 13, 'o': 20, 'x': 19},
paul@437 119
    # 5
paul@437 120
    {'.': 24, '0': 5, '1': 5, '2': 5,
paul@437 121
     '3': 5, '4': 5, '5': 5, '6': 5,
paul@437 122
     '7': 5, '8': 5, '9': 5, 'E': 25,
paul@437 123
     'J': 13, 'L': 13, 'e': 25, 'j': 13,
paul@437 124
     'l': 13},
paul@437 125
    # 6
paul@437 126
    {'0': 26, '1': 26, '2': 26, '3': 26,
paul@437 127
     '4': 26, '5': 26, '6': 26, '7': 26,
paul@437 128
     '8': 26, '9': 26},
paul@437 129
    # 7
paul@437 130
    {'*': 12, '=': 13},
paul@437 131
    # 8
paul@437 132
    {'=': 13, '>': 12},
paul@437 133
    # 9
paul@437 134
    {'<': 12, '=': 13, '>': 13},
paul@437 135
    # 10
paul@437 136
    {'=': 13},
paul@437 137
    # 11
paul@437 138
    {'/': 12, '=': 13},
paul@437 139
    # 12
paul@437 140
    {'=': 13},
paul@437 141
    # 13
paul@437 142
    {},
paul@437 143
    # 14
paul@437 144
    {'\n': 13},
paul@437 145
    # 15
paul@437 146
    {automata.DEFAULT: 30, '\n': 27,
paul@437 147
     '\r': 27, "'": 28, '\\': 29},
paul@437 148
    # 16
paul@437 149
    {automata.DEFAULT: 33, '\n': 27,
paul@437 150
     '\r': 27, '"': 31, '\\': 32},
paul@437 151
    # 17
paul@437 152
    {'\n': 13, '\r': 14},
paul@437 153
    # 18
paul@437 154
    {automata.DEFAULT: 18, '\n': 27, '\r': 27},
paul@437 155
    # 19
paul@437 156
    {'0': 34, '1': 34, '2': 34, '3': 34,
paul@437 157
     '4': 34, '5': 34, '6': 34, '7': 34,
paul@437 158
     '8': 34, '9': 34, 'A': 34, 'B': 34,
paul@437 159
     'C': 34, 'D': 34, 'E': 34, 'F': 34,
paul@437 160
     'a': 34, 'b': 34, 'c': 34, 'd': 34,
paul@437 161
     'e': 34, 'f': 34},
paul@437 162
    # 20
paul@437 163
    {'0': 35, '1': 35, '2': 35, '3': 35,
paul@437 164
     '4': 35, '5': 35, '6': 35, '7': 35},
paul@437 165
    # 21
paul@437 166
    {'.': 24, '0': 21, '1': 21, '2': 21,
paul@437 167
     '3': 21, '4': 21, '5': 21, '6': 21,
paul@437 168
     '7': 21, '8': 23, '9': 23, 'E': 25,
paul@437 169
     'J': 13, 'L': 13, 'e': 25, 'j': 13,
paul@437 170
     'l': 13},
paul@437 171
    # 22
paul@437 172
    {'0': 36, '1': 36},
paul@437 173
    # 23
paul@437 174
    {'.': 24, '0': 23, '1': 23, '2': 23,
paul@437 175
     '3': 23, '4': 23, '5': 23, '6': 23,
paul@437 176
     '7': 23, '8': 23, '9': 23, 'E': 25,
paul@437 177
     'J': 13, 'e': 25, 'j': 13},
paul@437 178
    # 24
paul@437 179
    {'0': 24, '1': 24, '2': 24, '3': 24,
paul@437 180
     '4': 24, '5': 24, '6': 24, '7': 24,
paul@437 181
     '8': 24, '9': 24, 'E': 37, 'J': 13,
paul@437 182
     'e': 37, 'j': 13},
paul@437 183
    # 25
paul@437 184
    {'+': 38, '-': 38, '0': 39, '1': 39,
paul@437 185
     '2': 39, '3': 39, '4': 39, '5': 39,
paul@437 186
     '6': 39, '7': 39, '8': 39, '9': 39},
paul@437 187
    # 26
paul@437 188
    {'0': 26, '1': 26, '2': 26, '3': 26,
paul@437 189
     '4': 26, '5': 26, '6': 26, '7': 26,
paul@437 190
     '8': 26, '9': 26, 'E': 37, 'J': 13,
paul@437 191
     'e': 37, 'j': 13},
paul@437 192
    # 27
paul@437 193
    {},
paul@437 194
    # 28
paul@437 195
    {"'": 13},
paul@437 196
    # 29
paul@437 197
    {automata.DEFAULT: 40, '\n': 13, '\r': 14},
paul@437 198
    # 30
paul@437 199
    {automata.DEFAULT: 30, '\n': 27,
paul@437 200
     '\r': 27, "'": 13, '\\': 29},
paul@437 201
    # 31
paul@437 202
    {'"': 13},
paul@437 203
    # 32
paul@437 204
    {automata.DEFAULT: 41, '\n': 13, '\r': 14},
paul@437 205
    # 33
paul@437 206
    {automata.DEFAULT: 33, '\n': 27,
paul@437 207
     '\r': 27, '"': 13, '\\': 32},
paul@437 208
    # 34
paul@437 209
    {'0': 34, '1': 34, '2': 34, '3': 34,
paul@437 210
     '4': 34, '5': 34, '6': 34, '7': 34,
paul@437 211
     '8': 34, '9': 34, 'A': 34, 'B': 34,
paul@437 212
     'C': 34, 'D': 34, 'E': 34, 'F': 34,
paul@437 213
     'L': 13, 'a': 34, 'b': 34, 'c': 34,
paul@437 214
     'd': 34, 'e': 34, 'f': 34, 'l': 13},
paul@437 215
    # 35
paul@437 216
    {'0': 35, '1': 35, '2': 35, '3': 35,
paul@437 217
     '4': 35, '5': 35, '6': 35, '7': 35,
paul@437 218
     'L': 13, 'l': 13},
paul@437 219
    # 36
paul@437 220
    {'0': 36, '1': 36, 'L': 13, 'l': 13},
paul@437 221
    # 37
paul@437 222
    {'+': 42, '-': 42, '0': 43, '1': 43,
paul@437 223
     '2': 43, '3': 43, '4': 43, '5': 43,
paul@437 224
     '6': 43, '7': 43, '8': 43, '9': 43},
paul@437 225
    # 38
paul@437 226
    {'0': 39, '1': 39, '2': 39, '3': 39,
paul@437 227
     '4': 39, '5': 39, '6': 39, '7': 39,
paul@437 228
     '8': 39, '9': 39},
paul@437 229
    # 39
paul@437 230
    {'0': 39, '1': 39, '2': 39, '3': 39,
paul@437 231
     '4': 39, '5': 39, '6': 39, '7': 39,
paul@437 232
     '8': 39, '9': 39, 'J': 13, 'j': 13},
paul@437 233
    # 40
paul@437 234
    {automata.DEFAULT: 40, '\n': 27,
paul@437 235
     '\r': 27, "'": 13, '\\': 29},
paul@437 236
    # 41
paul@437 237
    {automata.DEFAULT: 41, '\n': 27,
paul@437 238
     '\r': 27, '"': 13, '\\': 32},
paul@437 239
    # 42
paul@437 240
    {'0': 43, '1': 43, '2': 43, '3': 43,
paul@437 241
     '4': 43, '5': 43, '6': 43, '7': 43,
paul@437 242
     '8': 43, '9': 43},
paul@437 243
    # 43
paul@437 244
    {'0': 43, '1': 43, '2': 43, '3': 43,
paul@437 245
     '4': 43, '5': 43, '6': 43, '7': 43,
paul@437 246
     '8': 43, '9': 43, 'J': 13, 'j': 13},
paul@437 247
    ]
paul@437 248
pseudoDFA = automata.DFA(states, accepts)
paul@437 249
paul@437 250
accepts = [False, False, False, False, False, True]
paul@437 251
states = [
paul@437 252
    # 0
paul@437 253
    {automata.DEFAULT: 0, '"': 1, '\\': 2},
paul@437 254
    # 1
paul@437 255
    {automata.DEFAULT: 4, '"': 3, '\\': 2},
paul@437 256
    # 2
paul@437 257
    {automata.DEFAULT: 4},
paul@437 258
    # 3
paul@437 259
    {automata.DEFAULT: 4, '"': 5, '\\': 2},
paul@437 260
    # 4
paul@437 261
    {automata.DEFAULT: 4, '"': 1, '\\': 2},
paul@437 262
    # 5
paul@437 263
    {automata.DEFAULT: 4, '"': 5, '\\': 2},
paul@437 264
    ]
paul@437 265
double3DFA = automata.NonGreedyDFA(states, accepts)
paul@437 266
paul@437 267
accepts = [False, False, False, False, False, True]
paul@437 268
states = [
paul@437 269
    # 0
paul@437 270
    {automata.DEFAULT: 0, "'": 1, '\\': 2},
paul@437 271
    # 1
paul@437 272
    {automata.DEFAULT: 4, "'": 3, '\\': 2},
paul@437 273
    # 2
paul@437 274
    {automata.DEFAULT: 4},
paul@437 275
    # 3
paul@437 276
    {automata.DEFAULT: 4, "'": 5, '\\': 2},
paul@437 277
    # 4
paul@437 278
    {automata.DEFAULT: 4, "'": 1, '\\': 2},
paul@437 279
    # 5
paul@437 280
    {automata.DEFAULT: 4, "'": 5, '\\': 2},
paul@437 281
    ]
paul@437 282
single3DFA = automata.NonGreedyDFA(states, accepts)
paul@437 283
paul@437 284
accepts = [False, True, False, False]
paul@437 285
states = [
paul@437 286
    # 0
paul@437 287
    {automata.DEFAULT: 0, "'": 1, '\\': 2},
paul@437 288
    # 1
paul@437 289
    {},
paul@437 290
    # 2
paul@437 291
    {automata.DEFAULT: 3},
paul@437 292
    # 3
paul@437 293
    {automata.DEFAULT: 3, "'": 1, '\\': 2},
paul@437 294
    ]
paul@437 295
singleDFA = automata.DFA(states, accepts)
paul@437 296
paul@437 297
accepts = [False, True, False, False]
paul@437 298
states = [
paul@437 299
    # 0
paul@437 300
    {automata.DEFAULT: 0, '"': 1, '\\': 2},
paul@437 301
    # 1
paul@437 302
    {},
paul@437 303
    # 2
paul@437 304
    {automata.DEFAULT: 3},
paul@437 305
    # 3
paul@437 306
    {automata.DEFAULT: 3, '"': 1, '\\': 2},
paul@437 307
    ]
paul@437 308
doubleDFA = automata.DFA(states, accepts)
paul@437 309
paul@437 310
#_______________________________________________________________________
paul@437 311
# End of automatically generated DFA's
paul@437 312
paul@437 313
endDFAs = {"'" : singleDFA,
paul@437 314
           '"' : doubleDFA,
paul@437 315
           'r' : None,
paul@437 316
           'R' : None,
paul@437 317
           'u' : None,
paul@437 318
           'U' : None,
paul@437 319
           'b' : None,
paul@437 320
           'B' : None}
paul@437 321
paul@437 322
for uniPrefix in ("", "u", "U", "b", "B"):
paul@437 323
    for rawPrefix in ("", "r", "R"):
paul@437 324
        prefix = uniPrefix + rawPrefix
paul@437 325
        endDFAs[prefix + "'''"] = single3DFA
paul@437 326
        endDFAs[prefix + '"""'] = double3DFA
paul@437 327
paul@437 328
whiteSpaceStatesAccepts = [True]
paul@437 329
whiteSpaceStates = [{'\t': 0, ' ': 0, '\x0c': 0}]
paul@437 330
whiteSpaceDFA = automata.DFA(whiteSpaceStates, whiteSpaceStatesAccepts)
paul@437 331
paul@437 332
# ______________________________________________________________________
paul@437 333
# COPIED:
paul@437 334
paul@437 335
triple_quoted = {}
paul@437 336
for t in ("'''", '"""',
paul@437 337
          "r'''", 'r"""', "R'''", 'R"""',
paul@437 338
          "u'''", 'u"""', "U'''", 'U"""',
paul@437 339
          "b'''", 'b"""', "B'''", 'B"""',
paul@437 340
          "ur'''", 'ur"""', "Ur'''", 'Ur"""',
paul@437 341
          "uR'''", 'uR"""', "UR'''", 'UR"""',
paul@437 342
          "br'''", 'br"""', "Br'''", 'Br"""',
paul@437 343
          "bR'''", 'bR"""', "BR'''", 'BR"""'):
paul@437 344
    triple_quoted[t] = t
paul@437 345
single_quoted = {}
paul@437 346
for t in ("'", '"',
paul@437 347
          "r'", 'r"', "R'", 'R"',
paul@437 348
          "u'", 'u"', "U'", 'U"',
paul@437 349
          "b'", 'b"', "B'", 'B"',
paul@437 350
          "ur'", 'ur"', "Ur'", 'Ur"',
paul@437 351
          "uR'", 'uR"', "UR'", 'UR"',
paul@437 352
          "br'", 'br"', "Br'", 'Br"',
paul@437 353
          "bR'", 'bR"', "BR'", 'BR"'):
paul@437 354
    single_quoted[t] = t
paul@437 355
paul@437 356
tabsize = 8
paul@437 357
paul@437 358
# PYPY MODIFICATION: removed TokenError class as it's not needed here
paul@437 359
paul@437 360
# PYPY MODIFICATION: removed StopTokenizing class as it's not needed here
paul@437 361
paul@437 362
# PYPY MODIFICATION: removed printtoken() as it's not needed here
paul@437 363
paul@437 364
# PYPY MODIFICATION: removed tokenize() as it's not needed here
paul@437 365
paul@437 366
# PYPY MODIFICATION: removed tokenize_loop() as it's not needed here
paul@437 367
paul@437 368
# PYPY MODIFICATION: removed generate_tokens() as it was copied / modified
paul@437 369
#                    in pythonlexer.py
paul@437 370
paul@437 371
# PYPY MODIFICATION: removed main() as it's not needed here
paul@437 372
paul@437 373
# ______________________________________________________________________
paul@437 374
# End of pytokenize.py
paul@437 375