Lichen

pyparser/pytokenize.py

660:fc5943513f3a
2017-03-05 Paul Boddie Removed superfluous __TEST macro.
     1 # ______________________________________________________________________     2 """Module pytokenize     3      4 THIS FILE WAS COPIED FROM pypy/module/parser/pytokenize.py AND ADAPTED     5 TO BE ANNOTABLE (Mainly made lists homogeneous)     6      7 This is a modified version of Ka-Ping Yee's tokenize module found in the     8 Python standard library.     9     10 The primary modification is the removal of the tokenizer's dependence on the    11 standard Python regular expression module, which is written in C.  The regular    12 expressions have been replaced with hand built DFA's using the    13 basil.util.automata module.    14     15 $Id: pytokenize.py,v 1.3 2003/10/03 16:31:53 jriehl Exp $    16 """    17 # ______________________________________________________________________    18     19 from pyparser import automata    20     21 __all__ = [ "tokenize" ]    22     23 # ______________________________________________________________________    24 # Automatically generated DFA's    25     26 accepts = [True, True, True, True, True, True, True, True,    27            True, True, False, True, True, True, True, False,    28            False, False, True, False, False, True, False,    29            False, True, False, True, False, True, False,    30            False, True, False, False, True, True, True,    31            False, False, True, False, False, False, True]    32 states = [    33     # 0    34     {'\t': 0, '\n': 13, '\x0c': 0,    35      '\r': 14, ' ': 0, '!': 10, '"': 16,    36      '#': 18, '%': 12, '&': 12, "'": 15,    37      '(': 13, ')': 13, '*': 7, '+': 12,    38      ',': 13, '-': 12, '.': 6, '/': 11,    39      '0': 4, '1': 5, '2': 5, '3': 5,    40      '4': 5, '5': 5, '6': 5, '7': 5,    41      '8': 5, '9': 5, ':': 13, ';': 13,    42      '<': 9, '=': 12, '>': 8, '@': 13,    43      'A': 1, 'B': 2, 'C': 1, 'D': 1,    44      'E': 1, 'F': 1, 'G': 1, 'H': 1,    45      'I': 1, 'J': 1, 'K': 1, 'L': 1,    46      'M': 1, 'N': 1, 'O': 1, 'P': 1,    47      'Q': 1, 'R': 3, 'S': 1, 'T': 1,    48      'U': 2, 'V': 1, 'W': 1, 'X': 1,    49      'Y': 1, 'Z': 1, '[': 13, '\\': 17,    50      ']': 13, '^': 12, '_': 1, '`': 13,    51      'a': 1, 'b': 2, 'c': 1, 'd': 1,    52      'e': 1, 'f': 1, 'g': 1, 'h': 1,    53      'i': 1, 'j': 1, 'k': 1, 'l': 1,    54      'm': 1, 'n': 1, 'o': 1, 'p': 1,    55      'q': 1, 'r': 3, 's': 1, 't': 1,    56      'u': 2, 'v': 1, 'w': 1, 'x': 1,    57      'y': 1, 'z': 1, '{': 13, '|': 12,    58      '}': 13, '~': 13},    59     # 1    60     {'0': 1, '1': 1, '2': 1, '3': 1,    61      '4': 1, '5': 1, '6': 1, '7': 1,    62      '8': 1, '9': 1, 'A': 1, 'B': 1,    63      'C': 1, 'D': 1, 'E': 1, 'F': 1,    64      'G': 1, 'H': 1, 'I': 1, 'J': 1,    65      'K': 1, 'L': 1, 'M': 1, 'N': 1,    66      'O': 1, 'P': 1, 'Q': 1, 'R': 1,    67      'S': 1, 'T': 1, 'U': 1, 'V': 1,    68      'W': 1, 'X': 1, 'Y': 1, 'Z': 1,    69      '_': 1, 'a': 1, 'b': 1, 'c': 1,    70      'd': 1, 'e': 1, 'f': 1, 'g': 1,    71      'h': 1, 'i': 1, 'j': 1, 'k': 1,    72      'l': 1, 'm': 1, 'n': 1, 'o': 1,    73      'p': 1, 'q': 1, 'r': 1, 's': 1,    74      't': 1, 'u': 1, 'v': 1, 'w': 1,    75      'x': 1, 'y': 1, 'z': 1},    76     # 2    77     {'"': 16, "'": 15, '0': 1, '1': 1,    78      '2': 1, '3': 1, '4': 1, '5': 1,    79      '6': 1, '7': 1, '8': 1, '9': 1,    80      'A': 1, 'B': 1, 'C': 1, 'D': 1,    81      'E': 1, 'F': 1, 'G': 1, 'H': 1,    82      'I': 1, 'J': 1, 'K': 1, 'L': 1,    83      'M': 1, 'N': 1, 'O': 1, 'P': 1,    84      'Q': 1, 'R': 3, 'S': 1, 'T': 1,    85      'U': 1, 'V': 1, 'W': 1, 'X': 1,    86      'Y': 1, 'Z': 1, '_': 1, 'a': 1,    87      'b': 1, 'c': 1, 'd': 1, 'e': 1,    88      'f': 1, 'g': 1, 'h': 1, 'i': 1,    89      'j': 1, 'k': 1, 'l': 1, 'm': 1,    90      'n': 1, 'o': 1, 'p': 1, 'q': 1,    91      'r': 3, 's': 1, 't': 1, 'u': 1,    92      'v': 1, 'w': 1, 'x': 1, 'y': 1,    93      'z': 1},    94     # 3    95     {'"': 16, "'": 15, '0': 1, '1': 1,    96      '2': 1, '3': 1, '4': 1, '5': 1,    97      '6': 1, '7': 1, '8': 1, '9': 1,    98      'A': 1, 'B': 1, 'C': 1, 'D': 1,    99      'E': 1, 'F': 1, 'G': 1, 'H': 1,   100      'I': 1, 'J': 1, 'K': 1, 'L': 1,   101      'M': 1, 'N': 1, 'O': 1, 'P': 1,   102      'Q': 1, 'R': 1, 'S': 1, 'T': 1,   103      'U': 1, 'V': 1, 'W': 1, 'X': 1,   104      'Y': 1, 'Z': 1, '_': 1, 'a': 1,   105      'b': 1, 'c': 1, 'd': 1, 'e': 1,   106      'f': 1, 'g': 1, 'h': 1, 'i': 1,   107      'j': 1, 'k': 1, 'l': 1, 'm': 1,   108      'n': 1, 'o': 1, 'p': 1, 'q': 1,   109      'r': 1, 's': 1, 't': 1, 'u': 1,   110      'v': 1, 'w': 1, 'x': 1, 'y': 1,   111      'z': 1},   112     # 4   113     {'.': 24, '0': 21, '1': 21, '2': 21,   114      '3': 21, '4': 21, '5': 21, '6': 21,   115      '7': 21, '8': 23, '9': 23, 'B': 22,   116      'E': 25, 'J': 13, 'L': 13, 'O': 20,   117      'X': 19, 'b': 22, 'e': 25, 'j': 13,   118      'l': 13, 'o': 20, 'x': 19},   119     # 5   120     {'.': 24, '0': 5, '1': 5, '2': 5,   121      '3': 5, '4': 5, '5': 5, '6': 5,   122      '7': 5, '8': 5, '9': 5, 'E': 25,   123      'J': 13, 'L': 13, 'e': 25, 'j': 13,   124      'l': 13},   125     # 6   126     {'0': 26, '1': 26, '2': 26, '3': 26,   127      '4': 26, '5': 26, '6': 26, '7': 26,   128      '8': 26, '9': 26},   129     # 7   130     {'*': 12, '=': 13},   131     # 8   132     {'=': 13, '>': 12},   133     # 9   134     {'<': 12, '=': 13, '>': 13},   135     # 10   136     {'=': 13},   137     # 11   138     {'/': 12, '=': 13},   139     # 12   140     {'=': 13},   141     # 13   142     {},   143     # 14   144     {'\n': 13},   145     # 15   146     {automata.DEFAULT: 30, '\n': 27,   147      '\r': 27, "'": 28, '\\': 29},   148     # 16   149     {automata.DEFAULT: 33, '\n': 27,   150      '\r': 27, '"': 31, '\\': 32},   151     # 17   152     {'\n': 13, '\r': 14},   153     # 18   154     {automata.DEFAULT: 18, '\n': 27, '\r': 27},   155     # 19   156     {'0': 34, '1': 34, '2': 34, '3': 34,   157      '4': 34, '5': 34, '6': 34, '7': 34,   158      '8': 34, '9': 34, 'A': 34, 'B': 34,   159      'C': 34, 'D': 34, 'E': 34, 'F': 34,   160      'a': 34, 'b': 34, 'c': 34, 'd': 34,   161      'e': 34, 'f': 34},   162     # 20   163     {'0': 35, '1': 35, '2': 35, '3': 35,   164      '4': 35, '5': 35, '6': 35, '7': 35},   165     # 21   166     {'.': 24, '0': 21, '1': 21, '2': 21,   167      '3': 21, '4': 21, '5': 21, '6': 21,   168      '7': 21, '8': 23, '9': 23, 'E': 25,   169      'J': 13, 'L': 13, 'e': 25, 'j': 13,   170      'l': 13},   171     # 22   172     {'0': 36, '1': 36},   173     # 23   174     {'.': 24, '0': 23, '1': 23, '2': 23,   175      '3': 23, '4': 23, '5': 23, '6': 23,   176      '7': 23, '8': 23, '9': 23, 'E': 25,   177      'J': 13, 'e': 25, 'j': 13},   178     # 24   179     {'0': 24, '1': 24, '2': 24, '3': 24,   180      '4': 24, '5': 24, '6': 24, '7': 24,   181      '8': 24, '9': 24, 'E': 37, 'J': 13,   182      'e': 37, 'j': 13},   183     # 25   184     {'+': 38, '-': 38, '0': 39, '1': 39,   185      '2': 39, '3': 39, '4': 39, '5': 39,   186      '6': 39, '7': 39, '8': 39, '9': 39},   187     # 26   188     {'0': 26, '1': 26, '2': 26, '3': 26,   189      '4': 26, '5': 26, '6': 26, '7': 26,   190      '8': 26, '9': 26, 'E': 37, 'J': 13,   191      'e': 37, 'j': 13},   192     # 27   193     {},   194     # 28   195     {"'": 13},   196     # 29   197     {automata.DEFAULT: 40, '\n': 13, '\r': 14},   198     # 30   199     {automata.DEFAULT: 30, '\n': 27,   200      '\r': 27, "'": 13, '\\': 29},   201     # 31   202     {'"': 13},   203     # 32   204     {automata.DEFAULT: 41, '\n': 13, '\r': 14},   205     # 33   206     {automata.DEFAULT: 33, '\n': 27,   207      '\r': 27, '"': 13, '\\': 32},   208     # 34   209     {'0': 34, '1': 34, '2': 34, '3': 34,   210      '4': 34, '5': 34, '6': 34, '7': 34,   211      '8': 34, '9': 34, 'A': 34, 'B': 34,   212      'C': 34, 'D': 34, 'E': 34, 'F': 34,   213      'L': 13, 'a': 34, 'b': 34, 'c': 34,   214      'd': 34, 'e': 34, 'f': 34, 'l': 13},   215     # 35   216     {'0': 35, '1': 35, '2': 35, '3': 35,   217      '4': 35, '5': 35, '6': 35, '7': 35,   218      'L': 13, 'l': 13},   219     # 36   220     {'0': 36, '1': 36, 'L': 13, 'l': 13},   221     # 37   222     {'+': 42, '-': 42, '0': 43, '1': 43,   223      '2': 43, '3': 43, '4': 43, '5': 43,   224      '6': 43, '7': 43, '8': 43, '9': 43},   225     # 38   226     {'0': 39, '1': 39, '2': 39, '3': 39,   227      '4': 39, '5': 39, '6': 39, '7': 39,   228      '8': 39, '9': 39},   229     # 39   230     {'0': 39, '1': 39, '2': 39, '3': 39,   231      '4': 39, '5': 39, '6': 39, '7': 39,   232      '8': 39, '9': 39, 'J': 13, 'j': 13},   233     # 40   234     {automata.DEFAULT: 40, '\n': 27,   235      '\r': 27, "'": 13, '\\': 29},   236     # 41   237     {automata.DEFAULT: 41, '\n': 27,   238      '\r': 27, '"': 13, '\\': 32},   239     # 42   240     {'0': 43, '1': 43, '2': 43, '3': 43,   241      '4': 43, '5': 43, '6': 43, '7': 43,   242      '8': 43, '9': 43},   243     # 43   244     {'0': 43, '1': 43, '2': 43, '3': 43,   245      '4': 43, '5': 43, '6': 43, '7': 43,   246      '8': 43, '9': 43, 'J': 13, 'j': 13},   247     ]   248 pseudoDFA = automata.DFA(states, accepts)   249    250 accepts = [False, False, False, False, False, True]   251 states = [   252     # 0   253     {automata.DEFAULT: 0, '"': 1, '\\': 2},   254     # 1   255     {automata.DEFAULT: 4, '"': 3, '\\': 2},   256     # 2   257     {automata.DEFAULT: 4},   258     # 3   259     {automata.DEFAULT: 4, '"': 5, '\\': 2},   260     # 4   261     {automata.DEFAULT: 4, '"': 1, '\\': 2},   262     # 5   263     {automata.DEFAULT: 4, '"': 5, '\\': 2},   264     ]   265 double3DFA = automata.NonGreedyDFA(states, accepts)   266    267 accepts = [False, False, False, False, False, True]   268 states = [   269     # 0   270     {automata.DEFAULT: 0, "'": 1, '\\': 2},   271     # 1   272     {automata.DEFAULT: 4, "'": 3, '\\': 2},   273     # 2   274     {automata.DEFAULT: 4},   275     # 3   276     {automata.DEFAULT: 4, "'": 5, '\\': 2},   277     # 4   278     {automata.DEFAULT: 4, "'": 1, '\\': 2},   279     # 5   280     {automata.DEFAULT: 4, "'": 5, '\\': 2},   281     ]   282 single3DFA = automata.NonGreedyDFA(states, accepts)   283    284 accepts = [False, True, False, False]   285 states = [   286     # 0   287     {automata.DEFAULT: 0, "'": 1, '\\': 2},   288     # 1   289     {},   290     # 2   291     {automata.DEFAULT: 3},   292     # 3   293     {automata.DEFAULT: 3, "'": 1, '\\': 2},   294     ]   295 singleDFA = automata.DFA(states, accepts)   296    297 accepts = [False, True, False, False]   298 states = [   299     # 0   300     {automata.DEFAULT: 0, '"': 1, '\\': 2},   301     # 1   302     {},   303     # 2   304     {automata.DEFAULT: 3},   305     # 3   306     {automata.DEFAULT: 3, '"': 1, '\\': 2},   307     ]   308 doubleDFA = automata.DFA(states, accepts)   309    310 #_______________________________________________________________________   311 # End of automatically generated DFA's   312    313 endDFAs = {"'" : singleDFA,   314            '"' : doubleDFA,   315            'r' : None,   316            'R' : None,   317            'u' : None,   318            'U' : None,   319            'b' : None,   320            'B' : None}   321    322 for uniPrefix in ("", "u", "U", "b", "B"):   323     for rawPrefix in ("", "r", "R"):   324         prefix = uniPrefix + rawPrefix   325         endDFAs[prefix + "'''"] = single3DFA   326         endDFAs[prefix + '"""'] = double3DFA   327    328 whiteSpaceStatesAccepts = [True]   329 whiteSpaceStates = [{'\t': 0, ' ': 0, '\x0c': 0}]   330 whiteSpaceDFA = automata.DFA(whiteSpaceStates, whiteSpaceStatesAccepts)   331    332 # ______________________________________________________________________   333 # COPIED:   334    335 triple_quoted = {}   336 for t in ("'''", '"""',   337           "r'''", 'r"""', "R'''", 'R"""',   338           "u'''", 'u"""', "U'''", 'U"""',   339           "b'''", 'b"""', "B'''", 'B"""',   340           "ur'''", 'ur"""', "Ur'''", 'Ur"""',   341           "uR'''", 'uR"""', "UR'''", 'UR"""',   342           "br'''", 'br"""', "Br'''", 'Br"""',   343           "bR'''", 'bR"""', "BR'''", 'BR"""'):   344     triple_quoted[t] = t   345 single_quoted = {}   346 for t in ("'", '"',   347           "r'", 'r"', "R'", 'R"',   348           "u'", 'u"', "U'", 'U"',   349           "b'", 'b"', "B'", 'B"',   350           "ur'", 'ur"', "Ur'", 'Ur"',   351           "uR'", 'uR"', "UR'", 'UR"',   352           "br'", 'br"', "Br'", 'Br"',   353           "bR'", 'bR"', "BR'", 'BR"'):   354     single_quoted[t] = t   355    356 tabsize = 8   357    358 # PYPY MODIFICATION: removed TokenError class as it's not needed here   359    360 # PYPY MODIFICATION: removed StopTokenizing class as it's not needed here   361    362 # PYPY MODIFICATION: removed printtoken() as it's not needed here   363    364 # PYPY MODIFICATION: removed tokenize() as it's not needed here   365    366 # PYPY MODIFICATION: removed tokenize_loop() as it's not needed here   367    368 # PYPY MODIFICATION: removed generate_tokens() as it was copied / modified   369 #                    in pythonlexer.py   370    371 # PYPY MODIFICATION: removed main() as it's not needed here   372    373 # ______________________________________________________________________   374 # End of pytokenize.py   375