1 # ______________________________________________________________________ 2 """Module pytokenize 3 4 THIS FILE WAS COPIED FROM pypy/module/parser/pytokenize.py AND ADAPTED 5 TO BE ANNOTABLE (Mainly made lists homogeneous) 6 7 This is a modified version of Ka-Ping Yee's tokenize module found in the 8 Python standard library. 9 10 The primary modification is the removal of the tokenizer's dependence on the 11 standard Python regular expression module, which is written in C. The regular 12 expressions have been replaced with hand built DFA's using the 13 basil.util.automata module. 14 15 $Id: pytokenize.py,v 1.3 2003/10/03 16:31:53 jriehl Exp $ 16 """ 17 # ______________________________________________________________________ 18 19 from pyparser import automata 20 21 __all__ = [ "tokenize" ] 22 23 # ______________________________________________________________________ 24 # Automatically generated DFA's 25 26 accepts = [True, True, True, True, True, True, True, True, 27 True, True, False, True, True, True, True, False, 28 False, False, True, False, False, True, False, 29 False, True, False, True, False, True, False, 30 False, True, False, False, True, True, True, 31 False, False, True, False, False, False, True] 32 states = [ 33 # 0 34 {'\t': 0, '\n': 13, '\x0c': 0, 35 '\r': 14, ' ': 0, '!': 10, '"': 16, 36 '#': 18, '%': 12, '&': 12, "'": 15, 37 '(': 13, ')': 13, '*': 7, '+': 12, 38 ',': 13, '-': 12, '.': 6, '/': 11, 39 '0': 4, '1': 5, '2': 5, '3': 5, 40 '4': 5, '5': 5, '6': 5, '7': 5, 41 '8': 5, '9': 5, ':': 13, ';': 13, 42 '<': 9, '=': 12, '>': 8, '@': 13, 43 'A': 1, 'B': 2, 'C': 1, 'D': 1, 44 'E': 1, 'F': 1, 'G': 1, 'H': 1, 45 'I': 1, 'J': 1, 'K': 1, 'L': 1, 46 'M': 1, 'N': 1, 'O': 1, 'P': 1, 47 'Q': 1, 'R': 3, 'S': 1, 'T': 1, 48 'U': 2, 'V': 1, 'W': 1, 'X': 1, 49 'Y': 1, 'Z': 1, '[': 13, '\\': 17, 50 ']': 13, '^': 12, '_': 1, '`': 13, 51 'a': 1, 'b': 2, 'c': 1, 'd': 1, 52 'e': 1, 'f': 1, 'g': 1, 'h': 1, 53 'i': 1, 'j': 1, 'k': 1, 'l': 1, 54 'm': 1, 'n': 1, 'o': 1, 'p': 1, 55 'q': 1, 'r': 3, 's': 1, 't': 1, 56 'u': 2, 'v': 1, 'w': 1, 'x': 1, 57 'y': 1, 'z': 1, '{': 13, '|': 12, 58 '}': 13, '~': 13}, 59 # 1 60 {'0': 1, '1': 1, '2': 1, '3': 1, 61 '4': 1, '5': 1, '6': 1, '7': 1, 62 '8': 1, '9': 1, 'A': 1, 'B': 1, 63 'C': 1, 'D': 1, 'E': 1, 'F': 1, 64 'G': 1, 'H': 1, 'I': 1, 'J': 1, 65 'K': 1, 'L': 1, 'M': 1, 'N': 1, 66 'O': 1, 'P': 1, 'Q': 1, 'R': 1, 67 'S': 1, 'T': 1, 'U': 1, 'V': 1, 68 'W': 1, 'X': 1, 'Y': 1, 'Z': 1, 69 '_': 1, 'a': 1, 'b': 1, 'c': 1, 70 'd': 1, 'e': 1, 'f': 1, 'g': 1, 71 'h': 1, 'i': 1, 'j': 1, 'k': 1, 72 'l': 1, 'm': 1, 'n': 1, 'o': 1, 73 'p': 1, 'q': 1, 'r': 1, 's': 1, 74 't': 1, 'u': 1, 'v': 1, 'w': 1, 75 'x': 1, 'y': 1, 'z': 1}, 76 # 2 77 {'"': 16, "'": 15, '0': 1, '1': 1, 78 '2': 1, '3': 1, '4': 1, '5': 1, 79 '6': 1, '7': 1, '8': 1, '9': 1, 80 'A': 1, 'B': 1, 'C': 1, 'D': 1, 81 'E': 1, 'F': 1, 'G': 1, 'H': 1, 82 'I': 1, 'J': 1, 'K': 1, 'L': 1, 83 'M': 1, 'N': 1, 'O': 1, 'P': 1, 84 'Q': 1, 'R': 3, 'S': 1, 'T': 1, 85 'U': 1, 'V': 1, 'W': 1, 'X': 1, 86 'Y': 1, 'Z': 1, '_': 1, 'a': 1, 87 'b': 1, 'c': 1, 'd': 1, 'e': 1, 88 'f': 1, 'g': 1, 'h': 1, 'i': 1, 89 'j': 1, 'k': 1, 'l': 1, 'm': 1, 90 'n': 1, 'o': 1, 'p': 1, 'q': 1, 91 'r': 3, 's': 1, 't': 1, 'u': 1, 92 'v': 1, 'w': 1, 'x': 1, 'y': 1, 93 'z': 1}, 94 # 3 95 {'"': 16, "'": 15, '0': 1, '1': 1, 96 '2': 1, '3': 1, '4': 1, '5': 1, 97 '6': 1, '7': 1, '8': 1, '9': 1, 98 'A': 1, 'B': 1, 'C': 1, 'D': 1, 99 'E': 1, 'F': 1, 'G': 1, 'H': 1, 100 'I': 1, 'J': 1, 'K': 1, 'L': 1, 101 'M': 1, 'N': 1, 'O': 1, 'P': 1, 102 'Q': 1, 'R': 1, 'S': 1, 'T': 1, 103 'U': 1, 'V': 1, 'W': 1, 'X': 1, 104 'Y': 1, 'Z': 1, '_': 1, 'a': 1, 105 'b': 1, 'c': 1, 'd': 1, 'e': 1, 106 'f': 1, 'g': 1, 'h': 1, 'i': 1, 107 'j': 1, 'k': 1, 'l': 1, 'm': 1, 108 'n': 1, 'o': 1, 'p': 1, 'q': 1, 109 'r': 1, 's': 1, 't': 1, 'u': 1, 110 'v': 1, 'w': 1, 'x': 1, 'y': 1, 111 'z': 1}, 112 # 4 113 {'.': 24, '0': 21, '1': 21, '2': 21, 114 '3': 21, '4': 21, '5': 21, '6': 21, 115 '7': 21, '8': 23, '9': 23, 'B': 22, 116 'E': 25, 'J': 13, 'L': 13, 'O': 20, 117 'X': 19, 'b': 22, 'e': 25, 'j': 13, 118 'l': 13, 'o': 20, 'x': 19}, 119 # 5 120 {'.': 24, '0': 5, '1': 5, '2': 5, 121 '3': 5, '4': 5, '5': 5, '6': 5, 122 '7': 5, '8': 5, '9': 5, 'E': 25, 123 'J': 13, 'L': 13, 'e': 25, 'j': 13, 124 'l': 13}, 125 # 6 126 {'0': 26, '1': 26, '2': 26, '3': 26, 127 '4': 26, '5': 26, '6': 26, '7': 26, 128 '8': 26, '9': 26}, 129 # 7 130 {'*': 12, '=': 13}, 131 # 8 132 {'=': 13, '>': 12}, 133 # 9 134 {'<': 12, '=': 13, '>': 13}, 135 # 10 136 {'=': 13}, 137 # 11 138 {'/': 12, '=': 13}, 139 # 12 140 {'=': 13}, 141 # 13 142 {}, 143 # 14 144 {'\n': 13}, 145 # 15 146 {automata.DEFAULT: 30, '\n': 27, 147 '\r': 27, "'": 28, '\\': 29}, 148 # 16 149 {automata.DEFAULT: 33, '\n': 27, 150 '\r': 27, '"': 31, '\\': 32}, 151 # 17 152 {'\n': 13, '\r': 14}, 153 # 18 154 {automata.DEFAULT: 18, '\n': 27, '\r': 27}, 155 # 19 156 {'0': 34, '1': 34, '2': 34, '3': 34, 157 '4': 34, '5': 34, '6': 34, '7': 34, 158 '8': 34, '9': 34, 'A': 34, 'B': 34, 159 'C': 34, 'D': 34, 'E': 34, 'F': 34, 160 'a': 34, 'b': 34, 'c': 34, 'd': 34, 161 'e': 34, 'f': 34}, 162 # 20 163 {'0': 35, '1': 35, '2': 35, '3': 35, 164 '4': 35, '5': 35, '6': 35, '7': 35}, 165 # 21 166 {'.': 24, '0': 21, '1': 21, '2': 21, 167 '3': 21, '4': 21, '5': 21, '6': 21, 168 '7': 21, '8': 23, '9': 23, 'E': 25, 169 'J': 13, 'L': 13, 'e': 25, 'j': 13, 170 'l': 13}, 171 # 22 172 {'0': 36, '1': 36}, 173 # 23 174 {'.': 24, '0': 23, '1': 23, '2': 23, 175 '3': 23, '4': 23, '5': 23, '6': 23, 176 '7': 23, '8': 23, '9': 23, 'E': 25, 177 'J': 13, 'e': 25, 'j': 13}, 178 # 24 179 {'0': 24, '1': 24, '2': 24, '3': 24, 180 '4': 24, '5': 24, '6': 24, '7': 24, 181 '8': 24, '9': 24, 'E': 37, 'J': 13, 182 'e': 37, 'j': 13}, 183 # 25 184 {'+': 38, '-': 38, '0': 39, '1': 39, 185 '2': 39, '3': 39, '4': 39, '5': 39, 186 '6': 39, '7': 39, '8': 39, '9': 39}, 187 # 26 188 {'0': 26, '1': 26, '2': 26, '3': 26, 189 '4': 26, '5': 26, '6': 26, '7': 26, 190 '8': 26, '9': 26, 'E': 37, 'J': 13, 191 'e': 37, 'j': 13}, 192 # 27 193 {}, 194 # 28 195 {"'": 13}, 196 # 29 197 {automata.DEFAULT: 40, '\n': 13, '\r': 14}, 198 # 30 199 {automata.DEFAULT: 30, '\n': 27, 200 '\r': 27, "'": 13, '\\': 29}, 201 # 31 202 {'"': 13}, 203 # 32 204 {automata.DEFAULT: 41, '\n': 13, '\r': 14}, 205 # 33 206 {automata.DEFAULT: 33, '\n': 27, 207 '\r': 27, '"': 13, '\\': 32}, 208 # 34 209 {'0': 34, '1': 34, '2': 34, '3': 34, 210 '4': 34, '5': 34, '6': 34, '7': 34, 211 '8': 34, '9': 34, 'A': 34, 'B': 34, 212 'C': 34, 'D': 34, 'E': 34, 'F': 34, 213 'L': 13, 'a': 34, 'b': 34, 'c': 34, 214 'd': 34, 'e': 34, 'f': 34, 'l': 13}, 215 # 35 216 {'0': 35, '1': 35, '2': 35, '3': 35, 217 '4': 35, '5': 35, '6': 35, '7': 35, 218 'L': 13, 'l': 13}, 219 # 36 220 {'0': 36, '1': 36, 'L': 13, 'l': 13}, 221 # 37 222 {'+': 42, '-': 42, '0': 43, '1': 43, 223 '2': 43, '3': 43, '4': 43, '5': 43, 224 '6': 43, '7': 43, '8': 43, '9': 43}, 225 # 38 226 {'0': 39, '1': 39, '2': 39, '3': 39, 227 '4': 39, '5': 39, '6': 39, '7': 39, 228 '8': 39, '9': 39}, 229 # 39 230 {'0': 39, '1': 39, '2': 39, '3': 39, 231 '4': 39, '5': 39, '6': 39, '7': 39, 232 '8': 39, '9': 39, 'J': 13, 'j': 13}, 233 # 40 234 {automata.DEFAULT: 40, '\n': 27, 235 '\r': 27, "'": 13, '\\': 29}, 236 # 41 237 {automata.DEFAULT: 41, '\n': 27, 238 '\r': 27, '"': 13, '\\': 32}, 239 # 42 240 {'0': 43, '1': 43, '2': 43, '3': 43, 241 '4': 43, '5': 43, '6': 43, '7': 43, 242 '8': 43, '9': 43}, 243 # 43 244 {'0': 43, '1': 43, '2': 43, '3': 43, 245 '4': 43, '5': 43, '6': 43, '7': 43, 246 '8': 43, '9': 43, 'J': 13, 'j': 13}, 247 ] 248 pseudoDFA = automata.DFA(states, accepts) 249 250 accepts = [False, False, False, False, False, True] 251 states = [ 252 # 0 253 {automata.DEFAULT: 0, '"': 1, '\\': 2}, 254 # 1 255 {automata.DEFAULT: 4, '"': 3, '\\': 2}, 256 # 2 257 {automata.DEFAULT: 4}, 258 # 3 259 {automata.DEFAULT: 4, '"': 5, '\\': 2}, 260 # 4 261 {automata.DEFAULT: 4, '"': 1, '\\': 2}, 262 # 5 263 {automata.DEFAULT: 4, '"': 5, '\\': 2}, 264 ] 265 double3DFA = automata.NonGreedyDFA(states, accepts) 266 267 accepts = [False, False, False, False, False, True] 268 states = [ 269 # 0 270 {automata.DEFAULT: 0, "'": 1, '\\': 2}, 271 # 1 272 {automata.DEFAULT: 4, "'": 3, '\\': 2}, 273 # 2 274 {automata.DEFAULT: 4}, 275 # 3 276 {automata.DEFAULT: 4, "'": 5, '\\': 2}, 277 # 4 278 {automata.DEFAULT: 4, "'": 1, '\\': 2}, 279 # 5 280 {automata.DEFAULT: 4, "'": 5, '\\': 2}, 281 ] 282 single3DFA = automata.NonGreedyDFA(states, accepts) 283 284 accepts = [False, True, False, False] 285 states = [ 286 # 0 287 {automata.DEFAULT: 0, "'": 1, '\\': 2}, 288 # 1 289 {}, 290 # 2 291 {automata.DEFAULT: 3}, 292 # 3 293 {automata.DEFAULT: 3, "'": 1, '\\': 2}, 294 ] 295 singleDFA = automata.DFA(states, accepts) 296 297 accepts = [False, True, False, False] 298 states = [ 299 # 0 300 {automata.DEFAULT: 0, '"': 1, '\\': 2}, 301 # 1 302 {}, 303 # 2 304 {automata.DEFAULT: 3}, 305 # 3 306 {automata.DEFAULT: 3, '"': 1, '\\': 2}, 307 ] 308 doubleDFA = automata.DFA(states, accepts) 309 310 #_______________________________________________________________________ 311 # End of automatically generated DFA's 312 313 endDFAs = {"'" : singleDFA, 314 '"' : doubleDFA, 315 'r' : None, 316 'R' : None, 317 'u' : None, 318 'U' : None, 319 'b' : None, 320 'B' : None} 321 322 for uniPrefix in ("", "u", "U", "b", "B"): 323 for rawPrefix in ("", "r", "R"): 324 prefix = uniPrefix + rawPrefix 325 endDFAs[prefix + "'''"] = single3DFA 326 endDFAs[prefix + '"""'] = double3DFA 327 328 whiteSpaceStatesAccepts = [True] 329 whiteSpaceStates = [{'\t': 0, ' ': 0, '\x0c': 0}] 330 whiteSpaceDFA = automata.DFA(whiteSpaceStates, whiteSpaceStatesAccepts) 331 332 # ______________________________________________________________________ 333 # COPIED: 334 335 triple_quoted = {} 336 for t in ("'''", '"""', 337 "r'''", 'r"""', "R'''", 'R"""', 338 "u'''", 'u"""', "U'''", 'U"""', 339 "b'''", 'b"""', "B'''", 'B"""', 340 "ur'''", 'ur"""', "Ur'''", 'Ur"""', 341 "uR'''", 'uR"""', "UR'''", 'UR"""', 342 "br'''", 'br"""', "Br'''", 'Br"""', 343 "bR'''", 'bR"""', "BR'''", 'BR"""'): 344 triple_quoted[t] = t 345 single_quoted = {} 346 for t in ("'", '"', 347 "r'", 'r"', "R'", 'R"', 348 "u'", 'u"', "U'", 'U"', 349 "b'", 'b"', "B'", 'B"', 350 "ur'", 'ur"', "Ur'", 'Ur"', 351 "uR'", 'uR"', "UR'", 'UR"', 352 "br'", 'br"', "Br'", 'Br"', 353 "bR'", 'bR"', "BR'", 'BR"'): 354 single_quoted[t] = t 355 356 tabsize = 8 357 358 # PYPY MODIFICATION: removed TokenError class as it's not needed here 359 360 # PYPY MODIFICATION: removed StopTokenizing class as it's not needed here 361 362 # PYPY MODIFICATION: removed printtoken() as it's not needed here 363 364 # PYPY MODIFICATION: removed tokenize() as it's not needed here 365 366 # PYPY MODIFICATION: removed tokenize_loop() as it's not needed here 367 368 # PYPY MODIFICATION: removed generate_tokens() as it was copied / modified 369 # in pythonlexer.py 370 371 # PYPY MODIFICATION: removed main() as it's not needed here 372 373 # ______________________________________________________________________ 374 # End of pytokenize.py 375