1 #! /usr/bin/env python 2 """Module genPytokenize 3 4 Generates finite state automata for recognizing Python tokens. These are hand 5 coded versions of the regular expressions originally appearing in Ping's 6 tokenize module in the Python standard library. 7 8 When run from the command line, this should pretty print the DFA machinery. 9 10 $Id: genPytokenize.py,v 1.1 2003/10/02 17:37:17 jriehl Exp $ 11 """ 12 13 from pyparser.pylexer import * 14 from pyparser.automata import NonGreedyDFA, DFA, DEFAULT 15 16 def makePyPseudoDFA (): 17 import string 18 states = [] 19 def makeEOL(): 20 return group(states, 21 newArcPair(states, "\n"), 22 chain(states, 23 newArcPair(states, "\r"), 24 maybe(states, newArcPair(states, "\n")))) 25 # ____________________________________________________________ 26 def makeLineCont (): 27 return chain(states, 28 newArcPair(states, "\\"), 29 makeEOL()) 30 # ____________________________________________________________ 31 # Ignore stuff 32 def makeWhitespace (): 33 return any(states, groupStr(states, " \f\t")) 34 # ____________________________________________________________ 35 def makeComment (): 36 return chain(states, 37 newArcPair(states, "#"), 38 any(states, notGroupStr(states, "\r\n"))) 39 # ____________________________________________________________ 40 #ignore = chain(states, 41 # makeWhitespace(), 42 # any(states, chain(states, 43 # makeLineCont(), 44 # makeWhitespace())), 45 # maybe(states, makeComment())) 46 # ____________________________________________________________ 47 # Names 48 name = chain(states, 49 groupStr(states, string.letters + "_"), 50 any(states, groupStr(states, 51 string.letters + string.digits + "_"))) 52 # ____________________________________________________________ 53 # Digits 54 def makeDigits (): 55 return groupStr(states, "0123456789") 56 # ____________________________________________________________ 57 # Integer numbers 58 hexNumber = chain(states, 59 newArcPair(states, "0"), 60 groupStr(states, "xX"), 61 atleastonce(states, 62 groupStr(states, "0123456789abcdefABCDEF")), 63 maybe(states, groupStr(states, "lL"))) 64 octNumber = chain(states, 65 newArcPair(states, "0"), 66 maybe(states, 67 chain(states, 68 groupStr(states, "oO"), 69 groupStr(states, "01234567"))), 70 any(states, groupStr(states, "01234567")), 71 maybe(states, groupStr(states, "lL"))) 72 binNumber = chain(states, 73 newArcPair(states, "0"), 74 groupStr(states, "bB"), 75 atleastonce(states, groupStr(states, "01")), 76 maybe(states, groupStr(states, "lL"))) 77 decNumber = chain(states, 78 groupStr(states, "123456789"), 79 any(states, makeDigits()), 80 maybe(states, groupStr(states, "lL"))) 81 intNumber = group(states, hexNumber, octNumber, binNumber, decNumber) 82 # ____________________________________________________________ 83 # Exponents 84 def makeExp (): 85 return chain(states, 86 groupStr(states, "eE"), 87 maybe(states, groupStr(states, "+-")), 88 atleastonce(states, makeDigits())) 89 # ____________________________________________________________ 90 # Floating point numbers 91 def makeFloat (): 92 pointFloat = chain(states, 93 group(states, 94 chain(states, 95 atleastonce(states, makeDigits()), 96 newArcPair(states, "."), 97 any(states, makeDigits())), 98 chain(states, 99 newArcPair(states, "."), 100 atleastonce(states, makeDigits()))), 101 maybe(states, makeExp())) 102 expFloat = chain(states, 103 atleastonce(states, makeDigits()), 104 makeExp()) 105 return group(states, pointFloat, expFloat) 106 # ____________________________________________________________ 107 # Imaginary numbers 108 imagNumber = group(states, 109 chain(states, 110 atleastonce(states, makeDigits()), 111 groupStr(states, "jJ")), 112 chain(states, 113 makeFloat(), 114 groupStr(states, "jJ"))) 115 # ____________________________________________________________ 116 # Any old number. 117 number = group(states, imagNumber, makeFloat(), intNumber) 118 # ____________________________________________________________ 119 # Funny 120 operator = group(states, 121 chain(states, 122 chainStr(states, "**"), 123 maybe(states, newArcPair(states, "="))), 124 chain(states, 125 chainStr(states, ">>"), 126 maybe(states, newArcPair(states, "="))), 127 chain(states, 128 chainStr(states, "<<"), 129 maybe(states, newArcPair(states, "="))), 130 chainStr(states, "<>"), 131 chainStr(states, "!="), 132 chain(states, 133 chainStr(states, "//"), 134 maybe(states, newArcPair(states, "="))), 135 chain(states, 136 groupStr(states, "+-*/%&|^=<>"), 137 maybe(states, newArcPair(states, "="))), 138 newArcPair(states, "~")) 139 bracket = groupStr(states, "[](){}") 140 special = group(states, 141 makeEOL(), 142 groupStr(states, "@:;.,`")) 143 funny = group(states, operator, bracket, special) 144 # ____________________________________________________________ 145 def makeStrPrefix (): 146 return chain(states, 147 maybe(states, groupStr(states, "uUbB")), 148 maybe(states, groupStr(states, "rR"))) 149 # ____________________________________________________________ 150 contStr = group(states, 151 chain(states, 152 makeStrPrefix(), 153 newArcPair(states, "'"), 154 any(states, 155 notGroupStr(states, "\r\n'\\")), 156 any(states, 157 chain(states, 158 newArcPair(states, "\\"), 159 newArcPair(states, DEFAULT), 160 any(states, 161 notGroupStr(states, "\r\n'\\")))), 162 group(states, 163 newArcPair(states, "'"), 164 makeLineCont())), 165 chain(states, 166 makeStrPrefix(), 167 newArcPair(states, '"'), 168 any(states, 169 notGroupStr(states, '\r\n"\\')), 170 any(states, 171 chain(states, 172 newArcPair(states, "\\"), 173 newArcPair(states, DEFAULT), 174 any(states, 175 notGroupStr(states, '\r\n"\\')))), 176 group(states, 177 newArcPair(states, '"'), 178 makeLineCont()))) 179 triple = chain(states, 180 makeStrPrefix(), 181 group(states, 182 chainStr(states, "'''"), 183 chainStr(states, '"""'))) 184 pseudoExtras = group(states, 185 makeLineCont(), 186 makeComment(), 187 triple) 188 pseudoToken = chain(states, 189 makeWhitespace(), 190 group(states, 191 newArcPair(states, EMPTY), 192 pseudoExtras, number, funny, contStr, name)) 193 dfaStates, dfaAccepts = nfaToDfa(states, *pseudoToken) 194 return DFA(dfaStates, dfaAccepts), dfaStates 195 196 # ______________________________________________________________________ 197 198 def makePyEndDFAMap (): 199 states = [] 200 single = chain(states, 201 any(states, notGroupStr(states, "'\\")), 202 any(states, 203 chain(states, 204 newArcPair(states, "\\"), 205 newArcPair(states, DEFAULT), 206 any(states, notGroupStr(states, "'\\")))), 207 newArcPair(states, "'")) 208 states, accepts = nfaToDfa(states, *single) 209 singleDFA = DFA(states, accepts) 210 states_singleDFA = states 211 states = [] 212 double = chain(states, 213 any(states, notGroupStr(states, '"\\')), 214 any(states, 215 chain(states, 216 newArcPair(states, "\\"), 217 newArcPair(states, DEFAULT), 218 any(states, notGroupStr(states, '"\\')))), 219 newArcPair(states, '"')) 220 states, accepts = nfaToDfa(states, *double) 221 doubleDFA = DFA(states, accepts) 222 states_doubleDFA = states 223 states = [] 224 single3 = chain(states, 225 any(states, notGroupStr(states, "'\\")), 226 any(states, 227 chain(states, 228 group(states, 229 chain(states, 230 newArcPair(states, "\\"), 231 newArcPair(states, DEFAULT)), 232 chain(states, 233 newArcPair(states, "'"), 234 notChainStr(states, "''"))), 235 any(states, notGroupStr(states, "'\\")))), 236 chainStr(states, "'''")) 237 states, accepts = nfaToDfa(states, *single3) 238 single3DFA = NonGreedyDFA(states, accepts) 239 states_single3DFA = states 240 states = [] 241 double3 = chain(states, 242 any(states, notGroupStr(states, '"\\')), 243 any(states, 244 chain(states, 245 group(states, 246 chain(states, 247 newArcPair(states, "\\"), 248 newArcPair(states, DEFAULT)), 249 chain(states, 250 newArcPair(states, '"'), 251 notChainStr(states, '""'))), 252 any(states, notGroupStr(states, '"\\')))), 253 chainStr(states, '"""')) 254 states, accepts = nfaToDfa(states, *double3) 255 double3DFA = NonGreedyDFA(states, accepts) 256 states_double3DFA = states 257 map = {"'" : (singleDFA, states_singleDFA), 258 '"' : (doubleDFA, states_doubleDFA), 259 "r" : None, 260 "R" : None, 261 "u" : None, 262 "U" : None, 263 "b" : None, 264 "B" : None} 265 for uniPrefix in ("", "u", "U", "b", "B", ): 266 for rawPrefix in ("", "r", "R"): 267 prefix = uniPrefix + rawPrefix 268 map[prefix + "'''"] = (single3DFA, states_single3DFA) 269 map[prefix + '"""'] = (double3DFA, states_double3DFA) 270 return map 271 272 # ______________________________________________________________________ 273 274 def output(name, dfa_class, dfa, states): 275 import textwrap 276 lines = [] 277 i = 0 278 for line in textwrap.wrap(repr(dfa.accepts), width = 50): 279 if i == 0: 280 lines.append("accepts = ") 281 else: 282 lines.append(" ") 283 lines.append(line) 284 lines.append("\n") 285 i += 1 286 import StringIO 287 lines.append("states = [\n") 288 for numstate, state in enumerate(states): 289 lines.append(" # ") 290 lines.append(str(numstate)) 291 lines.append('\n') 292 s = StringIO.StringIO() 293 i = 0 294 for k, v in sorted(state.items()): 295 i += 1 296 if k == DEFAULT: 297 k = "automata.DEFAULT" 298 else: 299 k = repr(k) 300 s.write(k) 301 s.write('::') 302 s.write(repr(v)) 303 if i < len(state): 304 s.write(', ') 305 s.write('},') 306 i = 0 307 if len(state) <= 4: 308 text = [s.getvalue()] 309 else: 310 text = textwrap.wrap(s.getvalue(), width=36) 311 for line in text: 312 line = line.replace('::', ': ') 313 if i == 0: 314 lines.append(' {') 315 else: 316 lines.append(' ') 317 lines.append(line) 318 lines.append('\n') 319 i += 1 320 lines.append(" ]\n") 321 lines.append("%s = automata.%s(states, accepts)\n" % (name, dfa_class)) 322 return ''.join(lines) 323 324 def main (): 325 pseudoDFA, states_pseudoDFA = makePyPseudoDFA() 326 print output("pseudoDFA", "DFA", pseudoDFA, states_pseudoDFA) 327 endDFAMap = makePyEndDFAMap() 328 dfa, states = endDFAMap['"""'] 329 print output("double3DFA", "NonGreedyDFA", dfa, states) 330 dfa, states = endDFAMap["'''"] 331 print output("single3DFA", "NonGreedyDFA", dfa, states) 332 dfa, states = endDFAMap["'"] 333 print output("singleDFA", "DFA", dfa, states) 334 dfa, states = endDFAMap["\""] 335 print output("doubleDFA", "DFA", dfa, states) 336 337 # ______________________________________________________________________ 338 339 if __name__ == "__main__": 340 main()