paul@437 | 1 | #! /usr/bin/env python |
paul@437 | 2 | """Module genPytokenize |
paul@437 | 3 | |
paul@437 | 4 | Generates finite state automata for recognizing Python tokens. These are hand |
paul@437 | 5 | coded versions of the regular expressions originally appearing in Ping's |
paul@437 | 6 | tokenize module in the Python standard library. |
paul@437 | 7 | |
paul@437 | 8 | When run from the command line, this should pretty print the DFA machinery. |
paul@437 | 9 | |
paul@437 | 10 | $Id: genPytokenize.py,v 1.1 2003/10/02 17:37:17 jriehl Exp $ |
paul@437 | 11 | """ |
paul@437 | 12 | |
paul@437 | 13 | from pyparser.pylexer import * |
paul@437 | 14 | from pyparser.automata import NonGreedyDFA, DFA, DEFAULT |
paul@437 | 15 | |
paul@437 | 16 | def makePyPseudoDFA (): |
paul@437 | 17 | import string |
paul@437 | 18 | states = [] |
paul@437 | 19 | def makeEOL(): |
paul@437 | 20 | return group(states, |
paul@437 | 21 | newArcPair(states, "\n"), |
paul@437 | 22 | chain(states, |
paul@437 | 23 | newArcPair(states, "\r"), |
paul@437 | 24 | maybe(states, newArcPair(states, "\n")))) |
paul@437 | 25 | # ____________________________________________________________ |
paul@437 | 26 | def makeLineCont (): |
paul@437 | 27 | return chain(states, |
paul@437 | 28 | newArcPair(states, "\\"), |
paul@437 | 29 | makeEOL()) |
paul@437 | 30 | # ____________________________________________________________ |
paul@437 | 31 | # Ignore stuff |
paul@437 | 32 | def makeWhitespace (): |
paul@437 | 33 | return any(states, groupStr(states, " \f\t")) |
paul@437 | 34 | # ____________________________________________________________ |
paul@437 | 35 | def makeComment (): |
paul@437 | 36 | return chain(states, |
paul@437 | 37 | newArcPair(states, "#"), |
paul@437 | 38 | any(states, notGroupStr(states, "\r\n"))) |
paul@437 | 39 | # ____________________________________________________________ |
paul@437 | 40 | #ignore = chain(states, |
paul@437 | 41 | # makeWhitespace(), |
paul@437 | 42 | # any(states, chain(states, |
paul@437 | 43 | # makeLineCont(), |
paul@437 | 44 | # makeWhitespace())), |
paul@437 | 45 | # maybe(states, makeComment())) |
paul@437 | 46 | # ____________________________________________________________ |
paul@437 | 47 | # Names |
paul@437 | 48 | name = chain(states, |
paul@437 | 49 | groupStr(states, string.letters + "_"), |
paul@437 | 50 | any(states, groupStr(states, |
paul@437 | 51 | string.letters + string.digits + "_"))) |
paul@437 | 52 | # ____________________________________________________________ |
paul@437 | 53 | # Digits |
paul@437 | 54 | def makeDigits (): |
paul@437 | 55 | return groupStr(states, "0123456789") |
paul@437 | 56 | # ____________________________________________________________ |
paul@437 | 57 | # Integer numbers |
paul@437 | 58 | hexNumber = chain(states, |
paul@437 | 59 | newArcPair(states, "0"), |
paul@437 | 60 | groupStr(states, "xX"), |
paul@437 | 61 | atleastonce(states, |
paul@437 | 62 | groupStr(states, "0123456789abcdefABCDEF")), |
paul@437 | 63 | maybe(states, groupStr(states, "lL"))) |
paul@437 | 64 | octNumber = chain(states, |
paul@437 | 65 | newArcPair(states, "0"), |
paul@437 | 66 | maybe(states, |
paul@437 | 67 | chain(states, |
paul@437 | 68 | groupStr(states, "oO"), |
paul@437 | 69 | groupStr(states, "01234567"))), |
paul@437 | 70 | any(states, groupStr(states, "01234567")), |
paul@437 | 71 | maybe(states, groupStr(states, "lL"))) |
paul@437 | 72 | binNumber = chain(states, |
paul@437 | 73 | newArcPair(states, "0"), |
paul@437 | 74 | groupStr(states, "bB"), |
paul@437 | 75 | atleastonce(states, groupStr(states, "01")), |
paul@437 | 76 | maybe(states, groupStr(states, "lL"))) |
paul@437 | 77 | decNumber = chain(states, |
paul@437 | 78 | groupStr(states, "123456789"), |
paul@437 | 79 | any(states, makeDigits()), |
paul@437 | 80 | maybe(states, groupStr(states, "lL"))) |
paul@437 | 81 | intNumber = group(states, hexNumber, octNumber, binNumber, decNumber) |
paul@437 | 82 | # ____________________________________________________________ |
paul@437 | 83 | # Exponents |
paul@437 | 84 | def makeExp (): |
paul@437 | 85 | return chain(states, |
paul@437 | 86 | groupStr(states, "eE"), |
paul@437 | 87 | maybe(states, groupStr(states, "+-")), |
paul@437 | 88 | atleastonce(states, makeDigits())) |
paul@437 | 89 | # ____________________________________________________________ |
paul@437 | 90 | # Floating point numbers |
paul@437 | 91 | def makeFloat (): |
paul@437 | 92 | pointFloat = chain(states, |
paul@437 | 93 | group(states, |
paul@437 | 94 | chain(states, |
paul@437 | 95 | atleastonce(states, makeDigits()), |
paul@437 | 96 | newArcPair(states, "."), |
paul@437 | 97 | any(states, makeDigits())), |
paul@437 | 98 | chain(states, |
paul@437 | 99 | newArcPair(states, "."), |
paul@437 | 100 | atleastonce(states, makeDigits()))), |
paul@437 | 101 | maybe(states, makeExp())) |
paul@437 | 102 | expFloat = chain(states, |
paul@437 | 103 | atleastonce(states, makeDigits()), |
paul@437 | 104 | makeExp()) |
paul@437 | 105 | return group(states, pointFloat, expFloat) |
paul@437 | 106 | # ____________________________________________________________ |
paul@437 | 107 | # Imaginary numbers |
paul@437 | 108 | imagNumber = group(states, |
paul@437 | 109 | chain(states, |
paul@437 | 110 | atleastonce(states, makeDigits()), |
paul@437 | 111 | groupStr(states, "jJ")), |
paul@437 | 112 | chain(states, |
paul@437 | 113 | makeFloat(), |
paul@437 | 114 | groupStr(states, "jJ"))) |
paul@437 | 115 | # ____________________________________________________________ |
paul@437 | 116 | # Any old number. |
paul@437 | 117 | number = group(states, imagNumber, makeFloat(), intNumber) |
paul@437 | 118 | # ____________________________________________________________ |
paul@437 | 119 | # Funny |
paul@437 | 120 | operator = group(states, |
paul@437 | 121 | chain(states, |
paul@437 | 122 | chainStr(states, "**"), |
paul@437 | 123 | maybe(states, newArcPair(states, "="))), |
paul@437 | 124 | chain(states, |
paul@437 | 125 | chainStr(states, ">>"), |
paul@437 | 126 | maybe(states, newArcPair(states, "="))), |
paul@437 | 127 | chain(states, |
paul@437 | 128 | chainStr(states, "<<"), |
paul@437 | 129 | maybe(states, newArcPair(states, "="))), |
paul@437 | 130 | chainStr(states, "<>"), |
paul@437 | 131 | chainStr(states, "!="), |
paul@437 | 132 | chain(states, |
paul@437 | 133 | chainStr(states, "//"), |
paul@437 | 134 | maybe(states, newArcPair(states, "="))), |
paul@437 | 135 | chain(states, |
paul@437 | 136 | groupStr(states, "+-*/%&|^=<>"), |
paul@437 | 137 | maybe(states, newArcPair(states, "="))), |
paul@437 | 138 | newArcPair(states, "~")) |
paul@437 | 139 | bracket = groupStr(states, "[](){}") |
paul@437 | 140 | special = group(states, |
paul@437 | 141 | makeEOL(), |
paul@437 | 142 | groupStr(states, "@:;.,`")) |
paul@437 | 143 | funny = group(states, operator, bracket, special) |
paul@437 | 144 | # ____________________________________________________________ |
paul@437 | 145 | def makeStrPrefix (): |
paul@437 | 146 | return chain(states, |
paul@437 | 147 | maybe(states, groupStr(states, "uUbB")), |
paul@437 | 148 | maybe(states, groupStr(states, "rR"))) |
paul@437 | 149 | # ____________________________________________________________ |
paul@437 | 150 | contStr = group(states, |
paul@437 | 151 | chain(states, |
paul@437 | 152 | makeStrPrefix(), |
paul@437 | 153 | newArcPair(states, "'"), |
paul@437 | 154 | any(states, |
paul@437 | 155 | notGroupStr(states, "\r\n'\\")), |
paul@437 | 156 | any(states, |
paul@437 | 157 | chain(states, |
paul@437 | 158 | newArcPair(states, "\\"), |
paul@437 | 159 | newArcPair(states, DEFAULT), |
paul@437 | 160 | any(states, |
paul@437 | 161 | notGroupStr(states, "\r\n'\\")))), |
paul@437 | 162 | group(states, |
paul@437 | 163 | newArcPair(states, "'"), |
paul@437 | 164 | makeLineCont())), |
paul@437 | 165 | chain(states, |
paul@437 | 166 | makeStrPrefix(), |
paul@437 | 167 | newArcPair(states, '"'), |
paul@437 | 168 | any(states, |
paul@437 | 169 | notGroupStr(states, '\r\n"\\')), |
paul@437 | 170 | any(states, |
paul@437 | 171 | chain(states, |
paul@437 | 172 | newArcPair(states, "\\"), |
paul@437 | 173 | newArcPair(states, DEFAULT), |
paul@437 | 174 | any(states, |
paul@437 | 175 | notGroupStr(states, '\r\n"\\')))), |
paul@437 | 176 | group(states, |
paul@437 | 177 | newArcPair(states, '"'), |
paul@437 | 178 | makeLineCont()))) |
paul@437 | 179 | triple = chain(states, |
paul@437 | 180 | makeStrPrefix(), |
paul@437 | 181 | group(states, |
paul@437 | 182 | chainStr(states, "'''"), |
paul@437 | 183 | chainStr(states, '"""'))) |
paul@437 | 184 | pseudoExtras = group(states, |
paul@437 | 185 | makeLineCont(), |
paul@437 | 186 | makeComment(), |
paul@437 | 187 | triple) |
paul@437 | 188 | pseudoToken = chain(states, |
paul@437 | 189 | makeWhitespace(), |
paul@437 | 190 | group(states, |
paul@437 | 191 | newArcPair(states, EMPTY), |
paul@437 | 192 | pseudoExtras, number, funny, contStr, name)) |
paul@437 | 193 | dfaStates, dfaAccepts = nfaToDfa(states, *pseudoToken) |
paul@437 | 194 | return DFA(dfaStates, dfaAccepts), dfaStates |
paul@437 | 195 | |
paul@437 | 196 | # ______________________________________________________________________ |
paul@437 | 197 | |
paul@437 | 198 | def makePyEndDFAMap (): |
paul@437 | 199 | states = [] |
paul@437 | 200 | single = chain(states, |
paul@437 | 201 | any(states, notGroupStr(states, "'\\")), |
paul@437 | 202 | any(states, |
paul@437 | 203 | chain(states, |
paul@437 | 204 | newArcPair(states, "\\"), |
paul@437 | 205 | newArcPair(states, DEFAULT), |
paul@437 | 206 | any(states, notGroupStr(states, "'\\")))), |
paul@437 | 207 | newArcPair(states, "'")) |
paul@437 | 208 | states, accepts = nfaToDfa(states, *single) |
paul@437 | 209 | singleDFA = DFA(states, accepts) |
paul@437 | 210 | states_singleDFA = states |
paul@437 | 211 | states = [] |
paul@437 | 212 | double = chain(states, |
paul@437 | 213 | any(states, notGroupStr(states, '"\\')), |
paul@437 | 214 | any(states, |
paul@437 | 215 | chain(states, |
paul@437 | 216 | newArcPair(states, "\\"), |
paul@437 | 217 | newArcPair(states, DEFAULT), |
paul@437 | 218 | any(states, notGroupStr(states, '"\\')))), |
paul@437 | 219 | newArcPair(states, '"')) |
paul@437 | 220 | states, accepts = nfaToDfa(states, *double) |
paul@437 | 221 | doubleDFA = DFA(states, accepts) |
paul@437 | 222 | states_doubleDFA = states |
paul@437 | 223 | states = [] |
paul@437 | 224 | single3 = chain(states, |
paul@437 | 225 | any(states, notGroupStr(states, "'\\")), |
paul@437 | 226 | any(states, |
paul@437 | 227 | chain(states, |
paul@437 | 228 | group(states, |
paul@437 | 229 | chain(states, |
paul@437 | 230 | newArcPair(states, "\\"), |
paul@437 | 231 | newArcPair(states, DEFAULT)), |
paul@437 | 232 | chain(states, |
paul@437 | 233 | newArcPair(states, "'"), |
paul@437 | 234 | notChainStr(states, "''"))), |
paul@437 | 235 | any(states, notGroupStr(states, "'\\")))), |
paul@437 | 236 | chainStr(states, "'''")) |
paul@437 | 237 | states, accepts = nfaToDfa(states, *single3) |
paul@437 | 238 | single3DFA = NonGreedyDFA(states, accepts) |
paul@437 | 239 | states_single3DFA = states |
paul@437 | 240 | states = [] |
paul@437 | 241 | double3 = chain(states, |
paul@437 | 242 | any(states, notGroupStr(states, '"\\')), |
paul@437 | 243 | any(states, |
paul@437 | 244 | chain(states, |
paul@437 | 245 | group(states, |
paul@437 | 246 | chain(states, |
paul@437 | 247 | newArcPair(states, "\\"), |
paul@437 | 248 | newArcPair(states, DEFAULT)), |
paul@437 | 249 | chain(states, |
paul@437 | 250 | newArcPair(states, '"'), |
paul@437 | 251 | notChainStr(states, '""'))), |
paul@437 | 252 | any(states, notGroupStr(states, '"\\')))), |
paul@437 | 253 | chainStr(states, '"""')) |
paul@437 | 254 | states, accepts = nfaToDfa(states, *double3) |
paul@437 | 255 | double3DFA = NonGreedyDFA(states, accepts) |
paul@437 | 256 | states_double3DFA = states |
paul@437 | 257 | map = {"'" : (singleDFA, states_singleDFA), |
paul@437 | 258 | '"' : (doubleDFA, states_doubleDFA), |
paul@437 | 259 | "r" : None, |
paul@437 | 260 | "R" : None, |
paul@437 | 261 | "u" : None, |
paul@437 | 262 | "U" : None, |
paul@437 | 263 | "b" : None, |
paul@437 | 264 | "B" : None} |
paul@437 | 265 | for uniPrefix in ("", "u", "U", "b", "B", ): |
paul@437 | 266 | for rawPrefix in ("", "r", "R"): |
paul@437 | 267 | prefix = uniPrefix + rawPrefix |
paul@437 | 268 | map[prefix + "'''"] = (single3DFA, states_single3DFA) |
paul@437 | 269 | map[prefix + '"""'] = (double3DFA, states_double3DFA) |
paul@437 | 270 | return map |
paul@437 | 271 | |
paul@437 | 272 | # ______________________________________________________________________ |
paul@437 | 273 | |
paul@437 | 274 | def output(name, dfa_class, dfa, states): |
paul@437 | 275 | import textwrap |
paul@437 | 276 | lines = [] |
paul@437 | 277 | i = 0 |
paul@437 | 278 | for line in textwrap.wrap(repr(dfa.accepts), width = 50): |
paul@437 | 279 | if i == 0: |
paul@437 | 280 | lines.append("accepts = ") |
paul@437 | 281 | else: |
paul@437 | 282 | lines.append(" ") |
paul@437 | 283 | lines.append(line) |
paul@437 | 284 | lines.append("\n") |
paul@437 | 285 | i += 1 |
paul@437 | 286 | import StringIO |
paul@437 | 287 | lines.append("states = [\n") |
paul@437 | 288 | for numstate, state in enumerate(states): |
paul@437 | 289 | lines.append(" # ") |
paul@437 | 290 | lines.append(str(numstate)) |
paul@437 | 291 | lines.append('\n') |
paul@437 | 292 | s = StringIO.StringIO() |
paul@437 | 293 | i = 0 |
paul@437 | 294 | for k, v in sorted(state.items()): |
paul@437 | 295 | i += 1 |
paul@437 | 296 | if k == DEFAULT: |
paul@437 | 297 | k = "automata.DEFAULT" |
paul@437 | 298 | else: |
paul@437 | 299 | k = repr(k) |
paul@437 | 300 | s.write(k) |
paul@437 | 301 | s.write('::') |
paul@437 | 302 | s.write(repr(v)) |
paul@437 | 303 | if i < len(state): |
paul@437 | 304 | s.write(', ') |
paul@437 | 305 | s.write('},') |
paul@437 | 306 | i = 0 |
paul@437 | 307 | if len(state) <= 4: |
paul@437 | 308 | text = [s.getvalue()] |
paul@437 | 309 | else: |
paul@437 | 310 | text = textwrap.wrap(s.getvalue(), width=36) |
paul@437 | 311 | for line in text: |
paul@437 | 312 | line = line.replace('::', ': ') |
paul@437 | 313 | if i == 0: |
paul@437 | 314 | lines.append(' {') |
paul@437 | 315 | else: |
paul@437 | 316 | lines.append(' ') |
paul@437 | 317 | lines.append(line) |
paul@437 | 318 | lines.append('\n') |
paul@437 | 319 | i += 1 |
paul@437 | 320 | lines.append(" ]\n") |
paul@437 | 321 | lines.append("%s = automata.%s(states, accepts)\n" % (name, dfa_class)) |
paul@437 | 322 | return ''.join(lines) |
paul@437 | 323 | |
paul@437 | 324 | def main (): |
paul@437 | 325 | pseudoDFA, states_pseudoDFA = makePyPseudoDFA() |
paul@437 | 326 | print output("pseudoDFA", "DFA", pseudoDFA, states_pseudoDFA) |
paul@437 | 327 | endDFAMap = makePyEndDFAMap() |
paul@437 | 328 | dfa, states = endDFAMap['"""'] |
paul@437 | 329 | print output("double3DFA", "NonGreedyDFA", dfa, states) |
paul@437 | 330 | dfa, states = endDFAMap["'''"] |
paul@437 | 331 | print output("single3DFA", "NonGreedyDFA", dfa, states) |
paul@437 | 332 | dfa, states = endDFAMap["'"] |
paul@437 | 333 | print output("singleDFA", "DFA", dfa, states) |
paul@437 | 334 | dfa, states = endDFAMap["\""] |
paul@437 | 335 | print output("doubleDFA", "DFA", dfa, states) |
paul@437 | 336 | |
paul@437 | 337 | # ______________________________________________________________________ |
paul@437 | 338 | |
paul@437 | 339 | if __name__ == "__main__": |
paul@437 | 340 | main() |