Lichen

Annotated pyparser/genpytokenize.py

596:6a11001c93ce
2017-02-19 Paul Boddie Reordered conditional clauses and added some comments. method-wrapper-for-context
paul@437 1
#! /usr/bin/env python
paul@437 2
"""Module genPytokenize
paul@437 3
paul@437 4
Generates finite state automata for recognizing Python tokens.  These are hand
paul@437 5
coded versions of the regular expressions originally appearing in Ping's
paul@437 6
tokenize module in the Python standard library.
paul@437 7
paul@437 8
When run from the command line, this should pretty print the DFA machinery.
paul@437 9
paul@437 10
$Id: genPytokenize.py,v 1.1 2003/10/02 17:37:17 jriehl Exp $
paul@437 11
"""
paul@437 12
paul@437 13
from pyparser.pylexer import *
paul@437 14
from pyparser.automata import NonGreedyDFA, DFA, DEFAULT
paul@437 15
paul@437 16
def makePyPseudoDFA ():
paul@437 17
    import string
paul@437 18
    states = []
paul@437 19
    def makeEOL():
paul@437 20
        return group(states,
paul@437 21
                     newArcPair(states, "\n"),
paul@437 22
                     chain(states,
paul@437 23
                           newArcPair(states, "\r"),
paul@437 24
                           maybe(states, newArcPair(states, "\n"))))
paul@437 25
    # ____________________________________________________________
paul@437 26
    def makeLineCont ():
paul@437 27
        return chain(states,
paul@437 28
                     newArcPair(states, "\\"),
paul@437 29
                     makeEOL())
paul@437 30
    # ____________________________________________________________
paul@437 31
    # Ignore stuff
paul@437 32
    def makeWhitespace ():
paul@437 33
        return any(states, groupStr(states, " \f\t"))
paul@437 34
    # ____________________________________________________________
paul@437 35
    def makeComment ():
paul@437 36
        return chain(states,
paul@437 37
                     newArcPair(states, "#"),
paul@437 38
                     any(states, notGroupStr(states, "\r\n")))
paul@437 39
    # ____________________________________________________________
paul@437 40
    #ignore = chain(states,
paul@437 41
    #               makeWhitespace(),
paul@437 42
    #               any(states, chain(states,
paul@437 43
    #                                 makeLineCont(),
paul@437 44
    #                                 makeWhitespace())),
paul@437 45
    #               maybe(states, makeComment()))
paul@437 46
    # ____________________________________________________________
paul@437 47
    # Names
paul@437 48
    name = chain(states,
paul@437 49
                 groupStr(states, string.letters + "_"),
paul@437 50
                 any(states, groupStr(states,
paul@437 51
                                      string.letters + string.digits + "_")))
paul@437 52
    # ____________________________________________________________
paul@437 53
    # Digits
paul@437 54
    def makeDigits ():
paul@437 55
        return groupStr(states, "0123456789")
paul@437 56
    # ____________________________________________________________
paul@437 57
    # Integer numbers
paul@437 58
    hexNumber = chain(states,
paul@437 59
                      newArcPair(states, "0"),
paul@437 60
                      groupStr(states, "xX"),
paul@437 61
                      atleastonce(states,
paul@437 62
                                  groupStr(states, "0123456789abcdefABCDEF")),
paul@437 63
                      maybe(states, groupStr(states, "lL")))
paul@437 64
    octNumber = chain(states,
paul@437 65
                      newArcPair(states, "0"),
paul@437 66
                      maybe(states,
paul@437 67
                            chain(states,
paul@437 68
                                  groupStr(states, "oO"),
paul@437 69
                                  groupStr(states, "01234567"))),
paul@437 70
                      any(states, groupStr(states, "01234567")),
paul@437 71
                      maybe(states, groupStr(states, "lL")))
paul@437 72
    binNumber = chain(states,
paul@437 73
                      newArcPair(states, "0"),
paul@437 74
                      groupStr(states, "bB"),
paul@437 75
                      atleastonce(states, groupStr(states, "01")),
paul@437 76
                      maybe(states, groupStr(states, "lL")))
paul@437 77
    decNumber = chain(states,
paul@437 78
                      groupStr(states, "123456789"),
paul@437 79
                      any(states, makeDigits()),
paul@437 80
                      maybe(states, groupStr(states, "lL")))
paul@437 81
    intNumber = group(states, hexNumber, octNumber, binNumber, decNumber)
paul@437 82
    # ____________________________________________________________
paul@437 83
    # Exponents
paul@437 84
    def makeExp ():
paul@437 85
        return chain(states,
paul@437 86
                     groupStr(states, "eE"),
paul@437 87
                     maybe(states, groupStr(states, "+-")),
paul@437 88
                     atleastonce(states, makeDigits()))
paul@437 89
    # ____________________________________________________________
paul@437 90
    # Floating point numbers
paul@437 91
    def makeFloat ():
paul@437 92
        pointFloat = chain(states,
paul@437 93
                           group(states,
paul@437 94
                                 chain(states,
paul@437 95
                                       atleastonce(states, makeDigits()),
paul@437 96
                                       newArcPair(states, "."),
paul@437 97
                                       any(states, makeDigits())),
paul@437 98
                                 chain(states,
paul@437 99
                                       newArcPair(states, "."),
paul@437 100
                                       atleastonce(states, makeDigits()))),
paul@437 101
                           maybe(states, makeExp()))
paul@437 102
        expFloat = chain(states,
paul@437 103
                         atleastonce(states, makeDigits()),
paul@437 104
                         makeExp())
paul@437 105
        return group(states, pointFloat, expFloat)
paul@437 106
    # ____________________________________________________________
paul@437 107
    # Imaginary numbers
paul@437 108
    imagNumber = group(states,
paul@437 109
                       chain(states,
paul@437 110
                             atleastonce(states, makeDigits()),
paul@437 111
                             groupStr(states, "jJ")),
paul@437 112
                       chain(states,
paul@437 113
                             makeFloat(),
paul@437 114
                             groupStr(states, "jJ")))
paul@437 115
    # ____________________________________________________________
paul@437 116
    # Any old number.
paul@437 117
    number = group(states, imagNumber, makeFloat(), intNumber)
paul@437 118
    # ____________________________________________________________
paul@437 119
    # Funny
paul@437 120
    operator = group(states,
paul@437 121
                     chain(states,
paul@437 122
                           chainStr(states, "**"),
paul@437 123
                           maybe(states, newArcPair(states, "="))),
paul@437 124
                     chain(states,
paul@437 125
                           chainStr(states, ">>"),
paul@437 126
                           maybe(states, newArcPair(states, "="))),
paul@437 127
                     chain(states,
paul@437 128
                           chainStr(states, "<<"),
paul@437 129
                           maybe(states, newArcPair(states, "="))),
paul@437 130
                     chainStr(states, "<>"),
paul@437 131
                     chainStr(states, "!="),
paul@437 132
                     chain(states,
paul@437 133
                           chainStr(states, "//"),
paul@437 134
                           maybe(states, newArcPair(states, "="))),
paul@437 135
                     chain(states,
paul@437 136
                           groupStr(states, "+-*/%&|^=<>"),
paul@437 137
                           maybe(states, newArcPair(states, "="))),
paul@437 138
                     newArcPair(states, "~"))
paul@437 139
    bracket = groupStr(states, "[](){}")
paul@437 140
    special = group(states,
paul@437 141
                    makeEOL(),
paul@437 142
                    groupStr(states, "@:;.,`"))
paul@437 143
    funny = group(states, operator, bracket, special)
paul@437 144
    # ____________________________________________________________
paul@437 145
    def makeStrPrefix ():
paul@437 146
        return chain(states,
paul@437 147
                     maybe(states, groupStr(states, "uUbB")),
paul@437 148
                     maybe(states, groupStr(states, "rR")))
paul@437 149
    # ____________________________________________________________
paul@437 150
    contStr = group(states,
paul@437 151
                    chain(states,
paul@437 152
                          makeStrPrefix(),
paul@437 153
                          newArcPair(states, "'"),
paul@437 154
                          any(states,
paul@437 155
                              notGroupStr(states, "\r\n'\\")),
paul@437 156
                          any(states,
paul@437 157
                              chain(states,
paul@437 158
                                    newArcPair(states, "\\"),
paul@437 159
                                    newArcPair(states, DEFAULT),
paul@437 160
                                    any(states,
paul@437 161
                                        notGroupStr(states, "\r\n'\\")))),
paul@437 162
                          group(states,
paul@437 163
                                newArcPair(states, "'"),
paul@437 164
                                makeLineCont())),
paul@437 165
                    chain(states,
paul@437 166
                          makeStrPrefix(),
paul@437 167
                          newArcPair(states, '"'),
paul@437 168
                          any(states,
paul@437 169
                              notGroupStr(states, '\r\n"\\')),
paul@437 170
                          any(states,
paul@437 171
                              chain(states,
paul@437 172
                                    newArcPair(states, "\\"),
paul@437 173
                                    newArcPair(states, DEFAULT),
paul@437 174
                                    any(states,
paul@437 175
                                        notGroupStr(states, '\r\n"\\')))),
paul@437 176
                          group(states,
paul@437 177
                                newArcPair(states, '"'),
paul@437 178
                                makeLineCont())))
paul@437 179
    triple = chain(states,
paul@437 180
                   makeStrPrefix(),
paul@437 181
                   group(states,
paul@437 182
                         chainStr(states, "'''"),
paul@437 183
                         chainStr(states, '"""')))
paul@437 184
    pseudoExtras = group(states,
paul@437 185
                         makeLineCont(),
paul@437 186
                         makeComment(),
paul@437 187
                         triple)
paul@437 188
    pseudoToken = chain(states,
paul@437 189
                        makeWhitespace(),
paul@437 190
                        group(states,
paul@437 191
                              newArcPair(states, EMPTY),
paul@437 192
                              pseudoExtras, number, funny, contStr, name))
paul@437 193
    dfaStates, dfaAccepts = nfaToDfa(states, *pseudoToken)
paul@437 194
    return DFA(dfaStates, dfaAccepts), dfaStates
paul@437 195
paul@437 196
# ______________________________________________________________________
paul@437 197
paul@437 198
def makePyEndDFAMap ():
paul@437 199
    states = []
paul@437 200
    single = chain(states,
paul@437 201
                   any(states, notGroupStr(states, "'\\")),
paul@437 202
                   any(states,
paul@437 203
                       chain(states,
paul@437 204
                             newArcPair(states, "\\"),
paul@437 205
                             newArcPair(states, DEFAULT),
paul@437 206
                             any(states, notGroupStr(states, "'\\")))),
paul@437 207
                   newArcPair(states, "'"))
paul@437 208
    states, accepts = nfaToDfa(states, *single)
paul@437 209
    singleDFA = DFA(states, accepts)
paul@437 210
    states_singleDFA = states
paul@437 211
    states = []
paul@437 212
    double = chain(states,
paul@437 213
                   any(states, notGroupStr(states, '"\\')),
paul@437 214
                   any(states,
paul@437 215
                       chain(states,
paul@437 216
                             newArcPair(states, "\\"),
paul@437 217
                             newArcPair(states, DEFAULT),
paul@437 218
                             any(states, notGroupStr(states, '"\\')))),
paul@437 219
                   newArcPair(states, '"'))
paul@437 220
    states, accepts = nfaToDfa(states, *double)
paul@437 221
    doubleDFA = DFA(states, accepts)
paul@437 222
    states_doubleDFA = states
paul@437 223
    states = []
paul@437 224
    single3 = chain(states,
paul@437 225
                    any(states, notGroupStr(states, "'\\")),
paul@437 226
                    any(states,
paul@437 227
                        chain(states,
paul@437 228
                              group(states,
paul@437 229
                                    chain(states,
paul@437 230
                                          newArcPair(states, "\\"),
paul@437 231
                                          newArcPair(states, DEFAULT)),
paul@437 232
                                    chain(states,
paul@437 233
                                          newArcPair(states, "'"),
paul@437 234
                                          notChainStr(states, "''"))),
paul@437 235
                              any(states, notGroupStr(states, "'\\")))),
paul@437 236
                    chainStr(states, "'''"))
paul@437 237
    states, accepts = nfaToDfa(states, *single3)
paul@437 238
    single3DFA = NonGreedyDFA(states, accepts)
paul@437 239
    states_single3DFA = states
paul@437 240
    states = []
paul@437 241
    double3 = chain(states,
paul@437 242
                    any(states, notGroupStr(states, '"\\')),
paul@437 243
                    any(states,
paul@437 244
                        chain(states,
paul@437 245
                              group(states,
paul@437 246
                                    chain(states,
paul@437 247
                                          newArcPair(states, "\\"),
paul@437 248
                                          newArcPair(states, DEFAULT)),
paul@437 249
                                    chain(states,
paul@437 250
                                          newArcPair(states, '"'),
paul@437 251
                                          notChainStr(states, '""'))),
paul@437 252
                              any(states, notGroupStr(states, '"\\')))),
paul@437 253
                    chainStr(states, '"""'))
paul@437 254
    states, accepts = nfaToDfa(states, *double3)
paul@437 255
    double3DFA = NonGreedyDFA(states, accepts)
paul@437 256
    states_double3DFA = states
paul@437 257
    map = {"'" : (singleDFA, states_singleDFA),
paul@437 258
           '"' : (doubleDFA, states_doubleDFA),
paul@437 259
           "r" : None,
paul@437 260
           "R" : None,
paul@437 261
           "u" : None,
paul@437 262
           "U" : None,
paul@437 263
           "b" : None,
paul@437 264
           "B" : None}
paul@437 265
    for uniPrefix in ("", "u", "U", "b", "B", ):
paul@437 266
        for rawPrefix in ("", "r", "R"):
paul@437 267
            prefix = uniPrefix + rawPrefix
paul@437 268
            map[prefix + "'''"] = (single3DFA, states_single3DFA)
paul@437 269
            map[prefix + '"""'] = (double3DFA, states_double3DFA)
paul@437 270
    return map
paul@437 271
paul@437 272
# ______________________________________________________________________
paul@437 273
paul@437 274
def output(name, dfa_class, dfa, states):
paul@437 275
    import textwrap
paul@437 276
    lines = []
paul@437 277
    i = 0
paul@437 278
    for line in textwrap.wrap(repr(dfa.accepts), width = 50):
paul@437 279
        if i == 0:
paul@437 280
            lines.append("accepts = ")
paul@437 281
        else:
paul@437 282
            lines.append("           ")
paul@437 283
        lines.append(line)
paul@437 284
        lines.append("\n")
paul@437 285
        i += 1
paul@437 286
    import StringIO
paul@437 287
    lines.append("states = [\n")
paul@437 288
    for numstate, state in enumerate(states):
paul@437 289
        lines.append("    # ")
paul@437 290
        lines.append(str(numstate))
paul@437 291
        lines.append('\n')
paul@437 292
        s = StringIO.StringIO()
paul@437 293
        i = 0
paul@437 294
        for k, v in sorted(state.items()):
paul@437 295
            i += 1
paul@437 296
            if k == DEFAULT:
paul@437 297
                k = "automata.DEFAULT"
paul@437 298
            else:
paul@437 299
                k = repr(k)
paul@437 300
            s.write(k)
paul@437 301
            s.write('::')
paul@437 302
            s.write(repr(v))
paul@437 303
            if i < len(state):
paul@437 304
                s.write(', ')
paul@437 305
        s.write('},')
paul@437 306
        i = 0
paul@437 307
        if len(state) <= 4:
paul@437 308
            text = [s.getvalue()]
paul@437 309
        else:
paul@437 310
            text = textwrap.wrap(s.getvalue(), width=36)
paul@437 311
        for line in text:
paul@437 312
            line = line.replace('::', ': ')
paul@437 313
            if i == 0:
paul@437 314
                lines.append('    {')
paul@437 315
            else:
paul@437 316
                lines.append('     ')
paul@437 317
            lines.append(line)
paul@437 318
            lines.append('\n')
paul@437 319
            i += 1
paul@437 320
    lines.append("    ]\n")
paul@437 321
    lines.append("%s = automata.%s(states, accepts)\n" % (name, dfa_class))
paul@437 322
    return ''.join(lines)
paul@437 323
paul@437 324
def main ():
paul@437 325
    pseudoDFA, states_pseudoDFA = makePyPseudoDFA()
paul@437 326
    print output("pseudoDFA", "DFA", pseudoDFA, states_pseudoDFA)
paul@437 327
    endDFAMap = makePyEndDFAMap()
paul@437 328
    dfa, states = endDFAMap['"""']
paul@437 329
    print output("double3DFA", "NonGreedyDFA", dfa, states)
paul@437 330
    dfa, states = endDFAMap["'''"]
paul@437 331
    print output("single3DFA", "NonGreedyDFA", dfa, states)
paul@437 332
    dfa, states = endDFAMap["'"]
paul@437 333
    print output("singleDFA", "DFA", dfa, states)
paul@437 334
    dfa, states = endDFAMap["\""]
paul@437 335
    print output("doubleDFA", "DFA", dfa, states)
paul@437 336
paul@437 337
# ______________________________________________________________________
paul@437 338
paul@437 339
if __name__ == "__main__":
paul@437 340
    main()