paul@437 | 1 | # Used by genpytokenize.py to generate the parser in pytokenize.py |
paul@437 | 2 | from pyparser.automata import DFA, DEFAULT |
paul@437 | 3 | |
paul@437 | 4 | class EMPTY: pass |
paul@437 | 5 | |
paul@437 | 6 | def newArcPair (states, transitionLabel): |
paul@437 | 7 | s1Index = len(states) |
paul@437 | 8 | s2Index = s1Index + 1 |
paul@437 | 9 | states.append([(transitionLabel, s2Index)]) |
paul@437 | 10 | states.append([]) |
paul@437 | 11 | return s1Index, s2Index |
paul@437 | 12 | |
paul@437 | 13 | # ______________________________________________________________________ |
paul@437 | 14 | |
paul@437 | 15 | def chain (states, *stateIndexPairs): |
paul@437 | 16 | if len(stateIndexPairs) > 1: |
paul@437 | 17 | start, lastFinish = stateIndexPairs[0] |
paul@437 | 18 | for nStart, nFinish in stateIndexPairs[1:]: |
paul@437 | 19 | states[lastFinish].append((EMPTY, nStart)) |
paul@437 | 20 | lastFinish = nFinish |
paul@437 | 21 | return start, nFinish |
paul@437 | 22 | else: |
paul@437 | 23 | return stateIndexPairs[0] |
paul@437 | 24 | |
paul@437 | 25 | |
paul@437 | 26 | # ______________________________________________________________________ |
paul@437 | 27 | |
paul@437 | 28 | def chainStr (states, str): |
paul@437 | 29 | return chain(states, *map(lambda x : newArcPair(states, x), str)) |
paul@437 | 30 | |
paul@437 | 31 | # ______________________________________________________________________ |
paul@437 | 32 | |
paul@437 | 33 | def notChainStr (states, str): |
paul@437 | 34 | """XXX I'm not sure this is how it should be done, but I'm going to |
paul@437 | 35 | try it anyway. Note that for this case, I require only single character |
paul@437 | 36 | arcs, since I would have to basically invert all accepting states and |
paul@437 | 37 | non-accepting states of any sub-NFA's. |
paul@437 | 38 | """ |
paul@437 | 39 | assert len(str) > 0 |
paul@437 | 40 | arcs = map(lambda x : newArcPair(states, x), str) |
paul@437 | 41 | finish = len(states) |
paul@437 | 42 | states.append([]) |
paul@437 | 43 | start, lastFinish = arcs[0] |
paul@437 | 44 | states[start].append((EMPTY, finish)) |
paul@437 | 45 | for crntStart, crntFinish in arcs[1:]: |
paul@437 | 46 | states[lastFinish].append((EMPTY, crntStart)) |
paul@437 | 47 | states[crntStart].append((EMPTY, finish)) |
paul@437 | 48 | return start, finish |
paul@437 | 49 | |
paul@437 | 50 | # ______________________________________________________________________ |
paul@437 | 51 | |
paul@437 | 52 | def group (states, *stateIndexPairs): |
paul@437 | 53 | if len(stateIndexPairs) > 1: |
paul@437 | 54 | start = len(states) |
paul@437 | 55 | finish = start + 1 |
paul@437 | 56 | startList = [] |
paul@437 | 57 | states.append(startList) |
paul@437 | 58 | states.append([]) |
paul@437 | 59 | for eStart, eFinish in stateIndexPairs: |
paul@437 | 60 | startList.append((EMPTY, eStart)) |
paul@437 | 61 | states[eFinish].append((EMPTY, finish)) |
paul@437 | 62 | return start, finish |
paul@437 | 63 | else: |
paul@437 | 64 | return stateIndexPairs[0] |
paul@437 | 65 | |
paul@437 | 66 | # ______________________________________________________________________ |
paul@437 | 67 | |
paul@437 | 68 | def groupStr (states, str): |
paul@437 | 69 | return group(states, *map(lambda x : newArcPair(states, x), str)) |
paul@437 | 70 | |
paul@437 | 71 | # ______________________________________________________________________ |
paul@437 | 72 | |
paul@437 | 73 | def notGroup (states, *stateIndexPairs): |
paul@437 | 74 | """Like group, but will add a DEFAULT transition to a new end state, |
paul@437 | 75 | causing anything in the group to not match by going to a dead state. |
paul@437 | 76 | XXX I think this is right... |
paul@437 | 77 | """ |
paul@437 | 78 | start, dead = group(states, *stateIndexPairs) |
paul@437 | 79 | finish = len(states) |
paul@437 | 80 | states.append([]) |
paul@437 | 81 | states[start].append((DEFAULT, finish)) |
paul@437 | 82 | return start, finish |
paul@437 | 83 | |
paul@437 | 84 | # ______________________________________________________________________ |
paul@437 | 85 | |
paul@437 | 86 | def notGroupStr (states, str): |
paul@437 | 87 | return notGroup(states, *map(lambda x : newArcPair(states, x), str)) |
paul@437 | 88 | # ______________________________________________________________________ |
paul@437 | 89 | |
paul@437 | 90 | def any (states, *stateIndexPairs): |
paul@437 | 91 | start, finish = group(states, *stateIndexPairs) |
paul@437 | 92 | states[finish].append((EMPTY, start)) |
paul@437 | 93 | return start, start |
paul@437 | 94 | |
paul@437 | 95 | # ______________________________________________________________________ |
paul@437 | 96 | |
paul@437 | 97 | def maybe (states, *stateIndexPairs): |
paul@437 | 98 | start, finish = group(states, *stateIndexPairs) |
paul@437 | 99 | states[start].append((EMPTY, finish)) |
paul@437 | 100 | return start, finish |
paul@437 | 101 | |
paul@437 | 102 | # ______________________________________________________________________ |
paul@437 | 103 | |
paul@437 | 104 | def atleastonce (states, *stateIndexPairs): |
paul@437 | 105 | start, finish = group(states, *stateIndexPairs) |
paul@437 | 106 | states[finish].append((EMPTY, start)) |
paul@437 | 107 | return start, finish |
paul@437 | 108 | |
paul@437 | 109 | # ______________________________________________________________________ |
paul@437 | 110 | |
paul@437 | 111 | def closure (states, start, result = 0L): |
paul@437 | 112 | if None == result: |
paul@437 | 113 | result = 0L |
paul@437 | 114 | if 0 == (result & (1L << start)): |
paul@437 | 115 | result |= (1L << start) |
paul@437 | 116 | for label, arrow in states[start]: |
paul@437 | 117 | if label == EMPTY: |
paul@437 | 118 | result |= closure(states, arrow, result) |
paul@437 | 119 | return result |
paul@437 | 120 | |
paul@437 | 121 | # ______________________________________________________________________ |
paul@437 | 122 | |
paul@437 | 123 | def nfaToDfa (states, start, finish): |
paul@437 | 124 | tempStates = [] |
paul@437 | 125 | startClosure = closure(states, start) |
paul@437 | 126 | crntTempState = [startClosure, [], 0 != (startClosure & (1L << finish))] |
paul@437 | 127 | tempStates.append(crntTempState) |
paul@437 | 128 | index = 0 |
paul@437 | 129 | while index < len(tempStates): |
paul@437 | 130 | crntTempState = tempStates[index] |
paul@437 | 131 | crntClosure, crntArcs, crntAccept = crntTempState |
paul@437 | 132 | for index2 in range(0, len(states)): |
paul@437 | 133 | if 0 != (crntClosure & (1L << index2)): |
paul@437 | 134 | for label, nfaArrow in states[index2]: |
paul@437 | 135 | if label == EMPTY: |
paul@437 | 136 | continue |
paul@437 | 137 | foundTempArc = False |
paul@437 | 138 | for tempArc in crntArcs: |
paul@437 | 139 | if tempArc[0] == label: |
paul@437 | 140 | foundTempArc = True |
paul@437 | 141 | break |
paul@437 | 142 | if not foundTempArc: |
paul@437 | 143 | tempArc = [label, -1, 0L] |
paul@437 | 144 | crntArcs.append(tempArc) |
paul@437 | 145 | tempArc[2] = closure(states, nfaArrow, tempArc[2]) |
paul@437 | 146 | for arcIndex in range(0, len(crntArcs)): |
paul@437 | 147 | label, arrow, targetStates = crntArcs[arcIndex] |
paul@437 | 148 | targetFound = False |
paul@437 | 149 | arrow = 0 |
paul@437 | 150 | for destTempState in tempStates: |
paul@437 | 151 | if destTempState[0] == targetStates: |
paul@437 | 152 | targetFound = True |
paul@437 | 153 | break |
paul@437 | 154 | arrow += 1 |
paul@437 | 155 | if not targetFound: |
paul@437 | 156 | assert arrow == len(tempStates) |
paul@437 | 157 | newState = [targetStates, [], 0 != (targetStates & |
paul@437 | 158 | (1L << finish))] |
paul@437 | 159 | tempStates.append(newState) |
paul@437 | 160 | crntArcs[arcIndex][1] = arrow |
paul@437 | 161 | index += 1 |
paul@437 | 162 | tempStates = simplifyTempDfa(tempStates) |
paul@437 | 163 | states = finalizeTempDfa(tempStates) |
paul@437 | 164 | return states |
paul@437 | 165 | |
paul@437 | 166 | # ______________________________________________________________________ |
paul@437 | 167 | |
paul@437 | 168 | def sameState (s1, s2): |
paul@437 | 169 | """sameState(s1, s2) |
paul@437 | 170 | Note: |
paul@437 | 171 | state := [ nfaclosure : Long, [ arc ], accept : Boolean ] |
paul@437 | 172 | arc := [ label, arrow : Int, nfaClosure : Long ] |
paul@437 | 173 | """ |
paul@437 | 174 | if (len(s1[1]) != len(s2[1])) or (s1[2] != s2[2]): |
paul@437 | 175 | return False |
paul@437 | 176 | for arcIndex in range(0, len(s1[1])): |
paul@437 | 177 | arc1 = s1[1][arcIndex] |
paul@437 | 178 | arc2 = s2[1][arcIndex] |
paul@437 | 179 | if arc1[:-1] != arc2[:-1]: |
paul@437 | 180 | return False |
paul@437 | 181 | return True |
paul@437 | 182 | |
paul@437 | 183 | # ______________________________________________________________________ |
paul@437 | 184 | |
paul@437 | 185 | def simplifyTempDfa (tempStates): |
paul@437 | 186 | """simplifyTempDfa (tempStates) |
paul@437 | 187 | """ |
paul@437 | 188 | changes = True |
paul@437 | 189 | deletedStates = [] |
paul@437 | 190 | while changes: |
paul@437 | 191 | changes = False |
paul@437 | 192 | for i in range(1, len(tempStates)): |
paul@437 | 193 | if i in deletedStates: |
paul@437 | 194 | continue |
paul@437 | 195 | for j in range(0, i): |
paul@437 | 196 | if j in deletedStates: |
paul@437 | 197 | continue |
paul@437 | 198 | if sameState(tempStates[i], tempStates[j]): |
paul@437 | 199 | deletedStates.append(i) |
paul@437 | 200 | for k in range(0, len(tempStates)): |
paul@437 | 201 | if k in deletedStates: |
paul@437 | 202 | continue |
paul@437 | 203 | for arc in tempStates[k][1]: |
paul@437 | 204 | if arc[1] == i: |
paul@437 | 205 | arc[1] = j |
paul@437 | 206 | changes = True |
paul@437 | 207 | break |
paul@437 | 208 | for stateIndex in deletedStates: |
paul@437 | 209 | tempStates[stateIndex] = None |
paul@437 | 210 | return tempStates |
paul@437 | 211 | # ______________________________________________________________________ |
paul@437 | 212 | |
paul@437 | 213 | def finalizeTempDfa (tempStates): |
paul@437 | 214 | """finalizeTempDfa (tempStates) |
paul@437 | 215 | |
paul@437 | 216 | Input domain: |
paul@437 | 217 | tempState := [ nfaClosure : Long, [ tempArc ], accept : Boolean ] |
paul@437 | 218 | tempArc := [ label, arrow, nfaClosure ] |
paul@437 | 219 | |
paul@437 | 220 | Output domain: |
paul@437 | 221 | state := [ arcMap, accept : Boolean ] |
paul@437 | 222 | """ |
paul@437 | 223 | states = [] |
paul@437 | 224 | accepts = [] |
paul@437 | 225 | stateMap = {} |
paul@437 | 226 | tempIndex = 0 |
paul@437 | 227 | for tempIndex in range(0, len(tempStates)): |
paul@437 | 228 | tempState = tempStates[tempIndex] |
paul@437 | 229 | if None != tempState: |
paul@437 | 230 | stateMap[tempIndex] = len(states) |
paul@437 | 231 | states.append({}) |
paul@437 | 232 | accepts.append(tempState[2]) |
paul@437 | 233 | for tempIndex in stateMap.keys(): |
paul@437 | 234 | stateBitset, tempArcs, accepting = tempStates[tempIndex] |
paul@437 | 235 | newIndex = stateMap[tempIndex] |
paul@437 | 236 | arcMap = states[newIndex] |
paul@437 | 237 | for tempArc in tempArcs: |
paul@437 | 238 | arcMap[tempArc[0]] = stateMap[tempArc[1]] |
paul@437 | 239 | return states, accepts |
paul@437 | 240 | |