paul@437 | 1 | from pyparser import automata |
paul@437 | 2 | from pyparser.pygram import tokens |
paul@437 | 3 | from pyparser.pytoken import python_opmap |
paul@437 | 4 | from pyparser.error import TokenError, TokenIndentationError |
paul@437 | 5 | from pyparser.pytokenize import tabsize, whiteSpaceDFA, \ |
paul@437 | 6 | triple_quoted, endDFAs, single_quoted, pseudoDFA |
paul@437 | 7 | from pyparser import consts |
paul@437 | 8 | |
paul@437 | 9 | NAMECHARS = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_' |
paul@437 | 10 | NUMCHARS = '0123456789' |
paul@437 | 11 | ALNUMCHARS = NAMECHARS + NUMCHARS |
paul@437 | 12 | EXTENDED_ALNUMCHARS = ALNUMCHARS + '-.' |
paul@437 | 13 | WHITESPACES = ' \t\n\r\v\f' |
paul@437 | 14 | |
paul@437 | 15 | def match_encoding_declaration(comment): |
paul@437 | 16 | """returns the declared encoding or None |
paul@437 | 17 | |
paul@437 | 18 | This function is a replacement for : |
paul@437 | 19 | >>> py_encoding = re.compile(r"coding[:=]\s*([-\w.]+)") |
paul@437 | 20 | >>> py_encoding.search(comment) |
paul@437 | 21 | """ |
paul@437 | 22 | index = comment.find('coding') |
paul@437 | 23 | if index < 0: |
paul@437 | 24 | return None |
paul@437 | 25 | next_char = comment[index + 6] |
paul@437 | 26 | if next_char not in ':=': |
paul@437 | 27 | return None |
paul@437 | 28 | end_of_decl = comment[index + 7:] |
paul@437 | 29 | index = 0 |
paul@437 | 30 | for char in end_of_decl: |
paul@437 | 31 | if char not in WHITESPACES: |
paul@437 | 32 | break |
paul@437 | 33 | index += 1 |
paul@437 | 34 | else: |
paul@437 | 35 | return None |
paul@437 | 36 | encoding = '' |
paul@437 | 37 | for char in end_of_decl[index:]: |
paul@437 | 38 | if char in EXTENDED_ALNUMCHARS: |
paul@437 | 39 | encoding += char |
paul@437 | 40 | else: |
paul@437 | 41 | break |
paul@437 | 42 | if encoding != '': |
paul@437 | 43 | return encoding |
paul@437 | 44 | return None |
paul@437 | 45 | |
paul@437 | 46 | |
paul@437 | 47 | DUMMY_DFA = automata.DFA([], []) |
paul@437 | 48 | |
paul@437 | 49 | def generate_tokens(lines, flags): |
paul@437 | 50 | """ |
paul@437 | 51 | This is a rewrite of pypy.module.parser.pytokenize.generate_tokens since |
paul@437 | 52 | the original function is not RPYTHON (uses yield) |
paul@437 | 53 | It was also slightly modified to generate Token instances instead |
paul@437 | 54 | of the original 5-tuples -- it's now a 4-tuple of |
paul@437 | 55 | |
paul@437 | 56 | * the Token instance |
paul@437 | 57 | * the whole line as a string |
paul@437 | 58 | * the line number (the real one, counting continuation lines) |
paul@437 | 59 | * the position on the line of the end of the token. |
paul@437 | 60 | |
paul@437 | 61 | Original docstring :: |
paul@437 | 62 | |
paul@437 | 63 | The generate_tokens() generator requires one argment, readline, which |
paul@437 | 64 | must be a callable object which provides the same interface as the |
paul@437 | 65 | readline() method of built-in file objects. Each call to the function |
paul@437 | 66 | should return one line of input as a string. |
paul@437 | 67 | |
paul@437 | 68 | The generator produces 5-tuples with these members: the token type; the |
paul@437 | 69 | token string; a 2-tuple (srow, scol) of ints specifying the row and |
paul@437 | 70 | column where the token begins in the source; a 2-tuple (erow, ecol) of |
paul@437 | 71 | ints specifying the row and column where the token ends in the source; |
paul@437 | 72 | and the line on which the token was found. The line passed is the |
paul@437 | 73 | logical line; continuation lines are included. |
paul@437 | 74 | """ |
paul@437 | 75 | token_list = [] |
paul@437 | 76 | lnum = parenlev = continued = 0 |
paul@437 | 77 | namechars = NAMECHARS |
paul@437 | 78 | numchars = NUMCHARS |
paul@437 | 79 | contstr, needcont = '', 0 |
paul@437 | 80 | contline = None |
paul@437 | 81 | indents = [0] |
paul@437 | 82 | last_comment = '' |
paul@437 | 83 | parenlevstart = (0, 0, "") |
paul@437 | 84 | |
paul@437 | 85 | # make the annotator happy |
paul@437 | 86 | endDFA = DUMMY_DFA |
paul@437 | 87 | # make the annotator happy |
paul@437 | 88 | line = '' |
paul@437 | 89 | pos = 0 |
paul@437 | 90 | lines.append("") |
paul@437 | 91 | strstart = (0, 0, "") |
paul@437 | 92 | for line in lines: |
paul@437 | 93 | lnum = lnum + 1 |
paul@437 | 94 | line = universal_newline(line) |
paul@437 | 95 | pos, max = 0, len(line) |
paul@437 | 96 | |
paul@437 | 97 | if contstr: |
paul@437 | 98 | if not line: |
paul@437 | 99 | raise TokenError( |
paul@437 | 100 | "EOF while scanning triple-quoted string literal", |
paul@437 | 101 | strstart[2], strstart[0], strstart[1]+1, |
paul@437 | 102 | token_list, lnum-1) |
paul@437 | 103 | endmatch = endDFA.recognize(line) |
paul@437 | 104 | if endmatch >= 0: |
paul@437 | 105 | pos = end = endmatch |
paul@439 | 106 | tok = (tokens["STRING"], contstr + line[:end], strstart[0], |
paul@437 | 107 | strstart[1], line) |
paul@437 | 108 | token_list.append(tok) |
paul@437 | 109 | last_comment = '' |
paul@437 | 110 | contstr, needcont = '', 0 |
paul@437 | 111 | contline = None |
paul@437 | 112 | elif (needcont and not line.endswith('\\\n') and |
paul@437 | 113 | not line.endswith('\\\r\n')): |
paul@439 | 114 | tok = (tokens["ERRORTOKEN"], contstr + line, strstart[0], |
paul@437 | 115 | strstart[1], line) |
paul@437 | 116 | token_list.append(tok) |
paul@437 | 117 | last_comment = '' |
paul@437 | 118 | contstr = '' |
paul@437 | 119 | contline = None |
paul@437 | 120 | continue |
paul@437 | 121 | else: |
paul@437 | 122 | contstr = contstr + line |
paul@437 | 123 | contline = contline + line |
paul@437 | 124 | continue |
paul@437 | 125 | |
paul@437 | 126 | elif parenlev == 0 and not continued: # new statement |
paul@437 | 127 | if not line: break |
paul@437 | 128 | column = 0 |
paul@437 | 129 | while pos < max: # measure leading whitespace |
paul@437 | 130 | if line[pos] == ' ': column = column + 1 |
paul@437 | 131 | elif line[pos] == '\t': column = (column/tabsize + 1)*tabsize |
paul@437 | 132 | elif line[pos] == '\f': column = 0 |
paul@437 | 133 | else: break |
paul@437 | 134 | pos = pos + 1 |
paul@437 | 135 | if pos == max: break |
paul@437 | 136 | |
paul@437 | 137 | if line[pos] in '#\r\n': |
paul@437 | 138 | # skip comments or blank lines |
paul@437 | 139 | continue |
paul@437 | 140 | |
paul@437 | 141 | if column > indents[-1]: # count indents or dedents |
paul@437 | 142 | indents.append(column) |
paul@439 | 143 | token_list.append((tokens["INDENT"], line[:pos], lnum, 0, line)) |
paul@437 | 144 | last_comment = '' |
paul@437 | 145 | while column < indents[-1]: |
paul@437 | 146 | indents = indents[:-1] |
paul@439 | 147 | token_list.append((tokens["DEDENT"], '', lnum, pos, line)) |
paul@437 | 148 | last_comment = '' |
paul@437 | 149 | if column != indents[-1]: |
paul@437 | 150 | err = "unindent does not match any outer indentation level" |
paul@437 | 151 | raise TokenIndentationError(err, line, lnum, 0, token_list) |
paul@437 | 152 | |
paul@437 | 153 | else: # continued statement |
paul@437 | 154 | if not line: |
paul@437 | 155 | if parenlev > 0: |
paul@437 | 156 | lnum1, start1, line1 = parenlevstart |
paul@437 | 157 | raise TokenError("parenthesis is never closed", line1, |
paul@437 | 158 | lnum1, start1 + 1, token_list, lnum) |
paul@437 | 159 | raise TokenError("EOF in multi-line statement", line, |
paul@437 | 160 | lnum, 0, token_list) |
paul@437 | 161 | continued = 0 |
paul@437 | 162 | |
paul@437 | 163 | while pos < max: |
paul@437 | 164 | pseudomatch = pseudoDFA.recognize(line, pos) |
paul@437 | 165 | if pseudomatch >= 0: # scan for tokens |
paul@437 | 166 | # JDR: Modified |
paul@437 | 167 | start = whiteSpaceDFA.recognize(line, pos) |
paul@437 | 168 | if start < 0: |
paul@437 | 169 | start = pos |
paul@437 | 170 | end = pseudomatch |
paul@437 | 171 | |
paul@437 | 172 | if start == end: |
paul@437 | 173 | raise TokenError("Unknown character", line, |
paul@437 | 174 | lnum, start + 1, token_list) |
paul@437 | 175 | |
paul@437 | 176 | pos = end |
paul@437 | 177 | token, initial = line[start:end], line[start] |
paul@437 | 178 | if initial in numchars or \ |
paul@437 | 179 | (initial == '.' and token != '.'): # ordinary number |
paul@439 | 180 | token_list.append((tokens["NUMBER"], token, lnum, start, line)) |
paul@437 | 181 | last_comment = '' |
paul@437 | 182 | elif initial in '\r\n': |
paul@437 | 183 | if parenlev <= 0: |
paul@439 | 184 | tok = (tokens["NEWLINE"], last_comment, lnum, start, line) |
paul@437 | 185 | token_list.append(tok) |
paul@437 | 186 | last_comment = '' |
paul@437 | 187 | elif initial == '#': |
paul@437 | 188 | # skip comment |
paul@437 | 189 | last_comment = token |
paul@437 | 190 | elif token in triple_quoted: |
paul@437 | 191 | endDFA = endDFAs[token] |
paul@437 | 192 | endmatch = endDFA.recognize(line, pos) |
paul@437 | 193 | if endmatch >= 0: # all on one line |
paul@437 | 194 | pos = endmatch |
paul@437 | 195 | token = line[start:pos] |
paul@439 | 196 | tok = (tokens["STRING"], token, lnum, start, line) |
paul@437 | 197 | token_list.append(tok) |
paul@437 | 198 | last_comment = '' |
paul@437 | 199 | else: |
paul@437 | 200 | strstart = (lnum, start, line) |
paul@437 | 201 | contstr = line[start:] |
paul@437 | 202 | contline = line |
paul@437 | 203 | break |
paul@437 | 204 | elif initial in single_quoted or \ |
paul@437 | 205 | token[:2] in single_quoted or \ |
paul@437 | 206 | token[:3] in single_quoted: |
paul@437 | 207 | if token[-1] == '\n': # continued string |
paul@437 | 208 | strstart = (lnum, start, line) |
paul@437 | 209 | endDFA = (endDFAs[initial] or endDFAs[token[1]] or |
paul@437 | 210 | endDFAs[token[2]]) |
paul@437 | 211 | contstr, needcont = line[start:], 1 |
paul@437 | 212 | contline = line |
paul@437 | 213 | break |
paul@437 | 214 | else: # ordinary string |
paul@439 | 215 | tok = (tokens["STRING"], token, lnum, start, line) |
paul@437 | 216 | token_list.append(tok) |
paul@437 | 217 | last_comment = '' |
paul@437 | 218 | elif initial in namechars: # ordinary name |
paul@439 | 219 | token_list.append((tokens["NAME"], token, lnum, start, line)) |
paul@437 | 220 | last_comment = '' |
paul@437 | 221 | elif initial == '\\': # continued stmt |
paul@437 | 222 | continued = 1 |
paul@437 | 223 | else: |
paul@437 | 224 | if initial in '([{': |
paul@437 | 225 | if parenlev == 0: |
paul@437 | 226 | parenlevstart = (lnum, start, line) |
paul@437 | 227 | parenlev = parenlev + 1 |
paul@437 | 228 | elif initial in ')]}': |
paul@437 | 229 | parenlev = parenlev - 1 |
paul@437 | 230 | if parenlev < 0: |
paul@437 | 231 | raise TokenError("unmatched '%s'" % initial, line, |
paul@437 | 232 | lnum, start + 1, token_list) |
paul@437 | 233 | if token in python_opmap: |
paul@437 | 234 | punct = python_opmap[token] |
paul@437 | 235 | else: |
paul@439 | 236 | punct = tokens["OP"] |
paul@437 | 237 | token_list.append((punct, token, lnum, start, line)) |
paul@437 | 238 | last_comment = '' |
paul@437 | 239 | else: |
paul@437 | 240 | start = whiteSpaceDFA.recognize(line, pos) |
paul@437 | 241 | if start < 0: |
paul@437 | 242 | start = pos |
paul@437 | 243 | if start<max and line[start] in single_quoted: |
paul@437 | 244 | raise TokenError("EOL while scanning string literal", |
paul@437 | 245 | line, lnum, start+1, token_list) |
paul@439 | 246 | tok = (tokens["ERRORTOKEN"], line[pos], lnum, pos, line) |
paul@437 | 247 | token_list.append(tok) |
paul@437 | 248 | last_comment = '' |
paul@437 | 249 | pos = pos + 1 |
paul@437 | 250 | |
paul@437 | 251 | lnum -= 1 |
paul@437 | 252 | if not (flags & consts.PyCF_DONT_IMPLY_DEDENT): |
paul@439 | 253 | if token_list and token_list[-1][0] != tokens["NEWLINE"]: |
paul@439 | 254 | tok = (tokens["NEWLINE"], '', lnum, 0, '\n') |
paul@437 | 255 | token_list.append(tok) |
paul@437 | 256 | for indent in indents[1:]: # pop remaining indent levels |
paul@439 | 257 | token_list.append((tokens["DEDENT"], '', lnum, pos, line)) |
paul@439 | 258 | tok = (tokens["NEWLINE"], '', lnum, 0, '\n') |
paul@437 | 259 | token_list.append(tok) |
paul@437 | 260 | |
paul@439 | 261 | token_list.append((tokens["ENDMARKER"], '', lnum, pos, line)) |
paul@437 | 262 | return token_list |
paul@437 | 263 | |
paul@437 | 264 | |
paul@437 | 265 | def universal_newline(line): |
paul@437 | 266 | # show annotator that indexes below are non-negative |
paul@437 | 267 | line_len_m2 = len(line) - 2 |
paul@437 | 268 | if line_len_m2 >= 0 and line[-2] == '\r' and line[-1] == '\n': |
paul@437 | 269 | return line[:line_len_m2] + '\n' |
paul@437 | 270 | line_len_m1 = len(line) - 1 |
paul@437 | 271 | if line_len_m1 >= 0 and line[-1] == '\r': |
paul@437 | 272 | return line[:line_len_m1] + '\n' |
paul@437 | 273 | return line |