1 from pyparser import automata 2 from pyparser.pygram import tokens 3 from pyparser.pytoken import python_opmap 4 from pyparser.error import TokenError, TokenIndentationError 5 from pyparser.pytokenize import tabsize, whiteSpaceDFA, \ 6 triple_quoted, endDFAs, single_quoted, pseudoDFA 7 from pyparser import consts 8 9 NAMECHARS = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_' 10 NUMCHARS = '0123456789' 11 ALNUMCHARS = NAMECHARS + NUMCHARS 12 EXTENDED_ALNUMCHARS = ALNUMCHARS + '-.' 13 WHITESPACES = ' \t\n\r\v\f' 14 15 def match_encoding_declaration(comment): 16 """returns the declared encoding or None 17 18 This function is a replacement for : 19 >>> py_encoding = re.compile(r"coding[:=]\s*([-\w.]+)") 20 >>> py_encoding.search(comment) 21 """ 22 index = comment.find('coding') 23 if index < 0: 24 return None 25 next_char = comment[index + 6] 26 if next_char not in ':=': 27 return None 28 end_of_decl = comment[index + 7:] 29 index = 0 30 for char in end_of_decl: 31 if char not in WHITESPACES: 32 break 33 index += 1 34 else: 35 return None 36 encoding = '' 37 for char in end_of_decl[index:]: 38 if char in EXTENDED_ALNUMCHARS: 39 encoding += char 40 else: 41 break 42 if encoding != '': 43 return encoding 44 return None 45 46 47 DUMMY_DFA = automata.DFA([], []) 48 49 def generate_tokens(lines, flags): 50 """ 51 This is a rewrite of pypy.module.parser.pytokenize.generate_tokens since 52 the original function is not RPYTHON (uses yield) 53 It was also slightly modified to generate Token instances instead 54 of the original 5-tuples -- it's now a 4-tuple of 55 56 * the Token instance 57 * the whole line as a string 58 * the line number (the real one, counting continuation lines) 59 * the position on the line of the end of the token. 60 61 Original docstring :: 62 63 The generate_tokens() generator requires one argment, readline, which 64 must be a callable object which provides the same interface as the 65 readline() method of built-in file objects. Each call to the function 66 should return one line of input as a string. 67 68 The generator produces 5-tuples with these members: the token type; the 69 token string; a 2-tuple (srow, scol) of ints specifying the row and 70 column where the token begins in the source; a 2-tuple (erow, ecol) of 71 ints specifying the row and column where the token ends in the source; 72 and the line on which the token was found. The line passed is the 73 logical line; continuation lines are included. 74 """ 75 token_list = [] 76 lnum = parenlev = continued = 0 77 namechars = NAMECHARS 78 numchars = NUMCHARS 79 contstr, needcont = '', 0 80 contline = None 81 indents = [0] 82 last_comment = '' 83 parenlevstart = (0, 0, "") 84 85 # make the annotator happy 86 endDFA = DUMMY_DFA 87 # make the annotator happy 88 line = '' 89 pos = 0 90 lines.append("") 91 strstart = (0, 0, "") 92 for line in lines: 93 lnum = lnum + 1 94 line = universal_newline(line) 95 pos, max = 0, len(line) 96 97 if contstr: 98 if not line: 99 raise TokenError( 100 "EOF while scanning triple-quoted string literal", 101 strstart[2], strstart[0], strstart[1]+1, 102 token_list, lnum-1) 103 endmatch = endDFA.recognize(line) 104 if endmatch >= 0: 105 pos = end = endmatch 106 tok = (tokens["STRING"], contstr + line[:end], strstart[0], 107 strstart[1], line) 108 token_list.append(tok) 109 last_comment = '' 110 contstr, needcont = '', 0 111 contline = None 112 elif (needcont and not line.endswith('\\\n') and 113 not line.endswith('\\\r\n')): 114 tok = (tokens["ERRORTOKEN"], contstr + line, strstart[0], 115 strstart[1], line) 116 token_list.append(tok) 117 last_comment = '' 118 contstr = '' 119 contline = None 120 continue 121 else: 122 contstr = contstr + line 123 contline = contline + line 124 continue 125 126 elif parenlev == 0 and not continued: # new statement 127 if not line: break 128 column = 0 129 while pos < max: # measure leading whitespace 130 if line[pos] == ' ': column = column + 1 131 elif line[pos] == '\t': column = (column/tabsize + 1)*tabsize 132 elif line[pos] == '\f': column = 0 133 else: break 134 pos = pos + 1 135 if pos == max: break 136 137 if line[pos] in '#\r\n': 138 # skip comments or blank lines 139 continue 140 141 if column > indents[-1]: # count indents or dedents 142 indents.append(column) 143 token_list.append((tokens["INDENT"], line[:pos], lnum, 0, line)) 144 last_comment = '' 145 while column < indents[-1]: 146 indents = indents[:-1] 147 token_list.append((tokens["DEDENT"], '', lnum, pos, line)) 148 last_comment = '' 149 if column != indents[-1]: 150 err = "unindent does not match any outer indentation level" 151 raise TokenIndentationError(err, line, lnum, 0, token_list) 152 153 else: # continued statement 154 if not line: 155 if parenlev > 0: 156 lnum1, start1, line1 = parenlevstart 157 raise TokenError("parenthesis is never closed", line1, 158 lnum1, start1 + 1, token_list, lnum) 159 raise TokenError("EOF in multi-line statement", line, 160 lnum, 0, token_list) 161 continued = 0 162 163 while pos < max: 164 pseudomatch = pseudoDFA.recognize(line, pos) 165 if pseudomatch >= 0: # scan for tokens 166 # JDR: Modified 167 start = whiteSpaceDFA.recognize(line, pos) 168 if start < 0: 169 start = pos 170 end = pseudomatch 171 172 if start == end: 173 raise TokenError("Unknown character", line, 174 lnum, start + 1, token_list) 175 176 pos = end 177 token, initial = line[start:end], line[start] 178 if initial in numchars or \ 179 (initial == '.' and token != '.'): # ordinary number 180 token_list.append((tokens["NUMBER"], token, lnum, start, line)) 181 last_comment = '' 182 elif initial in '\r\n': 183 if parenlev <= 0: 184 tok = (tokens["NEWLINE"], last_comment, lnum, start, line) 185 token_list.append(tok) 186 last_comment = '' 187 elif initial == '#': 188 # skip comment 189 last_comment = token 190 elif token in triple_quoted: 191 endDFA = endDFAs[token] 192 endmatch = endDFA.recognize(line, pos) 193 if endmatch >= 0: # all on one line 194 pos = endmatch 195 token = line[start:pos] 196 tok = (tokens["STRING"], token, lnum, start, line) 197 token_list.append(tok) 198 last_comment = '' 199 else: 200 strstart = (lnum, start, line) 201 contstr = line[start:] 202 contline = line 203 break 204 elif initial in single_quoted or \ 205 token[:2] in single_quoted or \ 206 token[:3] in single_quoted: 207 if token[-1] == '\n': # continued string 208 strstart = (lnum, start, line) 209 endDFA = (endDFAs[initial] or endDFAs[token[1]] or 210 endDFAs[token[2]]) 211 contstr, needcont = line[start:], 1 212 contline = line 213 break 214 else: # ordinary string 215 tok = (tokens["STRING"], token, lnum, start, line) 216 token_list.append(tok) 217 last_comment = '' 218 elif initial in namechars: # ordinary name 219 token_list.append((tokens["NAME"], token, lnum, start, line)) 220 last_comment = '' 221 elif initial == '\\': # continued stmt 222 continued = 1 223 else: 224 if initial in '([{': 225 if parenlev == 0: 226 parenlevstart = (lnum, start, line) 227 parenlev = parenlev + 1 228 elif initial in ')]}': 229 parenlev = parenlev - 1 230 if parenlev < 0: 231 raise TokenError("unmatched '%s'" % initial, line, 232 lnum, start + 1, token_list) 233 if token in python_opmap: 234 punct = python_opmap[token] 235 else: 236 punct = tokens["OP"] 237 token_list.append((punct, token, lnum, start, line)) 238 last_comment = '' 239 else: 240 start = whiteSpaceDFA.recognize(line, pos) 241 if start < 0: 242 start = pos 243 if start<max and line[start] in single_quoted: 244 raise TokenError("EOL while scanning string literal", 245 line, lnum, start+1, token_list) 246 tok = (tokens["ERRORTOKEN"], line[pos], lnum, pos, line) 247 token_list.append(tok) 248 last_comment = '' 249 pos = pos + 1 250 251 lnum -= 1 252 if not (flags & consts.PyCF_DONT_IMPLY_DEDENT): 253 if token_list and token_list[-1][0] != tokens["NEWLINE"]: 254 tok = (tokens["NEWLINE"], '', lnum, 0, '\n') 255 token_list.append(tok) 256 for indent in indents[1:]: # pop remaining indent levels 257 token_list.append((tokens["DEDENT"], '', lnum, pos, line)) 258 tok = (tokens["NEWLINE"], '', lnum, 0, '\n') 259 token_list.append(tok) 260 261 token_list.append((tokens["ENDMARKER"], '', lnum, pos, line)) 262 return token_list 263 264 265 def universal_newline(line): 266 # show annotator that indexes below are non-negative 267 line_len_m2 = len(line) - 2 268 if line_len_m2 >= 0 and line[-2] == '\r' and line[-1] == '\n': 269 return line[:line_len_m2] + '\n' 270 line_len_m1 = len(line) - 1 271 if line_len_m1 >= 0 and line[-1] == '\r': 272 return line[:line_len_m1] + '\n' 273 return line