Lichen

pyparser/pytokenizer.py

1037:2ef21d25c6e5
5 months ago Paul Boddie Merged changes from the default branch. trailing-data
     1 from pyparser import automata     2 from pyparser.pygram import tokens     3 from pyparser.pytoken import python_opmap     4 from pyparser.error import TokenError, TokenIndentationError     5 from pyparser.pytokenize import tabsize, whiteSpaceDFA, \     6     triple_quoted, endDFAs, single_quoted, pseudoDFA     7 from pyparser import consts     8      9 NAMECHARS = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_'    10 NUMCHARS = '0123456789'    11 ALNUMCHARS = NAMECHARS + NUMCHARS    12 EXTENDED_ALNUMCHARS = ALNUMCHARS + '-.'    13 WHITESPACES = ' \t\n\r\v\f'    14     15 def match_encoding_declaration(comment):    16     """returns the declared encoding or None    17     18     This function is a replacement for :    19     >>> py_encoding = re.compile(r"coding[:=]\s*([-\w.]+)")    20     >>> py_encoding.search(comment)    21     """    22     index = comment.find('coding')    23     if index < 0:    24         return None    25     next_char = comment[index + 6]    26     if next_char not in ':=':    27         return None    28     end_of_decl = comment[index + 7:]    29     index = 0    30     for char in end_of_decl:    31         if char not in WHITESPACES:    32             break    33         index += 1    34     else:    35         return None    36     encoding = ''    37     for char in end_of_decl[index:]:    38         if char in EXTENDED_ALNUMCHARS:    39             encoding += char    40         else:    41             break    42     if encoding != '':    43         return encoding    44     return None    45     46     47 DUMMY_DFA = automata.DFA([], [])    48     49 def generate_tokens(lines, flags):    50     """    51     This is a rewrite of pypy.module.parser.pytokenize.generate_tokens since    52     the original function is not RPYTHON (uses yield)    53     It was also slightly modified to generate Token instances instead    54     of the original 5-tuples -- it's now a 4-tuple of    55     56     * the Token instance    57     * the whole line as a string    58     * the line number (the real one, counting continuation lines)    59     * the position on the line of the end of the token.    60     61     Original docstring ::    62     63         The generate_tokens() generator requires one argment, readline, which    64         must be a callable object which provides the same interface as the    65         readline() method of built-in file objects. Each call to the function    66         should return one line of input as a string.    67     68         The generator produces 5-tuples with these members: the token type; the    69         token string; a 2-tuple (srow, scol) of ints specifying the row and    70         column where the token begins in the source; a 2-tuple (erow, ecol) of    71         ints specifying the row and column where the token ends in the source;    72         and the line on which the token was found. The line passed is the    73         logical line; continuation lines are included.    74     """    75     token_list = []    76     lnum = parenlev = continued = 0    77     namechars = NAMECHARS    78     numchars = NUMCHARS    79     contstr, needcont = '', 0    80     contline = None    81     indents = [0]    82     last_comment = ''    83     parenlevstart = (0, 0, "")    84     85     # make the annotator happy    86     endDFA = DUMMY_DFA    87     # make the annotator happy    88     line = ''    89     pos = 0    90     lines.append("")    91     strstart = (0, 0, "")    92     for line in lines:    93         lnum = lnum + 1    94         line = universal_newline(line)    95         pos, max = 0, len(line)    96     97         if contstr:    98             if not line:    99                 raise TokenError(   100                     "EOF while scanning triple-quoted string literal",   101                     strstart[2], strstart[0], strstart[1]+1,   102                     token_list, lnum-1)   103             endmatch = endDFA.recognize(line)   104             if endmatch >= 0:   105                 pos = end = endmatch   106                 tok = (tokens["STRING"], contstr + line[:end], strstart[0],   107                        strstart[1], line)   108                 token_list.append(tok)   109                 last_comment = ''   110                 contstr, needcont = '', 0   111                 contline = None   112             elif (needcont and not line.endswith('\\\n') and   113                                not line.endswith('\\\r\n')):   114                 tok = (tokens["ERRORTOKEN"], contstr + line, strstart[0],   115                        strstart[1], line)   116                 token_list.append(tok)   117                 last_comment = ''   118                 contstr = ''   119                 contline = None   120                 continue   121             else:   122                 contstr = contstr + line   123                 contline = contline + line   124                 continue   125    126         elif parenlev == 0 and not continued:  # new statement   127             if not line: break   128             column = 0   129             while pos < max:                   # measure leading whitespace   130                 if line[pos] == ' ': column = column + 1   131                 elif line[pos] == '\t': column = (column/tabsize + 1)*tabsize   132                 elif line[pos] == '\f': column = 0   133                 else: break   134                 pos = pos + 1   135             if pos == max: break   136    137             if line[pos] in '#\r\n':   138                 # skip comments or blank lines   139                 continue   140    141             if column > indents[-1]:           # count indents or dedents   142                 indents.append(column)   143                 token_list.append((tokens["INDENT"], line[:pos], lnum, 0, line))   144                 last_comment = ''   145             while column < indents[-1]:   146                 indents = indents[:-1]   147                 token_list.append((tokens["DEDENT"], '', lnum, pos, line))   148                 last_comment = ''   149             if column != indents[-1]:   150                 err = "unindent does not match any outer indentation level"   151                 raise TokenIndentationError(err, line, lnum, 0, token_list)   152    153         else:                                  # continued statement   154             if not line:   155                 if parenlev > 0:   156                     lnum1, start1, line1 = parenlevstart   157                     raise TokenError("parenthesis is never closed", line1,   158                                      lnum1, start1 + 1, token_list, lnum)   159                 raise TokenError("EOF in multi-line statement", line,   160                                  lnum, 0, token_list)   161             continued = 0   162    163         while pos < max:   164             pseudomatch = pseudoDFA.recognize(line, pos)   165             if pseudomatch >= 0:                            # scan for tokens   166                 # JDR: Modified   167                 start = whiteSpaceDFA.recognize(line, pos)   168                 if start < 0:   169                     start = pos   170                 end = pseudomatch   171    172                 if start == end:   173                     raise TokenError("Unknown character", line,   174                                      lnum, start + 1, token_list)   175    176                 pos = end   177                 token, initial = line[start:end], line[start]   178                 if initial in numchars or \   179                    (initial == '.' and token != '.'):      # ordinary number   180                     token_list.append((tokens["NUMBER"], token, lnum, start, line))   181                     last_comment = ''   182                 elif initial in '\r\n':   183                     if parenlev <= 0:   184                         tok = (tokens["NEWLINE"], last_comment, lnum, start, line)   185                         token_list.append(tok)   186                     last_comment = ''   187                 elif initial == '#':   188                     # skip comment   189                     last_comment = token   190                 elif token in triple_quoted:   191                     endDFA = endDFAs[token]   192                     endmatch = endDFA.recognize(line, pos)   193                     if endmatch >= 0:                     # all on one line   194                         pos = endmatch   195                         token = line[start:pos]   196                         tok = (tokens["STRING"], token, lnum, start, line)   197                         token_list.append(tok)   198                         last_comment = ''   199                     else:   200                         strstart = (lnum, start, line)   201                         contstr = line[start:]   202                         contline = line   203                         break   204                 elif initial in single_quoted or \   205                     token[:2] in single_quoted or \   206                     token[:3] in single_quoted:   207                     if token[-1] == '\n':                  # continued string   208                         strstart = (lnum, start, line)   209                         endDFA = (endDFAs[initial] or endDFAs[token[1]] or   210                                    endDFAs[token[2]])   211                         contstr, needcont = line[start:], 1   212                         contline = line   213                         break   214                     else:                                  # ordinary string   215                         tok = (tokens["STRING"], token, lnum, start, line)   216                         token_list.append(tok)   217                         last_comment = ''   218                 elif initial in namechars:                 # ordinary name   219                     token_list.append((tokens["NAME"], token, lnum, start, line))   220                     last_comment = ''   221                 elif initial == '\\':                      # continued stmt   222                     continued = 1   223                 else:   224                     if initial in '([{':   225                         if parenlev == 0:   226                             parenlevstart = (lnum, start, line)   227                         parenlev = parenlev + 1   228                     elif initial in ')]}':   229                         parenlev = parenlev - 1   230                         if parenlev < 0:   231                             raise TokenError("unmatched '%s'" % initial, line,   232                                              lnum, start + 1, token_list)   233                     if token in python_opmap:   234                         punct = python_opmap[token]   235                     else:   236                         punct = tokens["OP"]   237                     token_list.append((punct, token, lnum, start, line))   238                     last_comment = ''   239             else:   240                 start = whiteSpaceDFA.recognize(line, pos)   241                 if start < 0:   242                     start = pos   243                 if start<max and line[start] in single_quoted:   244                     raise TokenError("EOL while scanning string literal",   245                              line, lnum, start+1, token_list)   246                 tok = (tokens["ERRORTOKEN"], line[pos], lnum, pos, line)   247                 token_list.append(tok)   248                 last_comment = ''   249                 pos = pos + 1   250    251     lnum -= 1   252     if not (flags & consts.PyCF_DONT_IMPLY_DEDENT):   253         if token_list and token_list[-1][0] != tokens["NEWLINE"]:   254             tok = (tokens["NEWLINE"], '', lnum, 0, '\n')   255             token_list.append(tok)   256         for indent in indents[1:]:                # pop remaining indent levels   257             token_list.append((tokens["DEDENT"], '', lnum, pos, line))   258     tok = (tokens["NEWLINE"], '', lnum, 0, '\n')   259     token_list.append(tok)   260    261     token_list.append((tokens["ENDMARKER"], '', lnum, pos, line))   262     return token_list   263    264    265 def universal_newline(line):   266     # show annotator that indexes below are non-negative   267     line_len_m2 = len(line) - 2   268     if line_len_m2 >= 0 and line[-2] == '\r' and line[-1] == '\n':   269         return line[:line_len_m2] + '\n'   270     line_len_m1 = len(line) - 1   271     if line_len_m1 >= 0 and line[-1] == '\r':   272         return line[:line_len_m1] + '\n'   273     return line