Lichen

lib/sre_parse.py

304:e4bad4af5d43
2016-12-03 Paul Boddie Changed the dict implementation to not use native functions or types directly for its representation.
     1 #     2 # Secret Labs' Regular Expression Engine     3 #     4 # convert re-style regular expression to sre pattern     5 #     6 # Copyright (c) 1998-2001 by Secret Labs AB.  All rights reserved.     7 #     8 # See the sre.py file for information on usage and redistribution.     9 #    10     11 """Internal support module for sre"""    12     13 # XXX: show string offset and offending character for all errors    14     15 import sys    16     17 from sre_constants import *    18     19 SPECIAL_CHARS = ".\\[{()*+?^$|"    20 REPEAT_CHARS = "*+?{"    21     22 DIGITS = set("0123456789")    23     24 OCTDIGITS = set("01234567")    25 HEXDIGITS = set("0123456789abcdefABCDEF")    26     27 WHITESPACE = set(" \t\n\r\v\f")    28     29 ESCAPES = {    30     r"\a": (LITERAL, ord("\a")),    31     r"\b": (LITERAL, ord("\b")),    32     r"\f": (LITERAL, ord("\f")),    33     r"\n": (LITERAL, ord("\n")),    34     r"\r": (LITERAL, ord("\r")),    35     r"\t": (LITERAL, ord("\t")),    36     r"\v": (LITERAL, ord("\v")),    37     r"\\": (LITERAL, ord("\\"))    38 }    39     40 CATEGORIES = {    41     r"\A": (AT, AT_BEGINNING_STRING), # start of string    42     r"\b": (AT, AT_BOUNDARY),    43     r"\B": (AT, AT_NON_BOUNDARY),    44     r"\d": (IN, [(CATEGORY, CATEGORY_DIGIT)]),    45     r"\D": (IN, [(CATEGORY, CATEGORY_NOT_DIGIT)]),    46     r"\s": (IN, [(CATEGORY, CATEGORY_SPACE)]),    47     r"\S": (IN, [(CATEGORY, CATEGORY_NOT_SPACE)]),    48     r"\w": (IN, [(CATEGORY, CATEGORY_WORD)]),    49     r"\W": (IN, [(CATEGORY, CATEGORY_NOT_WORD)]),    50     r"\Z": (AT, AT_END_STRING), # end of string    51 }    52     53 FLAGS = {    54     # standard flags    55     "i": SRE_FLAG_IGNORECASE,    56     "L": SRE_FLAG_LOCALE,    57     "m": SRE_FLAG_MULTILINE,    58     "s": SRE_FLAG_DOTALL,    59     "x": SRE_FLAG_VERBOSE,    60     # extensions    61     "t": SRE_FLAG_TEMPLATE,    62     "u": SRE_FLAG_UNICODE,    63 }    64     65 class Pattern:    66     # master pattern object.  keeps track of global attributes    67     def __init__(self):    68         self.flags = 0    69         self.open = []    70         self.groups = 1    71         self.groupdict = {}    72         self.str = None    73     def opengroup(self, name=None):    74         gid = self.groups    75         self.groups = gid + 1    76         if name is not None:    77             ogid = self.groupdict.get(name, None)    78             if ogid is not None:    79                 raise error, ("redefinition of group name %s as group %d; "    80                               "was group %d" % (repr(name), gid,  ogid))    81             self.groupdict[name] = gid    82         self.open.append(gid)    83         return gid    84     def closegroup(self, gid):    85         self.open.remove(gid)    86     def checkgroup(self, gid):    87         return gid < self.groups and gid not in self.open    88     89 class SubPattern:    90     # a subpattern, in intermediate form    91     def __init__(self, pattern, data=None):    92         self.pattern = pattern    93         if data is None:    94             data = []    95         self.data = data    96         self.width = None    97     def dump(self, level=0):    98         nl = 1    99         seqtypes = type(()), type([])   100         for op, av in self.data:   101             print level*"  " + op,; nl = 0   102             if op == "in":   103                 # member sublanguage   104                 print; nl = 1   105                 for op, a in av:   106                     print (level+1)*"  " + op, a   107             elif op == "branch":   108                 print; nl = 1   109                 i = 0   110                 for a in av[1]:   111                     if i > 0:   112                         print level*"  " + "or"   113                     a.dump(level+1); nl = 1   114                     i = i + 1   115             elif type(av) in seqtypes:   116                 for a in av:   117                     if isinstance(a, SubPattern):   118                         if not nl: print   119                         a.dump(level+1); nl = 1   120                     else:   121                         print a, ; nl = 0   122             else:   123                 print av, ; nl = 0   124             if not nl: print   125     def __repr__(self):   126         return repr(self.data)   127     def __len__(self):   128         return len(self.data)   129     def __delitem__(self, index):   130         del self.data[index]   131     def __getitem__(self, index):   132         if isinstance(index, slice):   133             return SubPattern(self.pattern, self.data[index])   134         return self.data[index]   135     def __setitem__(self, index, code):   136         self.data[index] = code   137     def insert(self, index, code):   138         self.data.insert(index, code)   139     def append(self, code):   140         self.data.append(code)   141     def getwidth(self):   142         # determine the width (min, max) for this subpattern   143         if self.width:   144             return self.width   145         lo = hi = 0L   146         UNITCODES = (ANY, RANGE, IN, LITERAL, NOT_LITERAL, CATEGORY)   147         REPEATCODES = (MIN_REPEAT, MAX_REPEAT)   148         for op, av in self.data:   149             if op is BRANCH:   150                 i = sys.maxint   151                 j = 0   152                 for av in av[1]:   153                     l, h = av.getwidth()   154                     i = min(i, l)   155                     j = max(j, h)   156                 lo = lo + i   157                 hi = hi + j   158             elif op is CALL:   159                 i, j = av.getwidth()   160                 lo = lo + i   161                 hi = hi + j   162             elif op is SUBPATTERN:   163                 i, j = av[1].getwidth()   164                 lo = lo + i   165                 hi = hi + j   166             elif op in REPEATCODES:   167                 i, j = av[2].getwidth()   168                 lo = lo + long(i) * av[0]   169                 hi = hi + long(j) * av[1]   170             elif op in UNITCODES:   171                 lo = lo + 1   172                 hi = hi + 1   173             elif op == SUCCESS:   174                 break   175         self.width = int(min(lo, sys.maxint)), int(min(hi, sys.maxint))   176         return self.width   177    178 class Tokenizer:   179     def __init__(self, string):   180         self.string = string   181         self.index = 0   182         self.__next()   183     def __next(self):   184         if self.index >= len(self.string):   185             self.next = None   186             return   187         char = self.string[self.index]   188         if char[0] == "\\":   189             try:   190                 c = self.string[self.index + 1]   191             except IndexError:   192                 raise error, "bogus escape (end of line)"   193             char = char + c   194         self.index = self.index + len(char)   195         self.next = char   196     def match(self, char, skip=1):   197         if char == self.next:   198             if skip:   199                 self.__next()   200             return 1   201         return 0   202     def get(self):   203         this = self.next   204         self.__next()   205         return this   206     def tell(self):   207         return self.index, self.next   208     def seek(self, index):   209         self.index, self.next = index   210    211 def isident(char):   212     return "a" <= char <= "z" or "A" <= char <= "Z" or char == "_"   213    214 def isdigit(char):   215     return "0" <= char <= "9"   216    217 def isname(name):   218     # check that group name is a valid string   219     if not isident(name[0]):   220         return False   221     for char in name[1:]:   222         if not isident(char) and not isdigit(char):   223             return False   224     return True   225    226 def _class_escape(source, escape):   227     # handle escape code inside character class   228     code = ESCAPES.get(escape)   229     if code:   230         return code   231     code = CATEGORIES.get(escape)   232     if code:   233         return code   234     try:   235         c = escape[1:2]   236         if c == "x":   237             # hexadecimal escape (exactly two digits)   238             while source.next in HEXDIGITS and len(escape) < 4:   239                 escape = escape + source.get()   240             escape = escape[2:]   241             if len(escape) != 2:   242                 raise error, "bogus escape: %s" % repr("\\" + escape)   243             return LITERAL, int(escape, 16) & 0xff   244         elif c in OCTDIGITS:   245             # octal escape (up to three digits)   246             while source.next in OCTDIGITS and len(escape) < 4:   247                 escape = escape + source.get()   248             escape = escape[1:]   249             return LITERAL, int(escape, 8) & 0xff   250         elif c in DIGITS:   251             raise error, "bogus escape: %s" % repr(escape)   252         if len(escape) == 2:   253             return LITERAL, ord(escape[1])   254     except ValueError:   255         pass   256     raise error, "bogus escape: %s" % repr(escape)   257    258 def _escape(source, escape, state):   259     # handle escape code in expression   260     code = CATEGORIES.get(escape)   261     if code:   262         return code   263     code = ESCAPES.get(escape)   264     if code:   265         return code   266     try:   267         c = escape[1:2]   268         if c == "x":   269             # hexadecimal escape   270             while source.next in HEXDIGITS and len(escape) < 4:   271                 escape = escape + source.get()   272             if len(escape) != 4:   273                 raise ValueError   274             return LITERAL, int(escape[2:], 16) & 0xff   275         elif c == "0":   276             # octal escape   277             while source.next in OCTDIGITS and len(escape) < 4:   278                 escape = escape + source.get()   279             return LITERAL, int(escape[1:], 8) & 0xff   280         elif c in DIGITS:   281             # octal escape *or* decimal group reference (sigh)   282             if source.next in DIGITS:   283                 escape = escape + source.get()   284                 if (escape[1] in OCTDIGITS and escape[2] in OCTDIGITS and   285                     source.next in OCTDIGITS):   286                     # got three octal digits; this is an octal escape   287                     escape = escape + source.get()   288                     return LITERAL, int(escape[1:], 8) & 0xff   289             # not an octal escape, so this is a group reference   290             group = int(escape[1:])   291             if group < state.groups:   292                 if not state.checkgroup(group):   293                     raise error, "cannot refer to open group"   294                 return GROUPREF, group   295             raise ValueError   296         if len(escape) == 2:   297             return LITERAL, ord(escape[1])   298     except ValueError:   299         pass   300     raise error, "bogus escape: %s" % repr(escape)   301    302 def _parse_sub(source, state, nested=1):   303     # parse an alternation: a|b|c   304    305     items = []   306     itemsappend = items.append   307     sourcematch = source.match   308     while 1:   309         itemsappend(_parse(source, state))   310         if sourcematch("|"):   311             continue   312         if not nested:   313             break   314         if not source.next or sourcematch(")", 0):   315             break   316         else:   317             raise error, "pattern not properly closed"   318    319     if len(items) == 1:   320         return items[0]   321    322     subpattern = SubPattern(state)   323     subpatternappend = subpattern.append   324    325     # check if all items share a common prefix   326     while 1:   327         prefix = None   328         for item in items:   329             if not item:   330                 break   331             if prefix is None:   332                 prefix = item[0]   333             elif item[0] != prefix:   334                 break   335         else:   336             # all subitems start with a common "prefix".   337             # move it out of the branch   338             for item in items:   339                 del item[0]   340             subpatternappend(prefix)   341             continue # check next one   342         break   343    344     # check if the branch can be replaced by a character set   345     for item in items:   346         if len(item) != 1 or item[0][0] != LITERAL:   347             break   348     else:   349         # we can store this as a character set instead of a   350         # branch (the compiler may optimize this even more)   351         set = []   352         setappend = set.append   353         for item in items:   354             setappend(item[0])   355         subpatternappend((IN, set))   356         return subpattern   357    358     subpattern.append((BRANCH, (None, items)))   359     return subpattern   360    361 def _parse_sub_cond(source, state, condgroup):   362     item_yes = _parse(source, state)   363     if source.match("|"):   364         item_no = _parse(source, state)   365         if source.match("|"):   366             raise error, "conditional backref with more than two branches"   367     else:   368         item_no = None   369     if source.next and not source.match(")", 0):   370         raise error, "pattern not properly closed"   371     subpattern = SubPattern(state)   372     subpattern.append((GROUPREF_EXISTS, (condgroup, item_yes, item_no)))   373     return subpattern   374    375 _PATTERNENDERS = set("|)")   376 _ASSERTCHARS = set("=!<")   377 _LOOKBEHINDASSERTCHARS = set("=!")   378 _REPEATCODES = set([MIN_REPEAT, MAX_REPEAT])   379    380 def _parse(source, state):   381     # parse a simple pattern   382     subpattern = SubPattern(state)   383    384     # precompute constants into local variables   385     subpatternappend = subpattern.append   386     sourceget = source.get   387     sourcematch = source.match   388     _len = len   389     PATTERNENDERS = _PATTERNENDERS   390     ASSERTCHARS = _ASSERTCHARS   391     LOOKBEHINDASSERTCHARS = _LOOKBEHINDASSERTCHARS   392     REPEATCODES = _REPEATCODES   393    394     while 1:   395    396         if source.next in PATTERNENDERS:   397             break # end of subpattern   398         this = sourceget()   399         if this is None:   400             break # end of pattern   401    402         if state.flags & SRE_FLAG_VERBOSE:   403             # skip whitespace and comments   404             if this in WHITESPACE:   405                 continue   406             if this == "#":   407                 while 1:   408                     this = sourceget()   409                     if this in (None, "\n"):   410                         break   411                 continue   412    413         if this and this[0] not in SPECIAL_CHARS:   414             subpatternappend((LITERAL, ord(this)))   415    416         elif this == "[":   417             # character set   418             set = []   419             setappend = set.append   420 ##          if sourcematch(":"):   421 ##              pass # handle character classes   422             if sourcematch("^"):   423                 setappend((NEGATE, None))   424             # check remaining characters   425             start = set[:]   426             while 1:   427                 this = sourceget()   428                 if this == "]" and set != start:   429                     break   430                 elif this and this[0] == "\\":   431                     code1 = _class_escape(source, this)   432                 elif this:   433                     code1 = LITERAL, ord(this)   434                 else:   435                     raise error, "unexpected end of regular expression"   436                 if sourcematch("-"):   437                     # potential range   438                     this = sourceget()   439                     if this == "]":   440                         if code1[0] is IN:   441                             code1 = code1[1][0]   442                         setappend(code1)   443                         setappend((LITERAL, ord("-")))   444                         break   445                     elif this:   446                         if this[0] == "\\":   447                             code2 = _class_escape(source, this)   448                         else:   449                             code2 = LITERAL, ord(this)   450                         if code1[0] != LITERAL or code2[0] != LITERAL:   451                             raise error, "bad character range"   452                         lo = code1[1]   453                         hi = code2[1]   454                         if hi < lo:   455                             raise error, "bad character range"   456                         setappend((RANGE, (lo, hi)))   457                     else:   458                         raise error, "unexpected end of regular expression"   459                 else:   460                     if code1[0] is IN:   461                         code1 = code1[1][0]   462                     setappend(code1)   463    464             # XXX: <fl> should move set optimization to compiler!   465             if _len(set)==1 and set[0][0] is LITERAL:   466                 subpatternappend(set[0]) # optimization   467             elif _len(set)==2 and set[0][0] is NEGATE and set[1][0] is LITERAL:   468                 subpatternappend((NOT_LITERAL, set[1][1])) # optimization   469             else:   470                 # XXX: <fl> should add charmap optimization here   471                 subpatternappend((IN, set))   472    473         elif this and this[0] in REPEAT_CHARS:   474             # repeat previous item   475             if this == "?":   476                 min, max = 0, 1   477             elif this == "*":   478                 min, max = 0, MAXREPEAT   479    480             elif this == "+":   481                 min, max = 1, MAXREPEAT   482             elif this == "{":   483                 if source.next == "}":   484                     subpatternappend((LITERAL, ord(this)))   485                     continue   486                 here = source.tell()   487                 min, max = 0, MAXREPEAT   488                 lo = hi = ""   489                 while source.next in DIGITS:   490                     lo = lo + source.get()   491                 if sourcematch(","):   492                     while source.next in DIGITS:   493                         hi = hi + sourceget()   494                 else:   495                     hi = lo   496                 if not sourcematch("}"):   497                     subpatternappend((LITERAL, ord(this)))   498                     source.seek(here)   499                     continue   500                 if lo:   501                     min = int(lo)   502                 if hi:   503                     max = int(hi)   504                 if max < min:   505                     raise error, "bad repeat interval"   506             else:   507                 raise error, "not supported"   508             # figure out which item to repeat   509             if subpattern:   510                 item = subpattern[-1:]   511             else:   512                 item = None   513             if not item or (_len(item) == 1 and item[0][0] == AT):   514                 raise error, "nothing to repeat"   515             if item[0][0] in REPEATCODES:   516                 raise error, "multiple repeat"   517             if sourcematch("?"):   518                 subpattern[-1] = (MIN_REPEAT, (min, max, item))   519             else:   520                 subpattern[-1] = (MAX_REPEAT, (min, max, item))   521    522         elif this == ".":   523             subpatternappend((ANY, None))   524    525         elif this == "(":   526             group = 1   527             name = None   528             condgroup = None   529             if sourcematch("?"):   530                 group = 0   531                 # options   532                 if sourcematch("P"):   533                     # python extensions   534                     if sourcematch("<"):   535                         # named group: skip forward to end of name   536                         name = ""   537                         while 1:   538                             char = sourceget()   539                             if char is None:   540                                 raise error, "unterminated name"   541                             if char == ">":   542                                 break   543                             name = name + char   544                         group = 1   545                         if not isname(name):   546                             raise error, "bad character in group name"   547                     elif sourcematch("="):   548                         # named backreference   549                         name = ""   550                         while 1:   551                             char = sourceget()   552                             if char is None:   553                                 raise error, "unterminated name"   554                             if char == ")":   555                                 break   556                             name = name + char   557                         if not isname(name):   558                             raise error, "bad character in group name"   559                         gid = state.groupdict.get(name)   560                         if gid is None:   561                             raise error, "unknown group name"   562                         subpatternappend((GROUPREF, gid))   563                         continue   564                     else:   565                         char = sourceget()   566                         if char is None:   567                             raise error, "unexpected end of pattern"   568                         raise error, "unknown specifier: ?P%s" % char   569                 elif sourcematch(":"):   570                     # non-capturing group   571                     group = 2   572                 elif sourcematch("#"):   573                     # comment   574                     while 1:   575                         if source.next is None or source.next == ")":   576                             break   577                         sourceget()   578                     if not sourcematch(")"):   579                         raise error, "unbalanced parenthesis"   580                     continue   581                 elif source.next in ASSERTCHARS:   582                     # lookahead assertions   583                     char = sourceget()   584                     dir = 1   585                     if char == "<":   586                         if source.next not in LOOKBEHINDASSERTCHARS:   587                             raise error, "syntax error"   588                         dir = -1 # lookbehind   589                         char = sourceget()   590                     p = _parse_sub(source, state)   591                     if not sourcematch(")"):   592                         raise error, "unbalanced parenthesis"   593                     if char == "=":   594                         subpatternappend((ASSERT, (dir, p)))   595                     else:   596                         subpatternappend((ASSERT_NOT, (dir, p)))   597                     continue   598                 elif sourcematch("("):   599                     # conditional backreference group   600                     condname = ""   601                     while 1:   602                         char = sourceget()   603                         if char is None:   604                             raise error, "unterminated name"   605                         if char == ")":   606                             break   607                         condname = condname + char   608                     group = 2   609                     if isname(condname):   610                         condgroup = state.groupdict.get(condname)   611                         if condgroup is None:   612                             raise error, "unknown group name"   613                     else:   614                         try:   615                             condgroup = int(condname)   616                         except ValueError:   617                             raise error, "bad character in group name"   618                 else:   619                     # flags   620                     if not source.next in FLAGS:   621                         raise error, "unexpected end of pattern"   622                     while source.next in FLAGS:   623                         state.flags = state.flags | FLAGS[sourceget()]   624             if group:   625                 # parse group contents   626                 if group == 2:   627                     # anonymous group   628                     group = None   629                 else:   630                     group = state.opengroup(name)   631                 if condgroup:   632                     p = _parse_sub_cond(source, state, condgroup)   633                 else:   634                     p = _parse_sub(source, state)   635                 if not sourcematch(")"):   636                     raise error, "unbalanced parenthesis"   637                 if group is not None:   638                     state.closegroup(group)   639                 subpatternappend((SUBPATTERN, (group, p)))   640             else:   641                 while 1:   642                     char = sourceget()   643                     if char is None:   644                         raise error, "unexpected end of pattern"   645                     if char == ")":   646                         break   647                     raise error, "unknown extension"   648    649         elif this == "^":   650             subpatternappend((AT, AT_BEGINNING))   651    652         elif this == "$":   653             subpattern.append((AT, AT_END))   654    655         elif this and this[0] == "\\":   656             code = _escape(source, this, state)   657             subpatternappend(code)   658    659         else:   660             raise error, "parser error"   661    662     return subpattern   663    664 def parse(str, flags=0, pattern=None):   665     # parse 're' pattern into list of (opcode, argument) tuples   666    667     source = Tokenizer(str)   668    669     if pattern is None:   670         pattern = Pattern()   671     pattern.flags = flags   672     pattern.str = str   673    674     p = _parse_sub(source, pattern, 0)   675    676     tail = source.get()   677     if tail == ")":   678         raise error, "unbalanced parenthesis"   679     elif tail:   680         raise error, "bogus characters at end of regular expression"   681    682     if flags & SRE_FLAG_DEBUG:   683         p.dump()   684    685     if not (flags & SRE_FLAG_VERBOSE) and p.pattern.flags & SRE_FLAG_VERBOSE:   686         # the VERBOSE flag was switched on inside the pattern.  to be   687         # on the safe side, we'll parse the whole thing again...   688         return parse(str, p.pattern.flags)   689    690     return p   691    692 def parse_template(source, pattern):   693     # parse 're' replacement string into list of literals and   694     # group references   695     s = Tokenizer(source)   696     sget = s.get   697     p = []   698     a = p.append   699     def literal(literal, p=p, pappend=a):   700         if p and p[-1][0] is LITERAL:   701             p[-1] = LITERAL, p[-1][1] + literal   702         else:   703             pappend((LITERAL, literal))   704     sep = source[:0]   705     if type(sep) is type(""):   706         makechar = chr   707     else:   708         makechar = unichr   709     while 1:   710         this = sget()   711         if this is None:   712             break # end of replacement string   713         if this and this[0] == "\\":   714             # group   715             c = this[1:2]   716             if c == "g":   717                 name = ""   718                 if s.match("<"):   719                     while 1:   720                         char = sget()   721                         if char is None:   722                             raise error, "unterminated group name"   723                         if char == ">":   724                             break   725                         name = name + char   726                 if not name:   727                     raise error, "bad group name"   728                 try:   729                     index = int(name)   730                     if index < 0:   731                         raise error, "negative group number"   732                 except ValueError:   733                     if not isname(name):   734                         raise error, "bad character in group name"   735                     try:   736                         index = pattern.groupindex[name]   737                     except KeyError:   738                         raise IndexError, "unknown group name"   739                 a((MARK, index))   740             elif c == "0":   741                 if s.next in OCTDIGITS:   742                     this = this + sget()   743                     if s.next in OCTDIGITS:   744                         this = this + sget()   745                 literal(makechar(int(this[1:], 8) & 0xff))   746             elif c in DIGITS:   747                 isoctal = False   748                 if s.next in DIGITS:   749                     this = this + sget()   750                     if (c in OCTDIGITS and this[2] in OCTDIGITS and   751                         s.next in OCTDIGITS):   752                         this = this + sget()   753                         isoctal = True   754                         literal(makechar(int(this[1:], 8) & 0xff))   755                 if not isoctal:   756                     a((MARK, int(this[1:])))   757             else:   758                 try:   759                     this = makechar(ESCAPES[this][1])   760                 except KeyError:   761                     pass   762                 literal(this)   763         else:   764             literal(this)   765     # convert template to groups and literals lists   766     i = 0   767     groups = []   768     groupsappend = groups.append   769     literals = [None] * len(p)   770     for c, s in p:   771         if c is MARK:   772             groupsappend((i, s))   773             # literal[i] is already None   774         else:   775             literals[i] = s   776         i = i + 1   777     return groups, literals   778    779 def expand_template(template, match):   780     g = match.group   781     sep = match.string[:0]   782     groups, literals = template   783     literals = literals[:]   784     try:   785         for index, group in groups:   786             literals[index] = s = g(group)   787             if s is None:   788                 raise error, "unmatched group"   789     except IndexError:   790         raise error, "invalid group reference"   791     return sep.join(literals)