paul@6 | 1 | # |
paul@6 | 2 | # Secret Labs' Regular Expression Engine |
paul@6 | 3 | # |
paul@6 | 4 | # convert re-style regular expression to sre pattern |
paul@6 | 5 | # |
paul@6 | 6 | # Copyright (c) 1998-2001 by Secret Labs AB. All rights reserved. |
paul@6 | 7 | # |
paul@6 | 8 | # See the sre.py file for information on usage and redistribution. |
paul@6 | 9 | # |
paul@6 | 10 | |
paul@6 | 11 | """Internal support module for sre""" |
paul@6 | 12 | |
paul@6 | 13 | # XXX: show string offset and offending character for all errors |
paul@6 | 14 | |
paul@6 | 15 | import sys |
paul@6 | 16 | |
paul@6 | 17 | from sre_constants import * |
paul@6 | 18 | |
paul@6 | 19 | SPECIAL_CHARS = ".\\[{()*+?^$|" |
paul@6 | 20 | REPEAT_CHARS = "*+?{" |
paul@6 | 21 | |
paul@6 | 22 | DIGITS = set("0123456789") |
paul@6 | 23 | |
paul@6 | 24 | OCTDIGITS = set("01234567") |
paul@6 | 25 | HEXDIGITS = set("0123456789abcdefABCDEF") |
paul@6 | 26 | |
paul@6 | 27 | WHITESPACE = set(" \t\n\r\v\f") |
paul@6 | 28 | |
paul@6 | 29 | ESCAPES = { |
paul@6 | 30 | r"\a": (LITERAL, ord("\a")), |
paul@6 | 31 | r"\b": (LITERAL, ord("\b")), |
paul@6 | 32 | r"\f": (LITERAL, ord("\f")), |
paul@6 | 33 | r"\n": (LITERAL, ord("\n")), |
paul@6 | 34 | r"\r": (LITERAL, ord("\r")), |
paul@6 | 35 | r"\t": (LITERAL, ord("\t")), |
paul@6 | 36 | r"\v": (LITERAL, ord("\v")), |
paul@6 | 37 | r"\\": (LITERAL, ord("\\")) |
paul@6 | 38 | } |
paul@6 | 39 | |
paul@6 | 40 | CATEGORIES = { |
paul@6 | 41 | r"\A": (AT, AT_BEGINNING_STRING), # start of string |
paul@6 | 42 | r"\b": (AT, AT_BOUNDARY), |
paul@6 | 43 | r"\B": (AT, AT_NON_BOUNDARY), |
paul@6 | 44 | r"\d": (IN, [(CATEGORY, CATEGORY_DIGIT)]), |
paul@6 | 45 | r"\D": (IN, [(CATEGORY, CATEGORY_NOT_DIGIT)]), |
paul@6 | 46 | r"\s": (IN, [(CATEGORY, CATEGORY_SPACE)]), |
paul@6 | 47 | r"\S": (IN, [(CATEGORY, CATEGORY_NOT_SPACE)]), |
paul@6 | 48 | r"\w": (IN, [(CATEGORY, CATEGORY_WORD)]), |
paul@6 | 49 | r"\W": (IN, [(CATEGORY, CATEGORY_NOT_WORD)]), |
paul@6 | 50 | r"\Z": (AT, AT_END_STRING), # end of string |
paul@6 | 51 | } |
paul@6 | 52 | |
paul@6 | 53 | FLAGS = { |
paul@6 | 54 | # standard flags |
paul@6 | 55 | "i": SRE_FLAG_IGNORECASE, |
paul@6 | 56 | "L": SRE_FLAG_LOCALE, |
paul@6 | 57 | "m": SRE_FLAG_MULTILINE, |
paul@6 | 58 | "s": SRE_FLAG_DOTALL, |
paul@6 | 59 | "x": SRE_FLAG_VERBOSE, |
paul@6 | 60 | # extensions |
paul@6 | 61 | "t": SRE_FLAG_TEMPLATE, |
paul@6 | 62 | "u": SRE_FLAG_UNICODE, |
paul@6 | 63 | } |
paul@6 | 64 | |
paul@6 | 65 | class Pattern: |
paul@6 | 66 | # master pattern object. keeps track of global attributes |
paul@6 | 67 | def __init__(self): |
paul@6 | 68 | self.flags = 0 |
paul@6 | 69 | self.open = [] |
paul@6 | 70 | self.groups = 1 |
paul@6 | 71 | self.groupdict = {} |
paul@6 | 72 | self.str = None |
paul@6 | 73 | def opengroup(self, name=None): |
paul@6 | 74 | gid = self.groups |
paul@6 | 75 | self.groups = gid + 1 |
paul@6 | 76 | if name is not None: |
paul@6 | 77 | ogid = self.groupdict.get(name, None) |
paul@6 | 78 | if ogid is not None: |
paul@6 | 79 | raise error, ("redefinition of group name %s as group %d; " |
paul@6 | 80 | "was group %d" % (repr(name), gid, ogid)) |
paul@6 | 81 | self.groupdict[name] = gid |
paul@6 | 82 | self.open.append(gid) |
paul@6 | 83 | return gid |
paul@6 | 84 | def closegroup(self, gid): |
paul@6 | 85 | self.open.remove(gid) |
paul@6 | 86 | def checkgroup(self, gid): |
paul@6 | 87 | return gid < self.groups and gid not in self.open |
paul@6 | 88 | |
paul@6 | 89 | class SubPattern: |
paul@6 | 90 | # a subpattern, in intermediate form |
paul@6 | 91 | def __init__(self, pattern, data=None): |
paul@6 | 92 | self.pattern = pattern |
paul@6 | 93 | if data is None: |
paul@6 | 94 | data = [] |
paul@6 | 95 | self.data = data |
paul@6 | 96 | self.width = None |
paul@6 | 97 | def dump(self, level=0): |
paul@6 | 98 | nl = 1 |
paul@6 | 99 | seqtypes = type(()), type([]) |
paul@6 | 100 | for op, av in self.data: |
paul@6 | 101 | print level*" " + op,; nl = 0 |
paul@6 | 102 | if op == "in": |
paul@6 | 103 | # member sublanguage |
paul@6 | 104 | print; nl = 1 |
paul@6 | 105 | for op, a in av: |
paul@6 | 106 | print (level+1)*" " + op, a |
paul@6 | 107 | elif op == "branch": |
paul@6 | 108 | print; nl = 1 |
paul@6 | 109 | i = 0 |
paul@6 | 110 | for a in av[1]: |
paul@6 | 111 | if i > 0: |
paul@6 | 112 | print level*" " + "or" |
paul@6 | 113 | a.dump(level+1); nl = 1 |
paul@6 | 114 | i = i + 1 |
paul@6 | 115 | elif type(av) in seqtypes: |
paul@6 | 116 | for a in av: |
paul@6 | 117 | if isinstance(a, SubPattern): |
paul@6 | 118 | if not nl: print |
paul@6 | 119 | a.dump(level+1); nl = 1 |
paul@6 | 120 | else: |
paul@6 | 121 | print a, ; nl = 0 |
paul@6 | 122 | else: |
paul@6 | 123 | print av, ; nl = 0 |
paul@6 | 124 | if not nl: print |
paul@6 | 125 | def __repr__(self): |
paul@6 | 126 | return repr(self.data) |
paul@6 | 127 | def __len__(self): |
paul@6 | 128 | return len(self.data) |
paul@6 | 129 | def __delitem__(self, index): |
paul@6 | 130 | del self.data[index] |
paul@6 | 131 | def __getitem__(self, index): |
paul@6 | 132 | if isinstance(index, slice): |
paul@6 | 133 | return SubPattern(self.pattern, self.data[index]) |
paul@6 | 134 | return self.data[index] |
paul@6 | 135 | def __setitem__(self, index, code): |
paul@6 | 136 | self.data[index] = code |
paul@6 | 137 | def insert(self, index, code): |
paul@6 | 138 | self.data.insert(index, code) |
paul@6 | 139 | def append(self, code): |
paul@6 | 140 | self.data.append(code) |
paul@6 | 141 | def getwidth(self): |
paul@6 | 142 | # determine the width (min, max) for this subpattern |
paul@6 | 143 | if self.width: |
paul@6 | 144 | return self.width |
paul@6 | 145 | lo = hi = 0L |
paul@6 | 146 | UNITCODES = (ANY, RANGE, IN, LITERAL, NOT_LITERAL, CATEGORY) |
paul@6 | 147 | REPEATCODES = (MIN_REPEAT, MAX_REPEAT) |
paul@6 | 148 | for op, av in self.data: |
paul@6 | 149 | if op is BRANCH: |
paul@6 | 150 | i = sys.maxint |
paul@6 | 151 | j = 0 |
paul@6 | 152 | for av in av[1]: |
paul@6 | 153 | l, h = av.getwidth() |
paul@6 | 154 | i = min(i, l) |
paul@6 | 155 | j = max(j, h) |
paul@6 | 156 | lo = lo + i |
paul@6 | 157 | hi = hi + j |
paul@6 | 158 | elif op is CALL: |
paul@6 | 159 | i, j = av.getwidth() |
paul@6 | 160 | lo = lo + i |
paul@6 | 161 | hi = hi + j |
paul@6 | 162 | elif op is SUBPATTERN: |
paul@6 | 163 | i, j = av[1].getwidth() |
paul@6 | 164 | lo = lo + i |
paul@6 | 165 | hi = hi + j |
paul@6 | 166 | elif op in REPEATCODES: |
paul@6 | 167 | i, j = av[2].getwidth() |
paul@6 | 168 | lo = lo + long(i) * av[0] |
paul@6 | 169 | hi = hi + long(j) * av[1] |
paul@6 | 170 | elif op in UNITCODES: |
paul@6 | 171 | lo = lo + 1 |
paul@6 | 172 | hi = hi + 1 |
paul@6 | 173 | elif op == SUCCESS: |
paul@6 | 174 | break |
paul@6 | 175 | self.width = int(min(lo, sys.maxint)), int(min(hi, sys.maxint)) |
paul@6 | 176 | return self.width |
paul@6 | 177 | |
paul@6 | 178 | class Tokenizer: |
paul@6 | 179 | def __init__(self, string): |
paul@6 | 180 | self.string = string |
paul@6 | 181 | self.index = 0 |
paul@6 | 182 | self.__next() |
paul@6 | 183 | def __next(self): |
paul@6 | 184 | if self.index >= len(self.string): |
paul@6 | 185 | self.next = None |
paul@6 | 186 | return |
paul@6 | 187 | char = self.string[self.index] |
paul@6 | 188 | if char[0] == "\\": |
paul@6 | 189 | try: |
paul@6 | 190 | c = self.string[self.index + 1] |
paul@6 | 191 | except IndexError: |
paul@6 | 192 | raise error, "bogus escape (end of line)" |
paul@6 | 193 | char = char + c |
paul@6 | 194 | self.index = self.index + len(char) |
paul@6 | 195 | self.next = char |
paul@6 | 196 | def match(self, char, skip=1): |
paul@6 | 197 | if char == self.next: |
paul@6 | 198 | if skip: |
paul@6 | 199 | self.__next() |
paul@6 | 200 | return 1 |
paul@6 | 201 | return 0 |
paul@6 | 202 | def get(self): |
paul@6 | 203 | this = self.next |
paul@6 | 204 | self.__next() |
paul@6 | 205 | return this |
paul@6 | 206 | def tell(self): |
paul@6 | 207 | return self.index, self.next |
paul@6 | 208 | def seek(self, index): |
paul@6 | 209 | self.index, self.next = index |
paul@6 | 210 | |
paul@6 | 211 | def isident(char): |
paul@6 | 212 | return "a" <= char <= "z" or "A" <= char <= "Z" or char == "_" |
paul@6 | 213 | |
paul@6 | 214 | def isdigit(char): |
paul@6 | 215 | return "0" <= char <= "9" |
paul@6 | 216 | |
paul@6 | 217 | def isname(name): |
paul@6 | 218 | # check that group name is a valid string |
paul@6 | 219 | if not isident(name[0]): |
paul@6 | 220 | return False |
paul@6 | 221 | for char in name[1:]: |
paul@6 | 222 | if not isident(char) and not isdigit(char): |
paul@6 | 223 | return False |
paul@6 | 224 | return True |
paul@6 | 225 | |
paul@6 | 226 | def _class_escape(source, escape): |
paul@6 | 227 | # handle escape code inside character class |
paul@6 | 228 | code = ESCAPES.get(escape) |
paul@6 | 229 | if code: |
paul@6 | 230 | return code |
paul@6 | 231 | code = CATEGORIES.get(escape) |
paul@6 | 232 | if code: |
paul@6 | 233 | return code |
paul@6 | 234 | try: |
paul@6 | 235 | c = escape[1:2] |
paul@6 | 236 | if c == "x": |
paul@6 | 237 | # hexadecimal escape (exactly two digits) |
paul@6 | 238 | while source.next in HEXDIGITS and len(escape) < 4: |
paul@6 | 239 | escape = escape + source.get() |
paul@6 | 240 | escape = escape[2:] |
paul@6 | 241 | if len(escape) != 2: |
paul@6 | 242 | raise error, "bogus escape: %s" % repr("\\" + escape) |
paul@6 | 243 | return LITERAL, int(escape, 16) & 0xff |
paul@6 | 244 | elif c in OCTDIGITS: |
paul@6 | 245 | # octal escape (up to three digits) |
paul@6 | 246 | while source.next in OCTDIGITS and len(escape) < 4: |
paul@6 | 247 | escape = escape + source.get() |
paul@6 | 248 | escape = escape[1:] |
paul@6 | 249 | return LITERAL, int(escape, 8) & 0xff |
paul@6 | 250 | elif c in DIGITS: |
paul@6 | 251 | raise error, "bogus escape: %s" % repr(escape) |
paul@6 | 252 | if len(escape) == 2: |
paul@6 | 253 | return LITERAL, ord(escape[1]) |
paul@6 | 254 | except ValueError: |
paul@6 | 255 | pass |
paul@6 | 256 | raise error, "bogus escape: %s" % repr(escape) |
paul@6 | 257 | |
paul@6 | 258 | def _escape(source, escape, state): |
paul@6 | 259 | # handle escape code in expression |
paul@6 | 260 | code = CATEGORIES.get(escape) |
paul@6 | 261 | if code: |
paul@6 | 262 | return code |
paul@6 | 263 | code = ESCAPES.get(escape) |
paul@6 | 264 | if code: |
paul@6 | 265 | return code |
paul@6 | 266 | try: |
paul@6 | 267 | c = escape[1:2] |
paul@6 | 268 | if c == "x": |
paul@6 | 269 | # hexadecimal escape |
paul@6 | 270 | while source.next in HEXDIGITS and len(escape) < 4: |
paul@6 | 271 | escape = escape + source.get() |
paul@6 | 272 | if len(escape) != 4: |
paul@6 | 273 | raise ValueError |
paul@6 | 274 | return LITERAL, int(escape[2:], 16) & 0xff |
paul@6 | 275 | elif c == "0": |
paul@6 | 276 | # octal escape |
paul@6 | 277 | while source.next in OCTDIGITS and len(escape) < 4: |
paul@6 | 278 | escape = escape + source.get() |
paul@6 | 279 | return LITERAL, int(escape[1:], 8) & 0xff |
paul@6 | 280 | elif c in DIGITS: |
paul@6 | 281 | # octal escape *or* decimal group reference (sigh) |
paul@6 | 282 | if source.next in DIGITS: |
paul@6 | 283 | escape = escape + source.get() |
paul@6 | 284 | if (escape[1] in OCTDIGITS and escape[2] in OCTDIGITS and |
paul@6 | 285 | source.next in OCTDIGITS): |
paul@6 | 286 | # got three octal digits; this is an octal escape |
paul@6 | 287 | escape = escape + source.get() |
paul@6 | 288 | return LITERAL, int(escape[1:], 8) & 0xff |
paul@6 | 289 | # not an octal escape, so this is a group reference |
paul@6 | 290 | group = int(escape[1:]) |
paul@6 | 291 | if group < state.groups: |
paul@6 | 292 | if not state.checkgroup(group): |
paul@6 | 293 | raise error, "cannot refer to open group" |
paul@6 | 294 | return GROUPREF, group |
paul@6 | 295 | raise ValueError |
paul@6 | 296 | if len(escape) == 2: |
paul@6 | 297 | return LITERAL, ord(escape[1]) |
paul@6 | 298 | except ValueError: |
paul@6 | 299 | pass |
paul@6 | 300 | raise error, "bogus escape: %s" % repr(escape) |
paul@6 | 301 | |
paul@6 | 302 | def _parse_sub(source, state, nested=1): |
paul@6 | 303 | # parse an alternation: a|b|c |
paul@6 | 304 | |
paul@6 | 305 | items = [] |
paul@6 | 306 | itemsappend = items.append |
paul@6 | 307 | sourcematch = source.match |
paul@6 | 308 | while 1: |
paul@6 | 309 | itemsappend(_parse(source, state)) |
paul@6 | 310 | if sourcematch("|"): |
paul@6 | 311 | continue |
paul@6 | 312 | if not nested: |
paul@6 | 313 | break |
paul@6 | 314 | if not source.next or sourcematch(")", 0): |
paul@6 | 315 | break |
paul@6 | 316 | else: |
paul@6 | 317 | raise error, "pattern not properly closed" |
paul@6 | 318 | |
paul@6 | 319 | if len(items) == 1: |
paul@6 | 320 | return items[0] |
paul@6 | 321 | |
paul@6 | 322 | subpattern = SubPattern(state) |
paul@6 | 323 | subpatternappend = subpattern.append |
paul@6 | 324 | |
paul@6 | 325 | # check if all items share a common prefix |
paul@6 | 326 | while 1: |
paul@6 | 327 | prefix = None |
paul@6 | 328 | for item in items: |
paul@6 | 329 | if not item: |
paul@6 | 330 | break |
paul@6 | 331 | if prefix is None: |
paul@6 | 332 | prefix = item[0] |
paul@6 | 333 | elif item[0] != prefix: |
paul@6 | 334 | break |
paul@6 | 335 | else: |
paul@6 | 336 | # all subitems start with a common "prefix". |
paul@6 | 337 | # move it out of the branch |
paul@6 | 338 | for item in items: |
paul@6 | 339 | del item[0] |
paul@6 | 340 | subpatternappend(prefix) |
paul@6 | 341 | continue # check next one |
paul@6 | 342 | break |
paul@6 | 343 | |
paul@6 | 344 | # check if the branch can be replaced by a character set |
paul@6 | 345 | for item in items: |
paul@6 | 346 | if len(item) != 1 or item[0][0] != LITERAL: |
paul@6 | 347 | break |
paul@6 | 348 | else: |
paul@6 | 349 | # we can store this as a character set instead of a |
paul@6 | 350 | # branch (the compiler may optimize this even more) |
paul@6 | 351 | set = [] |
paul@6 | 352 | setappend = set.append |
paul@6 | 353 | for item in items: |
paul@6 | 354 | setappend(item[0]) |
paul@6 | 355 | subpatternappend((IN, set)) |
paul@6 | 356 | return subpattern |
paul@6 | 357 | |
paul@6 | 358 | subpattern.append((BRANCH, (None, items))) |
paul@6 | 359 | return subpattern |
paul@6 | 360 | |
paul@6 | 361 | def _parse_sub_cond(source, state, condgroup): |
paul@6 | 362 | item_yes = _parse(source, state) |
paul@6 | 363 | if source.match("|"): |
paul@6 | 364 | item_no = _parse(source, state) |
paul@6 | 365 | if source.match("|"): |
paul@6 | 366 | raise error, "conditional backref with more than two branches" |
paul@6 | 367 | else: |
paul@6 | 368 | item_no = None |
paul@6 | 369 | if source.next and not source.match(")", 0): |
paul@6 | 370 | raise error, "pattern not properly closed" |
paul@6 | 371 | subpattern = SubPattern(state) |
paul@6 | 372 | subpattern.append((GROUPREF_EXISTS, (condgroup, item_yes, item_no))) |
paul@6 | 373 | return subpattern |
paul@6 | 374 | |
paul@6 | 375 | _PATTERNENDERS = set("|)") |
paul@6 | 376 | _ASSERTCHARS = set("=!<") |
paul@6 | 377 | _LOOKBEHINDASSERTCHARS = set("=!") |
paul@6 | 378 | _REPEATCODES = set([MIN_REPEAT, MAX_REPEAT]) |
paul@6 | 379 | |
paul@6 | 380 | def _parse(source, state): |
paul@6 | 381 | # parse a simple pattern |
paul@6 | 382 | subpattern = SubPattern(state) |
paul@6 | 383 | |
paul@6 | 384 | # precompute constants into local variables |
paul@6 | 385 | subpatternappend = subpattern.append |
paul@6 | 386 | sourceget = source.get |
paul@6 | 387 | sourcematch = source.match |
paul@6 | 388 | _len = len |
paul@6 | 389 | PATTERNENDERS = _PATTERNENDERS |
paul@6 | 390 | ASSERTCHARS = _ASSERTCHARS |
paul@6 | 391 | LOOKBEHINDASSERTCHARS = _LOOKBEHINDASSERTCHARS |
paul@6 | 392 | REPEATCODES = _REPEATCODES |
paul@6 | 393 | |
paul@6 | 394 | while 1: |
paul@6 | 395 | |
paul@6 | 396 | if source.next in PATTERNENDERS: |
paul@6 | 397 | break # end of subpattern |
paul@6 | 398 | this = sourceget() |
paul@6 | 399 | if this is None: |
paul@6 | 400 | break # end of pattern |
paul@6 | 401 | |
paul@6 | 402 | if state.flags & SRE_FLAG_VERBOSE: |
paul@6 | 403 | # skip whitespace and comments |
paul@6 | 404 | if this in WHITESPACE: |
paul@6 | 405 | continue |
paul@6 | 406 | if this == "#": |
paul@6 | 407 | while 1: |
paul@6 | 408 | this = sourceget() |
paul@6 | 409 | if this in (None, "\n"): |
paul@6 | 410 | break |
paul@6 | 411 | continue |
paul@6 | 412 | |
paul@6 | 413 | if this and this[0] not in SPECIAL_CHARS: |
paul@6 | 414 | subpatternappend((LITERAL, ord(this))) |
paul@6 | 415 | |
paul@6 | 416 | elif this == "[": |
paul@6 | 417 | # character set |
paul@6 | 418 | set = [] |
paul@6 | 419 | setappend = set.append |
paul@6 | 420 | ## if sourcematch(":"): |
paul@6 | 421 | ## pass # handle character classes |
paul@6 | 422 | if sourcematch("^"): |
paul@6 | 423 | setappend((NEGATE, None)) |
paul@6 | 424 | # check remaining characters |
paul@6 | 425 | start = set[:] |
paul@6 | 426 | while 1: |
paul@6 | 427 | this = sourceget() |
paul@6 | 428 | if this == "]" and set != start: |
paul@6 | 429 | break |
paul@6 | 430 | elif this and this[0] == "\\": |
paul@6 | 431 | code1 = _class_escape(source, this) |
paul@6 | 432 | elif this: |
paul@6 | 433 | code1 = LITERAL, ord(this) |
paul@6 | 434 | else: |
paul@6 | 435 | raise error, "unexpected end of regular expression" |
paul@6 | 436 | if sourcematch("-"): |
paul@6 | 437 | # potential range |
paul@6 | 438 | this = sourceget() |
paul@6 | 439 | if this == "]": |
paul@6 | 440 | if code1[0] is IN: |
paul@6 | 441 | code1 = code1[1][0] |
paul@6 | 442 | setappend(code1) |
paul@6 | 443 | setappend((LITERAL, ord("-"))) |
paul@6 | 444 | break |
paul@6 | 445 | elif this: |
paul@6 | 446 | if this[0] == "\\": |
paul@6 | 447 | code2 = _class_escape(source, this) |
paul@6 | 448 | else: |
paul@6 | 449 | code2 = LITERAL, ord(this) |
paul@6 | 450 | if code1[0] != LITERAL or code2[0] != LITERAL: |
paul@6 | 451 | raise error, "bad character range" |
paul@6 | 452 | lo = code1[1] |
paul@6 | 453 | hi = code2[1] |
paul@6 | 454 | if hi < lo: |
paul@6 | 455 | raise error, "bad character range" |
paul@6 | 456 | setappend((RANGE, (lo, hi))) |
paul@6 | 457 | else: |
paul@6 | 458 | raise error, "unexpected end of regular expression" |
paul@6 | 459 | else: |
paul@6 | 460 | if code1[0] is IN: |
paul@6 | 461 | code1 = code1[1][0] |
paul@6 | 462 | setappend(code1) |
paul@6 | 463 | |
paul@6 | 464 | # XXX: <fl> should move set optimization to compiler! |
paul@6 | 465 | if _len(set)==1 and set[0][0] is LITERAL: |
paul@6 | 466 | subpatternappend(set[0]) # optimization |
paul@6 | 467 | elif _len(set)==2 and set[0][0] is NEGATE and set[1][0] is LITERAL: |
paul@6 | 468 | subpatternappend((NOT_LITERAL, set[1][1])) # optimization |
paul@6 | 469 | else: |
paul@6 | 470 | # XXX: <fl> should add charmap optimization here |
paul@6 | 471 | subpatternappend((IN, set)) |
paul@6 | 472 | |
paul@6 | 473 | elif this and this[0] in REPEAT_CHARS: |
paul@6 | 474 | # repeat previous item |
paul@6 | 475 | if this == "?": |
paul@6 | 476 | min, max = 0, 1 |
paul@6 | 477 | elif this == "*": |
paul@6 | 478 | min, max = 0, MAXREPEAT |
paul@6 | 479 | |
paul@6 | 480 | elif this == "+": |
paul@6 | 481 | min, max = 1, MAXREPEAT |
paul@6 | 482 | elif this == "{": |
paul@6 | 483 | if source.next == "}": |
paul@6 | 484 | subpatternappend((LITERAL, ord(this))) |
paul@6 | 485 | continue |
paul@6 | 486 | here = source.tell() |
paul@6 | 487 | min, max = 0, MAXREPEAT |
paul@6 | 488 | lo = hi = "" |
paul@6 | 489 | while source.next in DIGITS: |
paul@6 | 490 | lo = lo + source.get() |
paul@6 | 491 | if sourcematch(","): |
paul@6 | 492 | while source.next in DIGITS: |
paul@6 | 493 | hi = hi + sourceget() |
paul@6 | 494 | else: |
paul@6 | 495 | hi = lo |
paul@6 | 496 | if not sourcematch("}"): |
paul@6 | 497 | subpatternappend((LITERAL, ord(this))) |
paul@6 | 498 | source.seek(here) |
paul@6 | 499 | continue |
paul@6 | 500 | if lo: |
paul@6 | 501 | min = int(lo) |
paul@6 | 502 | if hi: |
paul@6 | 503 | max = int(hi) |
paul@6 | 504 | if max < min: |
paul@6 | 505 | raise error, "bad repeat interval" |
paul@6 | 506 | else: |
paul@6 | 507 | raise error, "not supported" |
paul@6 | 508 | # figure out which item to repeat |
paul@6 | 509 | if subpattern: |
paul@6 | 510 | item = subpattern[-1:] |
paul@6 | 511 | else: |
paul@6 | 512 | item = None |
paul@6 | 513 | if not item or (_len(item) == 1 and item[0][0] == AT): |
paul@6 | 514 | raise error, "nothing to repeat" |
paul@6 | 515 | if item[0][0] in REPEATCODES: |
paul@6 | 516 | raise error, "multiple repeat" |
paul@6 | 517 | if sourcematch("?"): |
paul@6 | 518 | subpattern[-1] = (MIN_REPEAT, (min, max, item)) |
paul@6 | 519 | else: |
paul@6 | 520 | subpattern[-1] = (MAX_REPEAT, (min, max, item)) |
paul@6 | 521 | |
paul@6 | 522 | elif this == ".": |
paul@6 | 523 | subpatternappend((ANY, None)) |
paul@6 | 524 | |
paul@6 | 525 | elif this == "(": |
paul@6 | 526 | group = 1 |
paul@6 | 527 | name = None |
paul@6 | 528 | condgroup = None |
paul@6 | 529 | if sourcematch("?"): |
paul@6 | 530 | group = 0 |
paul@6 | 531 | # options |
paul@6 | 532 | if sourcematch("P"): |
paul@6 | 533 | # python extensions |
paul@6 | 534 | if sourcematch("<"): |
paul@6 | 535 | # named group: skip forward to end of name |
paul@6 | 536 | name = "" |
paul@6 | 537 | while 1: |
paul@6 | 538 | char = sourceget() |
paul@6 | 539 | if char is None: |
paul@6 | 540 | raise error, "unterminated name" |
paul@6 | 541 | if char == ">": |
paul@6 | 542 | break |
paul@6 | 543 | name = name + char |
paul@6 | 544 | group = 1 |
paul@6 | 545 | if not isname(name): |
paul@6 | 546 | raise error, "bad character in group name" |
paul@6 | 547 | elif sourcematch("="): |
paul@6 | 548 | # named backreference |
paul@6 | 549 | name = "" |
paul@6 | 550 | while 1: |
paul@6 | 551 | char = sourceget() |
paul@6 | 552 | if char is None: |
paul@6 | 553 | raise error, "unterminated name" |
paul@6 | 554 | if char == ")": |
paul@6 | 555 | break |
paul@6 | 556 | name = name + char |
paul@6 | 557 | if not isname(name): |
paul@6 | 558 | raise error, "bad character in group name" |
paul@6 | 559 | gid = state.groupdict.get(name) |
paul@6 | 560 | if gid is None: |
paul@6 | 561 | raise error, "unknown group name" |
paul@6 | 562 | subpatternappend((GROUPREF, gid)) |
paul@6 | 563 | continue |
paul@6 | 564 | else: |
paul@6 | 565 | char = sourceget() |
paul@6 | 566 | if char is None: |
paul@6 | 567 | raise error, "unexpected end of pattern" |
paul@6 | 568 | raise error, "unknown specifier: ?P%s" % char |
paul@6 | 569 | elif sourcematch(":"): |
paul@6 | 570 | # non-capturing group |
paul@6 | 571 | group = 2 |
paul@6 | 572 | elif sourcematch("#"): |
paul@6 | 573 | # comment |
paul@6 | 574 | while 1: |
paul@6 | 575 | if source.next is None or source.next == ")": |
paul@6 | 576 | break |
paul@6 | 577 | sourceget() |
paul@6 | 578 | if not sourcematch(")"): |
paul@6 | 579 | raise error, "unbalanced parenthesis" |
paul@6 | 580 | continue |
paul@6 | 581 | elif source.next in ASSERTCHARS: |
paul@6 | 582 | # lookahead assertions |
paul@6 | 583 | char = sourceget() |
paul@6 | 584 | dir = 1 |
paul@6 | 585 | if char == "<": |
paul@6 | 586 | if source.next not in LOOKBEHINDASSERTCHARS: |
paul@6 | 587 | raise error, "syntax error" |
paul@6 | 588 | dir = -1 # lookbehind |
paul@6 | 589 | char = sourceget() |
paul@6 | 590 | p = _parse_sub(source, state) |
paul@6 | 591 | if not sourcematch(")"): |
paul@6 | 592 | raise error, "unbalanced parenthesis" |
paul@6 | 593 | if char == "=": |
paul@6 | 594 | subpatternappend((ASSERT, (dir, p))) |
paul@6 | 595 | else: |
paul@6 | 596 | subpatternappend((ASSERT_NOT, (dir, p))) |
paul@6 | 597 | continue |
paul@6 | 598 | elif sourcematch("("): |
paul@6 | 599 | # conditional backreference group |
paul@6 | 600 | condname = "" |
paul@6 | 601 | while 1: |
paul@6 | 602 | char = sourceget() |
paul@6 | 603 | if char is None: |
paul@6 | 604 | raise error, "unterminated name" |
paul@6 | 605 | if char == ")": |
paul@6 | 606 | break |
paul@6 | 607 | condname = condname + char |
paul@6 | 608 | group = 2 |
paul@6 | 609 | if isname(condname): |
paul@6 | 610 | condgroup = state.groupdict.get(condname) |
paul@6 | 611 | if condgroup is None: |
paul@6 | 612 | raise error, "unknown group name" |
paul@6 | 613 | else: |
paul@6 | 614 | try: |
paul@6 | 615 | condgroup = int(condname) |
paul@6 | 616 | except ValueError: |
paul@6 | 617 | raise error, "bad character in group name" |
paul@6 | 618 | else: |
paul@6 | 619 | # flags |
paul@6 | 620 | if not source.next in FLAGS: |
paul@6 | 621 | raise error, "unexpected end of pattern" |
paul@6 | 622 | while source.next in FLAGS: |
paul@6 | 623 | state.flags = state.flags | FLAGS[sourceget()] |
paul@6 | 624 | if group: |
paul@6 | 625 | # parse group contents |
paul@6 | 626 | if group == 2: |
paul@6 | 627 | # anonymous group |
paul@6 | 628 | group = None |
paul@6 | 629 | else: |
paul@6 | 630 | group = state.opengroup(name) |
paul@6 | 631 | if condgroup: |
paul@6 | 632 | p = _parse_sub_cond(source, state, condgroup) |
paul@6 | 633 | else: |
paul@6 | 634 | p = _parse_sub(source, state) |
paul@6 | 635 | if not sourcematch(")"): |
paul@6 | 636 | raise error, "unbalanced parenthesis" |
paul@6 | 637 | if group is not None: |
paul@6 | 638 | state.closegroup(group) |
paul@6 | 639 | subpatternappend((SUBPATTERN, (group, p))) |
paul@6 | 640 | else: |
paul@6 | 641 | while 1: |
paul@6 | 642 | char = sourceget() |
paul@6 | 643 | if char is None: |
paul@6 | 644 | raise error, "unexpected end of pattern" |
paul@6 | 645 | if char == ")": |
paul@6 | 646 | break |
paul@6 | 647 | raise error, "unknown extension" |
paul@6 | 648 | |
paul@6 | 649 | elif this == "^": |
paul@6 | 650 | subpatternappend((AT, AT_BEGINNING)) |
paul@6 | 651 | |
paul@6 | 652 | elif this == "$": |
paul@6 | 653 | subpattern.append((AT, AT_END)) |
paul@6 | 654 | |
paul@6 | 655 | elif this and this[0] == "\\": |
paul@6 | 656 | code = _escape(source, this, state) |
paul@6 | 657 | subpatternappend(code) |
paul@6 | 658 | |
paul@6 | 659 | else: |
paul@6 | 660 | raise error, "parser error" |
paul@6 | 661 | |
paul@6 | 662 | return subpattern |
paul@6 | 663 | |
paul@6 | 664 | def parse(str, flags=0, pattern=None): |
paul@6 | 665 | # parse 're' pattern into list of (opcode, argument) tuples |
paul@6 | 666 | |
paul@6 | 667 | source = Tokenizer(str) |
paul@6 | 668 | |
paul@6 | 669 | if pattern is None: |
paul@6 | 670 | pattern = Pattern() |
paul@6 | 671 | pattern.flags = flags |
paul@6 | 672 | pattern.str = str |
paul@6 | 673 | |
paul@6 | 674 | p = _parse_sub(source, pattern, 0) |
paul@6 | 675 | |
paul@6 | 676 | tail = source.get() |
paul@6 | 677 | if tail == ")": |
paul@6 | 678 | raise error, "unbalanced parenthesis" |
paul@6 | 679 | elif tail: |
paul@6 | 680 | raise error, "bogus characters at end of regular expression" |
paul@6 | 681 | |
paul@6 | 682 | if flags & SRE_FLAG_DEBUG: |
paul@6 | 683 | p.dump() |
paul@6 | 684 | |
paul@6 | 685 | if not (flags & SRE_FLAG_VERBOSE) and p.pattern.flags & SRE_FLAG_VERBOSE: |
paul@6 | 686 | # the VERBOSE flag was switched on inside the pattern. to be |
paul@6 | 687 | # on the safe side, we'll parse the whole thing again... |
paul@6 | 688 | return parse(str, p.pattern.flags) |
paul@6 | 689 | |
paul@6 | 690 | return p |
paul@6 | 691 | |
paul@6 | 692 | def parse_template(source, pattern): |
paul@6 | 693 | # parse 're' replacement string into list of literals and |
paul@6 | 694 | # group references |
paul@6 | 695 | s = Tokenizer(source) |
paul@6 | 696 | sget = s.get |
paul@6 | 697 | p = [] |
paul@6 | 698 | a = p.append |
paul@6 | 699 | def literal(literal, p=p, pappend=a): |
paul@6 | 700 | if p and p[-1][0] is LITERAL: |
paul@6 | 701 | p[-1] = LITERAL, p[-1][1] + literal |
paul@6 | 702 | else: |
paul@6 | 703 | pappend((LITERAL, literal)) |
paul@6 | 704 | sep = source[:0] |
paul@6 | 705 | if type(sep) is type(""): |
paul@6 | 706 | makechar = chr |
paul@6 | 707 | else: |
paul@6 | 708 | makechar = unichr |
paul@6 | 709 | while 1: |
paul@6 | 710 | this = sget() |
paul@6 | 711 | if this is None: |
paul@6 | 712 | break # end of replacement string |
paul@6 | 713 | if this and this[0] == "\\": |
paul@6 | 714 | # group |
paul@6 | 715 | c = this[1:2] |
paul@6 | 716 | if c == "g": |
paul@6 | 717 | name = "" |
paul@6 | 718 | if s.match("<"): |
paul@6 | 719 | while 1: |
paul@6 | 720 | char = sget() |
paul@6 | 721 | if char is None: |
paul@6 | 722 | raise error, "unterminated group name" |
paul@6 | 723 | if char == ">": |
paul@6 | 724 | break |
paul@6 | 725 | name = name + char |
paul@6 | 726 | if not name: |
paul@6 | 727 | raise error, "bad group name" |
paul@6 | 728 | try: |
paul@6 | 729 | index = int(name) |
paul@6 | 730 | if index < 0: |
paul@6 | 731 | raise error, "negative group number" |
paul@6 | 732 | except ValueError: |
paul@6 | 733 | if not isname(name): |
paul@6 | 734 | raise error, "bad character in group name" |
paul@6 | 735 | try: |
paul@6 | 736 | index = pattern.groupindex[name] |
paul@6 | 737 | except KeyError: |
paul@6 | 738 | raise IndexError, "unknown group name" |
paul@6 | 739 | a((MARK, index)) |
paul@6 | 740 | elif c == "0": |
paul@6 | 741 | if s.next in OCTDIGITS: |
paul@6 | 742 | this = this + sget() |
paul@6 | 743 | if s.next in OCTDIGITS: |
paul@6 | 744 | this = this + sget() |
paul@6 | 745 | literal(makechar(int(this[1:], 8) & 0xff)) |
paul@6 | 746 | elif c in DIGITS: |
paul@6 | 747 | isoctal = False |
paul@6 | 748 | if s.next in DIGITS: |
paul@6 | 749 | this = this + sget() |
paul@6 | 750 | if (c in OCTDIGITS and this[2] in OCTDIGITS and |
paul@6 | 751 | s.next in OCTDIGITS): |
paul@6 | 752 | this = this + sget() |
paul@6 | 753 | isoctal = True |
paul@6 | 754 | literal(makechar(int(this[1:], 8) & 0xff)) |
paul@6 | 755 | if not isoctal: |
paul@6 | 756 | a((MARK, int(this[1:]))) |
paul@6 | 757 | else: |
paul@6 | 758 | try: |
paul@6 | 759 | this = makechar(ESCAPES[this][1]) |
paul@6 | 760 | except KeyError: |
paul@6 | 761 | pass |
paul@6 | 762 | literal(this) |
paul@6 | 763 | else: |
paul@6 | 764 | literal(this) |
paul@6 | 765 | # convert template to groups and literals lists |
paul@6 | 766 | i = 0 |
paul@6 | 767 | groups = [] |
paul@6 | 768 | groupsappend = groups.append |
paul@6 | 769 | literals = [None] * len(p) |
paul@6 | 770 | for c, s in p: |
paul@6 | 771 | if c is MARK: |
paul@6 | 772 | groupsappend((i, s)) |
paul@6 | 773 | # literal[i] is already None |
paul@6 | 774 | else: |
paul@6 | 775 | literals[i] = s |
paul@6 | 776 | i = i + 1 |
paul@6 | 777 | return groups, literals |
paul@6 | 778 | |
paul@6 | 779 | def expand_template(template, match): |
paul@6 | 780 | g = match.group |
paul@6 | 781 | sep = match.string[:0] |
paul@6 | 782 | groups, literals = template |
paul@6 | 783 | literals = literals[:] |
paul@6 | 784 | try: |
paul@6 | 785 | for index, group in groups: |
paul@6 | 786 | literals[index] = s = g(group) |
paul@6 | 787 | if s is None: |
paul@6 | 788 | raise error, "unmatched group" |
paul@6 | 789 | except IndexError: |
paul@6 | 790 | raise error, "invalid group reference" |
paul@6 | 791 | return sep.join(literals) |