paul@6 | 1 | # |
paul@6 | 2 | # Secret Labs' Regular Expression Engine |
paul@6 | 3 | # |
paul@6 | 4 | # re-compatible interface for the sre matching engine |
paul@6 | 5 | # |
paul@6 | 6 | # Copyright (c) 1998-2001 by Secret Labs AB. All rights reserved. |
paul@6 | 7 | # |
paul@6 | 8 | # This version of the SRE library can be redistributed under CNRI's |
paul@6 | 9 | # Python 1.6 license. For any other use, please contact Secret Labs |
paul@6 | 10 | # AB (info@pythonware.com). |
paul@6 | 11 | # |
paul@6 | 12 | # Portions of this engine have been developed in cooperation with |
paul@6 | 13 | # CNRI. Hewlett-Packard provided funding for 1.6 integration and |
paul@6 | 14 | # other compatibility work. |
paul@6 | 15 | # |
paul@6 | 16 | |
paul@6 | 17 | r"""Support for regular expressions (RE). |
paul@6 | 18 | |
paul@6 | 19 | This module provides regular expression matching operations similar to |
paul@6 | 20 | those found in Perl. It supports both 8-bit and Unicode strings; both |
paul@6 | 21 | the pattern and the strings being processed can contain null bytes and |
paul@6 | 22 | characters outside the US ASCII range. |
paul@6 | 23 | |
paul@6 | 24 | Regular expressions can contain both special and ordinary characters. |
paul@6 | 25 | Most ordinary characters, like "A", "a", or "0", are the simplest |
paul@6 | 26 | regular expressions; they simply match themselves. You can |
paul@6 | 27 | concatenate ordinary characters, so last matches the string 'last'. |
paul@6 | 28 | |
paul@6 | 29 | The special characters are: |
paul@6 | 30 | "." Matches any character except a newline. |
paul@6 | 31 | "^" Matches the start of the string. |
paul@6 | 32 | "$" Matches the end of the string or just before the newline at |
paul@6 | 33 | the end of the string. |
paul@6 | 34 | "*" Matches 0 or more (greedy) repetitions of the preceding RE. |
paul@6 | 35 | Greedy means that it will match as many repetitions as possible. |
paul@6 | 36 | "+" Matches 1 or more (greedy) repetitions of the preceding RE. |
paul@6 | 37 | "?" Matches 0 or 1 (greedy) of the preceding RE. |
paul@6 | 38 | *?,+?,?? Non-greedy versions of the previous three special characters. |
paul@6 | 39 | {m,n} Matches from m to n repetitions of the preceding RE. |
paul@6 | 40 | {m,n}? Non-greedy version of the above. |
paul@6 | 41 | "\\" Either escapes special characters or signals a special sequence. |
paul@6 | 42 | [] Indicates a set of characters. |
paul@6 | 43 | A "^" as the first character indicates a complementing set. |
paul@6 | 44 | "|" A|B, creates an RE that will match either A or B. |
paul@6 | 45 | (...) Matches the RE inside the parentheses. |
paul@6 | 46 | The contents can be retrieved or matched later in the string. |
paul@6 | 47 | (?iLmsux) Set the I, L, M, S, U, or X flag for the RE (see below). |
paul@6 | 48 | (?:...) Non-grouping version of regular parentheses. |
paul@6 | 49 | (?P<name>...) The substring matched by the group is accessible by name. |
paul@6 | 50 | (?P=name) Matches the text matched earlier by the group named name. |
paul@6 | 51 | (?#...) A comment; ignored. |
paul@6 | 52 | (?=...) Matches if ... matches next, but doesn't consume the string. |
paul@6 | 53 | (?!...) Matches if ... doesn't match next. |
paul@6 | 54 | (?<=...) Matches if preceded by ... (must be fixed length). |
paul@6 | 55 | (?<!...) Matches if not preceded by ... (must be fixed length). |
paul@6 | 56 | (?(id/name)yes|no) Matches yes pattern if the group with id/name matched, |
paul@6 | 57 | the (optional) no pattern otherwise. |
paul@6 | 58 | |
paul@6 | 59 | The special sequences consist of "\\" and a character from the list |
paul@6 | 60 | below. If the ordinary character is not on the list, then the |
paul@6 | 61 | resulting RE will match the second character. |
paul@6 | 62 | \number Matches the contents of the group of the same number. |
paul@6 | 63 | \A Matches only at the start of the string. |
paul@6 | 64 | \Z Matches only at the end of the string. |
paul@6 | 65 | \b Matches the empty string, but only at the start or end of a word. |
paul@6 | 66 | \B Matches the empty string, but not at the start or end of a word. |
paul@6 | 67 | \d Matches any decimal digit; equivalent to the set [0-9]. |
paul@6 | 68 | \D Matches any non-digit character; equivalent to the set [^0-9]. |
paul@6 | 69 | \s Matches any whitespace character; equivalent to [ \t\n\r\f\v]. |
paul@6 | 70 | \S Matches any non-whitespace character; equiv. to [^ \t\n\r\f\v]. |
paul@6 | 71 | \w Matches any alphanumeric character; equivalent to [a-zA-Z0-9_]. |
paul@6 | 72 | With LOCALE, it will match the set [0-9_] plus characters defined |
paul@6 | 73 | as letters for the current locale. |
paul@6 | 74 | \W Matches the complement of \w. |
paul@6 | 75 | \\ Matches a literal backslash. |
paul@6 | 76 | |
paul@6 | 77 | This module exports the following functions: |
paul@6 | 78 | match Match a regular expression pattern to the beginning of a string. |
paul@6 | 79 | search Search a string for the presence of a pattern. |
paul@6 | 80 | sub Substitute occurrences of a pattern found in a string. |
paul@6 | 81 | subn Same as sub, but also return the number of substitutions made. |
paul@6 | 82 | split Split a string by the occurrences of a pattern. |
paul@6 | 83 | findall Find all occurrences of a pattern in a string. |
paul@6 | 84 | finditer Return an iterator yielding a match object for each match. |
paul@6 | 85 | compile Compile a pattern into a RegexObject. |
paul@6 | 86 | purge Clear the regular expression cache. |
paul@6 | 87 | escape Backslash all non-alphanumerics in a string. |
paul@6 | 88 | |
paul@6 | 89 | Some of the functions in this module takes flags as optional parameters: |
paul@6 | 90 | I IGNORECASE Perform case-insensitive matching. |
paul@6 | 91 | L LOCALE Make \w, \W, \b, \B, dependent on the current locale. |
paul@6 | 92 | M MULTILINE "^" matches the beginning of lines (after a newline) |
paul@6 | 93 | as well as the string. |
paul@6 | 94 | "$" matches the end of lines (before a newline) as well |
paul@6 | 95 | as the end of the string. |
paul@6 | 96 | S DOTALL "." matches any character at all, including the newline. |
paul@6 | 97 | X VERBOSE Ignore whitespace and comments for nicer looking RE's. |
paul@6 | 98 | U UNICODE Make \w, \W, \b, \B, dependent on the Unicode locale. |
paul@6 | 99 | |
paul@6 | 100 | This module also defines an exception 'error'. |
paul@6 | 101 | |
paul@6 | 102 | """ |
paul@6 | 103 | |
paul@6 | 104 | from sre_constants import BRANCH, SUBPATTERN |
paul@6 | 105 | import sys |
paul@6 | 106 | import sre_compile |
paul@6 | 107 | import sre_parse |
paul@6 | 108 | |
paul@6 | 109 | # public symbols |
paul@6 | 110 | __all__ = [ "match", "search", "sub", "subn", "split", "findall", |
paul@6 | 111 | "compile", "purge", "template", "escape", "I", "L", "M", "S", "X", |
paul@6 | 112 | "U", "IGNORECASE", "LOCALE", "MULTILINE", "DOTALL", "VERBOSE", |
paul@6 | 113 | "UNICODE", "error" ] |
paul@6 | 114 | |
paul@6 | 115 | __version__ = "2.2.1" |
paul@6 | 116 | |
paul@6 | 117 | # flags |
paul@6 | 118 | I = IGNORECASE = sre_compile.SRE_FLAG_IGNORECASE # ignore case |
paul@6 | 119 | L = LOCALE = sre_compile.SRE_FLAG_LOCALE # assume current 8-bit locale |
paul@6 | 120 | U = UNICODE = sre_compile.SRE_FLAG_UNICODE # assume unicode locale |
paul@6 | 121 | M = MULTILINE = sre_compile.SRE_FLAG_MULTILINE # make anchors look for newline |
paul@6 | 122 | S = DOTALL = sre_compile.SRE_FLAG_DOTALL # make dot match newline |
paul@6 | 123 | X = VERBOSE = sre_compile.SRE_FLAG_VERBOSE # ignore whitespace and comments |
paul@6 | 124 | |
paul@6 | 125 | # sre extensions (experimental, don't rely on these) |
paul@6 | 126 | T = TEMPLATE = sre_compile.SRE_FLAG_TEMPLATE # disable backtracking |
paul@6 | 127 | DEBUG = sre_compile.SRE_FLAG_DEBUG # dump pattern after compilation |
paul@6 | 128 | |
paul@6 | 129 | # sre exception |
paul@6 | 130 | error = sre_compile.error |
paul@6 | 131 | |
paul@6 | 132 | # -------------------------------------------------------------------- |
paul@6 | 133 | # public interface |
paul@6 | 134 | |
paul@6 | 135 | def match(pattern, string, flags=0): |
paul@6 | 136 | """Try to apply the pattern at the start of the string, returning |
paul@6 | 137 | a match object, or None if no match was found.""" |
paul@6 | 138 | return _compile(pattern, flags).match(string) |
paul@6 | 139 | |
paul@6 | 140 | def search(pattern, string, flags=0): |
paul@6 | 141 | """Scan through string looking for a match to the pattern, returning |
paul@6 | 142 | a match object, or None if no match was found.""" |
paul@6 | 143 | return _compile(pattern, flags).search(string) |
paul@6 | 144 | |
paul@6 | 145 | def sub(pattern, repl, string, count=0, flags=0): |
paul@6 | 146 | """Return the string obtained by replacing the leftmost |
paul@6 | 147 | non-overlapping occurrences of the pattern in string by the |
paul@6 | 148 | replacement repl. repl can be either a string or a callable; |
paul@6 | 149 | if a string, backslash escapes in it are processed. If it is |
paul@6 | 150 | a callable, it's passed the match object and must return |
paul@6 | 151 | a replacement string to be used.""" |
paul@6 | 152 | return _compile(pattern, flags).sub(repl, string, count) |
paul@6 | 153 | |
paul@6 | 154 | def subn(pattern, repl, string, count=0, flags=0): |
paul@6 | 155 | """Return a 2-tuple containing (new_string, number). |
paul@6 | 156 | new_string is the string obtained by replacing the leftmost |
paul@6 | 157 | non-overlapping occurrences of the pattern in the source |
paul@6 | 158 | string by the replacement repl. number is the number of |
paul@6 | 159 | substitutions that were made. repl can be either a string or a |
paul@6 | 160 | callable; if a string, backslash escapes in it are processed. |
paul@6 | 161 | If it is a callable, it's passed the match object and must |
paul@6 | 162 | return a replacement string to be used.""" |
paul@6 | 163 | return _compile(pattern, flags).subn(repl, string, count) |
paul@6 | 164 | |
paul@6 | 165 | def split(pattern, string, maxsplit=0, flags=0): |
paul@6 | 166 | """Split the source string by the occurrences of the pattern, |
paul@6 | 167 | returning a list containing the resulting substrings.""" |
paul@6 | 168 | return _compile(pattern, flags).split(string, maxsplit) |
paul@6 | 169 | |
paul@6 | 170 | def findall(pattern, string, flags=0): |
paul@6 | 171 | """Return a list of all non-overlapping matches in the string. |
paul@6 | 172 | |
paul@6 | 173 | If one or more groups are present in the pattern, return a |
paul@6 | 174 | list of groups; this will be a list of tuples if the pattern |
paul@6 | 175 | has more than one group. |
paul@6 | 176 | |
paul@6 | 177 | Empty matches are included in the result.""" |
paul@6 | 178 | return _compile(pattern, flags).findall(string) |
paul@6 | 179 | |
paul@6 | 180 | if sys.hexversion >= 0x02020000: |
paul@6 | 181 | __all__.append("finditer") |
paul@6 | 182 | def finditer(pattern, string, flags=0): |
paul@6 | 183 | """Return an iterator over all non-overlapping matches in the |
paul@6 | 184 | string. For each match, the iterator returns a match object. |
paul@6 | 185 | |
paul@6 | 186 | Empty matches are included in the result.""" |
paul@6 | 187 | return _compile(pattern, flags).finditer(string) |
paul@6 | 188 | |
paul@6 | 189 | def compile(pattern, flags=0): |
paul@6 | 190 | "Compile a regular expression pattern, returning a pattern object." |
paul@6 | 191 | return _compile(pattern, flags) |
paul@6 | 192 | |
paul@6 | 193 | def purge(): |
paul@6 | 194 | "Clear the regular expression cache" |
paul@6 | 195 | _cache.clear() |
paul@6 | 196 | _cache_repl.clear() |
paul@6 | 197 | |
paul@6 | 198 | def template(pattern, flags=0): |
paul@6 | 199 | "Compile a template pattern, returning a pattern object" |
paul@6 | 200 | return _compile(pattern, flags|T) |
paul@6 | 201 | |
paul@6 | 202 | _alphanum = frozenset( |
paul@6 | 203 | "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789") |
paul@6 | 204 | |
paul@6 | 205 | def escape(pattern): |
paul@6 | 206 | "Escape all non-alphanumeric characters in pattern." |
paul@6 | 207 | s = list(pattern) |
paul@6 | 208 | alphanum = _alphanum |
paul@6 | 209 | for i, c in enumerate(pattern): |
paul@6 | 210 | if c not in alphanum: |
paul@6 | 211 | if c == "\000": |
paul@6 | 212 | s[i] = "\\000" |
paul@6 | 213 | else: |
paul@6 | 214 | s[i] = "\\" + c |
paul@6 | 215 | return pattern[:0].join(s) |
paul@6 | 216 | |
paul@6 | 217 | # -------------------------------------------------------------------- |
paul@6 | 218 | # internals |
paul@6 | 219 | |
paul@6 | 220 | _cache = {} |
paul@6 | 221 | _cache_repl = {} |
paul@6 | 222 | |
paul@6 | 223 | _pattern_type = type(sre_compile.compile("", 0)) |
paul@6 | 224 | |
paul@6 | 225 | _MAXCACHE = 100 |
paul@6 | 226 | |
paul@6 | 227 | def _compile(*key): |
paul@6 | 228 | # internal: compile pattern |
paul@6 | 229 | cachekey = (type(key[0]),) + key |
paul@6 | 230 | p = _cache.get(cachekey) |
paul@6 | 231 | if p is not None: |
paul@6 | 232 | return p |
paul@6 | 233 | pattern, flags = key |
paul@6 | 234 | if isinstance(pattern, _pattern_type): |
paul@6 | 235 | if flags: |
paul@6 | 236 | raise ValueError('Cannot process flags argument with a compiled pattern') |
paul@6 | 237 | return pattern |
paul@6 | 238 | if not sre_compile.isstring(pattern): |
paul@6 | 239 | raise TypeError, "first argument must be string or compiled pattern" |
paul@6 | 240 | try: |
paul@6 | 241 | p = sre_compile.compile(pattern, flags) |
paul@6 | 242 | except error, v: |
paul@6 | 243 | raise error, v # invalid expression |
paul@6 | 244 | if len(_cache) >= _MAXCACHE: |
paul@6 | 245 | _cache.clear() |
paul@6 | 246 | _cache[cachekey] = p |
paul@6 | 247 | return p |
paul@6 | 248 | |
paul@6 | 249 | def _compile_repl(*key): |
paul@6 | 250 | # internal: compile replacement pattern |
paul@6 | 251 | p = _cache_repl.get(key) |
paul@6 | 252 | if p is not None: |
paul@6 | 253 | return p |
paul@6 | 254 | repl, pattern = key |
paul@6 | 255 | try: |
paul@6 | 256 | p = sre_parse.parse_template(repl, pattern) |
paul@6 | 257 | except error, v: |
paul@6 | 258 | raise error, v # invalid expression |
paul@6 | 259 | if len(_cache_repl) >= _MAXCACHE: |
paul@6 | 260 | _cache_repl.clear() |
paul@6 | 261 | _cache_repl[key] = p |
paul@6 | 262 | return p |
paul@6 | 263 | |
paul@6 | 264 | def _expand(pattern, match, template): |
paul@6 | 265 | # internal: match.expand implementation hook |
paul@6 | 266 | template = sre_parse.parse_template(template, pattern) |
paul@6 | 267 | return sre_parse.expand_template(template, match) |
paul@6 | 268 | |
paul@6 | 269 | def _subx(pattern, template): |
paul@6 | 270 | # internal: pattern.sub/subn implementation helper |
paul@6 | 271 | template = _compile_repl(template, pattern) |
paul@6 | 272 | if not template[0] and len(template[1]) == 1: |
paul@6 | 273 | # literal replacement |
paul@6 | 274 | return template[1][0] |
paul@6 | 275 | def filter(match, template=template): |
paul@6 | 276 | return sre_parse.expand_template(template, match) |
paul@6 | 277 | return filter |
paul@6 | 278 | |
paul@6 | 279 | # -------------------------------------------------------------------- |
paul@6 | 280 | # experimental stuff (see python-dev discussions for details) |
paul@6 | 281 | |
paul@6 | 282 | class Scanner: |
paul@6 | 283 | def __init__(self, lexicon, flags=0): |
paul@6 | 284 | self.lexicon = lexicon |
paul@6 | 285 | # combine phrases into a compound pattern |
paul@6 | 286 | p = [] |
paul@6 | 287 | s = sre_parse.Pattern() |
paul@6 | 288 | s.flags = flags |
paul@6 | 289 | for phrase, action in lexicon: |
paul@6 | 290 | p.append(sre_parse.SubPattern(s, [ |
paul@6 | 291 | (SUBPATTERN, (len(p)+1, sre_parse.parse(phrase, flags))), |
paul@6 | 292 | ])) |
paul@6 | 293 | s.groups = len(p)+1 |
paul@6 | 294 | p = sre_parse.SubPattern(s, [(BRANCH, (None, p))]) |
paul@6 | 295 | self.scanner = sre_compile.compile(p) |
paul@6 | 296 | def scan(self, string): |
paul@6 | 297 | result = [] |
paul@6 | 298 | append = result.append |
paul@6 | 299 | match = self.scanner.scanner(string).match |
paul@6 | 300 | i = 0 |
paul@6 | 301 | while 1: |
paul@6 | 302 | m = match() |
paul@6 | 303 | if not m: |
paul@6 | 304 | break |
paul@6 | 305 | j = m.end() |
paul@6 | 306 | if i == j: |
paul@6 | 307 | break |
paul@6 | 308 | action = self.lexicon[m.lastindex-1][1] |
paul@6 | 309 | if hasattr(action, '__call__'): |
paul@6 | 310 | self.match = m |
paul@6 | 311 | action = action(self, m.group()) |
paul@6 | 312 | if action is not None: |
paul@6 | 313 | append(action) |
paul@6 | 314 | i = j |
paul@6 | 315 | return result, string[i:] |