# HG changeset patch # User Paul Boddie # Date 1531692263 -7200 # Node ID 225f92510d629d692de720612d0dac7940855b9d # Parent 4a05d10e795eeca5c3efcf081c012c1472ed0560 Combine patterns in order to search using a single regular expression. This requires all group names to be prefixed with pattern names, with a special null group being used within each constituent pattern to identify it. The match groups are filtered so that only the matching pattern's groups are retained. diff -r 4a05d10e795e -r 225f92510d62 moinformat/parsers/common.py --- a/moinformat/parsers/common.py Sun Jul 15 23:59:08 2018 +0200 +++ b/moinformat/parsers/common.py Mon Jul 16 00:04:23 2018 +0200 @@ -42,7 +42,10 @@ def group(name, s): - "Return a pattern group having 'name' and the pattern string 's'." + """ + Return a pattern for the group having the given 'name' and employing the + pattern string 's'. + """ return "(?P<%s>%s)" % (name, s) @@ -69,25 +72,44 @@ """ Define patterns for the regular expressions in the 'syntax' mapping. In each - pattern, replace \N with a pattern for matching whitespace excluding - newlines. + pattern, replace... + + \N with a pattern for matching whitespace excluding newlines + \Q with a pattern for matching quotation marks + + Group names are also qualified with a pattern name prefix. """ patterns = {} + for name, value in syntax.items(): value = value.replace(r"\N", ws_excl_nl) value = value.replace(r"\Q", quotes) - patterns[name] = re.compile(value, re.UNICODE | re.MULTILINE) + + # Add the name to group names as a prefix. + + value = value.replace("(?P<", "(?P<%s_" % name) + value = value.replace("(?P=", "(?P=%s_" % name) + + # Record the updated expression and add an identifying null group. + + patterns[name] = "%s(?P)" % (value, name) + return patterns -def get_subset(d, keys): +def get_expression(d, keys): - "Return a subset of 'd' having the given 'keys'." + """ + Return a compiled expression combining patterns in 'd' having the given + 'keys'. + """ - subset = {} + subset = [] + for key in keys: - subset[key] = d[key] - return subset + subset.append(d[key]) + + return re.compile("|".join(subset), re.UNICODE | re.MULTILINE) @@ -105,7 +127,7 @@ self.match = None self.queued = None - self.match_start = None + self.groups = {} # Pattern name details. @@ -123,56 +145,73 @@ self.queued = self.match - def read_until(self, patterns, remaining=True): + def read_until(self, expression, remaining=True): """ - Find the first match for the given 'patterns'. Return the text preceding - any match, the remaining text if no match was found, or None if no match - was found and 'remaining' is given as a false value. + Find the first match for the given 'expression'. Return the text + preceding any match, the remaining text if no match was found, or None + if no match was found and 'remaining' is given as a false value. """ if self.queued: self.match = self.queued self.queued = None else: - self.match_start = None self.matching = None # Find the first matching pattern. - for pattern_name, pattern in patterns.items(): - match = pattern.search(self.s, self.pos) - if match: - start, end = match.span() - if self.matching is None or start < self.start: - self.start = start - self.matching = pattern_name + match = expression.search(self.s, self.pos) + + if match: + for name, value in match.groupdict().items(): + + # Use a group with a non-null value to identify the + # matching pattern. + + if name.startswith("group_") and value is not None: + self.matching = name[len("group_"):] + self.start, self.end = match.span() self.match = match + break + + # Return the remaining text, if appropriate. if self.matching is None: + self.groups = {} if remaining: return self.s[self.pos:] else: return None else: + self.groups = self.filter_groups() return self.s[self.pos:self.start] - def match_group(self, group=1): + def filter_groups(self): + + "Filter groups from the current match for the matching pattern." + + d = {} + for key, value in self.match.groupdict().items(): + if key.startswith("%s_" % self.matching): + d[key] = value + return d + + def match_group(self, group=None): """ Return the matched text, updating the position in the stream. If 'group' is specified, the indicated group in a match will be returned. - Typically, group 1 should contain all pertinent data, but groups defined - within group 1 can provide sections of the data. + Otherwise, the entire match is returned. """ self.update_pos() if self.match: - try: - return self.match.group(group) - except IndexError: - return "" + if group is None: + return self.s[self.start:self.end] + else: + return self.groups.get("%s_%s" % (self.matching, group)) else: return None @@ -184,9 +223,12 @@ if self.match: if groups is None: - return self.match.groups() + return self.groups else: - return self.match.groups(groups) + l = [] + for group in groups: + l.append(self.groups.get("%s_%s" % (self.matching, group))) + return l else: return [] @@ -233,11 +275,11 @@ else: return None - def get_patterns(self, pattern_names): + def get_expression(self, pattern_names): "Return a mapping of the given 'pattern_names' to patterns." - return get_subset(self.patterns, pattern_names) + return get_expression(self.patterns, pattern_names) def get_items(self, s, pos=0): @@ -260,12 +302,13 @@ or None if no match was found and 'remaining' is given as a false value. """ - return self.items.read_until(self.get_patterns(pattern_names)) + return self.items.read_until(self.get_expression(pattern_names)) - def match_group(self, group=1): + def match_group(self, group=None): """ - Return the group of the matching pattern with the given 'group' number. + Return the group of the matching pattern with the given 'group' + identifier. If 'group' is omitted or None, return the entire match. """ return self.items.match_group(group) @@ -407,7 +450,7 @@ # Obtain any feature. - feature = self.match_group() + feature = self.match_group(None) handler = self.handlers.get(self.matching_pattern()) # Handle each feature or add text to the region.