MoinLight (file moinformat/parsers/common.py at f19281465a63)

     1 #!/usr/bin/env python     2      3 """     4 Moin wiki parsing functionality.     5      6 Copyright (C) 2017, 2018, 2019, 2021, 2023 Paul Boddie <paul@boddie.org.uk>     7      8 This program is free software; you can redistribute it and/or modify it under     9 the terms of the GNU General Public License as published by the Free Software    10 Foundation; either version 3 of the License, or (at your option) any later    11 version.    12     13 This program is distributed in the hope that it will be useful, but WITHOUT    14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS    15 FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more    16 details.    17     18 You should have received a copy of the GNU General Public License along with    19 this program.  If not, see <http://www.gnu.org/licenses/>.    20 """    21     22 from moinformat.tree.moin import Block, Region, Text    23 import re    24     25 # Pattern management.    26     27 ws_excl_nl = r"[ \f\r\t\v]"    28 quotes = "['" '"]'                  # ['"]    29 dotall = r"(.|\n)"                  # behave similarly to dot with DOTALL option    30 dotparagraph = r"(.|\n(?!\r?\n))"   # match everything within paragraphs    31     32 def choice(l):    33     34     "Return a pattern matching a choice of patterns in 'l'."    35     36     return "(%s)" % "|".join(l)    37     38 def excl(s):    39     40     "Return a non-matching pattern for 's'."    41     42     return "(?!%s)" % s    43     44 def expect(s):    45     46     "Return a pattern expecting 's'."    47     48     return "(?=%s)" % s    49     50 def group(name, s):    51     52     """    53     Return a pattern for the group having the given 'name' and employing the    54     pattern string 's'.    55     """    56     57     return "(?P<%s>%s)" % (name, s)    58     59 def optional(s):    60     61     "Return an optional pattern."    62     63     return "(?:%s)?" % s    64     65 def recur(name):    66     67     "Return a test for a recurrence of group 'name'."    68     69     return "(?P=%s)" % name    70     71 def repeat(s, min=None, max=None):    72     73     "Return a pattern matching 's' for the given 'min' and 'max' limits."    74     75     return "%s{%s,%s}" % (s, min is not None and min or "",    76                              max is not None and max or "")    77     78 def get_patterns(syntax):    79     80     """    81     Define patterns for the regular expressions in the 'syntax' mapping. In each    82     pattern, replace...    83     84     \E with a pattern for matching all characters including newlines    85     \N with a pattern for matching whitespace excluding newlines    86     \P with a pattern for matching all characters within a paragraph    87     \Q with a pattern for matching quotation marks    88     89     Group names are also qualified with a pattern name prefix.    90     """    91     92     patterns = {}    93     94     for name, value in syntax.items():    95         value = value.replace(r"\N", ws_excl_nl)    96         value = value.replace(r"\Q", quotes)    97         value = value.replace(r"\E", dotall)    98         value = value.replace(r"\P", dotparagraph)    99    100         # Add the name to group names as a prefix.   101    102         value = value.replace("(?P<", "(?P<%s_" % name)   103         value = value.replace("(?P=", "(?P=%s_" % name)   104    105         # Record the updated expression and add an identifying null group.   106    107         patterns[name] = "%s(?P<group_%s>)" % (value, name)   108    109     return patterns   110    111 def get_expression(d, keys):   112    113     """   114     Return a compiled expression combining patterns in 'd' having the given   115     'keys'.   116     """   117    118     subset = []   119    120     for key in keys:   121         subset.append(d[key])   122    123     return re.compile("|".join(subset), re.UNICODE | re.MULTILINE)   124    125    126    127 # Tokenising functions.   128    129 class TokenStream:   130    131     "A stream of tokens taken from a string."   132    133     def __init__(self, s, pos=0):   134         self.s = s   135         self.pos = pos   136    137         # Match details.   138    139         self.match = None   140         self.queued = None   141         self.groups = {}   142    143         # Pattern name details.   144    145         self.matching = None   146    147     def rewind(self, length):   148    149         "Rewind in the string by 'length'."   150    151         self.pos -= min(length, self.pos)   152    153     def queue_match(self):   154    155         "Rewind in the string to the start of the last match."   156    157         self.queued = self.match   158    159     def read_until(self, expression, remaining=True):   160    161         """   162         Find the first match for the given 'expression'. Return the text   163         preceding any match, the remaining text if no match was found, or None   164         if no match was found and 'remaining' is given as a false value.   165         """   166    167         if self.queued:   168             self.match = self.queued   169             self.queued = None   170         else:   171             self.matching = None   172             self.match = None   173    174             # Find the first matching pattern.   175    176             match = expression.search(self.s, self.pos)   177    178             if match:   179                 for name, value in match.groupdict().items():   180    181                     # Use a group with a non-null value to identify the   182                     # matching pattern.   183    184                     if name.startswith("group_") and value is not None:   185                         self.matching = name[len("group_"):]   186                         self.start, self.end = match.span()   187                         self.match = match   188                         break   189    190         # Return the remaining text, if appropriate.   191    192         if self.matching is None:   193             self.groups = {}   194             if remaining:   195                 return self.s[self.pos:]   196             else:   197                 return None   198    199         # Return the preceding text, requiring the match to be retrieved.   200    201         else:   202             self.groups = self.filter_groups()   203             return self.s[self.pos:self.start]   204    205     def filter_groups(self):   206    207         "Filter groups from the current match for the matching pattern."   208    209         prefix = "%s_" % self.matching   210    211         d = {}   212         for key, value in self.match.groupdict().items():   213             if key.startswith(prefix):   214                 d[key[len(prefix):]] = value   215         return d   216    217     def match_group(self, group=None):   218    219         """   220         Return the matched text, updating the position in the stream. If 'group'   221         is specified, the indicated group in a match will be returned.   222         Otherwise, the entire match is returned.   223         """   224    225         self.update_pos()   226    227         if self.match:   228             if group is None:   229                 return self.s[self.start:self.end]   230             else:   231                 return self.groups.get(group)   232         else:   233             return None   234    235     def match_groups(self, groups=None):   236    237         "Return the match 'groups', or all groups if unspecified."   238    239         self.update_pos()   240    241         if self.match:   242             if groups is None:   243                 return self.groups   244             else:   245                 l = []   246                 for group in groups:   247                     l.append(self.groups.get(group))   248                 return l   249         else:   250             return []   251    252     def update_pos(self):   253    254         "Update the position in the stream."   255    256         if self.match:   257             _start, self.pos = self.match.span()   258         else:   259             self.pos = len(self.s)   260    261    262    263 # Parser abstractions.   264    265 class ParserBase:   266    267     "Common parsing methods."   268    269     region_pattern_names = None   270    271     def __init__(self, metadata, parsers=None, root=None):   272    273         """   274         Initialise the parser with the given 'metadata' and optional 'parsers'.   275         An optional 'root' indicates the document-level parser.   276         """   277    278         self.metadata = metadata   279         self.parsers = parsers   280         self.root = root   281    282     def update_metadata(self, metadata):   283         pass   284    285     def get_parser(self, format_type):   286    287         """   288         Return a parser for 'format_type' or None if no suitable parser is found.   289         """   290    291         cls = self.parsers and self.parsers.get(format_type)   292         if cls:   293             return cls(self.metadata, self.parsers, self.root or self)   294         else:   295             return None   296    297     def get_expression(self, pattern_names):   298    299         "Return a mapping of the given 'pattern_names' to patterns."   300    301         return get_expression(self.patterns, pattern_names)   302    303     def get_items(self, s, pos=0):   304    305         "Return a sequence of token items for 's' and 'pos'."   306    307         return TokenStream(s, pos)   308    309     def set_region(self, items, region):   310    311         "Set the 'items' used to populate the given 'region'."   312    313         self.items = items   314         self.region = region   315    316     def read_until(self, pattern_names, remaining=True):   317    318         """   319         Read the next portion of input, matching using 'pattern_names'. Return   320         the text preceding any match, the remaining text if no match was found,   321         or None if no match was found and 'remaining' is given as a false value.   322         """   323    324         return self.items.read_until(self.get_expression(pattern_names),   325                                      remaining)   326    327     def match_group(self, group=None):   328    329         """   330         Return the group of the matching pattern with the given 'group'   331         identifier. If 'group' is omitted or None, return the entire match.   332         """   333    334         return self.items.match_group(group)   335    336     def matching_pattern(self):   337    338         "Return the name of the matching pattern."   339    340         return self.items.matching   341    342     def match_groups(self):   343    344         "Return the number of groups in the match."   345    346         return self.items.match_groups()   347    348     # Parser methods invoked from other objects.   349    350     def parse(self, s):   351    352         """   353         Parse page text 's'. Pages consist of regions delimited by markers.   354         """   355    356         self.items = self.get_items(s)   357         self.region = self.parse_region()   358         return self.region   359    360     def parse_region_content(self, items, region):   361    362         "Parse the data provided by 'items' to populate a 'region'."   363    364         self.set_region(items, region)   365    366         # Only parse directives if the region is transparent.   367    368         if region.transparent:   369             self.parse_region_directives(region)   370    371         # Parse inline and opaque regions.   372    373         if not region.transparent:   374             pattern_names = ["regionend"]   375    376         # Define a block to hold text.   377    378         else:   379             self.new_block(region)   380             pattern_names = self.region_pattern_names   381    382         # Start parsing.   383    384         if pattern_names:   385             self.parse_region_details(region, pattern_names)   386    387         # Reset the type if the region was not inline.   388    389         if region.type == "inline":   390             first = region.nodes and region.nodes[0]   391             if first and isinstance(first, Text) and first.multiline():   392                 region.type = None   393    394     # Top-level parser handler methods.   395    396     def parse_region(self, level=0, indent=0, type=None):   397    398         """   399         Parse the data to populate a region with the given 'level' at the given   400         'indent' having the given initial 'type'.   401         """   402    403         region = Region([], level, indent, type)   404    405         # Parse section headers, then parse according to region type.   406    407         self.parse_region_header(region)   408         self.parse_region_type(region)   409    410         return region   411    412     def parse_region_type(self, region):   413    414         """   415         Use configured parsers to parse 'region' based on its type.   416         """   417    418         # Find an appropriate parser given the type.   419    420         parser = self.get_parser(region.type)   421         if not parser:   422             region.transparent = False   423         parser = parser or self.get_parser("moin")   424    425         parser.parse_region_content(self.items, region)   426    427     def parse_region_header(self, region):   428    429         """   430         Parse the region header, setting it on the 'region' object.   431         """   432    433         if self.read_until(["header"], False) == "": # None means no header   434             region.args = self.match_group("args")   435             region.type = region.args.split(" ", 1)[0]   436    437     def parse_region_directives(self, region):   438    439         """   440         Parse any directives immediately after the region header, adding them to   441         the 'region' object.   442         """   443    444         try:   445             while True:   446                 preceding = self.read_until(["directive"], False)   447    448                 # With an immediately-appearing directive, handle its details.   449    450                 if preceding == "":   451                     handler = self.handlers.get(self.matching_pattern())   452                     if handler:   453                         handler(self, region)   454                     else:   455                         break   456    457                 # Otherwise, with no immediate directive (or none at all), stop.   458    459                 else:   460                     break   461    462         # Handle a premature end of region.   463    464         except StopIteration:   465             pass   466    467     # Parsing utilities.   468    469     def parse_region_details(self, region, pattern_names, strict=False):   470    471         """   472         Search 'region' using the 'pattern_names'. If 'strict' is set to a true   473         value, forbid the accumulation of additional textual padding.   474         """   475    476         try:   477             while True:   478    479                 # Obtain text before any marker or the end of the input.   480    481                 preceding = self.read_until(pattern_names)   482                 if preceding:   483                     if not strict:   484                         region.append_inline(Text(preceding))   485                     else:   486                         break   487    488                 # End of input.   489    490                 if not self.matching_pattern():   491                     self.match_group()   492                     break   493    494                 # Obtain any feature.   495    496                 feature = self.match_group("feature") or self.match_group()   497                 handler = self.handlers.get(self.matching_pattern())   498    499                 # Handle each feature or add text to the region.   500    501                 if handler:   502                     handler(self, region)   503                 elif not strict:   504                     region.append_inline(Text(feature))   505                 else:   506                     break   507    508         except StopIteration:   509             pass   510    511         region.normalise()   512    513     def add_node(self, region, node):   514    515         "Add to 'region' the given 'node'."   516    517         region.add(node)   518    519     def append_node(self, region, node):   520    521         "Append to 'region' the given 'node'."   522    523         region.append(node)   524    525     def append_inline(self, region, node):   526    527         "Append to 'region' the given 'node'."   528    529         region.append_inline(node)   530    531     def end_region(self, region):   532    533         "End the parsing of 'region', breaking out of the parsing loop."   534    535         raise StopIteration   536    537     def queue_match(self):   538    539         "Queue the current match."   540    541         self.items.queue_match()   542    543     def new_block(self, region):   544    545         "Start a new block in 'region'."   546    547         self.add_node(region, Block([]))   548    549     # Common handler methods.   550    551     def parse_region_end(self, node):   552    553         "Handle the end of a region occurring within 'node'."   554    555         level = self.match_group("level")   556         feature = self.match_group("feature")   557         self.region.extra = self.match_group("extra")   558    559         if self.region.have_end(level):   560             raise StopIteration   561         else:   562             node.append_inline(Text(feature))   563    564 # vim: tabstop=4 expandtab shiftwidth=4