MoinLight (file moinformat/parsers/common.py at 37e672f4923f)

     1 #!/usr/bin/env python     2      3 """     4 Moin wiki parsing functionality.     5      6 Copyright (C) 2017, 2018, 2019 Paul Boddie <paul@boddie.org.uk>     7      8 This program is free software; you can redistribute it and/or modify it under     9 the terms of the GNU General Public License as published by the Free Software    10 Foundation; either version 3 of the License, or (at your option) any later    11 version.    12     13 This program is distributed in the hope that it will be useful, but WITHOUT    14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS    15 FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more    16 details.    17     18 You should have received a copy of the GNU General Public License along with    19 this program.  If not, see <http://www.gnu.org/licenses/>.    20 """    21     22 from collections import defaultdict    23 from moinformat.tree.moin import Block, Region, Text    24 import re    25     26 # Pattern management.    27     28 ws_excl_nl = r"[ \f\r\t\v]"    29 quotes = "['" '"]'                  # ['"]    30 dotall = r"(.|\n)"                  # behave similarly to dot with DOTALL option    31 dotparagraph = r"(.|\n(?!\r?\n))"   # match everything within paragraphs    32     33 def choice(l):    34     35     "Return a pattern matching a choice of patterns in 'l'."    36     37     return "(%s)" % "|".join(l)    38     39 def excl(s):    40     41     "Return a non-matching pattern for 's'."    42     43     return "(?!%s)" % s    44     45 def expect(s):    46     47     "Return a pattern expecting 's'."    48     49     return "(?=%s)" % s    50     51 def group(name, s):    52     53     "Return a pattern group having 'name' and the pattern string 's'."    54     55     return "(?P<%s>%s)" % (name, s)    56     57 def optional(s):    58     59     "Return an optional pattern."    60     61     return "(?:%s)?" % s    62     63 def recur(name):    64     65     "Return a test for a recurrence of group 'name'."    66     67     return "(?P=%s)" % name    68     69 def repeat(s, min=None, max=None):    70     71     "Return a pattern matching 's' for the given 'min' and 'max' limits."    72     73     return "%s{%s,%s}" % (s, min is not None and min or "",    74                              max is not None and max or "")    75     76 def get_pattern(s):    77     78     "Return a compiled regular expression for the given pattern 's'."    79     80     return re.compile(s, re.UNICODE | re.MULTILINE)    81     82 def get_patterns(syntax):    83     84     """    85     Define patterns for the regular expressions in the 'syntax' mapping. In each    86     pattern, replace \N with a pattern for matching whitespace excluding    87     newlines.    88     """    89     90     patterns = {}    91     for name, value in syntax.items():    92         value = value.replace(r"\N", ws_excl_nl)    93         value = value.replace(r"\Q", quotes)    94         value = value.replace(r"\E", dotall)    95         value = value.replace(r"\P", dotparagraph)    96         patterns[name] = get_pattern(value)    97     return patterns    98     99 def get_subset(d, keys):   100    101     "Return a subset of 'd' having the given 'keys'."   102    103     subset = {}   104     for key in keys:   105         subset[key] = d[key]   106     return subset   107    108    109    110 # Tokenising functions.   111    112 class TokenStream:   113    114     "A stream of tokens taken from a string."   115    116     def __init__(self, s, pos=0):   117         self.s = s   118         self.pos = pos   119    120         # Match details.   121    122         self.match = None   123         self.queued = None   124         self.match_start = None   125    126         # Pattern name details.   127    128         self.matching = None   129    130     def rewind(self, length):   131    132         "Rewind in the string by 'length'."   133    134         self.pos -= min(length, self.pos)   135    136     def queue_match(self):   137    138         "Rewind in the string to the start of the last match."   139    140         self.queued = self.match   141    142     def read_until(self, patterns, remaining=True):   143    144         """   145         Find the first match for the given 'patterns'. Return the text preceding   146         any match, the remaining text if no match was found, or None if no match   147         was found and 'remaining' is given as a false value.   148         """   149    150         if self.queued:   151             self.match = self.queued   152             self.queued = None   153         else:   154             self.match_start = None   155             self.matching = None   156    157             # Find the first matching pattern.   158    159             for pattern_name, pattern in patterns.items():   160                 match = pattern.search(self.s, self.pos)   161                 if match:   162                     start, end = match.span()   163                     if self.matching is None or start < self.start:   164                         self.start = start   165                         self.matching = pattern_name   166                         self.match = match   167    168         if self.matching is None:   169             if remaining:   170                 return self.s[self.pos:]   171             else:   172                 return None   173         else:   174             return self.s[self.pos:self.start]   175    176     def match_group(self, group=1):   177    178         """   179         Return the matched text, updating the position in the stream. If 'group'   180         is specified, the indicated group in a match will be returned.   181         Typically, group 1 should contain all pertinent data, but groups defined   182         within group 1 can provide sections of the data.   183         """   184    185         self.update_pos()   186    187         if self.match:   188             try:   189                 return self.match.group(group)   190             except IndexError:   191                 return ""   192         else:   193             return None   194    195     def match_groups(self, groups=None):   196    197         "Return the match 'groups', or all groups if unspecified."   198    199         self.update_pos()   200    201         if self.match:   202             if groups is None:   203                 return self.match.groups()   204             else:   205                 return self.match.groups(groups)   206         else:   207             return []   208    209     def update_pos(self):   210    211         "Update the position in the stream."   212    213         if self.match:   214             _start, self.pos = self.match.span()   215         else:   216             self.pos = len(self.s)   217    218    219    220 # Parser abstractions.   221    222 class ParserBase:   223    224     "Common parsing methods."   225    226     region_pattern_names = None   227    228     def __init__(self, metadata, parsers=None, root=None):   229    230         """   231         Initialise the parser with the given 'metadata' and optional 'parsers'.   232         An optional 'root' indicates the document-level parser.   233         """   234    235         self.metadata = metadata   236         self.parsers = parsers   237         self.root = root   238    239     def get_parser(self, format_type):   240    241         """   242         Return a parser for 'format_type' or None if no suitable parser is found.   243         """   244    245         cls = self.parsers and self.parsers.get(format_type)   246         if cls:   247             return cls(self.metadata, self.parsers, self.root or self)   248         else:   249             return None   250    251     def get_patterns(self, pattern_names):   252    253         "Return a mapping of the given 'pattern_names' to patterns."   254    255         return get_subset(self.patterns, pattern_names)   256    257     def get_items(self, s, pos=0):   258    259         "Return a sequence of token items for 's' and 'pos'."   260    261         return TokenStream(s, pos)   262    263     def set_region(self, items, region):   264    265         "Set the 'items' used to populate the given 'region'."   266    267         self.items = items   268         self.region = region   269    270     def read_until(self, pattern_names, remaining=True):   271    272         """   273         Read the next portion of input, matching using 'pattern_names'. Return   274         the text preceding any match, the remaining text if no match was found,   275         or None if no match was found and 'remaining' is given as a false value.   276         """   277    278         return self.items.read_until(self.get_patterns(pattern_names))   279    280     def match_group(self, group=1):   281    282         """   283         Return the group of the matching pattern with the given 'group' number.   284         """   285    286         return self.items.match_group(group)   287    288     def matching_pattern(self):   289    290         "Return the name of the matching pattern."   291    292         return self.items.matching   293    294     def match_groups(self):   295    296         "Return the number of groups in the match."   297    298         return self.items.match_groups()   299    300     # Parser methods invoked from other objects.   301    302     def parse(self, s):   303    304         """   305         Parse page text 's'. Pages consist of regions delimited by markers.   306         """   307    308         self.items = self.get_items(s)   309         self.region = self.parse_region()   310         return self.region   311    312     def parse_region_content(self, items, region):   313    314         "Parse the data provided by 'items' to populate a 'region'."   315    316         self.set_region(items, region)   317    318         # Only parse directives if the region is transparent.   319    320         if region.transparent:   321             self.parse_region_directives(region)   322    323         # Parse inline and opaque regions.   324    325         if not region.transparent:   326             pattern_names = ["regionend"]   327    328         # Define a block to hold text.   329    330         else:   331             self.new_block(region)   332             pattern_names = self.region_pattern_names   333    334         # Start parsing.   335    336         if pattern_names:   337             self.parse_region_details(region, pattern_names)   338    339         # Reset the type if the region was not inline.   340    341         if region.type == "inline":   342             first = region.nodes and region.nodes[0]   343             if first and isinstance(first, Text) and first.multiline():   344                 region.type = None   345    346     # Top-level parser handler methods.   347    348     def parse_region(self, level=0, indent=0, type=None):   349    350         """   351         Parse the data to populate a region with the given 'level' at the given   352         'indent' having the given initial 'type'.   353         """   354    355         region = Region([], level, indent, type)   356    357         # Parse section headers, then parse according to region type.   358    359         self.parse_region_header(region)   360         self.parse_region_type(region)   361    362         return region   363    364     def parse_region_type(self, region):   365    366         """   367         Use configured parsers to parse 'region' based on its type.   368         """   369    370         # Find an appropriate parser given the type.   371    372         parser = self.get_parser(region.type)   373         if not parser:   374             region.transparent = False   375         parser = parser or self.get_parser("moin")   376    377         parser.parse_region_content(self.items, region)   378    379     def parse_region_header(self, region):   380    381         """   382         Parse the region header, setting it on the 'region' object.   383         """   384    385         if self.read_until(["header"], False) == "": # None means no header   386             region.args = self.match_group("args")   387             region.type = region.args.split(" ", 1)[0]   388    389     def parse_region_directives(self, region):   390    391         """   392         Parse any directives immediately after the region header, adding them to   393         the 'region' object.   394         """   395    396         try:   397             while True:   398                 preceding = self.read_until(["directive"], False)   399    400                 # With an immediately-appearing directive, handle its details.   401    402                 if preceding == "":   403                     handler = self.handlers.get(self.matching_pattern())   404                     if handler:   405                         handler(self, region)   406                     else:   407                         break   408    409                 # Otherwise, with no immediate directive (or none at all), stop.   410    411                 else:   412                     break   413    414         # Handle a premature end of region.   415    416         except StopIteration:   417             pass   418    419     # Parsing utilities.   420    421     def parse_region_details(self, region, pattern_names, strict=False):   422    423         """   424         Search 'region' using the 'pattern_names'. If 'strict' is set to a true   425         value, forbid the accumulation of additional textual padding.   426         """   427    428         try:   429             while True:   430    431                 # Obtain text before any marker or the end of the input.   432    433                 preceding = self.read_until(pattern_names)   434                 if preceding:   435                     if not strict:   436                         region.append_inline(Text(preceding))   437                     else:   438                         break   439    440                 # End of input.   441    442                 if not self.matching_pattern():   443                     break   444    445                 # Obtain any feature.   446    447                 feature = self.match_group("feature") or self.match_group()   448                 handler = self.handlers.get(self.matching_pattern())   449    450                 # Handle each feature or add text to the region.   451    452                 if handler:   453                     handler(self, region)   454                 elif not strict:   455                     region.append_inline(Text(feature))   456                 else:   457                     break   458    459         except StopIteration:   460             pass   461    462         region.normalise()   463    464     def add_node(self, region, node):   465    466         "Add to 'region' the given 'node'."   467    468         region.add(node)   469    470     def append_node(self, region, node):   471    472         "Append to 'region' the given 'node'."   473    474         region.append(node)   475    476     def end_region(self, region):   477    478         "End the parsing of 'region', breaking out of the parsing loop."   479    480         raise StopIteration   481    482     def queue_match(self):   483    484         "Queue the current match."   485    486         self.items.queue_match()   487    488     def new_block(self, region):   489    490         "Start a new block in 'region'."   491    492         self.add_node(region, Block([]))   493    494     # Common handler methods.   495    496     def parse_region_end(self, node):   497    498         "Handle the end of a region occurring within 'node'."   499    500         level = self.match_group("level")   501         feature = self.match_group("feature")   502         self.region.extra = self.match_group("extra")   503    504         if self.region.have_end(level):   505             raise StopIteration   506         else:   507             node.append_inline(Text(feature))   508    509 # vim: tabstop=4 expandtab shiftwidth=4