MoinLight (file moinformat/parsers/common.py at d517824d2df5)

     1 #!/usr/bin/env python     2      3 """     4 Moin wiki parsing functionality.     5      6 Copyright (C) 2017, 2018 Paul Boddie <paul@boddie.org.uk>     7      8 This program is free software; you can redistribute it and/or modify it under     9 the terms of the GNU General Public License as published by the Free Software    10 Foundation; either version 3 of the License, or (at your option) any later    11 version.    12     13 This program is distributed in the hope that it will be useful, but WITHOUT    14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS    15 FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more    16 details.    17     18 You should have received a copy of the GNU General Public License along with    19 this program.  If not, see <http://www.gnu.org/licenses/>.    20 """    21     22 from collections import defaultdict    23 from moinformat.tree import Block, Region, Text    24 import re    25     26 # Pattern management.    27     28 ws_excl_nl = r"[ \f\r\t\v]"    29     30 def get_patterns(syntax):    31     32     """    33     Define patterns for the regular expressions in the 'syntax' mapping. In each    34     pattern, replace \N with a pattern for matching whitespace excluding    35     newlines.    36     """    37     38     patterns = {}    39     for name, value in syntax.items():    40         value = value.replace(r"\N", ws_excl_nl)    41         patterns[name] = re.compile(value, re.UNICODE | re.MULTILINE)    42     return patterns    43     44 def get_subset(d, keys):    45     46     "Return a subset of 'd' having the given 'keys'."    47     48     subset = {}    49     for key in keys:    50         subset[key] = d[key]    51     return subset    52     53     54     55 # Tokenising functions.    56     57 class TokenStream:    58     59     "A stream of tokens taken from a string."    60     61     def __init__(self, s, pos=0):    62         self.s = s    63         self.pos = pos    64     65         # Match details.    66     67         self.match = None    68         self.queued = None    69         self.match_start = None    70     71         # Pattern name details.    72     73         self.matching = None    74     75     def rewind(self, length):    76     77         "Rewind in the string by 'length'."    78     79         self.pos -= min(length, self.pos)    80     81     def queue_match(self):    82     83         "Rewind in the string to the start of the last match."    84     85         self.queued = self.match    86     87     def read_until(self, patterns, remaining=True):    88     89         """    90         Find the first match for the given 'patterns'. Return the text preceding    91         any match, the remaining text if no match was found, or None if no match    92         was found and 'remaining' is given as a false value.    93         """    94     95         if self.queued:    96             self.match = self.queued    97             self.queued = None    98         else:    99             self.match_start = None   100             self.matching = None   101    102             # Find the first matching pattern.   103    104             for pattern_name, pattern in patterns.items():   105                 match = pattern.search(self.s, self.pos)   106                 if match:   107                     start, end = match.span()   108                     if self.matching is None or start < self.start:   109                         self.start = start   110                         self.matching = pattern_name   111                         self.match = match   112    113         if self.matching is None:   114             if remaining:   115                 return self.s[self.pos:]   116             else:   117                 return None   118         else:   119             return self.s[self.pos:self.start]   120    121     def match_group(self, group=1):   122    123         """   124         Return the matched text, updating the position in the stream. If 'group'   125         is specified, the indicated group in a match will be returned.   126         Typically, group 1 should contain all pertinent data, but groups defined   127         within group 1 can provide sections of the data.   128         """   129    130         self.update_pos()   131    132         if self.match:   133             try:   134                 return self.match.group(group)   135             except IndexError:   136                 return ""   137         else:   138             return None   139    140     def match_groups(self, groups=None):   141    142         "Return the match 'groups', or all groups if unspecified."   143    144         self.update_pos()   145    146         if self.match:   147             if groups is None:   148                 return self.match.groups()   149             else:   150                 return self.match.groups(groups)   151         else:   152             return []   153    154     def update_pos(self):   155    156         "Update the position in the stream."   157    158         if self.match:   159             _start, self.pos = self.match.span()   160         else:   161             self.pos = len(self.s)   162    163    164    165 # Parser abstractions.   166    167 class ParserBase:   168    169     "Common parsing methods."   170    171     region_pattern_names = None   172    173     def __init__(self, formats=None):   174    175         """   176         Initialise the parser with any given 'formats' mapping from region type   177         names to parser objects.   178         """   179    180         self.formats = formats   181    182     def get_parser(self, format_type):   183    184         """   185         Return a parser for 'format_type' or None if no suitable parser is found.   186         """   187    188         if not self.formats:   189             return None   190    191         cls = self.formats.get(format_type)   192         if cls:   193             return cls(self.formats)   194         else:   195             return None   196    197     def get_patterns(self, pattern_names):   198    199         "Return a mapping of the given 'pattern_names' to patterns."   200    201         return get_subset(self.patterns, pattern_names)   202    203     def get_items(self, s, pos=0):   204    205         "Return a sequence of token items for 's' and 'pos'."   206    207         return TokenStream(s, pos)   208    209     def set_region(self, items, region):   210    211         "Set the 'items' used to populate the given 'region'."   212    213         self.items = items   214         self.region = region   215    216     def read_until(self, pattern_names, remaining=True):   217    218         """   219         Read the next portion of input, matching using 'pattern_names'. Return   220         the text preceding any match, the remaining text if no match was found,   221         or None if no match was found and 'remaining' is given as a false value.   222         """   223    224         return self.items.read_until(self.get_patterns(pattern_names))   225    226     def match_group(self, group=1):   227    228         """   229         Return the group of the matching pattern with the given 'group' number.   230         """   231    232         return self.items.match_group(group)   233    234     def matching_pattern(self):   235    236         "Return the name of the matching pattern."   237    238         return self.items.matching   239    240     def match_groups(self):   241    242         "Return the number of groups in the match."   243    244         return self.items.match_groups()   245    246     # Parser methods invoked from other objects.   247    248     def parse(self, s):   249    250         """   251         Parse page text 's'. Pages consist of regions delimited by markers.   252         """   253    254         self.items = self.get_items(s)   255         self.region = self.parse_region()   256         return self.region   257    258     def parse_region_content(self, items, region):   259    260         "Parse the data provided by 'items' to populate a 'region'."   261    262         self.set_region(items, region)   263    264         # Define a block to hold text and start parsing.   265    266         self.new_block(region)   267    268         if self.region_pattern_names:   269             self.parse_region_details(region, self.region_pattern_names)   270    271     # Top-level parser handler methods.   272    273     def parse_region(self, level=0, indent=0, type=None):   274    275         """   276         Parse the data to populate a region with the given 'level' at the given   277         'indent' having the given initial 'type'.   278         """   279    280         region = Region([], level, indent, type)   281    282         # Parse section headers, then parse according to region type.   283    284         self.parse_region_header(region)   285         self.parse_region_type(region)   286    287         return region   288    289     def parse_region_type(self, region):   290    291         """   292         Use configured parsers to parse 'region' based on its type.   293         """   294    295         # Handle potentially inline regions.   296    297         if region.type == "inline":   298             self.parse_region_inline(region)   299             return   300    301         # Find an appropriate parser given the type.   302    303         parser = self.get_parser(region.type)   304    305         if parser:   306             parser.parse_region_content(self.items, region)   307    308         # Otherwise, treat the section as opaque.   309    310         else:   311             self.parse_region_opaque(region)   312    313     def parse_region_header(self, region):   314    315         """   316         Parse the region header, setting it on the 'region' object.   317         """   318    319         if self.read_until(["header"], False) == "": # None means no header   320             region.type = self.match_group()   321    322     def parse_region_opaque(self, region):   323    324         "Parse the data to populate an opaque 'region'."   325    326         region.transparent = False   327         self.parse_region_details(region, ["regionend"])   328    329     def parse_region_inline(self, region):   330    331         "Parse the data to populate an inline 'region'."   332    333         region.transparent = False   334         self.parse_region_details(region, ["regionend"])   335    336         # Reset the type if the region was not inline.   337    338         if region.type == "inline":   339             first = region.nodes and region.nodes[0]   340             if first and isinstance(first, Text) and first.multiline():   341                 region.type = None   342    343     # Parsing utilities.   344    345     def parse_region_details(self, region, pattern_names, strict=False):   346    347         """   348         Search 'region' using the 'pattern_names'. If 'strict' is set to a true   349         value, forbid the accumulation of additional textual padding.   350         """   351    352         try:   353             while True:   354    355                 # Obtain text before any marker or the end of the input.   356    357                 preceding = self.read_until(pattern_names)   358                 if preceding:   359                     if not strict:   360                         region.append_inline(Text(preceding))   361                     else:   362                         break   363    364                 # End of input.   365    366                 if not self.matching_pattern():   367                     break   368    369                 # Obtain any feature.   370    371                 feature = self.match_group()   372                 handler = self.handlers.get(self.matching_pattern())   373    374                 # Handle each feature or add text to the region.   375    376                 if handler:   377                     handler(self, region)   378                 elif not strict:   379                     region.append_inline(Text(feature))   380                 else:   381                     break   382    383         except StopIteration:   384             pass   385    386         region.normalise()   387    388     def add_node(self, region, node):   389    390         "Add to 'region' the given 'node'."   391    392         region.add(node)   393    394     def append_node(self, region, node):   395    396         "Append to 'region' the given 'node'."   397    398         region.append(node)   399    400     def end_region(self, region):   401    402         "End the parsing of 'region', breaking out of the parsing loop."   403    404         raise StopIteration   405    406     def queue_match(self):   407    408         "Queue the current match."   409    410         self.items.queue_match()   411    412     def new_block(self, region):   413    414         "Start a new block in 'region'."   415    416         self.add_node(region, Block([]))   417    418 # vim: tabstop=4 expandtab shiftwidth=4