MoinLight (file moinformat/parsers/common.py at 24e8cc2a2a1e)

     1 #!/usr/bin/env python     2      3 """     4 Moin wiki parsing functionality.     5      6 Copyright (C) 2017, 2018 Paul Boddie <paul@boddie.org.uk>     7      8 This program is free software; you can redistribute it and/or modify it under     9 the terms of the GNU General Public License as published by the Free Software    10 Foundation; either version 3 of the License, or (at your option) any later    11 version.    12     13 This program is distributed in the hope that it will be useful, but WITHOUT    14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS    15 FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more    16 details.    17     18 You should have received a copy of the GNU General Public License along with    19 this program.  If not, see <http://www.gnu.org/licenses/>.    20 """    21     22 from collections import defaultdict    23 from moinformat.tree import Block, Region, Text    24 import re    25     26 # Pattern management.    27     28 ws_excl_nl = r"[ \f\r\t\v]"    29     30 def get_patterns(syntax):    31     32     """    33     Define patterns for the regular expressions in the 'syntax' mapping. In each    34     pattern, replace \N with a pattern for matching whitespace excluding    35     newlines.    36     """    37     38     patterns = {}    39     for name, value in syntax.items():    40         value = value.replace(r"\N", ws_excl_nl)    41         patterns[name] = re.compile(value, re.UNICODE | re.MULTILINE)    42     return patterns    43     44 def get_subset(d, keys):    45     46     "Return a subset of 'd' having the given 'keys'."    47     48     subset = {}    49     for key in keys:    50         subset[key] = d[key]    51     return subset    52     53     54     55 # Tokenising functions.    56     57 class TokenStream:    58     59     "A stream of tokens taken from a string."    60     61     def __init__(self, s, pos=0):    62         self.s = s    63         self.pos = pos    64     65         # Match details.    66     67         self.match = None    68         self.queued = None    69         self.match_start = None    70     71         # Pattern name details.    72     73         self.matching = None    74     75     def rewind(self, length):    76     77         "Rewind in the string by 'length'."    78     79         self.pos -= min(length, self.pos)    80     81     def queue_match(self):    82     83         "Rewind in the string to the start of the last match."    84     85         self.queued = self.match    86     87     def read_until(self, patterns, remaining=True):    88     89         """    90         Find the first match for the given 'patterns'. Return the text preceding    91         any match, the remaining text if no match was found, or None if no match    92         was found and 'remaining' is given as a false value.    93         """    94     95         if self.queued:    96             self.match = self.queued    97             self.queued = None    98         else:    99             self.match_start = None   100             self.matching = None   101    102             # Find the first matching pattern.   103    104             for pattern_name, pattern in patterns.items():   105                 match = pattern.search(self.s, self.pos)   106                 if match:   107                     start, end = match.span()   108                     if self.matching is None or start < self.start:   109                         self.start = start   110                         self.matching = pattern_name   111                         self.match = match   112    113         if self.matching is None:   114             if remaining:   115                 return self.s[self.pos:]   116             else:   117                 return None   118         else:   119             return self.s[self.pos:self.start]   120    121     def read_match(self, group=1):   122    123         """   124         Return the matched text, updating the position in the stream. If 'group'   125         is specified, the indicated group in a match will be returned.   126         Typically, group 1 should contain all pertinent data, but groups defined   127         within group 1 can provide sections of the data.   128         """   129    130         if self.match:   131             _start, self.pos = self.match.span()   132             try:   133                 return self.match.group(group)   134             except IndexError:   135                 return ""   136         else:   137             self.pos = len(self.s)   138             return None   139    140     def match_groups(self):   141    142         "Return the match groups."   143    144         if self.match:   145             return self.match.groups()   146         else:   147             return []   148    149    150    151 # Parser abstractions.   152    153 class ParserBase:   154    155     "Common parsing methods."   156    157     region_pattern_names = None   158    159     def __init__(self, formats=None):   160    161         """   162         Initialise the parser with any given 'formats' mapping from region type   163         names to parser objects.   164         """   165    166         self.formats = formats   167    168     def get_parser(self, format_type):   169    170         """   171         Return a parser for 'format_type' or None if no suitable parser is found.   172         """   173    174         if not self.formats:   175             return None   176    177         cls = self.formats.get(format_type)   178         if cls:   179             return cls(self.formats)   180         else:   181             return None   182    183     def get_patterns(self, pattern_names):   184    185         "Return a mapping of the given 'pattern_names' to patterns."   186    187         return get_subset(self.patterns, pattern_names)   188    189     def get_items(self, s, pos=0):   190    191         "Return a sequence of token items for 's' and 'pos'."   192    193         return TokenStream(s, pos)   194    195     def set_region(self, items, region):   196    197         "Set the 'items' used to populate the given 'region'."   198    199         self.items = items   200         self.region = region   201    202     def read_until(self, pattern_names, remaining=True):   203    204         """   205         Read the next portion of input, matching using 'pattern_names'. Return   206         the text preceding any match, the remaining text if no match was found,   207         or None if no match was found and 'remaining' is given as a false value.   208         """   209    210         return self.items.read_until(self.get_patterns(pattern_names))   211    212     def read_match(self, group=1):   213    214         """   215         Return the group of the matching pattern with the given 'group' number.   216         """   217    218         return self.items.read_match(group)   219    220     def read_matching(self):   221    222         "Return the name of the matching pattern."   223    224         return self.items.matching   225    226     def match_groups(self):   227    228         "Return the number of groups in the match."   229    230         return self.items.match_groups()   231    232     # Parser methods invoked from other objects.   233    234     def parse(self, s):   235    236         """   237         Parse page text 's'. Pages consist of regions delimited by markers.   238         """   239    240         self.items = self.get_items(s)   241         self.region = self.parse_region()   242         return self.region   243    244     def parse_region_content(self, items, region):   245    246         "Parse the data provided by 'items' to populate a 'region'."   247    248         self.set_region(items, region)   249    250         # Define a block to hold text and start parsing.   251    252         self.new_block(region)   253    254         if self.region_pattern_names:   255             self.parse_region_details(region, self.region_pattern_names)   256    257     # Top-level parser handler methods.   258    259     def parse_region(self, level=0, indent=0, type=None):   260    261         """   262         Parse the data to populate a region with the given 'level' at the given   263         'indent' having the given initial 'type'.   264         """   265    266         region = Region([], level, indent, type)   267    268         # Parse section headers, then parse according to region type.   269    270         self.parse_region_header(region)   271         self.parse_region_type(region)   272    273         return region   274    275     def parse_region_type(self, region):   276    277         """   278         Use configured parsers to parse 'region' based on its type.   279         """   280    281         # Handle potentially inline regions.   282    283         if region.type == "inline":   284             self.parse_region_inline(region)   285             return   286    287         # Find an appropriate parser given the type.   288    289         parser = self.get_parser(region.type)   290    291         if parser:   292             parser.parse_region_content(self.items, region)   293    294         # Otherwise, treat the section as opaque.   295    296         else:   297             self.parse_region_opaque(region)   298    299     def parse_region_header(self, region):   300    301         """   302         Parse the region header, setting it on the 'region' object.   303         """   304    305         if self.read_until(["header"], False) == "": # None means no header   306             region.type = self.read_match()   307    308     def parse_region_opaque(self, region):   309    310         "Parse the data to populate an opaque 'region'."   311    312         region.transparent = False   313         self.parse_region_details(region, ["regionend"])   314    315     def parse_region_inline(self, region):   316    317         "Parse the data to populate an inline 'region'."   318    319         region.transparent = False   320         self.parse_region_details(region, ["regionend"])   321    322         # Reset the type if the region was not inline.   323    324         if region.type == "inline":   325             first = region.nodes and region.nodes[0]   326             if first and isinstance(first, Text) and first.multiline():   327                 region.type = None   328    329     # Parsing utilities.   330    331     def parse_region_details(self, region, pattern_names, strict=False):   332    333         """   334         Search 'region' using the 'pattern_names'. If 'strict' is set to a true   335         value, forbid the accumulation of additional textual padding.   336         """   337    338         try:   339             while True:   340    341                 # Obtain text before any marker or the end of the input.   342    343                 preceding = self.read_until(pattern_names)   344                 if preceding:   345                     if not strict:   346                         region.append_inline(Text(preceding))   347                     else:   348                         break   349    350                 # End of input.   351    352                 if not self.read_matching():   353                     break   354    355                 # Obtain any feature.   356    357                 feature = self.read_match()   358                 handler = self.handlers.get(self.read_matching())   359    360                 # Handle each feature or add text to the region.   361    362                 if handler:   363                     handler(self, region)   364                 elif not strict:   365                     region.append_inline(Text(feature))   366                 else:   367                     break   368    369         except StopIteration:   370             pass   371    372         region.normalise()   373    374     def add_node(self, region, node):   375    376         "Add to 'region' the given 'node'."   377    378         region.add(node)   379    380     def append_node(self, region, node):   381    382         "Append to 'region' the given 'node'."   383    384         region.append(node)   385    386     def end_region(self, region):   387    388         "End the parsing of 'region', breaking out of the parsing loop."   389    390         raise StopIteration   391    392     def queue_match(self):   393    394         "Queue the current match."   395    396         self.items.queue_match()   397    398     def new_block(self, region):   399    400         "Start a new block in 'region'."   401    402         self.add_node(region, Block([]))   403    404 # vim: tabstop=4 expandtab shiftwidth=4