MoinLight (file moinformat/parsing.py at 7b9f5d3e242d)

     1 #!/usr/bin/env python     2      3 """     4 Moin wiki parsing functionality.     5      6 Copyright (C) 2017 Paul Boddie <paul@boddie.org.uk>     7      8 This program is free software; you can redistribute it and/or modify it under     9 the terms of the GNU General Public License as published by the Free Software    10 Foundation; either version 3 of the License, or (at your option) any later    11 version.    12     13 This program is distributed in the hope that it will be useful, but WITHOUT    14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS    15 FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more    16 details.    17     18 You should have received a copy of the GNU General Public License along with    19 this program.  If not, see <http://www.gnu.org/licenses/>.    20 """    21     22 from moinformat.tree import Block, Region, Text    23 import re    24     25 # Pattern management.    26     27 ws_excl_nl = r"[ \f\r\t\v]"    28     29 def get_patterns(syntax):    30     31     """    32     Define patterns for the regular expressions in the 'syntax' mapping. In each    33     pattern, replace \N with a pattern for matching whitespace excluding    34     newlines.    35     """    36     37     patterns = {}    38     for name, value in syntax.items():    39         value = value.replace(r"\N", ws_excl_nl)    40         patterns[name] = re.compile(value, re.UNICODE | re.MULTILINE)    41     return patterns    42     43 def combine_patterns(patterns, syntax):    44     45     "Combine 'patterns' with those defined by the given 'syntax' mapping."    46     47     return combine_dicts([patterns, get_patterns(syntax)])    48     49 def combine_dicts(dicts):    50     51     "Combine the given 'dicts'."    52     53     combined = {}    54     for d in dicts:    55         combined.update(d)    56     return combined    57     58     59     60 # Tokenising functions.    61     62 class TokenStream:    63     64     "A stream of tokens taken from a string."    65     66     def __init__(self, s, patterns, pos=0):    67         self.s = s    68         self.patterns = patterns    69         self.pos = pos    70         self.match = None    71         self.matching = None    72     73     def rewind(self, length):    74     75         "Rewind in the string by 'length'."    76     77         self.pos -= min(length, self.pos)    78     79     def read_until(self, pattern_names, remaining=True):    80     81         """    82         Find the first match for the given 'pattern_names'. Return the text    83         preceding any match, the remaining text if no match was found, or None    84         if no match was found and 'remaining' is given as a false value.    85         """    86     87         first = None    88         self.matching = None    89     90         # Find the first matching pattern.    91     92         for pattern_name in pattern_names:    93             match = self.patterns[pattern_name].search(self.s, self.pos)    94             if match:    95                 start, end = match.span()    96                 if self.matching is None or start < first:    97                     first = start    98                     self.matching = pattern_name    99                     self.match = match   100    101         if self.matching is None:   102             if remaining:   103                 return self.s[self.pos:]   104             else:   105                 return None   106         else:   107             return self.s[self.pos:first]   108    109     def read_match(self, group=1):   110    111         """   112         Return the matched text, updating the position in the stream. If 'group'   113         is specified, the indicated group in a match will be returned.   114         Typically, group 1 should contain all pertinent data, but groups defined   115         within group 1 can provide sections of the data.   116         """   117    118         if self.match:   119             _start, self.pos = self.match.span()   120             try:   121                 return self.match.group(group)   122             except IndexError:   123                 return ""   124         else:   125             self.pos = len(self.s)   126             return None   127    128    129    130 # Utility functions.   131    132 def new_block(region):   133    134     "Start a new block in 'region'."   135    136     region.add(Block([]))   137    138    139    140 # Parser abstractions.   141    142 class ParserBase:   143    144     "Common parsing methods."   145    146     def __init__(self, formats=None):   147    148         """   149         Initialise the parser with any given 'formats' mapping from region type   150         names to parser objects.   151         """   152    153         self.formats = formats   154         self.replaced_items = None   155    156     def get_items(self, s, pos=0):   157    158         "Return a sequence of token items for 's' and 'pos'."   159    160         raise NotImplementedError   161    162     def replace_items(self, items):   163    164         "Replace the given 'items' with a sequence employing the same state."   165    166         self.replaced_items = items   167         return self.get_items(items.s, items.pos)   168    169     def update_items(self, items):   170    171         "Update the state of the replaced items with that of 'items'."   172    173         self.replaced_items.pos = items.pos   174    175     def parse(self, s):   176    177         """   178         Parse page text 's'. Pages consist of regions delimited by markers.   179         """   180    181         return self.parse_region(self.get_items(s))   182    183     def parse_region(self, items, level=0, indent=0):   184    185         """   186         Parse the data provided by 'items' to populate a region with the given   187         'level' at the given 'indent'.   188         """   189    190         region = Region([], level, indent)   191    192         # Parse section headers, then parse according to region type.   193    194         self.parse_region_header(items, region)   195         self.parse_region_type(items, region)   196    197         return region   198    199     def parse_region_type(self, items, region):   200    201         """   202         Given data provided by 'items', use configured parsers to parse the   203         'region' based on its type.   204         """   205    206         # Find an appropriate parser given the type.   207    208         if self.formats.has_key(region.type):   209             self.formats[region.type].parse_region_content(items, region)   210    211         # Otherwise, treat the section as opaque.   212    213         else:   214             self.parse_region_opaque(items, region)   215    216     def parse_region_header(self, items, region):   217    218         """   219         Parse the region header from the 'items', setting it for the given 'region'.   220         """   221    222         if items.read_until(["header"], False) == "": # None means no header   223             region.type = items.read_match()   224    225     def parse_region_opaque(self, items, region):   226    227         "Parse the data provided by 'items' to populate an opaque 'region'."   228    229         region.transparent = False   230         self.parse_region_details(items, region, ["regionend"])   231    232     def parse_region_content(self, items, region):   233    234         "Parse the data provided by 'items' to populate the given 'region'."   235    236         pass   237    238     # Parsing utilities.   239    240     def parse_region_details(self, items, region, pattern_names):   241    242         "Parse 'items' within 'region' searching using 'pattern_names'."   243    244         try:   245             while True:   246    247                 # Obtain text before any marker or the end of the input.   248    249                 preceding = items.read_until(pattern_names)   250                 if preceding:   251                     region.append_inline(Text(preceding))   252    253                 # End of input.   254    255                 if not items.matching:   256                     break   257    258                 # Obtain any feature.   259    260                 feature = items.read_match()   261                 handler = self.handlers.get(items.matching)   262    263                 # Handle each feature or add text to the region.   264    265                 if handler:   266                     handler(self, items, region)   267                 else:   268                     region.append_inline(Text(feature))   269    270         except StopIteration:   271             pass   272    273         region.normalise()   274    275     def end_region(self, items, region):   276    277         "End the parsing of 'region', breaking out of the parsing loop."   278    279         raise StopIteration   280    281    282 # Format mapping initialisation.   283    284 def init_formats(formats):   285    286     """   287     Convert the given 'formats' mapping from a name-to-class mapping to a   288     name-to-instance mapping with each parser instance employing the format   289     mapping itself. Return the converted mapping.   290     """   291    292     d = {}   293     for name, cls in formats.items():   294         d[name] = cls(d)   295     return d   296    297 # vim: tabstop=4 expandtab shiftwidth=4