MoinLight (file moinformat.py at 0c696e4a7ca3)

     1 #!/usr/bin/env python     2      3 """     4 Moin wiki format parser.     5      6 Copyright (C) 2012, 2013, 2015, 2017 Paul Boddie <paul@boddie.org.uk>     7      8 This program is free software; you can redistribute it and/or modify it under     9 the terms of the GNU General Public License as published by the Free Software    10 Foundation; either version 3 of the License, or (at your option) any later    11 version.    12     13 This program is distributed in the hope that it will be useful, but WITHOUT    14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS    15 FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more    16 details.    17     18 You should have received a copy of the GNU General Public License along with    19 this program.  If not, see <http://www.gnu.org/licenses/>.    20 """    21     22 from cgi import escape    23 import re    24     25 # Regular expressions.    26     27 syntax = {    28     # Page regions:    29     "marker"        : (r"^\s*([{]{3,}|[}]{3,})",    re.MULTILINE | re.DOTALL),  # {{{... or }}}...    30     31     # Region contents:    32     "header"        : (r"#!(.*?)\n",                0),                         # #! char-excl-nl    33     "break"         : (r"^\s*?\n",                  re.MULTILINE),              # blank line    34     }    35     36 # Define patterns for the regular expressions.    37     38 patterns = {}    39 for name, (value, flags) in syntax.items():    40     patterns[name] = re.compile(value, re.UNICODE | flags)    41     42     43     44 # Document nodes.    45     46 class Container:    47     48     "A container of document nodes."    49     50     def __init__(self, nodes):    51         self.nodes = nodes    52     53     def append(self, node):    54         self.nodes.append(node)    55     56     def normalise(self):    57     58         "Combine adjacent text nodes."    59     60         nodes = self.nodes    61         self.nodes = []    62         text = None    63     64         for node in nodes:    65     66             # Open a text node or merge text into an open node.    67     68             if isinstance(node, Text):    69                 if not text:    70                     text = node    71                 else:    72                     text.merge(node)    73     74             # Close any open text node and append the current node.    75     76             else:    77                 if text:    78                     self.append(text)    79                     text = None    80                 self.append(node)    81     82         # Add any open text node.    83     84         if text:    85             self.append(text)    86     87 class Region(Container):    88     89     "A region of the page."    90     91     transparent_region_types = ["wiki"]    92     93     def __init__(self, nodes, level=0, type=None):    94         Container.__init__(self, nodes)    95         self.level = level    96         self.type = type    97     98     def have_start(self, s):    99         return self.is_transparent() and s.startswith("{")   100    101     def have_end(self, s):   102         return self.level and s.startswith("}") and self.level == len(s)   103    104     def is_transparent(self):   105         return not self.level or self.type in self.transparent_region_types   106    107     def __repr__(self):   108         return "Region(%r, %r, %r)" % (self.nodes, self.level, self.type)   109    110     def to_string(self, out):   111         out.start_region(self.level, self.type)   112         for node in self.nodes:   113             node.to_string(out)   114         out.end_region(self.level, self.type)   115    116 class Block(Container):   117    118     "A block in the page."   119    120     def __init__(self, nodes, final=True):   121         Container.__init__(self, nodes)   122         self.final = final   123    124     def __repr__(self):   125         return "Block(%r)" % self.nodes   126    127     def to_string(self, out):   128         out.start_block(self.final)   129         for node in self.nodes:   130             node.to_string(out)   131         out.end_block(self.final)   132    133 class Text:   134    135     "A text node."   136    137     def __init__(self, s):   138         self.s = s   139    140     def merge(self, text):   141         self.s += text.s   142    143     def __repr__(self):   144         return "Text(%r)" % self.s   145    146     def to_string(self, out):   147         out.text(self.s)   148    149    150    151 # Serialisation.   152    153 class Serialiser:   154    155     "General serialisation support."   156    157     def __init__(self, out):   158         self.out = out   159    160 class MoinSerialiser(Serialiser):   161    162     "Serialisation of the page."   163    164     def start_region(self, level, type):   165         out = self.out   166         if level:   167             out("{" * level)        # marker   168         if type and level:   169             out("#!%s\n" % type)    # header   170    171     def end_region(self, level, type):   172         out = self.out   173         if level:   174             out("}" * level)        # marker   175    176     def start_block(self, final):   177         pass   178    179     def end_block(self, final):   180         if not final:   181             self.out("\n")   182    183     def text(self, s):   184         self.out(s)   185    186 class HTMLSerialiser(Serialiser):   187    188     "Serialisation of the page."   189    190     def start_region(self, level, type):   191         l = []   192         out = l.append   193         if level:   194             out("level-%d" % level)                 # marker   195    196         # NOTE: Encode type details for CSS.   197    198         if type:   199             out("type-%s" % escape(type, True))     # header   200    201         self.out("<span class='%s'>" % " ".join(l))   202    203     def end_region(self, level, type):   204         self.out("</span>")   205    206     def start_block(self, final):   207         self.out("<p>")   208    209     def end_block(self, final):   210         self.out("</p>")   211    212     def text(self, s):   213         self.out(escape(s))   214    215    216    217 # Tokenising functions.   218    219 class TokenStream:   220    221     "A stream of tokens taken from a string."   222    223     def __init__(self, s):   224         self.s = s   225         self.pos = 0   226         self.match = None   227         self.matching = None   228    229     def read_until(self, pattern_names, remaining=True):   230    231         """   232         Find the first match for the given 'pattern_names'. Return the text   233         preceding any match, the remaining text if no match was found, or None   234         if no match was found and 'remaining' is given as a false value.   235         """   236    237         first = None   238         self.matching = None   239    240         # Find the first matching pattern.   241    242         for pattern_name in pattern_names:   243             match = patterns[pattern_name].search(self.s, self.pos)   244             if match:   245                 start, end = match.span()   246                 if self.matching is None or start < first:   247                     first = start   248                     self.matching = pattern_name   249                     self.match = match   250    251         if self.matching is None:   252             if remaining:   253                 return self.s[self.pos:]   254             else:   255                 return None   256         else:   257             return self.s[self.pos:first]   258    259     def read_match(self):   260    261         "Return the matched text, updating the position in the stream."   262    263         if self.match:   264             _start, self.pos = self.match.span()   265             s = self.match.group(1)   266             self.match = None   267             return s   268         else:   269             self.pos = len(self.s)   270             return None   271    272    273    274 # Parser functions.   275    276 def parse_page(s):   277    278     """   279     Parse page text 's'. Pages consist of regions delimited by markers.   280     """   281    282     items = TokenStream(s)   283    284     # Define a region for the page and parse it.   285    286     region = Region([])   287     parse_region(items, region)   288     return region   289    290 def parse_region(items, region):   291    292     "Parse the data provided by 'items' to populate 'region'."   293    294     # Parse section headers.   295    296     parse_region_header(items, region)   297    298     if region.is_transparent():   299         parse_region_wiki(items, region)   300     else:   301         parse_region_opaque(items, region)   302    303 def parse_region_wiki(items, region):   304    305     "Parse the data provided by 'items' to populate a wiki 'region'."   306    307     # Process exposed text and sections.   308    309     block = Block([])   310     region.append(block)   311    312     while True:   313    314         # Obtain text before any marker or the end of the input.   315    316         match_text = items.read_until(["break", "marker"])   317         if match_text:   318             block.append(Text(match_text))   319    320         # Obtain any feature.   321    322         feature = items.read_match()   323    324         # End of input.   325    326         if not items.matching:   327             break   328    329         # Start a section if an appropriate marker is given.   330    331         if region.have_start(feature):   332    333             # Define the section and parse it.   334    335             _region = Region([], len(feature))   336             region.append(_region)   337             parse_region(items, _region)   338    339             # Start a new block after the section.   340    341             block = Block([])   342             region.append(block)   343    344         # Interpret the given marker, closing the current section if the   345         # given marker is the corresponding end marker for the current   346         # section.   347    348         elif region.have_end(feature):   349             break   350    351         # Start a new block if a paragraph break is found.   352    353         elif items.matching == "break":   354             block.final = False   355             block = Block([])   356             region.append(block)   357    358         # Add any inappropriate marker to the text.   359    360         else:   361             block.append(Text(feature))   362    363     region.normalise()   364    365 def parse_region_opaque(items, region):   366    367     "Parse the data provided by 'items' to populate an opaque 'region'."   368    369     # Process exposed text and sections.   370    371     while True:   372    373         # Obtain text before any marker or the end of the input.   374    375         match_text = items.read_until(["marker"])   376         if match_text:   377             region.append(Text(match_text))   378    379         # Obtain any marker.   380    381         marker = items.read_match()   382    383         # End of input.   384    385         if not marker:   386             break   387    388         # Interpret the given marker, closing the current section if the   389         # given marker is the corresponding end marker for the current   390         # section.   391    392         if region.have_end(marker):   393             break   394    395         # Add any inappropriate marker to the text.   396    397         else:   398             region.append(Text(marker))   399    400     region.normalise()   401    402 def parse_region_header(items, region):   403    404     """   405     Parse the region header from the 'items', setting it for the given 'region'.   406     """   407    408     if items.read_until(["header"], False) == "": # None means no header   409         region.type = items.read_match()   410    411    412    413 # Top-level functions.   414    415 parse = parse_page   416    417 def serialise(doc, serialiser=MoinSerialiser):   418     l = []   419     doc.to_string(serialiser(l.append))   420     return "".join(l)   421    422 # vim: tabstop=4 expandtab shiftwidth=4