MoinLight (file moinformat.py at 0cbb97f95895)

     1 #!/usr/bin/env python     2      3 """     4 Moin wiki format parser.     5      6 Copyright (C) 2012, 2013, 2015, 2017 Paul Boddie <paul@boddie.org.uk>     7      8 This program is free software; you can redistribute it and/or modify it under     9 the terms of the GNU General Public License as published by the Free Software    10 Foundation; either version 3 of the License, or (at your option) any later    11 version.    12     13 This program is distributed in the hope that it will be useful, but WITHOUT    14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS    15 FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more    16 details.    17     18 You should have received a copy of the GNU General Public License along with    19 this program.  If not, see <http://www.gnu.org/licenses/>.    20 """    21     22 from cgi import escape    23 import re    24     25 # Regular expressions.    26     27 syntax = {    28     # Page regions:    29     "regionstart"   : (r"^\s*([{]{3,})",                re.MULTILINE | re.DOTALL),  # {{{...    30     "regionend"     : (r"^\s*([}]{3,})",                re.MULTILINE | re.DOTALL),  # }}}...    31     "header"        : (r"#!(.*?)\n",                    0),                         # #! char-excl-nl    32     33     # Region contents:    34     "break"         : (r"^(\s*?)\n",                    re.MULTILINE),              # blank line    35     }    36     37 # Define patterns for the regular expressions.    38     39 patterns = {}    40 for name, (value, flags) in syntax.items():    41     patterns[name] = re.compile(value, re.UNICODE | flags)    42     43     44     45 # Document nodes.    46     47 class Container:    48     49     "A container of document nodes."    50     51     def __init__(self, nodes):    52         self.nodes = nodes    53     54     def append(self, node):    55         self.nodes.append(node)    56     57     append_text = append    58     59     def normalise(self):    60     61         "Combine adjacent text nodes."    62     63         nodes = self.nodes    64         self.nodes = []    65         text = None    66     67         for node in nodes:    68     69             # Open a text node or merge text into an open node.    70     71             if isinstance(node, Text):    72                 if not text:    73                     text = node    74                 else:    75                     text.merge(node)    76     77             # Close any open text node and append the current node.    78     79             else:    80                 if text:    81                     self.append(text)    82                     text = None    83                 self.append(node)    84     85         # Add any open text node.    86     87         if text:    88             self.append(text)    89     90     def __str__(self):    91         return self.prettyprint()    92     93     def prettyprint(self, indent=""):    94         pass    95     96 class Region(Container):    97     98     "A region of the page."    99    100     transparent_region_types = ["wiki"]   101    102     def __init__(self, nodes, level=0, type=None):   103         Container.__init__(self, nodes)   104         self.level = level   105         self.type = type   106    107     def append_text(self, s):   108         if self.is_transparent():   109             self.nodes[-1].append(s)   110         else:   111             self.append(s)   112    113     def have_end(self, s):   114         return self.level and s.startswith("}") and self.level == len(s)   115    116     def is_transparent(self):   117         return not self.level or self.type in self.transparent_region_types   118    119     def __repr__(self):   120         return "Region(%r, %r, %r)" % (self.nodes, self.level, self.type)   121    122     def prettyprint(self, indent=""):   123         l = ["%sRegion: level=%d type=%s" % (indent, self.level, self.type)]   124         for node in self.nodes:   125             l.append(node.prettyprint(indent + "  "))   126         return "\n".join(l)   127    128     def to_string(self, out):   129         out.start_region(self.level, self.type)   130         for node in self.nodes:   131             node.to_string(out)   132         out.end_region(self.level, self.type)   133    134 class Block(Container):   135    136     "A block in the page."   137    138     def __init__(self, nodes, final=True):   139         Container.__init__(self, nodes)   140         self.final = final   141    142     def __repr__(self):   143         return "Block(%r)" % self.nodes   144    145     def prettyprint(self, indent=""):   146         l = ["%sBlock: final=%s" % (indent, self.final)]   147         for node in self.nodes:   148             l.append(node.prettyprint(indent + "  "))   149         return "\n".join(l)   150    151     def to_string(self, out):   152         out.start_block(self.final)   153         for node in self.nodes:   154             node.to_string(out)   155         out.end_block(self.final)   156    157 class Text:   158    159     "A text node."   160    161     def __init__(self, s):   162         self.s = s   163    164     def merge(self, text):   165         self.s += text.s   166    167     def __repr__(self):   168         return "Text(%r)" % self.s   169    170     def prettyprint(self, indent=""):   171         return "%sText: %r" % (indent, self.s)   172    173     def to_string(self, out):   174         out.text(self.s)   175    176    177    178 # Serialisation.   179    180 class Serialiser:   181    182     "General serialisation support."   183    184     def __init__(self, out):   185         self.out = out   186    187 class MoinSerialiser(Serialiser):   188    189     "Serialisation of the page."   190    191     def start_region(self, level, type):   192         out = self.out   193         if level:   194             out("{" * level)        # marker   195         if type and level:   196             out("#!%s\n" % type)    # header   197    198     def end_region(self, level, type):   199         out = self.out   200         if level:   201             out("}" * level)        # marker   202    203     def start_block(self, final):   204         pass   205    206     def end_block(self, final):   207         if not final:   208             self.out("\n")   209    210     def text(self, s):   211         self.out(s)   212    213 class HTMLSerialiser(Serialiser):   214    215     "Serialisation of the page."   216    217     def start_region(self, level, type):   218         l = []   219         out = l.append   220         if level:   221             out("level-%d" % level)                 # marker   222    223         # NOTE: Encode type details for CSS.   224    225         if type:   226             out("type-%s" % escape(type, True))     # header   227    228         self.out("<span class='%s'>" % " ".join(l))   229    230     def end_region(self, level, type):   231         self.out("</span>")   232    233     def start_block(self, final):   234         self.out("<p>")   235    236     def end_block(self, final):   237         self.out("</p>")   238    239     def text(self, s):   240         self.out(escape(s))   241    242    243    244 # Tokenising functions.   245    246 class TokenStream:   247    248     "A stream of tokens taken from a string."   249    250     def __init__(self, s):   251         self.s = s   252         self.pos = 0   253         self.match = None   254         self.matching = None   255    256     def read_until(self, pattern_names, remaining=True):   257    258         """   259         Find the first match for the given 'pattern_names'. Return the text   260         preceding any match, the remaining text if no match was found, or None   261         if no match was found and 'remaining' is given as a false value.   262         """   263    264         first = None   265         self.matching = None   266    267         # Find the first matching pattern.   268    269         for pattern_name in pattern_names:   270             match = patterns[pattern_name].search(self.s, self.pos)   271             if match:   272                 start, end = match.span()   273                 if self.matching is None or start < first:   274                     first = start   275                     self.matching = pattern_name   276                     self.match = match   277    278         if self.matching is None:   279             if remaining:   280                 return self.s[self.pos:]   281             else:   282                 return None   283         else:   284             return self.s[self.pos:first]   285    286     def read_match(self):   287    288         "Return the matched text, updating the position in the stream."   289    290         if self.match:   291             _start, self.pos = self.match.span()   292             s = self.match.group(1)   293             return s   294         else:   295             self.pos = len(self.s)   296             return None   297    298    299    300 # Parser functions.   301    302 def parse_page(s):   303    304     """   305     Parse page text 's'. Pages consist of regions delimited by markers.   306     """   307    308     return parse_region(TokenStream(s))   309    310 def parse_region(items, level=0):   311    312     """   313     Parse the data provided by 'items' to populate a region at the given   314     'level'.   315     """   316    317     region = Region([], level)   318    319     # Parse section headers.   320    321     parse_region_header(items, region)   322    323     # Parse section body.   324    325     if region.is_transparent():   326         parse_region_wiki(items, region)   327     else:   328         parse_region_opaque(items, region)   329    330     return region   331    332 def parse_region_header(items, region):   333    334     """   335     Parse the region header from the 'items', setting it for the given 'region'.   336     """   337    338     if items.read_until(["header"], False) == "": # None means no header   339         region.type = items.read_match()   340    341 def parse_region_wiki(items, region):   342    343     "Parse the data provided by 'items' to populate a wiki 'region'."   344    345     new_block(region)   346     parse_region_details(items, region, ["break", "regionstart", "regionend"])   347    348 def parse_region_opaque(items, region):   349    350     "Parse the data provided by 'items' to populate an opaque 'region'."   351    352     parse_region_details(items, region, ["regionend"])   353    354 def parse_region_details(items, region, pattern_names):   355    356     "Parse 'items' within 'region' searching using 'pattern_names'."   357    358     try:   359         while True:   360    361             # Obtain text before any marker or the end of the input.   362    363             preceding = items.read_until(pattern_names)   364             if preceding:   365                 region.append_text(Text(preceding))   366    367             # End of input.   368    369             if not items.matching:   370                 break   371    372             # Obtain any feature.   373    374             feature = items.read_match()   375             handler = handlers.get(items.matching)   376    377             # Handle each feature or add text to the region.   378    379             if handler:   380                 handler(items, region)   381             else:   382                 region.append_text(Text(feature))   383    384     except StopIteration:   385         pass   386    387     region.normalise()   388    389 def end_region(items, region):   390    391     "End the parsing of 'region'."   392    393     raise StopIteration   394    395 def parse_break(items, region):   396    397     "Handle a paragraph break within 'region'."   398    399     # Mark any previous block as not being the final one in a sequence.   400    401     block = region.nodes[-1]   402     block.final = False   403     new_block(region)   404    405 def parse_section(items, region):   406    407     "Handle the start of a new section within 'region'."   408    409     # Parse the section and start a new block after the section.   410    411     level = len(items.read_match())   412     region.append(parse_region(items, level))   413     new_block(region)   414    415 def parse_section_end(items, region):   416    417     "Handle the end of a new section within 'region'."   418    419     feature = items.read_match()   420     if region.have_end(feature):   421         raise StopIteration   422     else:   423         region.append_text(Text(feature))   424    425 # Pattern handlers.   426    427 handlers = {   428     None : end_region,   429     "break" : parse_break,   430     "regionstart" : parse_section,   431     "regionend" : parse_section_end,   432     }   433    434 def new_block(region):   435    436     "Start a new block in 'region'."   437    438     block = Block([])   439     region.append(block)   440    441    442    443 # Top-level functions.   444    445 parse = parse_page   446    447 def serialise(doc, serialiser=MoinSerialiser):   448     l = []   449     doc.to_string(serialiser(l.append))   450     return "".join(l)   451    452 # vim: tabstop=4 expandtab shiftwidth=4