MoinLight (file moinformat.py at c3831bd8835f)

     1 #!/usr/bin/env python     2      3 """     4 Moin wiki format parser.     5      6 Copyright (C) 2012, 2013, 2015, 2017 Paul Boddie <paul@boddie.org.uk>     7      8 This program is free software; you can redistribute it and/or modify it under     9 the terms of the GNU General Public License as published by the Free Software    10 Foundation; either version 3 of the License, or (at your option) any later    11 version.    12     13 This program is distributed in the hope that it will be useful, but WITHOUT    14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS    15 FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more    16 details.    17     18 You should have received a copy of the GNU General Public License along with    19 this program.  If not, see <http://www.gnu.org/licenses/>.    20 """    21     22 from cgi import escape    23 import re    24     25 # Regular expressions.    26     27 syntax = {    28     # Page regions:    29     "regionstart"   : (r"^\s*([{]{3,})",                re.MULTILINE | re.DOTALL),  # {{{...    30     "regionend"     : (r"^\s*([}]{3,})",                re.MULTILINE | re.DOTALL),  # }}}...    31     "header"        : (r"#!(.*?)\n",                    0),                         # #! char-excl-nl    32     33     # Region contents:    34     "break"         : (r"^(\s*?)\n",                    re.MULTILINE),              # blank line    35     "listitem"      : (r"^((\s+)([*]|\d+[.]))",         re.MULTILINE),              # indent (list-item or number-item)    36     37     # List contents:    38     "listitemend"   : (r"^",                            re.MULTILINE),              # next line    39     }    40     41 # Define patterns for the regular expressions.    42     43 patterns = {}    44 for name, (value, flags) in syntax.items():    45     patterns[name] = re.compile(value, re.UNICODE | flags)    46     47     48     49 # Document nodes.    50     51 class Container:    52     53     "A container of document nodes."    54     55     def __init__(self, nodes):    56         self.nodes = nodes    57     58     def append(self, node):    59         self.nodes.append(node)    60     61     append_text = append    62     63     def empty(self):    64         return not self.nodes    65     66     def normalise(self):    67     68         "Combine adjacent text nodes."    69     70         nodes = self.nodes    71         self.nodes = []    72         text = None    73     74         for node in nodes:    75     76             # Open a text node or merge text into an open node.    77     78             if isinstance(node, Text):    79                 if not text:    80                     text = node    81                 else:    82                     text.merge(node)    83     84             # Close any open text node and append the current node.    85     86             else:    87                 if text:    88                     self.append(text)    89                     text = None    90                 self.append(node)    91     92         # Add any open text node.    93     94         if text:    95             self.append(text)    96     97     def __str__(self):    98         return self.prettyprint()    99    100     def prettyprint(self, indent=""):   101         pass   102    103 class Region(Container):   104    105     "A region of the page."   106    107     transparent_region_types = ["wiki"]   108    109     def __init__(self, nodes, level=0, type=None):   110         Container.__init__(self, nodes)   111         self.level = level   112         self.type = type   113    114     def append(self, node):   115         last = self.nodes and self.nodes[-1]   116         if last and last.empty():   117             self.nodes[-1] = node   118         else:   119             self.nodes.append(node)   120    121     def append_text(self, s):   122         if self.is_transparent():   123             self.nodes[-1].append(s)   124         else:   125             self.append(s)   126    127     def have_end(self, s):   128         return self.level and s.startswith("}") and self.level == len(s)   129    130     def is_transparent(self):   131         return not self.level or self.type in self.transparent_region_types   132    133     def __repr__(self):   134         return "Region(%r, %r, %r)" % (self.nodes, self.level, self.type)   135    136     def prettyprint(self, indent=""):   137         l = ["%sRegion: level=%d type=%s" % (indent, self.level, self.type)]   138         for node in self.nodes:   139             l.append(node.prettyprint(indent + "  "))   140         return "\n".join(l)   141    142     def to_string(self, out):   143         out.start_region(self.level, self.type)   144         for node in self.nodes:   145             node.to_string(out)   146         out.end_region(self.level, self.type)   147    148 class Block(Container):   149    150     "A block in the page."   151    152     def __init__(self, nodes, final=True):   153         Container.__init__(self, nodes)   154         self.final = final   155    156     def __repr__(self):   157         return "Block(%r)" % self.nodes   158    159     def prettyprint(self, indent=""):   160         l = ["%sBlock: final=%s" % (indent, self.final)]   161         for node in self.nodes:   162             l.append(node.prettyprint(indent + "  "))   163         return "\n".join(l)   164    165     def to_string(self, out):   166         out.start_block(self.final)   167         for node in self.nodes:   168             node.to_string(out)   169         out.end_block(self.final)   170    171 class ListItem(Container):   172    173     "A list item."   174    175     def __repr__(self):   176         return "ListItem(%r)" % self.nodes   177    178     def prettyprint(self, indent=""):   179         l = ["%sListItem:" % indent]   180         for node in self.nodes:   181             l.append(node.prettyprint(indent + "  "))   182         return "\n".join(l)   183    184     def to_string(self, out):   185         out.start_listitem()   186         for node in self.nodes:   187             node.to_string(out)   188         out.end_listitem()   189    190    191 class Text:   192    193     "A text node."   194    195     def __init__(self, s):   196         self.s = s   197    198     def empty(self):   199         return not self.s   200    201     def merge(self, text):   202         self.s += text.s   203    204     def __repr__(self):   205         return "Text(%r)" % self.s   206    207     def prettyprint(self, indent=""):   208         return "%sText: %r" % (indent, self.s)   209    210     def to_string(self, out):   211         out.text(self.s)   212    213    214    215 # Serialisation.   216    217 class Serialiser:   218    219     "General serialisation support."   220    221     def __init__(self, out):   222         self.out = out   223    224 class MoinSerialiser(Serialiser):   225    226     "Serialisation of the page."   227    228     def start_region(self, level, type):   229         out = self.out   230         if level:   231             out("{" * level)        # marker   232         if type and level:   233             out("#!%s\n" % type)    # header   234    235     def end_region(self, level, type):   236         out = self.out   237         if level:   238             out("}" * level)        # marker   239    240     def start_block(self, final):   241         pass   242    243     def end_block(self, final):   244         if not final:   245             self.out("\n")   246    247     def start_listitem(self):   248         self.out(" *")   249    250     def end_listitem(self):   251         pass   252    253     def text(self, s):   254         self.out(s)   255    256 class HTMLSerialiser(Serialiser):   257    258     "Serialisation of the page."   259    260     def start_region(self, level, type):   261         l = []   262         out = l.append   263         if level:   264             out("level-%d" % level)                 # marker   265    266         # NOTE: Encode type details for CSS.   267    268         if type:   269             out("type-%s" % escape(type, True))     # header   270    271         self.out("<span class='%s'>" % " ".join(l))   272    273     def end_region(self, level, type):   274         self.out("</span>")   275    276     def start_block(self, final):   277         self.out("<p>")   278    279     def end_block(self, final):   280         self.out("</p>")   281    282     def start_listitem(self):   283         self.out("<li>")   284    285     def end_listitem(self):   286         self.out("</li>")   287    288     def text(self, s):   289         self.out(escape(s))   290    291    292    293 # Tokenising functions.   294    295 class TokenStream:   296    297     "A stream of tokens taken from a string."   298    299     def __init__(self, s):   300         self.s = s   301         self.pos = 0   302         self.match = None   303         self.matching = None   304    305     def read_until(self, pattern_names, remaining=True):   306    307         """   308         Find the first match for the given 'pattern_names'. Return the text   309         preceding any match, the remaining text if no match was found, or None   310         if no match was found and 'remaining' is given as a false value.   311         """   312    313         first = None   314         self.matching = None   315    316         # Find the first matching pattern.   317    318         for pattern_name in pattern_names:   319             match = patterns[pattern_name].search(self.s, self.pos)   320             if match:   321                 start, end = match.span()   322                 if self.matching is None or start < first:   323                     first = start   324                     self.matching = pattern_name   325                     self.match = match   326    327         if self.matching is None:   328             if remaining:   329                 return self.s[self.pos:]   330             else:   331                 return None   332         else:   333             return self.s[self.pos:first]   334    335     def read_match(self):   336    337         "Return the matched text, updating the position in the stream."   338    339         if self.match:   340             _start, self.pos = self.match.span()   341             try:   342                 return self.match.group(1)   343             except IndexError:   344                 return ""   345         else:   346             self.pos = len(self.s)   347             return None   348    349    350    351 # Parser functions.   352    353 def parse_page(s):   354    355     """   356     Parse page text 's'. Pages consist of regions delimited by markers.   357     """   358    359     return parse_region(TokenStream(s))   360    361 def parse_region(items, level=0):   362    363     """   364     Parse the data provided by 'items' to populate a region at the given   365     'level'.   366     """   367    368     region = Region([], level)   369    370     # Parse section headers.   371    372     parse_region_header(items, region)   373    374     # Parse section body.   375    376     if region.is_transparent():   377         parse_region_wiki(items, region)   378     else:   379         parse_region_opaque(items, region)   380    381     return region   382    383 def parse_region_header(items, region):   384    385     """   386     Parse the region header from the 'items', setting it for the given 'region'.   387     """   388    389     if items.read_until(["header"], False) == "": # None means no header   390         region.type = items.read_match()   391    392 def parse_region_wiki(items, region):   393    394     "Parse the data provided by 'items' to populate a wiki 'region'."   395    396     new_block(region)   397     parse_region_details(items, region, ["break", "listitem", "regionstart", "regionend"])   398    399 def parse_region_opaque(items, region):   400    401     "Parse the data provided by 'items' to populate an opaque 'region'."   402    403     parse_region_details(items, region, ["regionend"])   404    405 def parse_region_details(items, region, pattern_names):   406    407     "Parse 'items' within 'region' searching using 'pattern_names'."   408    409     try:   410         while True:   411    412             # Obtain text before any marker or the end of the input.   413    414             preceding = items.read_until(pattern_names)   415             if preceding:   416                 region.append_text(Text(preceding))   417    418             # End of input.   419    420             if not items.matching:   421                 break   422    423             # Obtain any feature.   424    425             feature = items.read_match()   426             handler = handlers.get(items.matching)   427    428             # Handle each feature or add text to the region.   429    430             if handler:   431                 handler(items, region)   432             else:   433                 region.append_text(Text(feature))   434    435     except StopIteration:   436         pass   437    438     region.normalise()   439    440 def end_region(items, region):   441    442     "End the parsing of 'region'."   443    444     raise StopIteration   445    446 def parse_break(items, region):   447    448     "Handle a paragraph break within 'region'."   449    450     # Mark any previous block as not being the final one in a sequence.   451    452     block = region.nodes[-1]   453     block.final = False   454     new_block(region)   455    456 def parse_listitem_end(items, region):   457    458     "Handle the end of a list."   459    460     raise StopIteration   461    462 def parse_listitem(items, region):   463    464     "Handle a list item marker within 'region'."   465    466     item = ListItem([])   467     parse_region_details(items, item, ["listitemend"])   468     region.append(item)   469     new_block(region)   470    471 def parse_section(items, region):   472    473     "Handle the start of a new section within 'region'."   474    475     # Parse the section and start a new block after the section.   476    477     level = len(items.read_match())   478     region.append(parse_region(items, level))   479     new_block(region)   480    481 def parse_section_end(items, region):   482    483     "Handle the end of a new section within 'region'."   484    485     feature = items.read_match()   486     if region.have_end(feature):   487         raise StopIteration   488     else:   489         region.append_text(Text(feature))   490    491 # Pattern handlers.   492    493 handlers = {   494     None : end_region,   495     "break" : parse_break,   496     "listitemend" : parse_listitem_end,   497     "listitem" : parse_listitem,   498     "regionstart" : parse_section,   499     "regionend" : parse_section_end,   500     }   501    502 def new_block(region):   503    504     "Start a new block in 'region'."   505    506     block = Block([])   507     region.append(block)   508    509    510    511 # Top-level functions.   512    513 parse = parse_page   514    515 def serialise(doc, serialiser=MoinSerialiser):   516     l = []   517     doc.to_string(serialiser(l.append))   518     return "".join(l)   519    520 # vim: tabstop=4 expandtab shiftwidth=4