MoinLight

moinformat/__init__.py

23:3630b0946d2f
2017-05-01 Paul Boddie Added larger and smaller text formatting. Simplified inline pattern handling and document nodes somewhat.
     1 #!/usr/bin/env python     2      3 """     4 Moin wiki format parser.     5      6 Copyright (C) 2017 Paul Boddie <paul@boddie.org.uk>     7      8 This program is free software; you can redistribute it and/or modify it under     9 the terms of the GNU General Public License as published by the Free Software    10 Foundation; either version 3 of the License, or (at your option) any later    11 version.    12     13 This program is distributed in the hope that it will be useful, but WITHOUT    14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS    15 FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more    16 details.    17     18 You should have received a copy of the GNU General Public License along with    19 this program.  If not, see <http://www.gnu.org/licenses/>.    20 """    21     22 from moinformat.tree import Block, Break, DefItem, DefTerm, FontStyle, Heading, \    23                             Larger, ListItem, Monospace, Region, Rule, Smaller, \    24                             Subscript, Superscript, Text, Underline    25 import re    26     27 # Regular expressions.    28     29 syntax = {    30     # Page regions:    31     "regionstart"   : r"((^\s*)([{]{3,}))",                         # {{{...    32     "regionend"     : r"^\s*([}]{3,})",                             # }}}...    33     "header"        : r"#!(.*?)\n",                                 # #! char-excl-nl    34     35     # Region contents:    36     # Line-oriented patterns:    37                       # blank line    38     "break"         : r"^(\s*?)\n",    39                       # ws... expecting text ::    40     "defterm"       : r"^(\s+)(?=.+?::)",    41                       # ws... expecting :: ws...    42     "defterm_empty" : r"^(\s+)(?=::\s+)",    43                       # [ws...] =... ws... expecting headingend    44     "heading"       : r"^(\s*)(?P<x>=+)(\s+)(?=.*?\s+(?P=x)\s*\n)",    45                       # ws... list-item [ws...]    46     "listitem"      : r"^(\s+)(\*)(\s*)",    47                       # ws... number-item ws...    48     "listitem_num"  : r"^(\s+)(\d+\.)(\s+)",    49                       # ws... alpha-item ws...    50     "listitem_alpha": r"^(\s+)([aA]\.)(\s+)",    51                       # ws... roman-item ws...    52     "listitem_roman": r"^(\s+)([iI]\.)(\s+)",    53                       # ws... dot-item [ws...]    54     "listitem_dot"  : r"^(\s+)(\.)(\s*)",    55     56     # Region contents:    57     # Inline patterns:    58     "fontstyle"     : r"('{2,6})",    59     "larger"        : r"~\+",    60     "monospace"     : r"`",    61     "rule"          : r"(-----*)",                                  # ----...    62     "smaller"       : r"~-",    63     "sub"           : r",,",    64     "super"         : r"\^",    65     "underline"     : r"__",    66     67     # Inline contents:    68     "largerend"     : r"\+~",    69     "monospaceend"  : r"`",    70     "smallerend"    : r"-~",    71     "subend"        : r",,",    72     "superend"      : r"\^",    73     "underlineend"  : r"__",    74     75     # Heading contents:    76     "headingend"    : r"(\s+)(=+)(\s*\n)",                          # ws... =... [ws...] nl    77     78     # List contents:    79     "deftermend"    : r"::(\s*?\n)",    80     "deftermsep"    : r"::(\s+)",    81     "listitemend"   : r"^",                                         # next line    82     }    83     84 # Define inline pattern details.    85     86 inline_pattern_names = ["fontstyle", "larger", "monospace", "smaller", "sub", "super", "underline"]    87     88 def inline_patterns_for(name):    89     names = inline_pattern_names[:]    90     names[names.index(name)] = "%send" % name    91     return names    92     93 # Define patterns for the regular expressions.    94     95 patterns = {}    96 for name, value in syntax.items():    97     patterns[name] = re.compile(value, re.UNICODE | re.MULTILINE)    98     99    100    101 # Tokenising functions.   102    103 class TokenStream:   104    105     "A stream of tokens taken from a string."   106    107     def __init__(self, s):   108         self.s = s   109         self.pos = 0   110         self.match = None   111         self.matching = None   112    113     def rewind(self, length):   114    115         "Rewind in the string by 'length'."   116    117         self.pos -= min(length, self.pos)   118    119     def read_until(self, pattern_names, remaining=True):   120    121         """   122         Find the first match for the given 'pattern_names'. Return the text   123         preceding any match, the remaining text if no match was found, or None   124         if no match was found and 'remaining' is given as a false value.   125         """   126    127         first = None   128         self.matching = None   129    130         # Find the first matching pattern.   131    132         for pattern_name in pattern_names:   133             match = patterns[pattern_name].search(self.s, self.pos)   134             if match:   135                 start, end = match.span()   136                 if self.matching is None or start < first:   137                     first = start   138                     self.matching = pattern_name   139                     self.match = match   140    141         if self.matching is None:   142             if remaining:   143                 return self.s[self.pos:]   144             else:   145                 return None   146         else:   147             return self.s[self.pos:first]   148    149     def read_match(self, group=1):   150    151         """   152         Return the matched text, updating the position in the stream. If 'group'   153         is specified, the indicated group in a match will be returned.   154         Typically, group 1 should contain all pertinent data, but groups defined   155         within group 1 can provide sections of the data.   156         """   157    158         if self.match:   159             _start, self.pos = self.match.span()   160             try:   161                 return self.match.group(group)   162             except IndexError:   163                 return ""   164         else:   165             self.pos = len(self.s)   166             return None   167    168    169    170 # Parser functions.   171    172 def parse_page(s):   173    174     """   175     Parse page text 's'. Pages consist of regions delimited by markers.   176     """   177    178     return parse_region(TokenStream(s))   179    180 def parse_region(items, level=0, indent=0):   181    182     """   183     Parse the data provided by 'items' to populate a region with the given   184     'level' at the given 'indent'.   185     """   186    187     region = Region([], level, indent)   188    189     # Parse section headers.   190    191     parse_region_header(items, region)   192    193     # Parse section body.   194    195     if region.is_transparent():   196         parse_region_wiki(items, region)   197     else:   198         parse_region_opaque(items, region)   199    200     return region   201    202 def parse_region_header(items, region):   203    204     """   205     Parse the region header from the 'items', setting it for the given 'region'.   206     """   207    208     if items.read_until(["header"], False) == "": # None means no header   209         region.type = items.read_match()   210    211 def parse_region_wiki(items, region):   212    213     "Parse the data provided by 'items' to populate a wiki 'region'."   214    215     new_block(region)   216     parse_region_details(items, region, inline_pattern_names + [   217         "break", "heading",   218         "defterm", "defterm_empty",   219         "listitem", "listitem_alpha", "listitem_dot", "listitem_num",   220         "listitem_roman",   221         "regionstart", "regionend",   222         "rule",   223         ])   224    225 def parse_region_opaque(items, region):   226    227     "Parse the data provided by 'items' to populate an opaque 'region'."   228    229     parse_region_details(items, region, ["regionend"])   230    231 def parse_region_details(items, region, pattern_names):   232    233     "Parse 'items' within 'region' searching using 'pattern_names'."   234    235     try:   236         while True:   237    238             # Obtain text before any marker or the end of the input.   239    240             preceding = items.read_until(pattern_names)   241             if preceding:   242                 region.append_inline(Text(preceding))   243    244             # End of input.   245    246             if not items.matching:   247                 break   248    249             # Obtain any feature.   250    251             feature = items.read_match()   252             handler = handlers.get(items.matching)   253    254             # Handle each feature or add text to the region.   255    256             if handler:   257                 handler(items, region)   258             else:   259                 region.append_inline(Text(feature))   260    261     except StopIteration:   262         pass   263    264     region.normalise()   265    266 def end_region(items, region):   267    268     "End the parsing of 'region'."   269    270     raise StopIteration   271    272 def parse_break(items, region):   273    274     "Handle a paragraph break within 'region'."   275    276     region.add(Break())   277     new_block(region)   278    279 def parse_defitem(items, region, extra=""):   280    281     "Handle a definition item within 'region'."   282    283     pad = items.read_match(1)   284     item = DefItem([], pad, extra)   285     parse_region_details(items, item, ["listitemend"])   286     region.add(item)   287     new_block(region)   288    289 def parse_defterm(items, region):   290    291     "Handle a definition term within 'region'."   292    293     pad = items.read_match(1)   294     term = DefTerm([], pad)   295     parse_region_details(items, term, ["deftermend", "deftermsep"])   296     region.add(term)   297     if items.matching == "deftermsep":   298         parse_defitem(items, region)   299    300 def parse_defterm_empty(items, region):   301    302     "Handle an empty definition term within 'region'."   303    304     extra = items.read_match(1)   305     parse_region_details(items, region, ["deftermsep"])   306     parse_defitem(items, region, extra)   307    308 def parse_fontstyle(items, region):   309    310     "Handle emphasis and strong styles."   311    312     n = len(items.read_match(1))   313    314     # Handle endings.   315    316     if isinstance(region, FontStyle):   317         emphasis = n in (2, 4, 5)   318         strong = n in (3, 5, 6)   319         active = True   320    321         if region.emphasis and emphasis:   322             active = region.close_emphasis()   323             n -= 2   324         if region.strong and strong:   325             active = region.close_strong()   326             n -= 3   327    328         if not active:   329             if n:   330                 items.rewind(n)   331             raise StopIteration   332    333         elif not n:   334             return   335    336     # Handle new styles.   337    338     emphasis = n in (2, 4, 5)   339     strong = n in (3, 5, 6)   340     double = n in (4, 6)   341    342     span = FontStyle([], emphasis, strong)   343     if not double:   344         parse_region_details(items, span, inline_pattern_names)   345     region.append_inline(span)   346    347 def parse_heading(items, region):   348    349     "Handle a heading."   350    351     start_extra = items.read_match(1)   352     level = len(items.read_match(2))   353     start_pad = items.read_match(3)   354     heading = Heading([], level, start_extra, start_pad)   355     parse_region_details(items, heading, ["headingend"] + inline_pattern_names)   356     region.add(heading)   357     new_block(region)   358    359 def parse_heading_end(items, heading):   360    361     "Handle the end of a heading."   362    363     level = len(items.read_match(2))   364     if heading.level == level:   365         heading.end_pad = items.read_match(1)   366         heading.end_extra = items.read_match(3)   367         raise StopIteration   368    369 def parse_listitem(items, region):   370    371     "Handle a list item marker within 'region'."   372    373     indent = len(items.read_match(1))   374     marker = items.read_match(2)   375     space = items.read_match(3)   376     item = ListItem([], indent, marker, space)   377     parse_region_details(items, item, ["listitemend"])   378     region.add(item)   379     new_block(region)   380    381 def parse_rule(items, region):   382    383     "Handle a horizontal rule within 'region'."   384    385     length = len(items.read_match(1))   386     rule = Rule(length)   387     region.add(rule)   388     new_block(region)   389    390 def parse_section(items, region):   391    392     "Handle the start of a new section within 'region'."   393    394     # Parse the section and start a new block after the section.   395    396     indent = len(items.read_match(2))   397     level = len(items.read_match(3))   398     region.add(parse_region(items, level, indent))   399     new_block(region)   400    401 def parse_section_end(items, region):   402    403     "Handle the end of a new section within 'region'."   404    405     feature = items.read_match()   406     if region.have_end(feature):   407         raise StopIteration   408     else:   409         region.append_inline(Text(feature))   410    411 # Inline formatting handlers.   412    413 def parse_inline(items, region, cls, pattern_name):   414    415     "Handle an inline region."   416    417     span = cls([])   418     parse_region_details(items, span, inline_patterns_for(pattern_name))   419     region.append_inline(span)   420    421 parse_larger = lambda items, region: parse_inline(items, region, Larger, "larger")   422 parse_monospace = lambda items, region: parse_inline(items, region, Monospace, "monospace")   423 parse_smaller = lambda items, region: parse_inline(items, region, Smaller, "smaller")   424 parse_sub = lambda items, region: parse_inline(items, region, Subscript, "sub")   425 parse_super = lambda items, region: parse_inline(items, region, Superscript, "super")   426 parse_underline = lambda items, region: parse_inline(items, region, Underline, "underline")   427    428 # Pattern handlers.   429    430 handlers = {   431     None : end_region,   432     "break" : parse_break,   433     "defterm" : parse_defterm,   434     "defterm_empty" : parse_defterm_empty,   435     "deftermend" : end_region,   436     "deftermsep" : end_region,   437     "fontstyle" : parse_fontstyle,   438     "heading" : parse_heading,   439     "headingend" : parse_heading_end,   440     "larger" : parse_larger,   441     "largerend" : end_region,   442     "listitemend" : end_region,   443     "listitem" : parse_listitem,   444     "listitem_alpha" : parse_listitem,   445     "listitem_dot" : parse_listitem,   446     "listitem_num" : parse_listitem,   447     "listitem_roman" : parse_listitem,   448     "monospace" : parse_monospace,   449     "monospaceend" : end_region,   450     "regionstart" : parse_section,   451     "regionend" : parse_section_end,   452     "rule" : parse_rule,   453     "smaller" : parse_smaller,   454     "smallerend" : end_region,   455     "sub" : parse_sub,   456     "subend" : end_region,   457     "super" : parse_super,   458     "superend" : end_region,   459     "underline" : parse_underline,   460     "underlineend" : end_region,   461     }   462    463 def new_block(region):   464    465     "Start a new block in 'region'."   466    467     block = Block([])   468     region.add(block)   469    470    471    472 # Top-level functions.   473    474 parse = parse_page   475    476 # vim: tabstop=4 expandtab shiftwidth=4