MoinLight (file moinformat/__init_

     1 #!/usr/bin/env python     2      3 """     4 Moin wiki format parser.     5      6 Copyright (C) 2017 Paul Boddie <paul@boddie.org.uk>     7      8 This program is free software; you can redistribute it and/or modify it under     9 the terms of the GNU General Public License as published by the Free Software    10 Foundation; either version 3 of the License, or (at your option) any later    11 version.    12     13 This program is distributed in the hope that it will be useful, but WITHOUT    14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS    15 FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more    16 details.    17     18 You should have received a copy of the GNU General Public License along with    19 this program.  If not, see <http://www.gnu.org/licenses/>.    20 """    21     22 from moinformat.tree import Block, Break, DefItem, DefTerm, FontStyle, Heading, \    23                             ListItem, Monospace, Region, Rule, Subscript, \    24                             Superscript, Text, Underline    25 import re    26     27 # Regular expressions.    28     29 syntax = {    30     # Page regions:    31     "regionstart"   : r"((^\s*)([{]{3,}))",                         # {{{...    32     "regionend"     : r"^\s*([}]{3,})",                             # }}}...    33     "header"        : r"#!(.*?)\n",                                 # #! char-excl-nl    34     35     # Region contents:    36     # Line-oriented patterns:    37                       # blank line    38     "break"         : r"^(\s*?)\n",    39                       # ws... expecting text ::    40     "defterm"       : r"^(\s+)(?=.+?::)",    41                       # ws... expecting :: ws...    42     "defterm_empty" : r"^(\s+)(?=::\s+)",    43                       # [ws...] =... ws... expecting headingend    44     "heading"       : r"^(\s*)(?P<x>=+)(\s+)(?=.*?\s+(?P=x)\s*\n)",    45                       # ws... list-item [ws...]    46     "listitem"      : r"^(\s+)(\*)(\s*)",    47                       # ws... number-item ws...    48     "listitem_num"  : r"^(\s+)(\d+\.)(\s+)",    49                       # ws... alpha-item ws...    50     "listitem_alpha": r"^(\s+)([aA]\.)(\s+)",    51                       # ws... roman-item ws...    52     "listitem_roman": r"^(\s+)([iI]\.)(\s+)",    53                       # ws... dot-item [ws...]    54     "listitem_dot"  : r"^(\s+)(\.)(\s*)",    55     56     # Region contents:    57     # Inline patterns:    58     "fontstyle"     : r"('{2,6})",    59     "monospace"     : r"`",    60     "rule"          : r"(-----*)",                                  # ----...    61     "sub"           : r",,",    62     "super"         : r"\^",    63     "underline"     : r"__",    64     65     # Inline contents:    66     "monospaceend"  : r"`",    67     "subend"        : r",,",    68     "superend"      : r"\^",    69     "underlineend"  : r"__",    70     71     # Heading contents:    72     "headingend"    : r"(\s+)(=+)(\s*\n)",                          # ws... =... [ws...] nl    73     74     # List contents:    75     "deftermend"    : r"::(\s*?\n)",    76     "deftermsep"    : r"::(\s+)",    77     "listitemend"   : r"^",                                         # next line    78     }    79     80 # Define patterns for the regular expressions.    81     82 patterns = {}    83 for name, value in syntax.items():    84     patterns[name] = re.compile(value, re.UNICODE | re.MULTILINE)    85     86 inline_pattern_names = ["fontstyle", "monospace", "sub", "super", "underline"]    87     88 def inline_patterns_for(name):    89     names = inline_pattern_names[:]    90     names[names.index(name)] = "%send" % name    91     return names    92     93     94     95 # Tokenising functions.    96     97 class TokenStream:    98     99     "A stream of tokens taken from a string."   100    101     def __init__(self, s):   102         self.s = s   103         self.pos = 0   104         self.match = None   105         self.matching = None   106    107     def rewind(self, length):   108    109         "Rewind in the string by 'length'."   110    111         self.pos -= min(length, self.pos)   112    113     def read_until(self, pattern_names, remaining=True):   114    115         """   116         Find the first match for the given 'pattern_names'. Return the text   117         preceding any match, the remaining text if no match was found, or None   118         if no match was found and 'remaining' is given as a false value.   119         """   120    121         first = None   122         self.matching = None   123    124         # Find the first matching pattern.   125    126         for pattern_name in pattern_names:   127             match = patterns[pattern_name].search(self.s, self.pos)   128             if match:   129                 start, end = match.span()   130                 if self.matching is None or start < first:   131                     first = start   132                     self.matching = pattern_name   133                     self.match = match   134    135         if self.matching is None:   136             if remaining:   137                 return self.s[self.pos:]   138             else:   139                 return None   140         else:   141             return self.s[self.pos:first]   142    143     def read_match(self, group=1):   144    145         """   146         Return the matched text, updating the position in the stream. If 'group'   147         is specified, the indicated group in a match will be returned.   148         Typically, group 1 should contain all pertinent data, but groups defined   149         within group 1 can provide sections of the data.   150         """   151    152         if self.match:   153             _start, self.pos = self.match.span()   154             try:   155                 return self.match.group(group)   156             except IndexError:   157                 return ""   158         else:   159             self.pos = len(self.s)   160             return None   161    162    163    164 # Parser functions.   165    166 def parse_page(s):   167    168     """   169     Parse page text 's'. Pages consist of regions delimited by markers.   170     """   171    172     return parse_region(TokenStream(s))   173    174 def parse_region(items, level=0, indent=0):   175    176     """   177     Parse the data provided by 'items' to populate a region with the given   178     'level' at the given 'indent'.   179     """   180    181     region = Region([], level, indent)   182    183     # Parse section headers.   184    185     parse_region_header(items, region)   186    187     # Parse section body.   188    189     if region.is_transparent():   190         parse_region_wiki(items, region)   191     else:   192         parse_region_opaque(items, region)   193    194     return region   195    196 def parse_region_header(items, region):   197    198     """   199     Parse the region header from the 'items', setting it for the given 'region'.   200     """   201    202     if items.read_until(["header"], False) == "": # None means no header   203         region.type = items.read_match()   204    205 def parse_region_wiki(items, region):   206    207     "Parse the data provided by 'items' to populate a wiki 'region'."   208    209     new_block(region)   210     parse_region_details(items, region, inline_pattern_names + [   211         "break", "heading",   212         "defterm", "defterm_empty",   213         "listitem", "listitem_alpha", "listitem_dot", "listitem_num",   214         "listitem_roman",   215         "regionstart", "regionend",   216         "rule",   217         ])   218    219 def parse_region_opaque(items, region):   220    221     "Parse the data provided by 'items' to populate an opaque 'region'."   222    223     parse_region_details(items, region, ["regionend"])   224    225 def parse_region_details(items, region, pattern_names):   226    227     "Parse 'items' within 'region' searching using 'pattern_names'."   228    229     try:   230         while True:   231    232             # Obtain text before any marker or the end of the input.   233    234             preceding = items.read_until(pattern_names)   235             if preceding:   236                 region.append_inline(Text(preceding))   237    238             # End of input.   239    240             if not items.matching:   241                 break   242    243             # Obtain any feature.   244    245             feature = items.read_match()   246             handler = handlers.get(items.matching)   247    248             # Handle each feature or add text to the region.   249    250             if handler:   251                 handler(items, region)   252             else:   253                 region.append_inline(Text(feature))   254    255     except StopIteration:   256         pass   257    258     region.normalise()   259    260 def end_region(items, region):   261    262     "End the parsing of 'region'."   263    264     raise StopIteration   265    266 def parse_break(items, region):   267    268     "Handle a paragraph break within 'region'."   269    270     region.add(Break())   271     new_block(region)   272    273 def parse_defitem(items, region, extra=""):   274    275     "Handle a definition item within 'region'."   276    277     pad = items.read_match(1)   278     item = DefItem([], pad, extra)   279     parse_region_details(items, item, ["listitemend"])   280     region.add(item)   281     new_block(region)   282    283 def parse_defterm(items, region):   284    285     "Handle a definition term within 'region'."   286    287     pad = items.read_match(1)   288     term = DefTerm([], pad)   289     parse_region_details(items, term, ["deftermend", "deftermsep"])   290     region.add(term)   291     if items.matching == "deftermsep":   292         parse_defitem(items, region)   293    294 def parse_defterm_empty(items, region):   295    296     "Handle an empty definition term within 'region'."   297    298     extra = items.read_match(1)   299     parse_region_details(items, region, ["deftermsep"])   300     parse_defitem(items, region, extra)   301    302 def parse_fontstyle(items, region):   303    304     "Handle emphasis and strong styles."   305    306     n = len(items.read_match(1))   307    308     # Handle endings.   309    310     if isinstance(region, FontStyle):   311         emphasis = n in (2, 4, 5)   312         strong = n in (3, 5, 6)   313         active = True   314    315         if region.emphasis and emphasis:   316             active = region.close_emphasis()   317             n -= 2   318         if region.strong and strong:   319             active = region.close_strong()   320             n -= 3   321    322         if not active:   323             if n:   324                 items.rewind(n)   325             raise StopIteration   326    327         elif not n:   328             return   329    330     # Handle new styles.   331    332     emphasis = n in (2, 4, 5)   333     strong = n in (3, 5, 6)   334     double = n in (4, 6)   335    336     span = FontStyle([], emphasis, strong)   337     if not double:   338         parse_region_details(items, span, inline_pattern_names)   339     region.append_inline(span)   340    341 def parse_heading(items, region):   342    343     "Handle a heading."   344    345     start_extra = items.read_match(1)   346     level = len(items.read_match(2))   347     start_pad = items.read_match(3)   348     heading = Heading([], level, start_extra, start_pad)   349     parse_region_details(items, heading, ["headingend"] + inline_pattern_names)   350     region.add(heading)   351     new_block(region)   352    353 def parse_heading_end(items, heading):   354    355     "Handle the end of a heading."   356    357     level = len(items.read_match(2))   358     if heading.level == level:   359         heading.end_pad = items.read_match(1)   360         heading.end_extra = items.read_match(3)   361         raise StopIteration   362    363 def parse_listitem(items, region):   364    365     "Handle a list item marker within 'region'."   366    367     indent = len(items.read_match(1))   368     marker = items.read_match(2)   369     space = items.read_match(3)   370     item = ListItem([], indent, marker, space)   371     parse_region_details(items, item, ["listitemend"])   372     region.add(item)   373     new_block(region)   374    375 def parse_monospace(items, region):   376    377     "Handle monospace."   378    379     span = Monospace([])   380     parse_region_details(items, span, inline_patterns_for("monospace"))   381     region.append_inline(span)   382    383 def parse_rule(items, region):   384    385     "Handle a horizontal rule within 'region'."   386    387     length = len(items.read_match(1))   388     rule = Rule(length)   389     region.add(rule)   390     new_block(region)   391    392 def parse_section(items, region):   393    394     "Handle the start of a new section within 'region'."   395    396     # Parse the section and start a new block after the section.   397    398     indent = len(items.read_match(2))   399     level = len(items.read_match(3))   400     region.add(parse_region(items, level, indent))   401     new_block(region)   402    403 def parse_section_end(items, region):   404    405     "Handle the end of a new section within 'region'."   406    407     feature = items.read_match()   408     if region.have_end(feature):   409         raise StopIteration   410     else:   411         region.append_inline(Text(feature))   412    413 def parse_sub(items, region):   414    415     "Handle subscript."   416    417     span = Subscript([])   418     parse_region_details(items, span, inline_patterns_for("sub"))   419     region.append_inline(span)   420    421 def parse_super(items, region):   422    423     "Handle superscript."   424    425     span = Superscript([])   426     parse_region_details(items, span, inline_patterns_for("super"))   427     region.append_inline(span)   428    429 def parse_underline(items, region):   430    431     "Handle underline."   432    433     span = Underline([])   434     parse_region_details(items, span, inline_patterns_for("underline"))   435     region.append_inline(span)   436    437 # Pattern handlers.   438    439 handlers = {   440     None : end_region,   441     "break" : parse_break,   442     "defterm" : parse_defterm,   443     "defterm_empty" : parse_defterm_empty,   444     "deftermend" : end_region,   445     "deftermsep" : end_region,   446     "fontstyle" : parse_fontstyle,   447     "heading" : parse_heading,   448     "headingend" : parse_heading_end,   449     "listitemend" : end_region,   450     "listitem" : parse_listitem,   451     "listitem_alpha" : parse_listitem,   452     "listitem_dot" : parse_listitem,   453     "listitem_num" : parse_listitem,   454     "listitem_roman" : parse_listitem,   455     "monospace" : parse_monospace,   456     "monospaceend" : end_region,   457     "regionstart" : parse_section,   458     "regionend" : parse_section_end,   459     "rule" : parse_rule,   460     "sub" : parse_sub,   461     "subend" : end_region,   462     "super" : parse_super,   463     "superend" : end_region,   464     "underline" : parse_underline,   465     "underlineend" : end_region,   466     }   467    468 def new_block(region):   469    470     "Start a new block in 'region'."   471    472     block = Block([])   473     region.add(block)   474    475    476    477 # Top-level functions.   478    479 parse = parse_page   480    481 # vim: tabstop=4 expandtab shiftwidth=4
MoinLight

moinformat/__init__.py

moinformat/init.py