MoinLight (file moinformat/__init_

     1 #!/usr/bin/env python     2      3 """     4 Moin wiki format parser.     5      6 Copyright (C) 2017 Paul Boddie <paul@boddie.org.uk>     7      8 This program is free software; you can redistribute it and/or modify it under     9 the terms of the GNU General Public License as published by the Free Software    10 Foundation; either version 3 of the License, or (at your option) any later    11 version.    12     13 This program is distributed in the hope that it will be useful, but WITHOUT    14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS    15 FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more    16 details.    17     18 You should have received a copy of the GNU General Public License along with    19 this program.  If not, see <http://www.gnu.org/licenses/>.    20 """    21     22 from moinformat.tree import Block, Break, DefItem, DefTerm, FontStyle, Heading, \    23                             Larger, ListItem, Monospace, Region, Rule, Smaller, \    24                             Subscript, Superscript, TableCell, TableRow, Text, \    25                             Underline    26 import re    27     28 # Regular expressions.    29     30 syntax = {    31     # Page regions:    32     "regionstart"   : r"((^\s*)([{]{3,}))",                         # {{{...    33     "regionend"     : r"^\s*([}]{3,})",                             # }}}...    34     "header"        : r"#!(.*?)\n",                                 # #! char-excl-nl    35     36     # Region contents:    37     # Line-oriented patterns:    38                       # blank line    39     "break"         : r"^(\s*?)\n",    40                       # ws... expecting text ::    41     "defterm"       : r"^(\s+)(?=.+?::)",    42                       # ws... expecting :: ws...    43     "defterm_empty" : r"^(\s+)(?=::\s+)",    44                       # [ws...] =... ws... expecting headingend    45     "heading"       : r"^(\s*)(?P<x>=+)(\s+)(?=.*?\s+(?P=x)\s*\n)",    46                       # ws... list-item [ws...]    47     "listitem"      : r"^(\s+)(\*)(\s*)",    48                       # ws... number-item ws...    49     "listitem_num"  : r"^(\s+)(\d+\.)(\s+)",    50                       # ws... alpha-item ws...    51     "listitem_alpha": r"^(\s+)([aA]\.)(\s+)",    52                       # ws... roman-item ws...    53     "listitem_roman": r"^(\s+)([iI]\.)(\s+)",    54                       # ws... dot-item [ws...]    55     "listitem_dot"  : r"^(\s+)(\.)(\s*)",    56                       # ||    57     "tablerow"      : r"^\|\|",    58     59     # Region contents:    60     # Inline patterns:    61     "fontstyle"     : r"('{2,6})",    62     "larger"        : r"~\+",    63     "monospace"     : r"`",    64     "rule"          : r"(-----*)",                                  # ----...    65     "smaller"       : r"~-",    66     "sub"           : r",,",    67     "super"         : r"\^",    68     "underline"     : r"__",    69     70     # Inline contents:    71     "largerend"     : r"\+~",    72     "monospaceend"  : r"`",    73     "smallerend"    : r"-~",    74     "subend"        : r",,",    75     "superend"      : r"\^",    76     "underlineend"  : r"__",    77     78     # Heading contents:    79     "headingend"    : r"(\s+)(=+)(\s*\n)",                          # ws... =... [ws...] nl    80     81     # List contents:    82     "deftermend"    : r"::(\s*?\n)",    83     "deftermsep"    : r"::(\s+)",    84     "listitemend"   : r"^",                                         # next line    85     86     # Table contents:    87     "tablecell"     : r"\|\|",    88     "tableend"      : r"(\s*?)^",                                   # [ws...] next line    89     }    90     91 # Define inline pattern details.    92     93 inline_pattern_names = ["fontstyle", "larger", "monospace", "smaller", "sub", "super", "underline"]    94     95 def inline_patterns_for(name):    96     names = inline_pattern_names[:]    97     names[names.index(name)] = "%send" % name    98     return names    99    100 # Define patterns for the regular expressions.   101    102 patterns = {}   103 for name, value in syntax.items():   104     patterns[name] = re.compile(value, re.UNICODE | re.MULTILINE)   105    106    107    108 # Tokenising functions.   109    110 class TokenStream:   111    112     "A stream of tokens taken from a string."   113    114     def __init__(self, s):   115         self.s = s   116         self.pos = 0   117         self.match = None   118         self.matching = None   119    120     def rewind(self, length):   121    122         "Rewind in the string by 'length'."   123    124         self.pos -= min(length, self.pos)   125    126     def read_until(self, pattern_names, remaining=True):   127    128         """   129         Find the first match for the given 'pattern_names'. Return the text   130         preceding any match, the remaining text if no match was found, or None   131         if no match was found and 'remaining' is given as a false value.   132         """   133    134         first = None   135         self.matching = None   136    137         # Find the first matching pattern.   138    139         for pattern_name in pattern_names:   140             match = patterns[pattern_name].search(self.s, self.pos)   141             if match:   142                 start, end = match.span()   143                 if self.matching is None or start < first:   144                     first = start   145                     self.matching = pattern_name   146                     self.match = match   147    148         if self.matching is None:   149             if remaining:   150                 return self.s[self.pos:]   151             else:   152                 return None   153         else:   154             return self.s[self.pos:first]   155    156     def read_match(self, group=1):   157    158         """   159         Return the matched text, updating the position in the stream. If 'group'   160         is specified, the indicated group in a match will be returned.   161         Typically, group 1 should contain all pertinent data, but groups defined   162         within group 1 can provide sections of the data.   163         """   164    165         if self.match:   166             _start, self.pos = self.match.span()   167             try:   168                 return self.match.group(group)   169             except IndexError:   170                 return ""   171         else:   172             self.pos = len(self.s)   173             return None   174    175    176    177 # Parser functions.   178    179 def parse_page(s):   180    181     """   182     Parse page text 's'. Pages consist of regions delimited by markers.   183     """   184    185     return parse_region(TokenStream(s))   186    187 def parse_region(items, level=0, indent=0):   188    189     """   190     Parse the data provided by 'items' to populate a region with the given   191     'level' at the given 'indent'.   192     """   193    194     region = Region([], level, indent)   195    196     # Parse section headers.   197    198     parse_region_header(items, region)   199    200     # Parse section body.   201    202     if region.is_transparent():   203         parse_region_wiki(items, region)   204     else:   205         parse_region_opaque(items, region)   206    207     return region   208    209 def parse_region_header(items, region):   210    211     """   212     Parse the region header from the 'items', setting it for the given 'region'.   213     """   214    215     if items.read_until(["header"], False) == "": # None means no header   216         region.type = items.read_match()   217    218 def parse_region_wiki(items, region):   219    220     "Parse the data provided by 'items' to populate a wiki 'region'."   221    222     new_block(region)   223     parse_region_details(items, region, inline_pattern_names + [   224         "break", "heading",   225         "defterm", "defterm_empty",   226         "listitem", "listitem_alpha", "listitem_dot", "listitem_num",   227         "listitem_roman",   228         "regionstart", "regionend",   229         "rule",   230         "tablerow",   231         ])   232    233 def parse_region_opaque(items, region):   234    235     "Parse the data provided by 'items' to populate an opaque 'region'."   236    237     parse_region_details(items, region, ["regionend"])   238    239 def parse_region_details(items, region, pattern_names):   240    241     "Parse 'items' within 'region' searching using 'pattern_names'."   242    243     try:   244         while True:   245    246             # Obtain text before any marker or the end of the input.   247    248             preceding = items.read_until(pattern_names)   249             if preceding:   250                 region.append_inline(Text(preceding))   251    252             # End of input.   253    254             if not items.matching:   255                 break   256    257             # Obtain any feature.   258    259             feature = items.read_match()   260             handler = handlers.get(items.matching)   261    262             # Handle each feature or add text to the region.   263    264             if handler:   265                 handler(items, region)   266             else:   267                 region.append_inline(Text(feature))   268    269     except StopIteration:   270         pass   271    272     region.normalise()   273    274 def end_region(items, region):   275    276     "End the parsing of 'region'."   277    278     raise StopIteration   279    280 def parse_break(items, region):   281    282     "Handle a paragraph break within 'region'."   283    284     region.add(Break())   285     new_block(region)   286    287 def parse_defitem(items, region, extra=""):   288    289     "Handle a definition item within 'region'."   290    291     pad = items.read_match(1)   292     item = DefItem([], pad, extra)   293     parse_region_details(items, item, ["listitemend"])   294     region.add(item)   295     new_block(region)   296    297 def parse_defterm(items, region):   298    299     "Handle a definition term within 'region'."   300    301     pad = items.read_match(1)   302     term = DefTerm([], pad)   303     parse_region_details(items, term, ["deftermend", "deftermsep"])   304     region.add(term)   305     if items.matching == "deftermsep":   306         parse_defitem(items, region)   307    308 def parse_defterm_empty(items, region):   309    310     "Handle an empty definition term within 'region'."   311    312     extra = items.read_match(1)   313     parse_region_details(items, region, ["deftermsep"])   314     parse_defitem(items, region, extra)   315    316 def parse_fontstyle(items, region):   317    318     "Handle emphasis and strong styles."   319    320     n = len(items.read_match(1))   321    322     # Handle endings.   323    324     if isinstance(region, FontStyle):   325         emphasis = n in (2, 4, 5)   326         strong = n in (3, 5, 6)   327         active = True   328    329         if region.emphasis and emphasis:   330             active = region.close_emphasis()   331             n -= 2   332         if region.strong and strong:   333             active = region.close_strong()   334             n -= 3   335    336         if not active:   337             if n:   338                 items.rewind(n)   339             raise StopIteration   340    341         elif not n:   342             return   343    344     # Handle new styles.   345    346     emphasis = n in (2, 4, 5)   347     strong = n in (3, 5, 6)   348     double = n in (4, 6)   349    350     span = FontStyle([], emphasis, strong)   351     if not double:   352         parse_region_details(items, span, inline_pattern_names)   353     region.append_inline(span)   354    355 def parse_heading(items, region):   356    357     "Handle a heading."   358    359     start_extra = items.read_match(1)   360     level = len(items.read_match(2))   361     start_pad = items.read_match(3)   362     heading = Heading([], level, start_extra, start_pad)   363     parse_region_details(items, heading, ["headingend"] + inline_pattern_names)   364     region.add(heading)   365     new_block(region)   366    367 def parse_heading_end(items, heading):   368    369     "Handle the end of a heading."   370    371     level = len(items.read_match(2))   372     if heading.level == level:   373         heading.end_pad = items.read_match(1)   374         heading.end_extra = items.read_match(3)   375         raise StopIteration   376    377 def parse_listitem(items, region):   378    379     "Handle a list item marker within 'region'."   380    381     indent = len(items.read_match(1))   382     marker = items.read_match(2)   383     space = items.read_match(3)   384     item = ListItem([], indent, marker, space)   385     parse_region_details(items, item, ["listitemend"])   386     region.add(item)   387     new_block(region)   388    389 def parse_rule(items, region):   390    391     "Handle a horizontal rule within 'region'."   392    393     length = len(items.read_match(1))   394     rule = Rule(length)   395     region.add(rule)   396     new_block(region)   397    398 def parse_section(items, region):   399    400     "Handle the start of a new section within 'region'."   401    402     # Parse the section and start a new block after the section.   403    404     indent = len(items.read_match(2))   405     level = len(items.read_match(3))   406     region.add(parse_region(items, level, indent))   407     new_block(region)   408    409 def parse_section_end(items, region):   410    411     "Handle the end of a new section within 'region'."   412    413     feature = items.read_match()   414     if region.have_end(feature):   415         raise StopIteration   416     else:   417         region.append_inline(Text(feature))   418    419 def parse_table_row(items, region):   420    421     "Handle the start of a table row within 'region'."   422    423     row = TableRow([])   424    425     while True:   426         cell = TableCell([])   427         parse_region_details(items, cell, ["tablecell", "tableend"])   428    429         # Handle the end of the row.   430    431         if items.matching == "tableend":   432             trailing = items.read_match()   433    434             # If the cell was started but not finished, convert the row into text.   435    436             if not row.nodes or not cell.empty():   437                 region.append_inline(Text("||"))   438    439                 # Convert all cells.   440    441                 for node in row.nodes:   442                     region.append_inline_many(node.nodes)   443                     region.append_inline(Text("||"))   444    445                 region.append_inline_many(cell.nodes)   446                 region.append_inline(Text(trailing))   447    448                 new_block(region)   449                 return   450    451             # Append the final cell, if not empty.   452    453             else:   454                 row.trailing = trailing   455    456                 if not cell.empty():   457                     row.append(cell)   458                 break   459    460         row.append(cell)   461    462     region.add(row)   463     new_block(region)   464    465 # Inline formatting handlers.   466    467 def parse_inline(items, region, cls, pattern_name):   468    469     "Handle an inline region."   470    471     span = cls([])   472     parse_region_details(items, span, inline_patterns_for(pattern_name))   473     region.append_inline(span)   474    475 parse_larger = lambda items, region: parse_inline(items, region, Larger, "larger")   476 parse_monospace = lambda items, region: parse_inline(items, region, Monospace, "monospace")   477 parse_smaller = lambda items, region: parse_inline(items, region, Smaller, "smaller")   478 parse_sub = lambda items, region: parse_inline(items, region, Subscript, "sub")   479 parse_super = lambda items, region: parse_inline(items, region, Superscript, "super")   480 parse_underline = lambda items, region: parse_inline(items, region, Underline, "underline")   481    482 # Pattern handlers.   483    484 handlers = {   485     None : end_region,   486     "break" : parse_break,   487     "defterm" : parse_defterm,   488     "defterm_empty" : parse_defterm_empty,   489     "deftermend" : end_region,   490     "deftermsep" : end_region,   491     "fontstyle" : parse_fontstyle,   492     "heading" : parse_heading,   493     "headingend" : parse_heading_end,   494     "larger" : parse_larger,   495     "largerend" : end_region,   496     "listitemend" : end_region,   497     "listitem" : parse_listitem,   498     "listitem_alpha" : parse_listitem,   499     "listitem_dot" : parse_listitem,   500     "listitem_num" : parse_listitem,   501     "listitem_roman" : parse_listitem,   502     "monospace" : parse_monospace,   503     "monospaceend" : end_region,   504     "regionstart" : parse_section,   505     "regionend" : parse_section_end,   506     "rule" : parse_rule,   507     "smaller" : parse_smaller,   508     "smallerend" : end_region,   509     "sub" : parse_sub,   510     "subend" : end_region,   511     "super" : parse_super,   512     "superend" : end_region,   513     "tablerow" : parse_table_row,   514     "tablecell" : end_region,   515     "tableend" : end_region,   516     "underline" : parse_underline,   517     "underlineend" : end_region,   518     }   519    520 def new_block(region):   521    522     "Start a new block in 'region'."   523    524     block = Block([])   525     region.add(block)   526    527    528    529 # Top-level functions.   530    531 parse = parse_page   532    533 # vim: tabstop=4 expandtab shiftwidth=4
MoinLight

moinformat/__init__.py

moinformat/init.py