ConfluenceConverter

wikiparser.py

96:a95675d52731
2013-07-16 Paul Boddie Added a patch against Moin 1.9 to associate author details with page revisions.
     1 #!/usr/bin/env python     2      3 """     4 Confluence Wiki syntax parsing.     5      6 Copyright (C) 2012, 2013 Paul Boddie <paul@boddie.org.uk>     7      8 This software is free software; you can redistribute it and/or     9 modify it under the terms of the GNU General Public License as    10 published by the Free Software Foundation; either version 2 of    11 the License, or (at your option) any later version.    12     13 This software is distributed in the hope that it will be useful,    14 but WITHOUT ANY WARRANTY; without even the implied warranty of    15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the    16 GNU General Public License for more details.    17     18 You should have received a copy of the GNU General Public    19 License along with this library; see the file LICENCE.txt    20 If not, write to the Free Software Foundation, Inc.,    21 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA    22     23 --------    24     25 The basic procedure is as follows:    26     27  1. Wiki pages are first split up into regions.    28  2. Then, within these regions, the text is split into blocks.    29     1. First, lists are identified.    30     2. Additionally, other block-like elements are identified.    31  3. Each block is then split into regions.    32 """    33     34 from common import *    35 import re    36 import sys    37 import codecs    38 import operator    39     40 # Section extraction.    41     42 sections_regexp_str = r"(?<!{){(?P<type>[^-_*+{}\n:]+)(?P<options>:[^}\n]+)?}" \    43                       r"|" \    44                       r"^(?P<rowstart>[|]{1,2})" \    45                       r"|" \    46                       r"(?P<rowend>[|]{1,2}(\n|$))" \    47                       r"|" \    48                       r"^(?P<listitem>\s*[*#-]+\s+.*?([^|](\n|$)|(?=[|](\n|$))))"    49     50 sections_regexp = re.compile(sections_regexp_str, re.MULTILINE)    51     52 def get_regions(s):    53     54     """    55     Return a list of regions from 's'. Each region is specified using a tuple of    56     the form (type, text).    57     """    58     59     last = 0    60     regions = [""]    61     depth = 0    62     had_row = False    63     had_item = False    64     65     for match in sections_regexp.finditer(s):    66         start, end = match.span()    67         is_start = match.group("options") or match.group("rowstart")    68         is_section = is_section_marker(match.group("type"))    69         is_row = match.group("rowstart") or match.group("rowend")    70         is_item = match.group("listitem")    71     72         # The start of a region is either indicated by a marker with options or    73         # by a marker where no region is currently active.    74     75         if is_start or not depth:    76     77             # Where no region is active, add the text since the last match as a    78             # "null" region.    79     80             if not depth:    81                 regions[-1] += s[last:start]    82     83                 # A new region is maintained as a string.    84     85                 if is_section:    86                     regions.append(s[start:end])    87     88                 # A new row may either continue a table region or start a new    89                 # table region.    90     91                 elif is_row:    92                     if had_row and last == start:    93                         regions[-2] += regions[-1] + s[start:end]    94                         regions.pop()    95                     else:    96                         regions.append(s[start:end])    97     98                 # A list item may either continue a list region or start a new    99                 # list region.   100    101                 elif is_item:   102    103                     # If continuing a list, merge the list regions and start a   104                     # new potentally separate region.   105    106                     if had_item and last == start:   107                         regions[-2] += regions[-1] + s[start:end]   108                         regions[-1] = ""   109    110                     # If not continuing a list, make a region for a new list and   111                     # start a new potentally separate region.   112    113                     else:   114                         regions.append(s[start:end])   115                         regions.append("")   116    117                 # Certain markers may be standalone macros.   118    119                 else:   120                     regions[-1] += s[start:end]   121    122             # Where a region is active, add the text since the last match as   123             # well as the text in this match to the region.   124    125             else:   126                 regions[-1] += s[last:end]   127    128             if is_section or is_row:   129                 depth += 1   130    131         # The end of a region is indicated by a marker with no options or the   132         # end of a row.   133    134         else:   135             # Where no region is active, the text since the last match plus the   136             # marker are added to the current "null" region.   137    138             if not depth:   139    140                 # Add to the string portion of the "null" region.   141    142                 regions[-1] += s[last:end]   143    144             # Where a region is active, the end marker and preceding text is   145             # either incorporated into the current region if more than one   146             # region is active, or the preceding text is incorporated into the   147             # current region and the details of the region are then obtained.   148    149             else:   150                 if depth > 1 or (not is_section and not is_row):   151                     regions[-1] += s[last:end]   152    153                 # Terminate the active region, interpreting its contents.   154    155                 else:   156                     regions[-1] += s[last:end]   157                     regions.append("")   158    159                 if is_section or is_row:   160                     depth -= 1   161    162         had_row = is_row   163         had_item = is_item   164         last = end   165    166     # Where a region is still active, terminate it.   167    168     regions[-1] += s[last:]   169    170     return [get_section_details(s) for s in regions if s]   171    172 def is_section_marker(sectiontype):   173     return sectiontypes.has_key(sectiontype) or sectiontype == "color"   174    175 # Section inspection.   176    177 section_regexp_str = r"{(?P<sectiontype>[^\n:]*?)(?::(?P<options>.*?))?}(?P<section>.*){(?P=sectiontype)}"   178 section_regexp = re.compile(section_regexp_str, re.DOTALL | re.MULTILINE)   179    180 def get_section_details(s):   181    182     "Return the details of a section 's' in the form (type, text)."   183    184     match = section_regexp.match(s)   185     if match:   186         return (match.group("sectiontype"), match.group("options")), match.group("section")   187     else:   188         return None, s   189    190 # Heading, table and list extraction.   191    192 list_regexp_str = r"^\s*(?P<listtype>[*#-])[*#-]*\s+.*(\n\s*(?P=listtype).*?)*(?:\n|$)"   193 table_regexp_str = r"^((?P<celltype>[|]{1,2})((.|\n(?!\n))+?(?P=celltype))+(\n|$))+"   194 blocktext_regexp_str = r"^(?P<type>h\d|bq)\.\s+(?P<text>.*)$"   195    196 blockelement_regexp = re.compile(   197     "(" + list_regexp_str + ")"   198     "|"   199     "(" + table_regexp_str + ")"   200     "|"   201     "(" + blocktext_regexp_str + ")",   202     re.MULTILINE   203     )   204    205 def get_block_elements(s):   206    207     """   208     Extract headings, tables and lists from the given string 's'.   209     """   210    211     last = 0   212     blocks = []   213     for match in blockelement_regexp.finditer(s):   214         start, end = match.span()   215         matchtype = match.group("listtype") and "list" or match.group("celltype") and "table" or match.group("type")   216         blocks.append((None, s[last:start]))   217         blocks.append((matchtype, match.group("text") or s[start:end]))   218         last = end   219     blocks.append((None, s[last:]))   220     return blocks   221    222 # Block extraction.   223    224 block_regexp_str = r"^(?:\s*\n)+"   225 block_regexp = re.compile(block_regexp_str, re.MULTILINE)   226    227 def get_basic_blocks(s):   228    229     """   230     Return blocks from the given string 's' by splitting the text on blank lines   231     and eliminating those lines.   232     """   233    234     return [b for b in block_regexp.split(s) if b.strip()]   235    236 # Block inspection.   237    238 def get_blocks(s):   239    240     """   241     Return blocks from the given string 's', inspecting the basic blocks and   242     generating additional block-level text where appropriate.   243     """   244    245     blocks = []   246    247     for blocktype, blocktext in get_block_elements(s):   248    249         # Collect heading, list and table blocks.   250    251         if blocktype is not None:   252             blocks.append((blocktype, blocktext))   253    254         # Attempt to find new subblocks in other regions.   255    256         else:   257             for block in get_basic_blocks(blocktext):   258                 blocks.append((None, block))   259    260     return blocks   261    262 # List item inspection.   263    264 listitem_regexp_str = r"^(?P<marker> *[-*#]+)\s+(?P<text>.*)$"   265 listitem_regexp = re.compile(listitem_regexp_str, re.MULTILINE)   266    267 def get_list_items(text):   268    269     "Return a list of (marker, text) tuples for the given list 'text'."   270    271     items = []   272    273     for match in listitem_regexp.finditer(text):   274         items.append((match.group("marker"), match.group("text")))   275    276     return items   277    278 # Content inspection.   279    280 monospace_regexp_str = r"{{(?P<monotext>.*?)}}"   281 link_regexp_str      = r"(?<!\\)[[](?P<linktext>.*?)]"   282 image_regexp_str     = r"!(?P<imagetext>\w.*?)!"   283 macro_regexp_str     = r"{(?P<macro>.*?)(?::(?P<options>.*?))?}"   284    285 # Word-dependent patterns.   286 # Here, the unbracketed markers must test for the absence of surrounding word   287 # characters.   288    289 italic_regexp_str    = r"(?:(?<!\w)_|\{_\})(?P<italictext>.*?)(?:_(?!\w)|\{_\})"   290 bold_regexp_str      = r"(?:(?<!\w)\*|\{\*\})(?P<boldtext>.*?)(?:\*(?!\w)|\{\*\})"   291 del_regexp_str       = r"(?:(?<!\w)-|\{-\})(?P<deltext>.*?)(?:-(?!\w)|\{-\})"   292 underline_regexp_str = r"(?:(?<!\w)\+|\{\+\})(?P<underlinetext>.*?)(?:\+(?!\w)|\{\+\})"   293 sub_regexp_str       = r"(?:(?<!\w)~|\{~\})(?P<subtext>.*?)(?:~(?!\w)|\{~\})"   294    295 content_regexp_str = (   296     "(" + monospace_regexp_str + ")"   297     "|"   298     "(" + link_regexp_str + ")"   299     "|"   300     "(" + image_regexp_str + ")"   301     "|"   302     "(" + macro_regexp_str + ")"   303     "|"   304     "(" + italic_regexp_str + ")"   305     "|"   306     "(" + bold_regexp_str + ")"   307     "|"   308     "(" + del_regexp_str + ")"   309     "|"   310     "(" + underline_regexp_str + ")"   311     "|"   312     "(" + sub_regexp_str + ")"   313     )   314    315 # Table row inspection.   316    317 cellsep_regexp_str = r"(?P<celltype>[|]{1,2})"   318    319 table_content_regexp_str = (   320     content_regexp_str +   321     "|"   322     "(" + cellsep_regexp_str + ")"   323     )   324    325 content_regexp = re.compile(content_regexp_str)   326 table_content_regexp = re.compile(table_content_regexp_str)   327    328 def get_table_rows(text):   329    330     "Return a list of (cellsep, columns) tuples for the given table 'text'."   331    332     rows = []   333    334     for row in text.split("|\n"):   335         if not row:   336             break   337    338         row += "|"   339         cellsep = None   340         columns = [""]   341         last = 0   342         for match in table_content_regexp.finditer(row):   343             start, end = match.span()   344             columns[-1] += row[last:start]   345    346             if match.group("celltype"):   347                 if cellsep is None:   348                     cellsep = match.group("celltype")   349                 columns.append("")   350             else:   351                 columns[-1] += match.group()   352    353             last = end   354    355         columns[-1] += row[last:]   356    357         if cellsep:   358             rows.append((cellsep, columns[1:-1]))   359    360     return rows   361    362 # Notation conversion.   363    364 notation_mapping = [   365     (r"\!", "!"),   366     (r"\-", "-"),   367     (r"\\""\n", "<<BR>>"),   368     (r"\\ ", "<<BR>>"),   369     (r"\~", "~"),   370     (r"\[", "<<Verbatim([)>>"),   371     (r"\]", "<<Verbatim(])>>"),   372     (r"\*", "*"),   373     ]   374    375 preformatted_notation_mapping = [   376     (r"\!", "!"),   377     (r"\-", "-"),   378     (r"\\""\n", "\n"),   379     (r"\\ ", "\n"),   380     (r"\~", "~"),   381     ]   382    383 # Translation helpers.   384    385 markers = {   386     "*" : "*",   387     "#" : "1.",   388     "-" : "*",   389     }   390    391 cellseps = {   392     "|" : "\n|| ",   393     "||" : "\n|| ",   394     }   395    396 cellextra = {   397     "|" : "",   398     "||" : "'''",   399     }   400    401 sectiontypes = {   402     "code"      : "",   403     "excerpt"   : "#!wiki",   404     "noformat"  : "",   405     "quote"     : "",   406     "info"      : "#!wiki important",   407     "note"      : "#!wiki caution",   408     "tip"       : "#!wiki tip",   409     "warning"   : "#!wiki warning",   410     }   411    412 preformatted_sectiontypes = (None, "noformat")   413    414 macroargs = {   415     "color"     : "col",   416     }   417    418 macrotypes = {   419     "anchor"    : "<<Anchor(%(args)s)>>",   420     "color"     : "<<Color2(%(content)s, %(args)s)>>",   421     "toc"       : "<<TableOfContents>>",   422     }   423    424 class ConfluenceParser:   425    426     "A parser for Confluence markup."   427    428     def __init__(self):   429         self.max_level = self.level = 0   430         self.in_heading = False   431         self.held_anchors = []   432         self.macro = None   433         self.sections = []   434    435     def translate_marker(self, marker):   436    437         "Translate the given 'marker' to a suitable Moin representation."   438    439         return " " * len(marker) + markers[marker[-1]]   440    441     def translate_cellsep(self, cellsep):   442    443         "Translate the given 'cellsep' to a suitable Moin representation."   444    445         return cellseps[cellsep]   446    447     def translate_cell(self, cellsep, text):   448    449         "Using 'cellsep', translate the cell 'text'."   450    451         return cellextra[cellsep] + self.parse_text(text).strip() + cellextra[cellsep]   452    453     def translate_content_match(self, match):   454    455         "Translate the content described by the given 'match', returning a string."   456    457         if match.group("monotext"):   458             self.enter_section(); self.leave_section()   459             return "{{{%s}}}" % match.group("monotext")   460    461         elif match.group("linktext"):   462             parts = match.group("linktext").split("|")   463    464             # NOTE: Proper detection of external links required.   465    466             if len(parts) == 1:   467                 label, target, title = None, parts[0], None   468             elif len(parts) == 2:   469                 (label, target), title = parts, None   470             else:   471                 label, target, title = parts   472    473             target = target.strip()   474    475             # Look for namespace links and rewrite them.   476    477             if target.find(":") != -1:   478                 prefix = ""   479                 space, rest = target.split(":", 1)   480                 if space not in URL_SCHEMES:   481                     rest = get_page_title(rest)   482                     target = "%s/%s" % (space, rest)   483    484             # Detect anchors.   485    486             elif target.startswith("#"):   487                 prefix = ""   488    489             # Detect attachments.   490    491             elif target.startswith("^"):   492                 prefix = "attachment:"   493    494             # Link to other pages within a space.   495    496             else:   497                 prefix = "../"   498    499                 # Make the link tidier by making a target if none was given.   500    501                 if not label:   502                     label = target   503    504                 target = get_page_title(target)   505    506             if not label and not title:   507                 return "[[%s%s]]" % (prefix, target)   508             elif not title:   509                 return "[[%s%s|%s]]" % (prefix, target, label)   510             else:   511                 return "[[%s%s|%s|title=%s]]" % (prefix, target, label, title)   512    513         elif match.group("imagetext"):   514             parts = match.group("imagetext").split("|")   515    516             # NOTE: Proper detection of external links required.   517    518             if parts[0].startswith("http"):   519                 prefix = ""   520             else:   521                 prefix = "attachment:"   522    523             # NOTE: Proper options conversion required.   524    525             if len(parts) == 1:   526                 return "{{%s%s}}" % (prefix, parts[0])   527             else:   528                 return "{{%s%s|%s}}" % (prefix, parts[0], parts[1])   529    530         elif match.group("macro"):   531             macro_name = match.group("macro")   532             if macrotypes.has_key(macro_name):   533                 argname = macroargs.get(macro_name)   534                 result = macrotypes[macro_name] % {   535                     "args" : quote_macro_argument((argname and ("%s=" % argname) or "") + (match.group("options") or ""))   536                     }   537                 if not self.forbids_macros():   538                     return result   539                 if macro_name == "anchor":   540                     self.held_anchors.append(result)   541             return ""   542    543         elif match.group("italictext"):   544             return "''%s''" % self.translate_content(match.group("italictext"))   545    546         elif match.group("boldtext"):   547             return "'''%s'''" % self.translate_content(match.group("boldtext"))   548    549         elif match.group("deltext"):   550             return "--(%s)--" % self.translate_content(match.group("deltext"))   551    552         elif match.group("underlinetext"):   553             return "__%s__" % self.translate_content(match.group("underlinetext"))   554    555         elif match.group("subtext"):   556             return ",,%s,," % self.translate_content(match.group("subtext"))   557    558         else:   559             return self.translate_text(match.group())   560    561     def translate_text(self, s, preformatted=False):   562    563         "Translate the plain text string 's', converting notation."   564    565         for before, after in preformatted and preformatted_notation_mapping or notation_mapping:   566             s = s.replace(before, after)   567         return s   568    569     def translate_content(self, text):   570    571         """   572         Return a translation of the given 'text'. If the optional 'sectiontype' is   573         specified, the translation may be modified to a form appropriate to the   574         section being translated.   575         """   576    577         parts = []   578         preformatted = self.is_preformatted()   579    580         last = 0   581         for match in content_regexp.finditer(text):   582             start, end = match.span()   583             parts.append(self.translate_text(text[last:start], preformatted))   584    585             # Handle unformatted sections.   586    587             if self.sections and self.sections[-1] in ("code", "noformat"):   588                 parts.append(match.group())   589             else:   590                 parts.append(self.translate_content_match(match))   591    592             last = end   593    594         parts.append(self.translate_text(text[last:], preformatted))   595         return "".join(parts)   596    597     def is_preformatted(self):   598         return reduce(operator.or_, [x in preformatted_sectiontypes for x in self.sections], False)   599    600     def translate_block(self, blocktype, blocktext):   601    602         "Translate the block with the given 'blocktype' and 'blocktext'."   603    604         if blocktype in headings:   605             self.in_heading = True   606             self.held_anchors = []   607    608         parts = []   609    610         # Translate headings and blockquotes.   611    612         if blocktypes.has_key(blocktype):   613             text = self.parse_text(blocktext)   614             for anchor in self.held_anchors:   615                 parts.append(anchor)   616             parts.append(blocktypes[blocktype] % text)   617    618         # Translate list items.   619    620         elif blocktype == "list":   621             for listmarker, listitem in get_list_items(blocktext):   622                 parts.append("%s %s" % (self.translate_marker(listmarker), self.parse_text(listitem)))   623    624         # Translate table items.   625    626         elif blocktype == "table":   627    628             # Enter the table.   629    630             self.enter_section("table")   631    632             table_parts = []   633             first = True   634    635             for cellsep, columns in get_table_rows(blocktext):   636                 if not first:   637                     table_parts.append("==")   638                 else:   639                     first = False   640                 moinsep = self.translate_cellsep(cellsep)   641                 table_parts.append(moinsep.join([self.translate_cell(cellsep, column) for column in columns]))   642    643             # Nest the section appropriately.   644    645             opening, closing = self.nest_section()   646    647             parts.append("%s#!table" % opening)   648             parts += table_parts   649             parts.append(closing)   650    651             # Leave the table.   652    653             self.leave_section()   654    655         # Handle anonymous blocks.   656    657         else:   658             parts.append(self.parse_text(blocktext))   659    660         if blocktype in headings:   661             self.in_heading = False   662    663         return "\n".join(parts)   664    665     def translate_section(self, sectiontype, options, text):   666    667         """   668         Translate the section with the given 'sectiontype', 'options' and   669         'text'.   670         """   671    672         parts = []   673    674         # Enter the section.   675    676         self.enter_section(sectiontype)   677    678         # Sections can contain other sections.   679    680         if sectiontype == "noformat":   681             section_content = self.translate_content(text.strip("\n"))   682         else:   683             section_content = self.parse_text(text.strip())   684    685         # Nest the section appropriately.   686    687         opening, closing = self.nest_section()   688         mointype = sectiontypes.get(sectiontype)   689    690         parts.append("%s%s\n" % (opening, mointype or ""))   691         parts.append(section_content)   692         parts.append("\n%s\n" % closing)   693    694         # Leave the section.   695    696         self.leave_section()   697    698         return parts   699    700     def enter_section(self, sectiontype=None):   701         self.level += 1   702         self.max_level = max(self.level, self.max_level)   703         self.sections.append(sectiontype)   704    705     def leave_section(self):   706         self.level -= 1   707         if not self.level:   708             self.max_level = 0   709         self.sections.pop()   710    711     def nest_section(self):   712         level = 3 + self.max_level - self.level   713         opening = "{" * level   714         closing = "}" * level   715         return opening, closing   716    717     # General parsing.   718    719     def parse_text(self, s, top=False):   720    721         "Parse the content in the string 's', returning the translation."   722    723         parts = []   724    725         # Control spacing between blocks and other blocks or sections.   726    727         preceded_by_block = False   728    729         for type, text in get_regions(s):   730    731             # Handle list, heading, blockquote or anonymous blocks.   732    733             if type is None:   734    735                 # Where the region is the same as the provided text, return   736                 # immediately. This is the base case of the recursive parsing   737                 # process.   738    739                 if text == s and not top:   740                     return self.translate_content(text)   741    742                 # Otherwise, obtain and translate the blocks.   743    744                 if preceded_by_block:   745                     parts.append("\n")   746    747                 first = True   748                 for blocktype, blocktext in get_blocks(text):   749                     if not first:   750                         parts.append("\n")   751                     else:   752                         first = False   753                     parts.append("%s" % self.translate_block(blocktype, blocktext))   754    755                 if not first:   756                     preceded_by_block = True   757    758             # Handle sections.   759    760             else:   761                 sectiontype, options = type   762    763                 # Direct translations of sections.   764    765                 if sectiontypes.has_key(sectiontype):   766                     if preceded_by_block:   767                         parts.append("\n")   768    769                     parts += self.translate_section(sectiontype, options, text)   770                     preceded_by_block = True   771    772                 # Translations of macros acting as sections.   773    774                 elif macrotypes.has_key(sectiontype):   775    776                     # Prevent the production of macros in places they would   777                     # produce illegal Moin syntax.   778    779                     if not self.forbids_macros():   780                         self.macro = sectiontype   781                         argname = macroargs.get(sectiontype)   782                         parts.append(macrotypes[sectiontype] % {   783                             "content"   : quote_macro_argument(self.parse_text(text)),   784                             "args"      : quote_macro_argument((argname and ("%s=" % argname) or "") + options)   785                             })   786                         self.macro = None   787    788                     # Include the contents of section-based macros where the   789                     # macros themselves are not allowed.   790    791                     else:   792                         parts.append(self.translate_content(text))   793    794                     preceded_by_block = False   795    796                 # Unrecognised sections.   797    798                 else:   799                     parts += self.translate_section(sectiontype, None, text)   800                     preceded_by_block = False   801    802         return "".join(parts)   803    804     def forbids_macros(self):   805         return self.in_heading or self.macro   806    807 def parse(s, out):   808    809     "Parse the content in the string 's', writing a translation to 'out'."   810    811     parser = ConfluenceParser()   812     out.write(parser.parse_text(s, top=True))   813    814 if __name__ == "__main__":   815     s = codecs.getreader("utf-8")(sys.stdin).read()   816     out = codecs.getwriter("utf-8")(sys.stdout)   817     parse(s, out)   818    819 # vim: tabstop=4 expandtab shiftwidth=4