ConfluenceConverter (file wikiparser.py at a79c3559e45c)

     1 #!/usr/bin/env python     2      3 """     4 Confluence Wiki syntax parsing.     5      6 Copyright (C) 2012, 2013 Paul Boddie <paul@boddie.org.uk>     7      8 This software is free software; you can redistribute it and/or     9 modify it under the terms of the GNU General Public License as    10 published by the Free Software Foundation; either version 2 of    11 the License, or (at your option) any later version.    12     13 This software is distributed in the hope that it will be useful,    14 but WITHOUT ANY WARRANTY; without even the implied warranty of    15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the    16 GNU General Public License for more details.    17     18 You should have received a copy of the GNU General Public    19 License along with this library; see the file LICENCE.txt    20 If not, write to the Free Software Foundation, Inc.,    21 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA    22     23 --------    24     25 The basic procedure is as follows:    26     27  1. Wiki pages are first split up into regions.    28  2. Then, within these regions, the text is split into blocks.    29     1. First, lists are identified.    30     2. Additionally, other block-like elements are identified.    31  3. Each block is then split into regions.    32 """    33     34 from common import *    35 import re    36 import sys    37 import codecs    38 import operator    39     40 # Section extraction.    41     42 sections_regexp_str = r"(?<!{){(?P<type>[^-_*+{}\n:]+)(?P<options>:[^}\n]+)?}" \    43                       r"|" \    44                       r"^(?P<rowstart>[|]{1,2})" \    45                       r"|" \    46                       r"(?P<rowend>[|]{1,2}(\n|$))" \    47                       r"|" \    48                       r"^(?P<listitem>\s*[*#-]+\s+.*(\n|$))"    49     50 sections_regexp = re.compile(sections_regexp_str, re.DOTALL | re.MULTILINE)    51     52 def get_regions(s):    53     54     """    55     Return a list of regions from 's'. Each region is specified using a tuple of    56     the form (type, text).    57     """    58     59     last = 0    60     regions = [""]    61     depth = 0    62     had_row = False    63     had_item = False    64     65     for match in sections_regexp.finditer(s):    66         start, end = match.span()    67         is_start = match.group("options") or match.group("rowstart")    68         is_section = is_section_marker(match.group("type"))    69         is_row = match.group("rowstart") or match.group("rowend")    70         is_item = match.group("listitem")    71     72         # The start of a region is either indicated by a marker with options or    73         # by a marker where no region is currently active.    74     75         if is_start or not depth:    76     77             # Where no region is active, add the text since the last match as a    78             # "null" region.    79     80             if not depth:    81                 regions[-1] += s[last:start]    82     83                 # A new region is maintained as a string.    84     85                 if is_section:    86                     regions.append(s[start:end])    87     88                 # A new row may either continue a table region or start a new    89                 # table region.    90     91                 elif is_row:    92                     if (last != start or not had_row):    93                         regions.append(s[start:end])    94                     else:    95                         regions[-2] += regions[-1] + s[start:end]    96                         regions.pop()    97     98                 # A list item may either continue a list region or start a new    99                 # list region.   100    101                 elif is_item:   102                     if (last != start or not had_item):   103                         regions.append(s[start:end])   104                     else:   105                         regions[-2] += regions[-1] + s[start:end]   106                         regions.pop()   107    108                 # Certain markers may be standalone macros.   109    110                 else:   111                     regions[-1] += s[start:end]   112    113             # Where a region is active, add the text since the last match as   114             # well as the text in this match to the region.   115    116             else:   117                 regions[-1] += s[last:end]   118    119             if is_section or is_row:   120                 depth += 1   121    122         # The end of a region is indicated by a marker with no options.   123    124         else:   125             # Where no region is active, the text since the last match plus the   126             # marker are added to the current "null" region.   127    128             if not depth:   129    130                 # Add to the string portion of the "null" region.   131    132                 regions[-1] += s[last:end]   133    134             # Where a region is active, the end marker and preceding text is   135             # either incorporated into the current region if more than one   136             # region is active, or the preceding text is incorporated into the   137             # current region and the details of the region are then obtained.   138    139             else:   140                 if depth > 1 or (not is_section and not is_row):   141                     regions[-1] += s[last:end]   142    143                 # Terminate the active region, interpreting its contents.   144    145                 else:   146                     regions[-1] += s[last:end]   147                     regions.append("")   148    149                 if is_section or is_row:   150                     depth -= 1   151    152         had_row = is_row   153         had_item = is_item   154         last = end   155    156     # Where a region is still active, terminate it.   157    158     regions[-1] += s[last:]   159    160     return [get_section_details(s) for s in regions if s]   161    162 def is_section_marker(sectiontype):   163     return sectiontypes.has_key(sectiontype) or sectiontype == "color"   164    165 # Section inspection.   166    167 section_regexp_str = r"{(?P<sectiontype>[^\n:]*?)(?::(?P<options>.*?))?}(?P<section>.*){(?P=sectiontype)}"   168 section_regexp = re.compile(section_regexp_str, re.DOTALL | re.MULTILINE)   169    170 def get_section_details(s):   171    172     "Return the details of a section 's' in the form (type, text)."   173    174     match = section_regexp.match(s)   175     if match:   176         return (match.group("sectiontype"), match.group("options")), match.group("section")   177     else:   178         return None, s   179    180 # Heading, table and list extraction.   181    182 list_regexp_str = r"^\s*(?P<listtype>[*#-])[*#-]*\s+.*(\n\s*(?P=listtype).*?)*(?:\n|$)"   183 table_regexp_str = r"^((?P<celltype>[|]{1,2})((.|\n(?!\n))+?(?P=celltype))+(\n|$))+"   184 blocktext_regexp_str = r"^(?P<type>h\d|bq)\.\s+(?P<text>.*)$"   185    186 blockelement_regexp = re.compile(   187     "(" + list_regexp_str + ")"   188     "|"   189     "(" + table_regexp_str + ")"   190     "|"   191     "(" + blocktext_regexp_str + ")",   192     re.MULTILINE   193     )   194    195 def get_block_elements(s):   196    197     """   198     Extract headings, tables and lists from the given string 's'.   199     """   200    201     last = 0   202     blocks = []   203     for match in blockelement_regexp.finditer(s):   204         start, end = match.span()   205         matchtype = match.group("listtype") and "list" or match.group("celltype") and "table" or match.group("type")   206         blocks.append((None, s[last:start]))   207         blocks.append((matchtype, match.group("text") or s[start:end]))   208         last = end   209     blocks.append((None, s[last:]))   210     return blocks   211    212 # Block extraction.   213    214 block_regexp_str = r"^(?:\s*\n)+"   215 block_regexp = re.compile(block_regexp_str, re.MULTILINE)   216    217 def get_basic_blocks(s):   218    219     """   220     Return blocks from the given string 's' by splitting the text on blank lines   221     and eliminating those lines.   222     """   223    224     return [b for b in block_regexp.split(s) if b.strip()]   225    226 # Block inspection.   227    228 def get_blocks(s):   229    230     """   231     Return blocks from the given string 's', inspecting the basic blocks and   232     generating additional block-level text where appropriate.   233     """   234    235     blocks = []   236    237     for blocktype, blocktext in get_block_elements(s):   238    239         # Collect heading, list and table blocks.   240    241         if blocktype is not None:   242             blocks.append((blocktype, blocktext))   243    244         # Attempt to find new subblocks in other regions.   245    246         else:   247             for block in get_basic_blocks(blocktext):   248                 blocks.append((None, block))   249    250     return blocks   251    252 # List item inspection.   253    254 listitem_regexp_str = r"^(?P<marker> *[-*#]+)\s+(?P<text>.*)$"   255 listitem_regexp = re.compile(listitem_regexp_str, re.MULTILINE)   256    257 def get_list_items(text):   258    259     "Return a list of (marker, text) tuples for the given list 'text'."   260    261     items = []   262    263     for match in listitem_regexp.finditer(text):   264         items.append((match.group("marker"), match.group("text")))   265    266     return items   267    268 # Content inspection.   269    270 monospace_regexp_str = r"{{(?P<monotext>.*?)}}"   271 link_regexp_str      = r"[[](?P<linktext>.*?)]"   272 image_regexp_str     = r"!(?P<imagetext>\w.*?)!"   273 macro_regexp_str     = r"{(?P<macro>.*?):(?P<options>.*?)}"   274    275 # Word-dependent patterns.   276 # Here, the unbracketed markers must test for the absence of surrounding word   277 # characters.   278    279 italic_regexp_str    = r"(?:(?<!\w)_|\{_\})(?P<italictext>.*?)(?:_(?!\w)|\{_\})"   280 bold_regexp_str      = r"(?:(?<!\w)\*|\{\*\})(?P<boldtext>.*?)(?:\*(?!\w)|\{\*\})"   281 del_regexp_str       = r"(?:(?<!\w)-|\{-\})(?P<deltext>.*?)(?:-(?!\w)|\{-\})"   282 underline_regexp_str = r"(?:(?<!\w)\+|\{\+\})(?P<underlinetext>.*?)(?:\+(?!\w)|\{\+\})"   283 sub_regexp_str       = r"(?:(?<!\w)~|\{~\})(?P<subtext>.*?)(?:~(?!\w)|\{~\})"   284    285 content_regexp_str = (   286     "(" + monospace_regexp_str + ")"   287     "|"   288     "(" + link_regexp_str + ")"   289     "|"   290     "(" + image_regexp_str + ")"   291     "|"   292     "(" + macro_regexp_str + ")"   293     "|"   294     "(" + italic_regexp_str + ")"   295     "|"   296     "(" + bold_regexp_str + ")"   297     "|"   298     "(" + del_regexp_str + ")"   299     "|"   300     "(" + underline_regexp_str + ")"   301     "|"   302     "(" + sub_regexp_str + ")"   303     )   304    305 # Table row inspection.   306    307 cellsep_regexp_str = r"(?P<celltype>[|]{1,2})"   308    309 table_content_regexp_str = (   310     content_regexp_str +   311     "|"   312     "(" + cellsep_regexp_str + ")"   313     )   314    315 content_regexp = re.compile(content_regexp_str)   316 table_content_regexp = re.compile(table_content_regexp_str)   317    318 def get_table_rows(text):   319    320     "Return a list of (cellsep, columns) tuples for the given table 'text'."   321    322     rows = []   323    324     for row in text.split("|\n"):   325         if not row:   326             break   327    328         row += "|"   329         cellsep = None   330         columns = [""]   331         last = 0   332         for match in table_content_regexp.finditer(row):   333             start, end = match.span()   334             columns[-1] += row[last:start]   335    336             if match.group("celltype"):   337                 if cellsep is None:   338                     cellsep = match.group("celltype")   339                 columns.append("")   340             else:   341                 columns[-1] += match.group()   342    343             last = end   344    345         columns[-1] += row[last:]   346    347         if cellsep:   348             rows.append((cellsep, columns[1:-1]))   349    350     return rows   351    352 # Notation conversion.   353    354 notation_mapping = [   355     (r"\!", "!"),   356     (r"\-", "-"),   357     (r"\\""\n", "<<BR>>"),   358     (r"\\ ", "<<BR>>"),   359     (r"\~", "~"),   360     ]   361    362 preformatted_notation_mapping = [   363     (r"\!", "!"),   364     (r"\-", "-"),   365     (r"\\""\n", "\n"),   366     (r"\\ ", "\n"),   367     (r"\~", "~"),   368     ]   369    370 # Translation helpers.   371    372 markers = {   373     "*" : "*",   374     "#" : "1.",   375     "-" : "*",   376     }   377    378 cellseps = {   379     "|" : "\n|| ",   380     "||" : "\n|| ",   381     }   382    383 cellextra = {   384     "|" : "",   385     "||" : "'''",   386     }   387    388 sectiontypes = {   389     "code"      : "",   390     "noformat"  : "",   391     "quote"     : "",   392     "info"      : "#!wiki important",   393     "note"      : "#!wiki caution",   394     "tip"       : "#!wiki tip",   395     "warning"   : "#!wiki warning",   396     }   397    398 preformatted_sectiontypes = (None, "noformat")   399    400 macroargs = {   401     "color"     : "col",   402     }   403    404 macrotypes = {   405     "anchor"    : "<<Anchor(%(args)s)>>",   406     "color"     : "<<Color2(%(content)s, %(args)s)>>",   407     }   408    409 class ConfluenceParser:   410    411     "A parser for Confluence markup."   412    413     def __init__(self):   414         self.max_level = self.level = 0   415         self.in_heading = False   416         self.held_anchors = []   417         self.macro = None   418         self.sections = []   419    420     def translate_marker(self, marker):   421    422         "Translate the given 'marker' to a suitable Moin representation."   423    424         return " " * len(marker) + markers[marker[-1]]   425    426     def translate_cellsep(self, cellsep):   427    428         "Translate the given 'cellsep' to a suitable Moin representation."   429    430         return cellseps[cellsep]   431    432     def translate_cell(self, cellsep, text):   433    434         "Using 'cellsep', translate the cell 'text'."   435    436         return cellextra[cellsep] + self.parse_text(text).strip() + cellextra[cellsep]   437    438     def translate_content_match(self, match):   439    440         "Translate the content described by the given 'match', returning a string."   441    442         if match.group("monotext"):   443             self.enter_section(); self.leave_section()   444             return "{{{%s}}}" % match.group("monotext")   445    446         elif match.group("linktext"):   447             parts = match.group("linktext").split("|")   448    449             # NOTE: Proper detection of external links required.   450    451             if len(parts) == 1:   452                 label, target, title = None, parts[0], None   453             elif len(parts) == 2:   454                 (label, target), title = parts, None   455             else:   456                 label, target, title = parts   457    458             target = target.strip()   459    460             # Look for namespace links and rewrite them.   461    462             if target.find(":") != -1:   463                 prefix = ""   464                 space, rest = target.split(":", 1)   465                 if space not in URL_SCHEMES:   466                     rest = get_page_title(rest)   467                     target = "%s/%s" % (space, rest)   468    469             # Detect anchors.   470    471             elif target.startswith("#"):   472                 prefix = ""   473    474             # Detect attachments.   475    476             elif target.startswith("^"):   477                 prefix = "attachment:"   478    479             # Link to other pages within a space.   480    481             else:   482                 prefix = "../"   483    484                 # Make the link tidier by making a target if none was given.   485    486                 if not label:   487                     label = target   488    489                 target = get_page_title(target)   490    491             if not label and not title:   492                 return "[[%s%s]]" % (prefix, target)   493             elif not title:   494                 return "[[%s%s|%s]]" % (prefix, target, label)   495             else:   496                 return "[[%s%s|%s|title=%s]]" % (prefix, target, label, title)   497    498         elif match.group("imagetext"):   499             parts = match.group("imagetext").split("|")   500    501             # NOTE: Proper detection of external links required.   502    503             if parts[0].startswith("http"):   504                 prefix = ""   505             else:   506                 prefix = "attachment:"   507    508             # NOTE: Proper options conversion required.   509    510             if len(parts) == 1:   511                 return "{{%s%s}}" % (prefix, parts[0])   512             else:   513                 return "{{%s%s|%s}}" % (prefix, parts[0], parts[1])   514    515         elif match.group("macro"):   516             macro_name = match.group("macro")   517             if macrotypes.has_key(macro_name):   518                 argname = macroargs.get(macro_name)   519                 result = macrotypes[macro_name] % {   520                     "args" : quote_macro_argument((argname and ("%s=" % argname) or "") + match.group("options"))   521                     }   522                 if not self.forbids_macros():   523                     return result   524                 if macro_name == "anchor":   525                     self.held_anchors.append(result)   526             return ""   527    528         elif match.group("italictext"):   529             return "''%s''" % self.translate_content(match.group("italictext"))   530    531         elif match.group("boldtext"):   532             return "'''%s'''" % self.translate_content(match.group("boldtext"))   533    534         elif match.group("deltext"):   535             return "--(%s)--" % self.translate_content(match.group("deltext"))   536    537         elif match.group("underlinetext"):   538             return "__%s__" % self.translate_content(match.group("underlinetext"))   539    540         elif match.group("subtext"):   541             return ",,%s,," % self.translate_content(match.group("subtext"))   542    543         else:   544             return self.translate_text(match.group())   545    546     def translate_text(self, s, preformatted=False):   547    548         "Translate the plain text string 's', converting notation."   549    550         for before, after in preformatted and preformatted_notation_mapping or notation_mapping:   551             s = s.replace(before, after)   552         return s   553    554     def translate_content(self, text):   555    556         """   557         Return a translation of the given 'text'. If the optional 'sectiontype' is   558         specified, the translation may be modified to a form appropriate to the   559         section being translated.   560         """   561    562         parts = []   563         preformatted = self.is_preformatted()   564    565         last = 0   566         for match in content_regexp.finditer(text):   567             start, end = match.span()   568             parts.append(self.translate_text(text[last:start], preformatted))   569    570             # Handle unformatted sections.   571    572             if self.sections and self.sections[-1] in ("code", "noformat"):   573                 parts.append(match.group())   574             else:   575                 parts.append(self.translate_content_match(match))   576    577             last = end   578    579         parts.append(self.translate_text(text[last:], preformatted))   580         return "".join(parts)   581    582     def is_preformatted(self):   583         return reduce(operator.or_, [x in preformatted_sectiontypes for x in self.sections], False)   584    585     def translate_block(self, blocktype, blocktext):   586    587         "Translate the block with the given 'blocktype' and 'blocktext'."   588    589         if blocktype in headings:   590             self.in_heading = True   591             self.held_anchors = []   592    593         parts = []   594    595         # Translate headings and blockquotes.   596    597         if blocktypes.has_key(blocktype):   598             text = self.parse_text(blocktext)   599             for anchor in self.held_anchors:   600                 parts.append(anchor)   601             parts.append(blocktypes[blocktype] % text)   602    603         # Translate list items.   604    605         elif blocktype == "list":   606             for listmarker, listitem in get_list_items(blocktext):   607                 parts.append("%s %s" % (self.translate_marker(listmarker), self.parse_text(listitem)))   608    609         # Translate table items.   610    611         elif blocktype == "table":   612    613             # Enter the table.   614    615             self.enter_section()   616    617             table_parts = []   618             first = True   619    620             for cellsep, columns in get_table_rows(blocktext):   621                 if not first:   622                     table_parts.append("==")   623                 else:   624                     first = False   625                 moinsep = self.translate_cellsep(cellsep)   626                 table_parts.append(moinsep.join([self.translate_cell(cellsep, column) for column in columns]))   627    628             # Nest the section appropriately.   629    630             opening, closing = self.nest_section()   631    632             parts.append("%s#!table" % opening)   633             parts += table_parts   634             parts.append(closing)   635    636             # Leave the table.   637    638             self.leave_section()   639    640         # Handle anonymous blocks.   641    642         else:   643             parts.append(self.parse_text(blocktext))   644    645         if blocktype in headings:   646             self.in_heading = False   647    648         return "\n".join(parts)   649    650     def translate_section(self, sectiontype, options, text):   651    652         """   653         Translate the section with the given 'sectiontype', 'options' and   654         'text'.   655         """   656    657         parts = []   658    659         # Enter the section.   660    661         self.enter_section(sectiontype)   662    663         # Sections can contain other sections.   664    665         section_content = self.parse_text(text.strip())   666    667         # Nest the section appropriately.   668    669         opening, closing = self.nest_section()   670         mointype = sectiontypes.get(sectiontype)   671    672         parts.append("%s%s\n" % (opening, mointype or ""))   673         if options:   674             parts.append("## %s\n" % options)   675         parts.append(section_content)   676         parts.append("\n%s\n" % closing)   677    678         # Leave the section.   679    680         self.leave_section()   681    682         return parts   683    684     def enter_section(self, sectiontype=None):   685         self.level += 1   686         self.max_level = max(self.level, self.max_level)   687         self.sections.append(sectiontype)   688    689     def leave_section(self):   690         self.level -= 1   691         if not self.level:   692             self.max_level = 0   693         self.sections.pop()   694    695     def nest_section(self):   696         level = 3 + self.max_level - self.level   697         opening = "{" * level   698         closing = "}" * level   699         return opening, closing   700    701     # General parsing.   702    703     def parse_text(self, s, top=False):   704    705         "Parse the content in the string 's', returning the translation."   706    707         parts = []   708    709         # Control spacing between blocks and other blocks or sections.   710    711         preceded_by_block = False   712    713         for type, text in get_regions(s):   714    715             # Handle list, heading, blockquote or anonymous blocks.   716    717             if type is None:   718    719                 # Where the region is the same as the provided text, return   720                 # immediately. This is the base case of the recursive parsing   721                 # process.   722    723                 if text == s and not top:   724                     return self.translate_content(text)   725    726                 # Otherwise, obtain and translate the blocks.   727    728                 if preceded_by_block:   729                     parts.append("\n")   730    731                 first = True   732                 for blocktype, blocktext in get_blocks(text):   733                     if not first:   734                         parts.append("\n")   735                     else:   736                         first = False   737                     parts.append("%s" % self.translate_block(blocktype, blocktext))   738    739                 if not first:   740                     preceded_by_block = True   741    742             # Handle sections.   743    744             else:   745                 sectiontype, options = type   746    747                 # Direct translations of sections.   748    749                 if sectiontypes.has_key(sectiontype):   750                     if preceded_by_block:   751                         parts.append("\n")   752    753                     parts += self.translate_section(sectiontype, options, text)   754                     preceded_by_block = True   755    756                 # Translations of macros acting as sections.   757    758                 elif macrotypes.has_key(sectiontype):   759    760                     # Prevent the production of macros in places they would   761                     # produce illegal Moin syntax.   762    763                     if not self.forbids_macros():   764                         self.macro = sectiontype   765                         argname = macroargs.get(sectiontype)   766                         parts.append(macrotypes[sectiontype] % {   767                             "content"   : quote_macro_argument(self.parse_text(text)),   768                             "args"      : quote_macro_argument((argname and ("%s=" % argname) or "") + options)   769                             })   770                         self.macro = None   771    772                     # Include the contents of section-based macros where the   773                     # macros themselves are not allowed.   774    775                     else:   776                         parts.append(self.translate_content(text))   777    778                     preceded_by_block = False   779    780                 # Unrecognised sections.   781    782                 else:   783                     parts += self.translate_section(sectiontype, None, text)   784                     preceded_by_block = False   785    786         return "".join(parts)   787    788     def forbids_macros(self):   789         return self.in_heading or self.macro   790    791 def parse(s, out):   792    793     "Parse the content in the string 's', writing a translation to 'out'."   794    795     parser = ConfluenceParser()   796     out.write(parser.parse_text(s, top=True))   797    798 if __name__ == "__main__":   799     s = codecs.getreader("utf-8")(sys.stdin).read()   800     out = codecs.getwriter("utf-8")(sys.stdout)   801     parse(s, out)   802    803 # vim: tabstop=4 expandtab shiftwidth=4