ConfluenceConverter (file wikiparser.py at ef640bccabac)

     1 #!/usr/bin/env python     2      3 """     4 Confluence Wiki syntax parsing.     5      6 Copyright (C) 2012, 2013 Paul Boddie <paul@boddie.org.uk>     7      8 This software is free software; you can redistribute it and/or     9 modify it under the terms of the GNU General Public License as    10 published by the Free Software Foundation; either version 2 of    11 the License, or (at your option) any later version.    12     13 This software is distributed in the hope that it will be useful,    14 but WITHOUT ANY WARRANTY; without even the implied warranty of    15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the    16 GNU General Public License for more details.    17     18 You should have received a copy of the GNU General Public    19 License along with this library; see the file LICENCE.txt    20 If not, write to the Free Software Foundation, Inc.,    21 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA    22     23 --------    24     25 The basic procedure is as follows:    26     27  1. Wiki pages are first split up into regions.    28  2. Then, within these regions, the text is split into blocks.    29     1. First, lists are identified.    30     2. Additionally, other block-like elements are identified.    31  3. Each block is then split into regions.    32 """    33     34 from common import *    35 import re    36 import sys    37 import codecs    38 import operator    39     40 # Section extraction.    41     42 sections_regexp_str = r"(?<!{){(?P<type>[^-_*+{}\n:]+)(?P<options>:[^}\n]+)?}" \    43                       r"|" \    44                       r"^(?P<rowstart>[|]{1,2})" \    45                       r"|" \    46                       r"(?P<rowend>[|]{1,2}(\n|$))" \    47                       r"|" \    48                       r"^(?P<listitem>\s*[*#-]+\s+.*?([^|](\n|$)|(?=[|](\n|$))))"    49     50 sections_regexp = re.compile(sections_regexp_str, re.MULTILINE)    51     52 def get_regions(s):    53     54     """    55     Return a list of regions from 's'. Each region is specified using a tuple of    56     the form (type, text).    57     """    58     59     last = 0    60     regions = [""]    61     depth = 0    62     had_row = False    63     had_item = False    64     65     for match in sections_regexp.finditer(s):    66         start, end = match.span()    67         is_start = match.group("options") or match.group("rowstart")    68         is_section = is_section_marker(match.group("type"))    69         is_row = match.group("rowstart") or match.group("rowend")    70         is_item = match.group("listitem")    71     72         # The start of a region is either indicated by a marker with options or    73         # by a marker where no region is currently active.    74     75         if is_start or not depth:    76     77             # Where no region is active, add the text since the last match as a    78             # "null" region.    79     80             if not depth:    81                 regions[-1] += s[last:start]    82     83                 # A new region is maintained as a string.    84     85                 if is_section:    86                     regions.append(s[start:end])    87     88                 # A new row may either continue a table region or start a new    89                 # table region.    90     91                 elif is_row:    92                     if had_row and last == start:    93                         regions[-2] += regions[-1] + s[start:end]    94                         regions.pop()    95                     else:    96                         regions.append(s[start:end])    97     98                 # A list item may either continue a list region or start a new    99                 # list region.   100    101                 elif is_item:   102    103                     # If continuing a list, merge the list regions and start a   104                     # new potentally separate region.   105    106                     if had_item and last == start:   107                         regions[-2] += regions[-1] + s[start:end]   108                         regions[-1] = ""   109    110                     # If not continuing a list, make a region for a new list and   111                     # start a new potentally separate region.   112    113                     else:   114                         regions.append(s[start:end])   115                         regions.append("")   116    117                 # Certain markers may be standalone macros.   118    119                 else:   120                     regions[-1] += s[start:end]   121    122             # Where a region is active, add the text since the last match as   123             # well as the text in this match to the region.   124    125             else:   126                 regions[-1] += s[last:end]   127    128             if is_section or is_row:   129                 depth += 1   130    131         # The end of a region is indicated by a marker with no options or the   132         # end of a row.   133    134         else:   135             # Where no region is active, the text since the last match plus the   136             # marker are added to the current "null" region.   137    138             if not depth:   139    140                 # Add to the string portion of the "null" region.   141    142                 regions[-1] += s[last:end]   143    144             # Where a region is active, the end marker and preceding text is   145             # either incorporated into the current region if more than one   146             # region is active, or the preceding text is incorporated into the   147             # current region and the details of the region are then obtained.   148    149             else:   150                 if depth > 1 or (not is_section and not is_row):   151                     regions[-1] += s[last:end]   152    153                 # Terminate the active region, interpreting its contents.   154    155                 else:   156                     regions[-1] += s[last:end]   157                     regions.append("")   158    159                 if is_section or is_row:   160                     depth -= 1   161    162         had_row = is_row   163         had_item = is_item   164         last = end   165    166     # Where a region is still active, terminate it.   167    168     regions[-1] += s[last:]   169    170     return [get_section_details(s) for s in regions if s]   171    172 def is_section_marker(sectiontype):   173     return sectiontypes.has_key(sectiontype) or sectiontype == "color"   174    175 # Section inspection.   176    177 section_regexp_str = r"{(?P<sectiontype>[^\n:]*?)(?::(?P<options>.*?))?}(?P<section>.*){(?P=sectiontype)}"   178 section_regexp = re.compile(section_regexp_str, re.DOTALL | re.MULTILINE)   179    180 def get_section_details(s):   181    182     "Return the details of a section 's' in the form (type, text)."   183    184     match = section_regexp.match(s)   185     if match:   186         return (match.group("sectiontype"), match.group("options")), match.group("section")   187     else:   188         return None, s   189    190 # Heading, table and list extraction.   191    192 list_regexp_str = r"^\s*(?P<listtype>[*#-])[*#-]*\s+.*(\n\s*(?P=listtype).*?)*(?:\n|$)"   193 table_regexp_str = r"^((?P<celltype>[|]{1,2})((.|\n(?!\n))+?(?P=celltype))+(\n|$))+"   194 blocktext_regexp_str = r"^(?P<type>h\d|bq)\.\s+(?P<text>.*)$"   195    196 blockelement_regexp = re.compile(   197     "(" + list_regexp_str + ")"   198     "|"   199     "(" + table_regexp_str + ")"   200     "|"   201     "(" + blocktext_regexp_str + ")",   202     re.MULTILINE   203     )   204    205 def get_block_elements(s):   206    207     """   208     Extract headings, tables and lists from the given string 's'.   209     """   210    211     last = 0   212     blocks = []   213     for match in blockelement_regexp.finditer(s):   214         start, end = match.span()   215         matchtype = match.group("listtype") and "list" or match.group("celltype") and "table" or match.group("type")   216         blocks.append((None, s[last:start]))   217         blocks.append((matchtype, match.group("text") or s[start:end]))   218         last = end   219     blocks.append((None, s[last:]))   220     return blocks   221    222 # Block extraction.   223    224 block_regexp_str = r"^(?:\s*\n)+"   225 block_regexp = re.compile(block_regexp_str, re.MULTILINE)   226    227 def get_basic_blocks(s):   228    229     """   230     Return blocks from the given string 's' by splitting the text on blank lines   231     and eliminating those lines.   232     """   233    234     return [b for b in block_regexp.split(s) if b.strip()]   235    236 # Block inspection.   237    238 def get_blocks(s):   239    240     """   241     Return blocks from the given string 's', inspecting the basic blocks and   242     generating additional block-level text where appropriate.   243     """   244    245     blocks = []   246    247     for blocktype, blocktext in get_block_elements(s):   248    249         # Collect heading, list and table blocks.   250    251         if blocktype is not None:   252             blocks.append((blocktype, blocktext))   253    254         # Attempt to find new subblocks in other regions.   255    256         else:   257             for block in get_basic_blocks(blocktext):   258                 blocks.append((None, block))   259    260     return blocks   261    262 # List item inspection.   263    264 listitem_regexp_str = r"^(?P<marker> *[-*#]+)\s+(?P<text>.*)$"   265 listitem_regexp = re.compile(listitem_regexp_str, re.MULTILINE)   266    267 def get_list_items(text):   268    269     "Return a list of (marker, text) tuples for the given list 'text'."   270    271     items = []   272    273     for match in listitem_regexp.finditer(text):   274         items.append((match.group("marker"), match.group("text")))   275    276     return items   277    278 # Content inspection.   279    280 monospace_regexp_str = r"{{(?P<monotext>.*?)}}"   281 link_regexp_str      = r"[[](?P<linktext>.*?)]"   282 image_regexp_str     = r"!(?P<imagetext>\w.*?)!"   283 macro_regexp_str     = r"{(?P<macro>.*?):(?P<options>.*?)}"   284    285 # Word-dependent patterns.   286 # Here, the unbracketed markers must test for the absence of surrounding word   287 # characters.   288    289 italic_regexp_str    = r"(?:(?<!\w)_|\{_\})(?P<italictext>.*?)(?:_(?!\w)|\{_\})"   290 bold_regexp_str      = r"(?:(?<!\w)\*|\{\*\})(?P<boldtext>.*?)(?:\*(?!\w)|\{\*\})"   291 del_regexp_str       = r"(?:(?<!\w)-|\{-\})(?P<deltext>.*?)(?:-(?!\w)|\{-\})"   292 underline_regexp_str = r"(?:(?<!\w)\+|\{\+\})(?P<underlinetext>.*?)(?:\+(?!\w)|\{\+\})"   293 sub_regexp_str       = r"(?:(?<!\w)~|\{~\})(?P<subtext>.*?)(?:~(?!\w)|\{~\})"   294    295 content_regexp_str = (   296     "(" + monospace_regexp_str + ")"   297     "|"   298     "(" + link_regexp_str + ")"   299     "|"   300     "(" + image_regexp_str + ")"   301     "|"   302     "(" + macro_regexp_str + ")"   303     "|"   304     "(" + italic_regexp_str + ")"   305     "|"   306     "(" + bold_regexp_str + ")"   307     "|"   308     "(" + del_regexp_str + ")"   309     "|"   310     "(" + underline_regexp_str + ")"   311     "|"   312     "(" + sub_regexp_str + ")"   313     )   314    315 # Table row inspection.   316    317 cellsep_regexp_str = r"(?P<celltype>[|]{1,2})"   318    319 table_content_regexp_str = (   320     content_regexp_str +   321     "|"   322     "(" + cellsep_regexp_str + ")"   323     )   324    325 content_regexp = re.compile(content_regexp_str)   326 table_content_regexp = re.compile(table_content_regexp_str)   327    328 def get_table_rows(text):   329    330     "Return a list of (cellsep, columns) tuples for the given table 'text'."   331    332     rows = []   333    334     for row in text.split("|\n"):   335         if not row:   336             break   337    338         row += "|"   339         cellsep = None   340         columns = [""]   341         last = 0   342         for match in table_content_regexp.finditer(row):   343             start, end = match.span()   344             columns[-1] += row[last:start]   345    346             if match.group("celltype"):   347                 if cellsep is None:   348                     cellsep = match.group("celltype")   349                 columns.append("")   350             else:   351                 columns[-1] += match.group()   352    353             last = end   354    355         columns[-1] += row[last:]   356    357         if cellsep:   358             rows.append((cellsep, columns[1:-1]))   359    360     return rows   361    362 # Notation conversion.   363    364 notation_mapping = [   365     (r"\!", "!"),   366     (r"\-", "-"),   367     (r"\\""\n", "<<BR>>"),   368     (r"\\ ", "<<BR>>"),   369     (r"\~", "~"),   370     ]   371    372 preformatted_notation_mapping = [   373     (r"\!", "!"),   374     (r"\-", "-"),   375     (r"\\""\n", "\n"),   376     (r"\\ ", "\n"),   377     (r"\~", "~"),   378     ]   379    380 # Translation helpers.   381    382 markers = {   383     "*" : "*",   384     "#" : "1.",   385     "-" : "*",   386     }   387    388 cellseps = {   389     "|" : "\n|| ",   390     "||" : "\n|| ",   391     }   392    393 cellextra = {   394     "|" : "",   395     "||" : "'''",   396     }   397    398 sectiontypes = {   399     "code"      : "",   400     "noformat"  : "",   401     "quote"     : "",   402     "info"      : "#!wiki important",   403     "note"      : "#!wiki caution",   404     "tip"       : "#!wiki tip",   405     "warning"   : "#!wiki warning",   406     }   407    408 preformatted_sectiontypes = (None, "noformat")   409    410 macroargs = {   411     "color"     : "col",   412     }   413    414 macrotypes = {   415     "anchor"    : "<<Anchor(%(args)s)>>",   416     "color"     : "<<Color2(%(content)s, %(args)s)>>",   417     }   418    419 class ConfluenceParser:   420    421     "A parser for Confluence markup."   422    423     def __init__(self):   424         self.max_level = self.level = 0   425         self.in_heading = False   426         self.held_anchors = []   427         self.macro = None   428         self.sections = []   429    430     def translate_marker(self, marker):   431    432         "Translate the given 'marker' to a suitable Moin representation."   433    434         return " " * len(marker) + markers[marker[-1]]   435    436     def translate_cellsep(self, cellsep):   437    438         "Translate the given 'cellsep' to a suitable Moin representation."   439    440         return cellseps[cellsep]   441    442     def translate_cell(self, cellsep, text):   443    444         "Using 'cellsep', translate the cell 'text'."   445    446         return cellextra[cellsep] + self.parse_text(text).strip() + cellextra[cellsep]   447    448     def translate_content_match(self, match):   449    450         "Translate the content described by the given 'match', returning a string."   451    452         if match.group("monotext"):   453             self.enter_section(); self.leave_section()   454             return "{{{%s}}}" % match.group("monotext")   455    456         elif match.group("linktext"):   457             parts = match.group("linktext").split("|")   458    459             # NOTE: Proper detection of external links required.   460    461             if len(parts) == 1:   462                 label, target, title = None, parts[0], None   463             elif len(parts) == 2:   464                 (label, target), title = parts, None   465             else:   466                 label, target, title = parts   467    468             target = target.strip()   469    470             # Look for namespace links and rewrite them.   471    472             if target.find(":") != -1:   473                 prefix = ""   474                 space, rest = target.split(":", 1)   475                 if space not in URL_SCHEMES:   476                     rest = get_page_title(rest)   477                     target = "%s/%s" % (space, rest)   478    479             # Detect anchors.   480    481             elif target.startswith("#"):   482                 prefix = ""   483    484             # Detect attachments.   485    486             elif target.startswith("^"):   487                 prefix = "attachment:"   488    489             # Link to other pages within a space.   490    491             else:   492                 prefix = "../"   493    494                 # Make the link tidier by making a target if none was given.   495    496                 if not label:   497                     label = target   498    499                 target = get_page_title(target)   500    501             if not label and not title:   502                 return "[[%s%s]]" % (prefix, target)   503             elif not title:   504                 return "[[%s%s|%s]]" % (prefix, target, label)   505             else:   506                 return "[[%s%s|%s|title=%s]]" % (prefix, target, label, title)   507    508         elif match.group("imagetext"):   509             parts = match.group("imagetext").split("|")   510    511             # NOTE: Proper detection of external links required.   512    513             if parts[0].startswith("http"):   514                 prefix = ""   515             else:   516                 prefix = "attachment:"   517    518             # NOTE: Proper options conversion required.   519    520             if len(parts) == 1:   521                 return "{{%s%s}}" % (prefix, parts[0])   522             else:   523                 return "{{%s%s|%s}}" % (prefix, parts[0], parts[1])   524    525         elif match.group("macro"):   526             macro_name = match.group("macro")   527             if macrotypes.has_key(macro_name):   528                 argname = macroargs.get(macro_name)   529                 result = macrotypes[macro_name] % {   530                     "args" : quote_macro_argument((argname and ("%s=" % argname) or "") + match.group("options"))   531                     }   532                 if not self.forbids_macros():   533                     return result   534                 if macro_name == "anchor":   535                     self.held_anchors.append(result)   536             return ""   537    538         elif match.group("italictext"):   539             return "''%s''" % self.translate_content(match.group("italictext"))   540    541         elif match.group("boldtext"):   542             return "'''%s'''" % self.translate_content(match.group("boldtext"))   543    544         elif match.group("deltext"):   545             return "--(%s)--" % self.translate_content(match.group("deltext"))   546    547         elif match.group("underlinetext"):   548             return "__%s__" % self.translate_content(match.group("underlinetext"))   549    550         elif match.group("subtext"):   551             return ",,%s,," % self.translate_content(match.group("subtext"))   552    553         else:   554             return self.translate_text(match.group())   555    556     def translate_text(self, s, preformatted=False):   557    558         "Translate the plain text string 's', converting notation."   559    560         for before, after in preformatted and preformatted_notation_mapping or notation_mapping:   561             s = s.replace(before, after)   562         return s   563    564     def translate_content(self, text):   565    566         """   567         Return a translation of the given 'text'. If the optional 'sectiontype' is   568         specified, the translation may be modified to a form appropriate to the   569         section being translated.   570         """   571    572         parts = []   573         preformatted = self.is_preformatted()   574    575         last = 0   576         for match in content_regexp.finditer(text):   577             start, end = match.span()   578             parts.append(self.translate_text(text[last:start], preformatted))   579    580             # Handle unformatted sections.   581    582             if self.sections and self.sections[-1] in ("code", "noformat"):   583                 parts.append(match.group())   584             else:   585                 parts.append(self.translate_content_match(match))   586    587             last = end   588    589         parts.append(self.translate_text(text[last:], preformatted))   590         return "".join(parts)   591    592     def is_preformatted(self):   593         return reduce(operator.or_, [x in preformatted_sectiontypes for x in self.sections], False)   594    595     def translate_block(self, blocktype, blocktext):   596    597         "Translate the block with the given 'blocktype' and 'blocktext'."   598    599         if blocktype in headings:   600             self.in_heading = True   601             self.held_anchors = []   602    603         parts = []   604    605         # Translate headings and blockquotes.   606    607         if blocktypes.has_key(blocktype):   608             text = self.parse_text(blocktext)   609             for anchor in self.held_anchors:   610                 parts.append(anchor)   611             parts.append(blocktypes[blocktype] % text)   612    613         # Translate list items.   614    615         elif blocktype == "list":   616             for listmarker, listitem in get_list_items(blocktext):   617                 parts.append("%s %s" % (self.translate_marker(listmarker), self.parse_text(listitem)))   618    619         # Translate table items.   620    621         elif blocktype == "table":   622    623             # Enter the table.   624    625             self.enter_section()   626    627             table_parts = []   628             first = True   629    630             for cellsep, columns in get_table_rows(blocktext):   631                 if not first:   632                     table_parts.append("==")   633                 else:   634                     first = False   635                 moinsep = self.translate_cellsep(cellsep)   636                 table_parts.append(moinsep.join([self.translate_cell(cellsep, column) for column in columns]))   637    638             # Nest the section appropriately.   639    640             opening, closing = self.nest_section()   641    642             parts.append("%s#!table" % opening)   643             parts += table_parts   644             parts.append(closing)   645    646             # Leave the table.   647    648             self.leave_section()   649    650         # Handle anonymous blocks.   651    652         else:   653             parts.append(self.parse_text(blocktext))   654    655         if blocktype in headings:   656             self.in_heading = False   657    658         return "\n".join(parts)   659    660     def translate_section(self, sectiontype, options, text):   661    662         """   663         Translate the section with the given 'sectiontype', 'options' and   664         'text'.   665         """   666    667         parts = []   668    669         # Enter the section.   670    671         self.enter_section(sectiontype)   672    673         # Sections can contain other sections.   674    675         if sectiontype == "noformat":   676             section_content = self.translate_content(text.strip("\n"))   677         else:   678             section_content = self.parse_text(text.strip())   679    680         # Nest the section appropriately.   681    682         opening, closing = self.nest_section()   683         mointype = sectiontypes.get(sectiontype)   684    685         parts.append("%s%s\n" % (opening, mointype or ""))   686         if options:   687             parts.append("## %s\n" % options)   688         parts.append(section_content)   689         parts.append("\n%s\n" % closing)   690    691         # Leave the section.   692    693         self.leave_section()   694    695         return parts   696    697     def enter_section(self, sectiontype=None):   698         self.level += 1   699         self.max_level = max(self.level, self.max_level)   700         self.sections.append(sectiontype)   701    702     def leave_section(self):   703         self.level -= 1   704         if not self.level:   705             self.max_level = 0   706         self.sections.pop()   707    708     def nest_section(self):   709         level = 3 + self.max_level - self.level   710         opening = "{" * level   711         closing = "}" * level   712         return opening, closing   713    714     # General parsing.   715    716     def parse_text(self, s, top=False):   717    718         "Parse the content in the string 's', returning the translation."   719    720         parts = []   721    722         # Control spacing between blocks and other blocks or sections.   723    724         preceded_by_block = False   725    726         for type, text in get_regions(s):   727    728             # Handle list, heading, blockquote or anonymous blocks.   729    730             if type is None:   731    732                 # Where the region is the same as the provided text, return   733                 # immediately. This is the base case of the recursive parsing   734                 # process.   735    736                 if text == s and not top:   737                     return self.translate_content(text)   738    739                 # Otherwise, obtain and translate the blocks.   740    741                 if preceded_by_block:   742                     parts.append("\n")   743    744                 first = True   745                 for blocktype, blocktext in get_blocks(text):   746                     if not first:   747                         parts.append("\n")   748                     else:   749                         first = False   750                     parts.append("%s" % self.translate_block(blocktype, blocktext))   751    752                 if not first:   753                     preceded_by_block = True   754    755             # Handle sections.   756    757             else:   758                 sectiontype, options = type   759    760                 # Direct translations of sections.   761    762                 if sectiontypes.has_key(sectiontype):   763                     if preceded_by_block:   764                         parts.append("\n")   765    766                     parts += self.translate_section(sectiontype, options, text)   767                     preceded_by_block = True   768    769                 # Translations of macros acting as sections.   770    771                 elif macrotypes.has_key(sectiontype):   772    773                     # Prevent the production of macros in places they would   774                     # produce illegal Moin syntax.   775    776                     if not self.forbids_macros():   777                         self.macro = sectiontype   778                         argname = macroargs.get(sectiontype)   779                         parts.append(macrotypes[sectiontype] % {   780                             "content"   : quote_macro_argument(self.parse_text(text)),   781                             "args"      : quote_macro_argument((argname and ("%s=" % argname) or "") + options)   782                             })   783                         self.macro = None   784    785                     # Include the contents of section-based macros where the   786                     # macros themselves are not allowed.   787    788                     else:   789                         parts.append(self.translate_content(text))   790    791                     preceded_by_block = False   792    793                 # Unrecognised sections.   794    795                 else:   796                     parts += self.translate_section(sectiontype, None, text)   797                     preceded_by_block = False   798    799         return "".join(parts)   800    801     def forbids_macros(self):   802         return self.in_heading or self.macro   803    804 def parse(s, out):   805    806     "Parse the content in the string 's', writing a translation to 'out'."   807    808     parser = ConfluenceParser()   809     out.write(parser.parse_text(s, top=True))   810    811 if __name__ == "__main__":   812     s = codecs.getreader("utf-8")(sys.stdin).read()   813     out = codecs.getwriter("utf-8")(sys.stdout)   814     parse(s, out)   815    816 # vim: tabstop=4 expandtab shiftwidth=4