ConfluenceConverter (file wikiparser.py at 06641676740f)

     1 #!/usr/bin/env python     2      3 """     4 Confluence Wiki syntax parsing.     5      6 Copyright (C) 2012, 2013, 2015 Paul Boddie <paul@boddie.org.uk>     7      8 This software is free software; you can redistribute it and/or     9 modify it under the terms of the GNU General Public License as    10 published by the Free Software Foundation; either version 2 of    11 the License, or (at your option) any later version.    12     13 This software is distributed in the hope that it will be useful,    14 but WITHOUT ANY WARRANTY; without even the implied warranty of    15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the    16 GNU General Public License for more details.    17     18 You should have received a copy of the GNU General Public    19 License along with this library; see the file LICENCE.txt    20 If not, write to the Free Software Foundation, Inc.,    21 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA    22     23 --------    24     25 The basic procedure is as follows:    26     27  1. Wiki pages are first split up into regions.    28  2. Then, within these regions, the text is split into blocks.    29     1. First, lists are identified.    30     2. Additionally, other block-like elements are identified.    31  3. Each block is then split into regions.    32 """    33     34 from common import *    35 import re    36 import sys    37 import codecs    38 import operator    39     40 # Section extraction.    41     42 sections_regexp_str = r"(?<!{){(?P<type>[^-_*+{}\n:]+)(?P<options>:[^}\n]+)?}" \    43                       r"|" \    44                       r"^(?P<rowstart>[|]{1,2})" \    45                       r"|" \    46                       r"(?P<rowend>[|]{1,2}(\n|$))" \    47                       r"|" \    48                       r"^(?P<listitem>\s*[*#-]+\s+.*?([^|](\n|$)|(?=[|](\n|$))))"    49     50 sections_regexp = re.compile(sections_regexp_str, re.MULTILINE)    51     52 def get_regions(s):    53     54     """    55     Return a list of regions from 's'. Each region is specified using a tuple of    56     the form (type, text).    57     """    58     59     last = 0    60     regions = [""]    61     depth = 0    62     had_row = False    63     had_item = False    64     65     for match in sections_regexp.finditer(s):    66         start, end = match.span()    67         is_start = match.group("options") or match.group("rowstart")    68         is_section = is_section_marker(match.group("type"))    69         is_row = match.group("rowstart") or match.group("rowend")    70         is_item = match.group("listitem")    71     72         # The start of a region is either indicated by a marker with options or    73         # by a marker where no region is currently active.    74     75         if is_start or not depth:    76     77             # Where no region is active, add the text since the last match as a    78             # "null" region.    79     80             if not depth:    81                 regions[-1] += s[last:start]    82     83                 # A new region is maintained as a string.    84     85                 if is_section:    86                     regions.append(s[start:end])    87     88                 # A new row may either continue a table region or start a new    89                 # table region.    90     91                 elif is_row:    92                     if had_row and last == start:    93                         regions[-2] += regions[-1] + s[start:end]    94                         regions.pop()    95                     else:    96                         regions.append(s[start:end])    97     98                 # A list item may either continue a list region or start a new    99                 # list region.   100    101                 elif is_item:   102    103                     # If continuing a list, merge the list regions and start a   104                     # new potentally separate region.   105    106                     if had_item and last == start:   107                         regions[-2] += regions[-1] + s[start:end]   108                         regions[-1] = ""   109    110                     # If not continuing a list, make a region for a new list and   111                     # start a new potentally separate region.   112    113                     else:   114                         regions.append(s[start:end])   115                         regions.append("")   116    117                 # Certain markers may be standalone macros.   118    119                 else:   120                     regions[-1] += s[start:end]   121    122             # Where a region is active, add the text since the last match as   123             # well as the text in this match to the region.   124    125             else:   126                 regions[-1] += s[last:end]   127    128             if is_section or is_row:   129                 depth += 1   130    131         # The end of a region is indicated by a marker with no options or the   132         # end of a row.   133    134         else:   135             # Where no region is active, the text since the last match plus the   136             # marker are added to the current "null" region.   137    138             if not depth:   139    140                 # Add to the string portion of the "null" region.   141    142                 regions[-1] += s[last:end]   143    144             # Where a region is active, the end marker and preceding text is   145             # either incorporated into the current region if more than one   146             # region is active, or the preceding text is incorporated into the   147             # current region and the details of the region are then obtained.   148    149             else:   150                 if depth > 1 or (not is_section and not is_row):   151                     regions[-1] += s[last:end]   152    153                 # Terminate the active region, interpreting its contents.   154    155                 else:   156                     regions[-1] += s[last:end]   157                     regions.append("")   158    159                 if is_section or is_row:   160                     depth -= 1   161    162         had_row = is_row   163         had_item = is_item   164         last = end   165    166     # Where a region is still active, terminate it.   167    168     regions[-1] += s[last:]   169    170     return [get_section_details(s) for s in regions if s]   171    172 def is_section_marker(sectiontype):   173     return sectiontypes.has_key(sectiontype) or sectiontype == "color"   174    175 # Section inspection.   176    177 section_regexp_str = r"{(?P<sectiontype>[^\n:]*?)(?::(?P<options>.*?))?}(?P<section>.*){(?P=sectiontype)}"   178 section_regexp = re.compile(section_regexp_str, re.DOTALL | re.MULTILINE)   179    180 def get_section_details(s):   181    182     "Return the details of a section 's' in the form (type, text)."   183    184     match = section_regexp.match(s)   185     if match:   186         return (match.group("sectiontype"), match.group("options")), match.group("section")   187     else:   188         return None, s   189    190 # Heading, table and list extraction.   191    192 list_regexp_str = r"^\s*(?P<listtype>[*#-])[*#-]*\s+.*(\n\s*(?P=listtype).*?)*(?:\n|$)"   193 table_regexp_str = r"^((?P<celltype>[|]{1,2})((.|\n(?!\n))+?(?P=celltype))+(\n|$))+"   194 blocktext_regexp_str = r"^(?P<type>h\d|bq)\.\s+(?P<text>.*)$"   195    196 blockelement_regexp = re.compile(   197     "(" + list_regexp_str + ")"   198     "|"   199     "(" + table_regexp_str + ")"   200     "|"   201     "(" + blocktext_regexp_str + ")",   202     re.MULTILINE   203     )   204    205 def get_block_elements(s):   206    207     """   208     Extract headings, tables and lists from the given string 's'.   209     """   210    211     last = 0   212     blocks = []   213     for match in blockelement_regexp.finditer(s):   214         start, end = match.span()   215         matchtype = match.group("listtype") and "list" or match.group("celltype") and "table" or match.group("type")   216         blocks.append((None, s[last:start]))   217         blocks.append((matchtype, match.group("text") or s[start:end]))   218         last = end   219     blocks.append((None, s[last:]))   220     return blocks   221    222 # Block extraction.   223    224 block_regexp_str = r"^(?:\s*\n)+"   225 block_regexp = re.compile(block_regexp_str, re.MULTILINE)   226    227 def get_basic_blocks(s):   228    229     """   230     Return blocks from the given string 's' by splitting the text on blank lines   231     and eliminating those lines.   232     """   233    234     return [b for b in block_regexp.split(s) if b.strip()]   235    236 # Block inspection.   237    238 def get_blocks(s):   239    240     """   241     Return blocks from the given string 's', inspecting the basic blocks and   242     generating additional block-level text where appropriate.   243     """   244    245     blocks = []   246    247     for blocktype, blocktext in get_block_elements(s):   248    249         # Collect heading, list and table blocks.   250    251         if blocktype is not None:   252             blocks.append((blocktype, blocktext))   253    254         # Attempt to find new subblocks in other regions.   255    256         else:   257             for block in get_basic_blocks(blocktext):   258                 blocks.append((None, block))   259    260     return blocks   261    262 # List item inspection.   263    264 listitem_regexp_str = r"^(?P<marker> *[-*#]+)\s+(?P<text>.*)$"   265 listitem_regexp = re.compile(listitem_regexp_str, re.MULTILINE)   266    267 def get_list_items(text):   268    269     "Return a list of (marker, text) tuples for the given list 'text'."   270    271     items = []   272    273     for match in listitem_regexp.finditer(text):   274         items.append((match.group("marker"), match.group("text")))   275    276     return items   277    278 # Content inspection.   279    280 monospace_regexp_str = r"{{(?P<monotext>.*?)}}"   281 link_regexp_str      = r"(?<!\\)[[](?P<linktext>.*?)]"   282 image_regexp_str     = r"!(?P<imagetext>\w.*?)!"   283 macro_regexp_str     = r"{(?P<macro>.*?)(?::(?P<options>.*?))?}"   284    285 # Word-dependent patterns.   286 # Here, the unbracketed markers must test for the absence of surrounding word   287 # characters.   288    289 italic_regexp_str    = r"(?:(?<!\w)_|\{_\})(?P<italictext>.*?)(?:_(?!\w)|\{_\})"   290 bold_regexp_str      = r"(?:(?<!\w)\*|\{\*\})(?P<boldtext>.*?)(?:\*(?!\w)|\{\*\})"   291 del_regexp_str       = r"(?:(?<!\w)-|\{-\})(?P<deltext>.*?)(?:-(?!\w)|\{-\})"   292 underline_regexp_str = r"(?:(?<!\w)\+|\{\+\})(?P<underlinetext>.*?)(?:\+(?!\w)|\{\+\})"   293 sub_regexp_str       = r"(?:(?<!\w)~|\{~\})(?P<subtext>.*?)(?:~(?!\w)|\{~\})"   294    295 content_regexp_str = (   296     "(" + monospace_regexp_str + ")"   297     "|"   298     "(" + link_regexp_str + ")"   299     "|"   300     "(" + image_regexp_str + ")"   301     "|"   302     "(" + macro_regexp_str + ")"   303     "|"   304     "(" + italic_regexp_str + ")"   305     "|"   306     "(" + bold_regexp_str + ")"   307     "|"   308     "(" + del_regexp_str + ")"   309     "|"   310     "(" + underline_regexp_str + ")"   311     "|"   312     "(" + sub_regexp_str + ")"   313     )   314    315 # Table row inspection.   316    317 cellsep_regexp_str = r"(?P<celltype>[|]{1,2})"   318    319 table_content_regexp_str = (   320     content_regexp_str +   321     "|"   322     "(" + cellsep_regexp_str + ")"   323     )   324    325 content_regexp = re.compile(content_regexp_str)   326 table_content_regexp = re.compile(table_content_regexp_str)   327    328 def get_table_rows(text):   329    330     "Return a list of (cellsep, columns) tuples for the given table 'text'."   331    332     rows = []   333    334     for row in text.split("|\n"):   335         if not row:   336             break   337    338         row += "|"   339         cellsep = None   340         columns = [""]   341         last = 0   342         for match in table_content_regexp.finditer(row):   343             start, end = match.span()   344             columns[-1] += row[last:start]   345    346             if match.group("celltype"):   347                 if cellsep is None:   348                     cellsep = match.group("celltype")   349                 columns.append("")   350             else:   351                 columns[-1] += match.group()   352    353             last = end   354    355         columns[-1] += row[last:]   356    357         if cellsep:   358             rows.append((cellsep, columns[1:-1]))   359    360     return rows   361    362 # Notation conversion.   363    364 notation_mapping = [   365     (r"\!", "!"),   366     (r"\-", "-"),   367     (r"\\""\n", "<<BR>>"),   368     (r"\\ ", "<<BR>>"),   369     (r"\~", "~"),   370     (r"\[", "<<Verbatim([)>>"),   371     (r"\]", "<<Verbatim(])>>"),   372     (r"\*", "*"),   373     ]   374    375 preformatted_notation_mapping = [   376     (r"\!", "!"),   377     (r"\-", "-"),   378     (r"\\""\n", "\n"),   379     (r"\\ ", "\n"),   380     (r"\~", "~"),   381     ]   382    383 # Translation helpers.   384    385 markers = {   386     "*" : "*",   387     "#" : "1.",   388     "-" : "*",   389     }   390    391 cellseps = {   392     "|" : "\n|| ",   393     "||" : "\n|| ",   394     }   395    396 cellextra = {   397     "|" : "",   398     "||" : "'''",   399     }   400    401 sectiontypes = {   402     "code"      : "",   403     "excerpt"   : "#!wiki",   404     "noformat"  : "",   405     "quote"     : "",   406     "info"      : "#!wiki important",   407     "note"      : "#!wiki caution",   408     "tip"       : "#!wiki tip",   409     "warning"   : "#!wiki warning",   410     }   411    412 preformatted_sectiontypes = (None, "noformat")   413    414 macroargs = {   415     "color"     : "col",   416     }   417    418 macrotypes = {   419     "anchor"    : "<<Anchor(%(args)s)>>",   420     "color"     : "<<Color2(%(content)s, %(args)s)>>",   421     "toc"       : "<<TableOfContents>>",   422     }   423    424 class ConfluenceParser:   425    426     "A parser for Confluence markup."   427    428     def __init__(self, is_comment_page=False):   429         self.is_comment_page = is_comment_page   430         self.max_level = self.level = 0   431         self.in_heading = False   432         self.held_anchors = []   433         self.macro = None   434         self.sections = []   435    436     def translate_marker(self, marker):   437    438         "Translate the given 'marker' to a suitable Moin representation."   439    440         return " " * len(marker) + markers[marker[-1]]   441    442     def translate_cellsep(self, cellsep):   443    444         "Translate the given 'cellsep' to a suitable Moin representation."   445    446         return cellseps[cellsep]   447    448     def translate_cell(self, cellsep, text):   449    450         "Using 'cellsep', translate the cell 'text'."   451    452         return cellextra[cellsep] + self.parse_text(text).strip() + cellextra[cellsep]   453    454     def translate_content_match(self, match):   455    456         "Translate the content described by the given 'match', returning a string."   457    458         if match.group("monotext"):   459             self.enter_section(); self.leave_section()   460             return "{{{%s}}}" % match.group("monotext")   461    462         elif match.group("linktext"):   463             parts = match.group("linktext").split("|")   464    465             # NOTE: Proper detection of external links required.   466    467             if len(parts) == 1:   468                 label, target, title = None, parts[0], None   469             elif len(parts) == 2:   470                 (label, target), title = parts, None   471             else:   472                 label, target, title = parts   473    474             target = target.strip()   475    476             # Look for namespace links and rewrite them.   477    478             if target.find(":") != -1:   479                 prefix = ""   480                 space, rest = target.split(":", 1)   481                 if space not in URL_SCHEMES:   482                     rest = get_page_title(rest)   483                     target = "%s/%s" % (space, rest)   484    485             # Detect anchors.   486    487             elif target.startswith("#"):   488                 prefix = ""   489    490             # Detect attachments.   491    492             elif target.startswith("^"):   493                 prefix = "attachment:"   494    495             # Link to other pages within a space.   496    497             else:   498                 prefix = "../"   499                 if self.is_comment_page:   500                     prefix += "../"   501    502                 # Make the link tidier by making a target if none was given.   503    504                 if not label:   505                     label = target   506    507                 target = get_page_title(target)   508    509             if not label and not title:   510                 return "[[%s%s]]" % (prefix, target)   511             elif not title:   512                 return "[[%s%s|%s]]" % (prefix, target, label)   513             else:   514                 return "[[%s%s|%s|title=%s]]" % (prefix, target, label, title)   515    516         elif match.group("imagetext"):   517             parts = match.group("imagetext").split("|")   518    519             # NOTE: Proper detection of external links required.   520    521             if parts[0].startswith("http"):   522                 prefix = ""   523             else:   524                 prefix = "attachment:"   525    526             # NOTE: Proper options conversion required.   527    528             if len(parts) == 1:   529                 return "{{%s%s}}" % (prefix, parts[0])   530             else:   531                 return "{{%s%s|%s}}" % (prefix, parts[0], parts[1])   532    533         elif match.group("macro"):   534             macro_name = match.group("macro")   535             if macrotypes.has_key(macro_name):   536                 argname = macroargs.get(macro_name)   537                 result = macrotypes[macro_name] % {   538                     "args" : quote_macro_argument((argname and ("%s=" % argname) or "") + (match.group("options") or ""))   539                     }   540                 if not self.forbids_macros():   541                     return result   542                 if macro_name == "anchor":   543                     self.held_anchors.append(result)   544             return ""   545    546         elif match.group("italictext"):   547             return "''%s''" % self.translate_content(match.group("italictext"))   548    549         elif match.group("boldtext"):   550             return "'''%s'''" % self.translate_content(match.group("boldtext"))   551    552         elif match.group("deltext"):   553             return "--(%s)--" % self.translate_content(match.group("deltext"))   554    555         elif match.group("underlinetext"):   556             return "__%s__" % self.translate_content(match.group("underlinetext"))   557    558         elif match.group("subtext"):   559             return ",,%s,," % self.translate_content(match.group("subtext"))   560    561         else:   562             return self.translate_text(match.group())   563    564     def translate_text(self, s, preformatted=False):   565    566         "Translate the plain text string 's', converting notation."   567    568         for before, after in preformatted and preformatted_notation_mapping or notation_mapping:   569             s = s.replace(before, after)   570         return s   571    572     def translate_content(self, text):   573    574         """   575         Return a translation of the given 'text'. If the optional 'sectiontype' is   576         specified, the translation may be modified to a form appropriate to the   577         section being translated.   578         """   579    580         parts = []   581         preformatted = self.is_preformatted()   582    583         last = 0   584         for match in content_regexp.finditer(text):   585             start, end = match.span()   586             parts.append(self.translate_text(text[last:start], preformatted))   587    588             # Handle unformatted sections.   589    590             if self.sections and self.sections[-1] in ("code", "noformat"):   591                 parts.append(match.group())   592             else:   593                 parts.append(self.translate_content_match(match))   594    595             last = end   596    597         parts.append(self.translate_text(text[last:], preformatted))   598         return "".join(parts)   599    600     def is_preformatted(self):   601         return reduce(operator.or_, [x in preformatted_sectiontypes for x in self.sections], False)   602    603     def translate_block(self, blocktype, blocktext):   604    605         "Translate the block with the given 'blocktype' and 'blocktext'."   606    607         if blocktype in headings:   608             self.in_heading = True   609             self.held_anchors = []   610    611         parts = []   612    613         # Translate headings and blockquotes.   614    615         if blocktypes.has_key(blocktype):   616             text = self.parse_text(blocktext)   617             for anchor in self.held_anchors:   618                 parts.append(anchor)   619             parts.append(blocktypes[blocktype] % text)   620    621         # Translate list items.   622    623         elif blocktype == "list":   624             for listmarker, listitem in get_list_items(blocktext):   625                 parts.append("%s %s" % (self.translate_marker(listmarker), self.parse_text(listitem)))   626    627         # Translate table items.   628    629         elif blocktype == "table":   630    631             # Enter the table.   632    633             self.enter_section("table")   634    635             table_parts = []   636             first = True   637    638             for cellsep, columns in get_table_rows(blocktext):   639                 if not first:   640                     table_parts.append("==")   641                 else:   642                     first = False   643                 moinsep = self.translate_cellsep(cellsep)   644                 table_parts.append(moinsep.join([self.translate_cell(cellsep, column) for column in columns]))   645    646             # Nest the section appropriately.   647    648             opening, closing = self.nest_section()   649    650             parts.append("%s#!table" % opening)   651             parts += table_parts   652             parts.append(closing)   653    654             # Leave the table.   655    656             self.leave_section()   657    658         # Handle anonymous blocks.   659    660         else:   661             parts.append(self.parse_text(blocktext))   662    663         if blocktype in headings:   664             self.in_heading = False   665    666         return "\n".join(parts)   667    668     def translate_section(self, sectiontype, options, text):   669    670         """   671         Translate the section with the given 'sectiontype', 'options' and   672         'text'.   673         """   674    675         parts = []   676    677         # Enter the section.   678    679         self.enter_section(sectiontype)   680    681         # Sections can contain other sections.   682    683         if sectiontype == "noformat":   684             section_content = self.translate_content(text.strip("\n"))   685         else:   686             section_content = self.parse_text(text.strip())   687    688         # Nest the section appropriately.   689    690         opening, closing = self.nest_section()   691         mointype = sectiontypes.get(sectiontype)   692    693         parts.append("%s%s\n" % (opening, mointype or ""))   694         parts.append(section_content)   695         parts.append("\n%s\n" % closing)   696    697         # Leave the section.   698    699         self.leave_section()   700    701         return parts   702    703     def enter_section(self, sectiontype=None):   704         self.level += 1   705         self.max_level = max(self.level, self.max_level)   706         self.sections.append(sectiontype)   707    708     def leave_section(self):   709         self.level -= 1   710         if not self.level:   711             self.max_level = 0   712         self.sections.pop()   713    714     def nest_section(self):   715         level = 3 + self.max_level - self.level   716         opening = "{" * level   717         closing = "}" * level   718         return opening, closing   719    720     # General parsing.   721    722     def parse_text(self, s, top=False):   723    724         "Parse the content in the string 's', returning the translation."   725    726         parts = []   727    728         # Control spacing between blocks and other blocks or sections.   729    730         preceded_by_block = False   731    732         for type, text in get_regions(s):   733    734             # Handle list, heading, blockquote or anonymous blocks.   735    736             if type is None:   737    738                 # Where the region is the same as the provided text, return   739                 # immediately. This is the base case of the recursive parsing   740                 # process.   741    742                 if text == s and not top:   743                     return self.translate_content(text)   744    745                 # Otherwise, obtain and translate the blocks.   746    747                 if preceded_by_block:   748                     parts.append("\n")   749    750                 first = True   751                 for blocktype, blocktext in get_blocks(text):   752                     if not first:   753                         parts.append("\n")   754                     else:   755                         first = False   756                     parts.append("%s" % self.translate_block(blocktype, blocktext))   757    758                 if not first:   759                     preceded_by_block = True   760    761             # Handle sections.   762    763             else:   764                 sectiontype, options = type   765    766                 # Direct translations of sections.   767    768                 if sectiontypes.has_key(sectiontype):   769                     if preceded_by_block:   770                         parts.append("\n")   771    772                     parts += self.translate_section(sectiontype, options, text)   773                     preceded_by_block = True   774    775                 # Translations of macros acting as sections.   776    777                 elif macrotypes.has_key(sectiontype):   778    779                     # Prevent the production of macros in places they would   780                     # produce illegal Moin syntax.   781    782                     if not self.forbids_macros():   783                         self.macro = sectiontype   784                         argname = macroargs.get(sectiontype)   785                         parts.append(macrotypes[sectiontype] % {   786                             "content"   : quote_macro_argument(self.parse_text(text)),   787                             "args"      : quote_macro_argument((argname and ("%s=" % argname) or "") + options)   788                             })   789                         self.macro = None   790    791                     # Include the contents of section-based macros where the   792                     # macros themselves are not allowed.   793    794                     else:   795                         parts.append(self.translate_content(text))   796    797                     preceded_by_block = False   798    799                 # Unrecognised sections.   800    801                 else:   802                     parts += self.translate_section(sectiontype, None, text)   803                     preceded_by_block = False   804    805         return "".join(parts)   806    807     def forbids_macros(self):   808         return self.in_heading or self.macro   809    810 def parse(s, out, is_comment_page=False):   811    812     "Parse the content in the string 's', writing a translation to 'out'."   813    814     parser = ConfluenceParser(is_comment_page)   815     out.write(parser.parse_text(s, top=True))   816    817 if __name__ == "__main__":   818     s = codecs.getreader("utf-8")(sys.stdin).read()   819     out = codecs.getwriter("utf-8")(sys.stdout)   820     parse(s, out)   821    822 # vim: tabstop=4 expandtab shiftwidth=4