ConfluenceConverter (file wikiparser.py at e014b3b56995)

     1 #!/usr/bin/env python     2      3 """     4 Confluence Wiki syntax parsing.     5      6 Copyright (C) 2012, 2013 Paul Boddie <paul@boddie.org.uk>     7      8 This software is free software; you can redistribute it and/or     9 modify it under the terms of the GNU General Public License as    10 published by the Free Software Foundation; either version 2 of    11 the License, or (at your option) any later version.    12     13 This software is distributed in the hope that it will be useful,    14 but WITHOUT ANY WARRANTY; without even the implied warranty of    15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the    16 GNU General Public License for more details.    17     18 You should have received a copy of the GNU General Public    19 License along with this library; see the file LICENCE.txt    20 If not, write to the Free Software Foundation, Inc.,    21 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA    22     23 --------    24     25 The basic procedure is as follows:    26     27  1. Wiki pages are first split up into regions.    28  2. Then, within these regions, the text is split into blocks.    29     1. First, lists are identified.    30     2. Additionally, other block-like elements are identified.    31  3. Each block is then parsed.    32 """    33     34 from common import *    35 import re    36 import sys    37 import codecs    38     39 # Section extraction.    40     41 sections_regexp_str = r"(?<!{){(?P<type>[^-_*+{}\n:]+)(:[^}\n]+)?}.*?{(?P=type)}"    42 sections_regexp = re.compile(sections_regexp_str, re.DOTALL | re.MULTILINE)    43     44 def get_regions(s):    45     46     """    47     Return a list of regions from 's'. Each region is specified using a tuple of    48     the form (type, text).    49     """    50     51     last = 0    52     regions = []    53     for match in sections_regexp.finditer(s):    54         start, end = match.span()    55         regions.append((None, s[last:start]))    56         regions.append(get_section_details(s[start:end]))    57         last = end    58     regions.append((None, s[last:]))    59     return regions    60     61 # Section inspection.    62     63 section_regexp_str = r"{(?P<sectiontype>[^\n:]*?)(?::(?P<options>.*?))?}(?P<section>.*){(?P=sectiontype)}"    64 section_regexp = re.compile(section_regexp_str, re.DOTALL | re.MULTILINE)    65     66 def get_section_details(s):    67     68     "Return the details of a section 's' in the form (type, text)."    69     70     match = section_regexp.match(s)    71     if match:    72         return (match.group("sectiontype"), match.group("options")), match.group("section")    73     else:    74         return None, s    75     76 # Heading, table and list extraction.    77     78 list_regexp_str = r"^\s*(?P<listtype>[*#-])[*#-]*\s+.*(\n\s*(?P=listtype).*?)*(?:\n|$)"    79 table_regexp_str = r"^((?P<celltype>[|]{1,2})((.|\n(?!\n))+?(?P=celltype))+(\n|$))+"    80 blocktext_regexp_str = r"^(?P<type>h\d|bq)\.\s+(?P<text>.*)$"    81     82 blockelement_regexp = re.compile(    83     "(" + list_regexp_str + ")"    84     "|"    85     "(" + table_regexp_str + ")"    86     "|"    87     "(" + blocktext_regexp_str + ")",    88     re.MULTILINE    89     )    90     91 def get_block_elements(s):    92     93     """    94     Extract headings, tables and lists from the given string 's'.    95     """    96     97     last = 0    98     blocks = []    99     for match in blockelement_regexp.finditer(s):   100         start, end = match.span()   101         matchtype = match.group("listtype") and "list" or match.group("celltype") and "table" or match.group("type")   102         blocks.append((None, s[last:start]))   103         blocks.append((matchtype, match.group("text") or s[start:end]))   104         last = end   105     blocks.append((None, s[last:]))   106     return blocks   107    108 # Block extraction.   109    110 block_regexp_str = r"^(?:\s*\n)+"   111 block_regexp = re.compile(block_regexp_str, re.MULTILINE)   112    113 def get_basic_blocks(s):   114    115     """   116     Return blocks from the given string 's' by splitting the text on blank lines   117     and eliminating those lines.   118     """   119    120     return [b for b in block_regexp.split(s) if b.strip()]   121    122 # Block inspection.   123    124 def get_blocks(s):   125    126     """   127     Return blocks from the given string 's', inspecting the basic blocks and   128     generating additional block-level text where appropriate.   129     """   130    131     blocks = []   132    133     for blocktype, blocktext in get_block_elements(s):   134    135         # Collect heading, list and table blocks.   136    137         if blocktype is not None:   138             blocks.append((blocktype, blocktext))   139    140         # Attempt to find new subblocks in other regions.   141    142         else:   143             for block in get_basic_blocks(blocktext):   144                 blocks.append((None, block))   145    146     return blocks   147    148 # List item inspection.   149    150 listitem_regexp_str = r"^(?P<marker> *[-*#]+)\s+(?P<text>.*)$"   151 listitem_regexp = re.compile(listitem_regexp_str, re.MULTILINE)   152    153 def get_list_items(text):   154    155     "Return a list of (marker, text) tuples for the given list 'text'."   156    157     items = []   158    159     for match in listitem_regexp.finditer(text):   160         items.append((match.group("marker"), match.group("text")))   161    162     return items   163    164 # Content inspection.   165    166 monospace_regexp_str = r"{{(?P<monotext>.*?)}}"   167 link_regexp_str      = r"[[](?P<linktext>.*?)]"   168 image_regexp_str     = r"!(?P<imagetext>\w.*?)!"   169 macro_regexp_str     = r"{(?P<macro>.*?):(?P<options>.*?)}"   170    171 # Word-dependent patterns.   172 # Here, the unbracketed markers must test for the absence of surrounding word   173 # characters.   174    175 italic_regexp_str    = r"(?:(?<!\w)_|\{_\})(?P<italictext>.*?)(?:_(?!\w)|\{_\})"   176 bold_regexp_str      = r"(?:(?<!\w)\*|\{\*\})(?P<boldtext>.*?)(?:\*(?!\w)|\{\*\})"   177 del_regexp_str       = r"(?:(?<!\w)-|\{-\})(?P<deltext>.*?)(?:-(?!\w)|\{-\})"   178 underline_regexp_str = r"(?:(?<!\w)\+|\{\+\})(?P<underlinetext>.*?)(?:\+(?!\w)|\{\+\})"   179 sub_regexp_str       = r"(?:(?<!\w)~|\{~\})(?P<subtext>.*?)(?:~(?!\w)|\{~\})"   180    181 content_regexp_str = (   182     "(" + monospace_regexp_str + ")"   183     "|"   184     "(" + link_regexp_str + ")"   185     "|"   186     "(" + image_regexp_str + ")"   187     "|"   188     "(" + macro_regexp_str + ")"   189     "|"   190     "(" + italic_regexp_str + ")"   191     "|"   192     "(" + bold_regexp_str + ")"   193     "|"   194     "(" + del_regexp_str + ")"   195     "|"   196     "(" + underline_regexp_str + ")"   197     "|"   198     "(" + sub_regexp_str + ")"   199     )   200    201 # Table row inspection.   202    203 cellsep_regexp_str = r"(?P<celltype>[|]{1,2})"   204    205 table_content_regexp_str = (   206     content_regexp_str +   207     "|"   208     "(" + cellsep_regexp_str + ")"   209     )   210    211 content_regexp = re.compile(content_regexp_str)   212 table_content_regexp = re.compile(table_content_regexp_str)   213    214 def get_table_rows(text):   215    216     "Return a list of (cellsep, columns) tuples for the given table 'text'."   217    218     rows = []   219    220     for row in text.split("|\n"):   221         if not row:   222             break   223    224         row += "|"   225         cellsep = None   226         columns = [""]   227         last = 0   228         for match in table_content_regexp.finditer(row):   229             start, end = match.span()   230             columns[-1] += row[last:start]   231    232             if match.group("celltype"):   233                 if cellsep is None:   234                     cellsep = match.group("celltype")   235                 columns.append("")   236             else:   237                 columns[-1] += match.group()   238    239             last = end   240    241         columns[-1] += row[last:]   242    243         if cellsep:   244             rows.append((cellsep, columns[1:-1]))   245    246     return rows   247    248 # Notation conversion.   249    250 notation_mapping = [   251     (r"\!", "!"),   252     (r"\-", "-"),   253     (r"\\""\n", "<<BR>>"),   254     (r"\\ ", "<<BR>>"),   255     (r"\~", "~"),   256     ]   257    258 preformatted_notation_mapping = [   259     (r"\!", "!"),   260     (r"\-", "-"),   261     (r"\\""\n", "\n"),   262     (r"\\ ", "\n"),   263     (r"\~", "~"),   264     ]   265    266 # Translation helpers.   267    268 markers = {   269     "*" : "*",   270     "#" : "1.",   271     "-" : "*",   272     }   273    274 cellseps = {   275     "|" : "\n|| ",   276     "||" : "\n|| ",   277     }   278    279 cellextra = {   280     "|" : "",   281     "||" : "'''",   282     }   283    284 sectiontypes = {   285     "code"      : "",   286     "noformat"  : "",   287     "quote"     : "",   288     "info"      : "#!wiki important",   289     "note"      : "#!wiki caution",   290     "tip"       : "#!wiki tip",   291     "warning"   : "#!wiki warning",   292     }   293    294 preformatted_sectiontypes = (None, "noformat")   295    296 macroargs = {   297     "color"     : "col",   298     }   299    300 macrotypes = {   301     "anchor"    : "<<Anchor(%(args)s)>>",   302     "color"     : "<<Color2(%(content)s, %(args)s)>>",   303     }   304    305 class ConfluenceParser:   306    307     "A parser for Confluence markup."   308    309     def __init__(self):   310         self.max_level = self.level = 0   311         self.in_heading = False   312         self.held_anchors = []   313    314     def translate_marker(self, marker):   315    316         "Translate the given 'marker' to a suitable Moin representation."   317    318         return " " * len(marker) + markers[marker[-1]]   319    320     def translate_cellsep(self, cellsep):   321    322         "Translate the given 'cellsep' to a suitable Moin representation."   323    324         return cellseps[cellsep]   325    326     def translate_cell(self, cellsep, text):   327    328         "Using 'cellsep', translate the cell 'text'."   329    330         return cellextra[cellsep] + self.parse_text(text).strip() + cellextra[cellsep]   331    332     def translate_content_match(self, match):   333    334         "Translate the content described by the given 'match', returning a string."   335    336         if match.group("monotext"):   337             self.enter_section(); self.leave_section()   338             return "{{{%s}}}" % match.group("monotext")   339    340         elif match.group("linktext"):   341             parts = match.group("linktext").split("|")   342    343             # NOTE: Proper detection of external links required.   344    345             if len(parts) == 1:   346                 label, target, title = None, parts[0], None   347             elif len(parts) == 2:   348                 (label, target), title = parts, None   349             else:   350                 label, target, title = parts   351    352             target = target.strip()   353    354             # Look for namespace links and rewrite them.   355    356             if target.find(":") != -1:   357                 prefix = ""   358                 space, rest = target.split(":", 1)   359                 if space not in URL_SCHEMES:   360                     target = "%s/%s" % (space, rest)   361    362             # Detect anchors.   363    364             elif target.startswith("#"):   365                 prefix = ""   366    367             # Detect attachments.   368    369             elif target.startswith("^"):   370                 prefix = "attachment:"   371    372             # Link to other pages within a space.   373    374             else:   375                 prefix = "../"   376    377                 # Make the link tidier by making a target if none was given.   378    379                 if not label:   380                     label = target   381    382             if not label and not title:   383                 return "[[%s%s]]" % (prefix, target)   384             elif not title:   385                 return "[[%s%s|%s]]" % (prefix, target, label)   386             else:   387                 return "[[%s%s|%s|title=%s]]" % (prefix, target, label, title)   388    389         elif match.group("imagetext"):   390             parts = match.group("imagetext").split("|")   391    392             # NOTE: Proper detection of external links required.   393    394             if parts[0].startswith("http"):   395                 prefix = ""   396             else:   397                 prefix = "attachment:"   398    399             # NOTE: Proper options conversion required.   400    401             if len(parts) == 1:   402                 return "{{%s%s}}" % (prefix, parts[0])   403             else:   404                 return "{{%s%s|%s}}" % (prefix, parts[0], parts[1])   405    406         elif match.group("macro"):   407             macro_name = match.group("macro")   408             if macrotypes.has_key(macro_name):   409                 argname = macroargs.get(macro_name)   410                 result = macrotypes[macro_name] % {   411                     "args" : quote_macro_argument((argname and ("%s=" % argname) or "") + match.group("options"))   412                     }   413                 if not self.forbids_macros():   414                     return result   415                 if macro_name == "anchor":   416                     self.held_anchors.append(result)   417             return ""   418    419         elif match.group("italictext"):   420             return "''%s''" % self.translate_content(match.group("italictext"))   421    422         elif match.group("boldtext"):   423             return "'''%s'''" % self.translate_content(match.group("boldtext"))   424    425         elif match.group("deltext"):   426             return "--(%s)--" % self.translate_content(match.group("deltext"))   427    428         elif match.group("underlinetext"):   429             return "__%s__" % self.translate_content(match.group("underlinetext"))   430    431         elif match.group("subtext"):   432             return ",,%s,," % self.translate_content(match.group("subtext"))   433    434         else:   435             return self.translate_text(match.group())   436    437     def translate_text(self, s, preformatted=False):   438    439         "Translate the plain text string 's', converting notation."   440    441         for before, after in preformatted and preformatted_notation_mapping or notation_mapping:   442             s = s.replace(before, after)   443         return s   444    445     def translate_content(self, text, sectiontype=None):   446    447         """   448         Return a translation of the given 'text'. If the optional 'sectiontype' is   449         specified, the translation may be modified to a form appropriate to the   450         section being translated.   451         """   452    453         parts = []   454         preformatted = sectiontype in preformatted_sectiontypes   455    456         last = 0   457         for match in content_regexp.finditer(text):   458             start, end = match.span()   459             parts.append(self.translate_text(text[last:start], preformatted))   460    461             # Handle unformatted sections.   462    463             if sectiontype in ("code", "noformat"):   464                 parts.append(match.group())   465             else:   466                 parts.append(self.translate_content_match(match))   467    468             last = end   469    470         parts.append(self.translate_text(text[last:], preformatted))   471         return "".join(parts)   472    473     def translate_block(self, blocktype, blocktext):   474    475         "Translate the block with the given 'blocktype' and 'blocktext'."   476    477         if blocktype in headings:   478             self.in_heading = True   479             self.held_anchors = []   480    481         parts = []   482    483         # Translate headings and blockquotes.   484    485         if blocktypes.has_key(blocktype):   486             text = self.translate_content(blocktext)   487             for anchor in self.held_anchors:   488                 parts.append(anchor)   489             parts.append(blocktypes[blocktype] % text)   490    491         # Translate list items.   492    493         elif blocktype == "list":   494             for listmarker, listitem in get_list_items(blocktext):   495                 parts.append("%s %s" % (self.translate_marker(listmarker), self.translate_content(listitem)))   496    497         # Translate table items.   498    499         elif blocktype == "table":   500    501             # Enter the table.   502    503             self.enter_section()   504    505             table_parts = []   506             first = True   507    508             for cellsep, columns in get_table_rows(blocktext):   509                 if not first:   510                     table_parts.append("==")   511                 else:   512                     first = False   513                 moinsep = self.translate_cellsep(cellsep)   514                 table_parts.append(moinsep.join([self.translate_cell(cellsep, column) for column in columns]))   515    516             # Nest the section appropriately.   517    518             opening, closing = self.nest_section()   519    520             parts.append("%s#!table" % opening)   521             parts += table_parts   522             parts.append(closing)   523    524             # Leave the table.   525    526             self.leave_section()   527    528         # Handle anonymous blocks.   529    530         else:   531             parts.append(self.translate_content(blocktext))   532    533         if blocktype in headings:   534             self.in_heading = False   535    536         return "\n".join(parts)   537    538     def translate_section(self, sectiontype, options, text):   539    540         """   541         Translate the section with the given 'sectiontype', 'options' and   542         'text'.   543         """   544    545         parts = []   546    547         # Enter the section.   548    549         self.enter_section()   550    551         mointype = sectiontypes.get(sectiontype)   552         section_content = self.translate_content(text.strip(), sectiontype)   553    554         # Nest the section appropriately.   555    556         opening, closing = self.nest_section()   557    558         parts.append("%s%s\n" % (opening, mointype or ""))   559         if options:   560             parts.append("## %s\n" % options)   561         parts.append(section_content)   562         parts.append("\n%s\n" % closing)   563    564         # Leave the section.   565    566         self.leave_section()   567    568         return parts   569    570     def enter_section(self):   571         self.level += 1   572         self.max_level = max(self.level, self.max_level)   573    574     def leave_section(self):   575         self.level -= 1   576         if not self.level:   577             self.max_level = 0   578    579     def nest_section(self):   580         level = 3 + self.max_level - self.level   581         opening = "{" * level   582         closing = "}" * level   583         return opening, closing   584    585     # General parsing.   586    587     def parse_text(self, s):   588    589         "Parse the content in the string 's', returning the translation."   590    591         parts = []   592    593         # Control spacing between blocks and other blocks or sections.   594    595         preceded_by_block = False   596    597         for type, text in get_regions(s):   598    599             # Handle list, heading, blockquote or anonymous blocks.   600    601             if type is None:   602                 if preceded_by_block:   603                     parts.append("\n")   604    605                 first = True   606                 for blocktype, blocktext in get_blocks(text):   607                     if not first:   608                         parts.append("\n")   609                     else:   610                         first = False   611                     parts.append("%s" % self.translate_block(blocktype, blocktext))   612    613                 if not first:   614                     preceded_by_block = True   615    616             # Handle sections.   617    618             else:   619                 sectiontype, options = type   620    621                 # Direct translations of sections.   622    623                 if sectiontypes.has_key(sectiontype):   624                     if preceded_by_block:   625                         parts.append("\n")   626    627                     parts += self.translate_section(sectiontype, options, text)   628                     preceded_by_block = True   629    630                 # Translations of macros (which can look like sections).   631    632                 elif macrotypes.has_key(sectiontype) and not self.forbids_macros():   633                     argname = macroargs.get(sectiontype)   634                     parts.append(macrotypes[sectiontype] % {   635                         "content"   : quote_macro_argument(self.translate_content(text, sectiontype)),   636                         "args"      : quote_macro_argument((argname and ("%s=" % argname) or "") + options)   637                         })   638                     preceded_by_block = False   639    640                 # Unrecognised sections.   641    642                 else:   643                     parts += self.translate_section(sectiontype, None, text)   644                     preceded_by_block = False   645    646         return "".join(parts)   647    648     def forbids_macros(self):   649         return self.in_heading   650    651 def parse(s, out):   652    653     "Parse the content in the string 's', writing a translation to 'out'."   654    655     parser = ConfluenceParser()   656     out.write(parser.parse_text(s))   657    658 if __name__ == "__main__":   659     s = codecs.getreader("utf-8")(sys.stdin).read()   660     out = codecs.getwriter("utf-8")(sys.stdout)   661     parse(s, out)   662    663 # vim: tabstop=4 expandtab shiftwidth=4