ConfluenceConverter (file wikiparser.py at fa3c16972921)

     1 #!/usr/bin/env python     2      3 """     4 Confluence Wiki syntax parsing.     5      6 Copyright (C) 2012, 2013 Paul Boddie <paul@boddie.org.uk>     7      8 This software is free software; you can redistribute it and/or     9 modify it under the terms of the GNU General Public License as    10 published by the Free Software Foundation; either version 2 of    11 the License, or (at your option) any later version.    12     13 This software is distributed in the hope that it will be useful,    14 but WITHOUT ANY WARRANTY; without even the implied warranty of    15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the    16 GNU General Public License for more details.    17     18 You should have received a copy of the GNU General Public    19 License along with this library; see the file LICENCE.txt    20 If not, write to the Free Software Foundation, Inc.,    21 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA    22     23 --------    24     25 The basic procedure is as follows:    26     27  1. Wiki pages are first split up into regions.    28  2. Then, within these regions, the text is split into blocks.    29     1. First, lists are identified.    30     2. Additionally, other block-like elements are identified.    31  3. Each block is then parsed.    32 """    33     34 from common import *    35 import re    36 import sys    37 import codecs    38     39 # Section extraction.    40     41 sections_regexp_str = r"(?<!{){(?P<type>[^-_*+{}\n:]+)(?P<options>:[^}\n]+)?}"    42 sections_regexp = re.compile(sections_regexp_str, re.DOTALL | re.MULTILINE)    43     44 def get_regions(s):    45     46     """    47     Return a list of regions from 's'. Each region is specified using a tuple of    48     the form (type, text).    49     """    50     51     last = 0    52     regions = []    53     depth = 0    54     55     for match in sections_regexp.finditer(s):    56         start, end = match.span()    57         is_start = match.group("options")    58     59         # The start of a region is either indicated by a marker with options or    60         # by a marker where no region is currently active.    61     62         if is_start or not depth:    63     64             # Where no region is active, add the text since the last match as a    65             # "null" region.    66     67             if not depth:    68                 regions.append((None, s[last:start]))    69     70                 # A new region is maintained as a string.    71     72                 regions.append(s[start:end])    73     74             # Where a region is active, add the text since the last match as    75             # well as the text in this match to the region.    76     77             else:    78                 regions[-1] += s[last:end]    79     80             # Any start marker will cause an increase in the depth of the region    81             # nesting.    82     83             depth += 1    84     85         # The end of a region is indicated by a marker with no options.    86     87         else:    88             # Where no region is active, the text since the last match plus the    89             # marker are added to the current "null" region.    90     91             if not depth:    92     93                 # Add to the string portion of the "null" region.    94     95                 regions[-1][1] += s[last:end]    96     97             # Where a region is active, the end marker and preceding text is    98             # either incorporated into the current region if more than one    99             # region is active, or the preceding text is incorporated into the   100             # current region and the details of the region are then obtained.   101    102             else:   103                 if depth > 1:   104                     regions[-1] += s[last:end]   105    106                 # Terminate the active region, interpreting its contents.   107    108                 else:   109                     regions[-1] = get_section_details(regions[-1] + s[last:end])   110                 depth -= 1   111    112         last = end   113    114     # Where a region is still active, terminate it.   115    116     if depth:   117         regions[-1] = get_section_details(regions[-1] + s[last:])   118    119     # Otherwise, add a "null" region.   120    121     else:   122         regions.append((None, s[last:]))   123    124     return regions   125    126 # Section inspection.   127    128 section_regexp_str = r"{(?P<sectiontype>[^\n:]*?)(?::(?P<options>.*?))?}(?P<section>.*){(?P=sectiontype)}"   129 section_regexp = re.compile(section_regexp_str, re.DOTALL | re.MULTILINE)   130    131 def get_section_details(s):   132    133     "Return the details of a section 's' in the form (type, text)."   134    135     match = section_regexp.match(s)   136     if match:   137         return (match.group("sectiontype"), match.group("options")), match.group("section")   138     else:   139         return None, s   140    141 # Heading, table and list extraction.   142    143 list_regexp_str = r"^\s*(?P<listtype>[*#-])[*#-]*\s+.*(\n\s*(?P=listtype).*?)*(?:\n|$)"   144 table_regexp_str = r"^((?P<celltype>[|]{1,2})((.|\n(?!\n))+?(?P=celltype))+(\n|$))+"   145 blocktext_regexp_str = r"^(?P<type>h\d|bq)\.\s+(?P<text>.*)$"   146    147 blockelement_regexp = re.compile(   148     "(" + list_regexp_str + ")"   149     "|"   150     "(" + table_regexp_str + ")"   151     "|"   152     "(" + blocktext_regexp_str + ")",   153     re.MULTILINE   154     )   155    156 def get_block_elements(s):   157    158     """   159     Extract headings, tables and lists from the given string 's'.   160     """   161    162     last = 0   163     blocks = []   164     for match in blockelement_regexp.finditer(s):   165         start, end = match.span()   166         matchtype = match.group("listtype") and "list" or match.group("celltype") and "table" or match.group("type")   167         blocks.append((None, s[last:start]))   168         blocks.append((matchtype, match.group("text") or s[start:end]))   169         last = end   170     blocks.append((None, s[last:]))   171     return blocks   172    173 # Block extraction.   174    175 block_regexp_str = r"^(?:\s*\n)+"   176 block_regexp = re.compile(block_regexp_str, re.MULTILINE)   177    178 def get_basic_blocks(s):   179    180     """   181     Return blocks from the given string 's' by splitting the text on blank lines   182     and eliminating those lines.   183     """   184    185     return [b for b in block_regexp.split(s) if b.strip()]   186    187 # Block inspection.   188    189 def get_blocks(s):   190    191     """   192     Return blocks from the given string 's', inspecting the basic blocks and   193     generating additional block-level text where appropriate.   194     """   195    196     blocks = []   197    198     for blocktype, blocktext in get_block_elements(s):   199    200         # Collect heading, list and table blocks.   201    202         if blocktype is not None:   203             blocks.append((blocktype, blocktext))   204    205         # Attempt to find new subblocks in other regions.   206    207         else:   208             for block in get_basic_blocks(blocktext):   209                 blocks.append((None, block))   210    211     return blocks   212    213 # List item inspection.   214    215 listitem_regexp_str = r"^(?P<marker> *[-*#]+)\s+(?P<text>.*)$"   216 listitem_regexp = re.compile(listitem_regexp_str, re.MULTILINE)   217    218 def get_list_items(text):   219    220     "Return a list of (marker, text) tuples for the given list 'text'."   221    222     items = []   223    224     for match in listitem_regexp.finditer(text):   225         items.append((match.group("marker"), match.group("text")))   226    227     return items   228    229 # Content inspection.   230    231 monospace_regexp_str = r"{{(?P<monotext>.*?)}}"   232 link_regexp_str      = r"[[](?P<linktext>.*?)]"   233 image_regexp_str     = r"!(?P<imagetext>\w.*?)!"   234 macro_regexp_str     = r"{(?P<macro>.*?):(?P<options>.*?)}"   235    236 # Word-dependent patterns.   237 # Here, the unbracketed markers must test for the absence of surrounding word   238 # characters.   239    240 italic_regexp_str    = r"(?:(?<!\w)_|\{_\})(?P<italictext>.*?)(?:_(?!\w)|\{_\})"   241 bold_regexp_str      = r"(?:(?<!\w)\*|\{\*\})(?P<boldtext>.*?)(?:\*(?!\w)|\{\*\})"   242 del_regexp_str       = r"(?:(?<!\w)-|\{-\})(?P<deltext>.*?)(?:-(?!\w)|\{-\})"   243 underline_regexp_str = r"(?:(?<!\w)\+|\{\+\})(?P<underlinetext>.*?)(?:\+(?!\w)|\{\+\})"   244 sub_regexp_str       = r"(?:(?<!\w)~|\{~\})(?P<subtext>.*?)(?:~(?!\w)|\{~\})"   245    246 content_regexp_str = (   247     "(" + monospace_regexp_str + ")"   248     "|"   249     "(" + link_regexp_str + ")"   250     "|"   251     "(" + image_regexp_str + ")"   252     "|"   253     "(" + macro_regexp_str + ")"   254     "|"   255     "(" + italic_regexp_str + ")"   256     "|"   257     "(" + bold_regexp_str + ")"   258     "|"   259     "(" + del_regexp_str + ")"   260     "|"   261     "(" + underline_regexp_str + ")"   262     "|"   263     "(" + sub_regexp_str + ")"   264     )   265    266 # Table row inspection.   267    268 cellsep_regexp_str = r"(?P<celltype>[|]{1,2})"   269    270 table_content_regexp_str = (   271     content_regexp_str +   272     "|"   273     "(" + cellsep_regexp_str + ")"   274     )   275    276 content_regexp = re.compile(content_regexp_str)   277 table_content_regexp = re.compile(table_content_regexp_str)   278    279 def get_table_rows(text):   280    281     "Return a list of (cellsep, columns) tuples for the given table 'text'."   282    283     rows = []   284    285     for row in text.split("|\n"):   286         if not row:   287             break   288    289         row += "|"   290         cellsep = None   291         columns = [""]   292         last = 0   293         for match in table_content_regexp.finditer(row):   294             start, end = match.span()   295             columns[-1] += row[last:start]   296    297             if match.group("celltype"):   298                 if cellsep is None:   299                     cellsep = match.group("celltype")   300                 columns.append("")   301             else:   302                 columns[-1] += match.group()   303    304             last = end   305    306         columns[-1] += row[last:]   307    308         if cellsep:   309             rows.append((cellsep, columns[1:-1]))   310    311     return rows   312    313 # Notation conversion.   314    315 notation_mapping = [   316     (r"\!", "!"),   317     (r"\-", "-"),   318     (r"\\""\n", "<<BR>>"),   319     (r"\\ ", "<<BR>>"),   320     (r"\~", "~"),   321     ]   322    323 preformatted_notation_mapping = [   324     (r"\!", "!"),   325     (r"\-", "-"),   326     (r"\\""\n", "\n"),   327     (r"\\ ", "\n"),   328     (r"\~", "~"),   329     ]   330    331 # Translation helpers.   332    333 markers = {   334     "*" : "*",   335     "#" : "1.",   336     "-" : "*",   337     }   338    339 cellseps = {   340     "|" : "\n|| ",   341     "||" : "\n|| ",   342     }   343    344 cellextra = {   345     "|" : "",   346     "||" : "'''",   347     }   348    349 sectiontypes = {   350     "code"      : "",   351     "noformat"  : "",   352     "quote"     : "",   353     "info"      : "#!wiki important",   354     "note"      : "#!wiki caution",   355     "tip"       : "#!wiki tip",   356     "warning"   : "#!wiki warning",   357     }   358    359 preformatted_sectiontypes = (None, "noformat")   360    361 macroargs = {   362     "color"     : "col",   363     }   364    365 macrotypes = {   366     "anchor"    : "<<Anchor(%(args)s)>>",   367     "color"     : "<<Color2(%(content)s, %(args)s)>>",   368     }   369    370 class ConfluenceParser:   371    372     "A parser for Confluence markup."   373    374     def __init__(self):   375         self.max_level = self.level = 0   376         self.in_heading = False   377         self.held_anchors = []   378    379     def translate_marker(self, marker):   380    381         "Translate the given 'marker' to a suitable Moin representation."   382    383         return " " * len(marker) + markers[marker[-1]]   384    385     def translate_cellsep(self, cellsep):   386    387         "Translate the given 'cellsep' to a suitable Moin representation."   388    389         return cellseps[cellsep]   390    391     def translate_cell(self, cellsep, text):   392    393         "Using 'cellsep', translate the cell 'text'."   394    395         return cellextra[cellsep] + self.parse_text(text).strip() + cellextra[cellsep]   396    397     def translate_content_match(self, match):   398    399         "Translate the content described by the given 'match', returning a string."   400    401         if match.group("monotext"):   402             self.enter_section(); self.leave_section()   403             return "{{{%s}}}" % match.group("monotext")   404    405         elif match.group("linktext"):   406             parts = match.group("linktext").split("|")   407    408             # NOTE: Proper detection of external links required.   409    410             if len(parts) == 1:   411                 label, target, title = None, parts[0], None   412             elif len(parts) == 2:   413                 (label, target), title = parts, None   414             else:   415                 label, target, title = parts   416    417             target = target.strip()   418    419             # Look for namespace links and rewrite them.   420    421             if target.find(":") != -1:   422                 prefix = ""   423                 space, rest = target.split(":", 1)   424                 if space not in URL_SCHEMES:   425                     target = "%s/%s" % (space, rest)   426    427             # Detect anchors.   428    429             elif target.startswith("#"):   430                 prefix = ""   431    432             # Detect attachments.   433    434             elif target.startswith("^"):   435                 prefix = "attachment:"   436    437             # Link to other pages within a space.   438    439             else:   440                 prefix = "../"   441    442                 # Make the link tidier by making a target if none was given.   443    444                 if not label:   445                     label = target   446    447             if not label and not title:   448                 return "[[%s%s]]" % (prefix, target)   449             elif not title:   450                 return "[[%s%s|%s]]" % (prefix, target, label)   451             else:   452                 return "[[%s%s|%s|title=%s]]" % (prefix, target, label, title)   453    454         elif match.group("imagetext"):   455             parts = match.group("imagetext").split("|")   456    457             # NOTE: Proper detection of external links required.   458    459             if parts[0].startswith("http"):   460                 prefix = ""   461             else:   462                 prefix = "attachment:"   463    464             # NOTE: Proper options conversion required.   465    466             if len(parts) == 1:   467                 return "{{%s%s}}" % (prefix, parts[0])   468             else:   469                 return "{{%s%s|%s}}" % (prefix, parts[0], parts[1])   470    471         elif match.group("macro"):   472             macro_name = match.group("macro")   473             if macrotypes.has_key(macro_name):   474                 argname = macroargs.get(macro_name)   475                 result = macrotypes[macro_name] % {   476                     "args" : quote_macro_argument((argname and ("%s=" % argname) or "") + match.group("options"))   477                     }   478                 if not self.forbids_macros():   479                     return result   480                 if macro_name == "anchor":   481                     self.held_anchors.append(result)   482             return ""   483    484         elif match.group("italictext"):   485             return "''%s''" % self.translate_content(match.group("italictext"))   486    487         elif match.group("boldtext"):   488             return "'''%s'''" % self.translate_content(match.group("boldtext"))   489    490         elif match.group("deltext"):   491             return "--(%s)--" % self.translate_content(match.group("deltext"))   492    493         elif match.group("underlinetext"):   494             return "__%s__" % self.translate_content(match.group("underlinetext"))   495    496         elif match.group("subtext"):   497             return ",,%s,," % self.translate_content(match.group("subtext"))   498    499         else:   500             return self.translate_text(match.group())   501    502     def translate_text(self, s, preformatted=False):   503    504         "Translate the plain text string 's', converting notation."   505    506         for before, after in preformatted and preformatted_notation_mapping or notation_mapping:   507             s = s.replace(before, after)   508         return s   509    510     def translate_content(self, text, sectiontype=None):   511    512         """   513         Return a translation of the given 'text'. If the optional 'sectiontype' is   514         specified, the translation may be modified to a form appropriate to the   515         section being translated.   516         """   517    518         parts = []   519         preformatted = sectiontype in preformatted_sectiontypes   520    521         last = 0   522         for match in content_regexp.finditer(text):   523             start, end = match.span()   524             parts.append(self.translate_text(text[last:start], preformatted))   525    526             # Handle unformatted sections.   527    528             if sectiontype in ("code", "noformat"):   529                 parts.append(match.group())   530             else:   531                 parts.append(self.translate_content_match(match))   532    533             last = end   534    535         parts.append(self.translate_text(text[last:], preformatted))   536         return "".join(parts)   537    538     def translate_block(self, blocktype, blocktext):   539    540         "Translate the block with the given 'blocktype' and 'blocktext'."   541    542         if blocktype in headings:   543             self.in_heading = True   544             self.held_anchors = []   545    546         parts = []   547    548         # Translate headings and blockquotes.   549    550         if blocktypes.has_key(blocktype):   551             text = self.translate_content(blocktext)   552             for anchor in self.held_anchors:   553                 parts.append(anchor)   554             parts.append(blocktypes[blocktype] % text)   555    556         # Translate list items.   557    558         elif blocktype == "list":   559             for listmarker, listitem in get_list_items(blocktext):   560                 parts.append("%s %s" % (self.translate_marker(listmarker), self.translate_content(listitem)))   561    562         # Translate table items.   563    564         elif blocktype == "table":   565    566             # Enter the table.   567    568             self.enter_section()   569    570             table_parts = []   571             first = True   572    573             for cellsep, columns in get_table_rows(blocktext):   574                 if not first:   575                     table_parts.append("==")   576                 else:   577                     first = False   578                 moinsep = self.translate_cellsep(cellsep)   579                 table_parts.append(moinsep.join([self.translate_cell(cellsep, column) for column in columns]))   580    581             # Nest the section appropriately.   582    583             opening, closing = self.nest_section()   584    585             parts.append("%s#!table" % opening)   586             parts += table_parts   587             parts.append(closing)   588    589             # Leave the table.   590    591             self.leave_section()   592    593         # Handle anonymous blocks.   594    595         else:   596             parts.append(self.translate_content(blocktext))   597    598         if blocktype in headings:   599             self.in_heading = False   600    601         return "\n".join(parts)   602    603     def translate_section(self, sectiontype, options, text):   604    605         """   606         Translate the section with the given 'sectiontype', 'options' and   607         'text'.   608         """   609    610         parts = []   611    612         # Enter the section.   613    614         self.enter_section()   615    616         mointype = sectiontypes.get(sectiontype)   617         section_content = self.translate_content(text.strip(), sectiontype)   618    619         # Nest the section appropriately.   620    621         opening, closing = self.nest_section()   622    623         parts.append("%s%s\n" % (opening, mointype or ""))   624         if options:   625             parts.append("## %s\n" % options)   626         parts.append(section_content)   627         parts.append("\n%s\n" % closing)   628    629         # Leave the section.   630    631         self.leave_section()   632    633         return parts   634    635     def enter_section(self):   636         self.level += 1   637         self.max_level = max(self.level, self.max_level)   638    639     def leave_section(self):   640         self.level -= 1   641         if not self.level:   642             self.max_level = 0   643    644     def nest_section(self):   645         level = 3 + self.max_level - self.level   646         opening = "{" * level   647         closing = "}" * level   648         return opening, closing   649    650     # General parsing.   651    652     def parse_text(self, s):   653    654         "Parse the content in the string 's', returning the translation."   655    656         parts = []   657    658         # Control spacing between blocks and other blocks or sections.   659    660         preceded_by_block = False   661    662         for type, text in get_regions(s):   663    664             # Handle list, heading, blockquote or anonymous blocks.   665    666             if type is None:   667                 if preceded_by_block:   668                     parts.append("\n")   669    670                 first = True   671                 for blocktype, blocktext in get_blocks(text):   672                     if not first:   673                         parts.append("\n")   674                     else:   675                         first = False   676                     parts.append("%s" % self.translate_block(blocktype, blocktext))   677    678                 if not first:   679                     preceded_by_block = True   680    681             # Handle sections.   682    683             else:   684                 sectiontype, options = type   685    686                 # Direct translations of sections.   687    688                 if sectiontypes.has_key(sectiontype):   689                     if preceded_by_block:   690                         parts.append("\n")   691    692                     parts += self.translate_section(sectiontype, options, text)   693                     preceded_by_block = True   694    695                 # Translations of macros (which can look like sections).   696    697                 elif macrotypes.has_key(sectiontype) and not self.forbids_macros():   698                     argname = macroargs.get(sectiontype)   699                     parts.append(macrotypes[sectiontype] % {   700                         "content"   : quote_macro_argument(self.parse_text(text)),   701                         "args"      : quote_macro_argument((argname and ("%s=" % argname) or "") + options)   702                         })   703                     preceded_by_block = False   704    705                 # Unrecognised sections.   706    707                 else:   708                     parts += self.translate_section(sectiontype, None, text)   709                     preceded_by_block = False   710    711         return "".join(parts)   712    713     def forbids_macros(self):   714         return self.in_heading   715    716 def parse(s, out):   717    718     "Parse the content in the string 's', writing a translation to 'out'."   719    720     parser = ConfluenceParser()   721     out.write(parser.parse_text(s))   722    723 if __name__ == "__main__":   724     s = codecs.getreader("utf-8")(sys.stdin).read()   725     out = codecs.getwriter("utf-8")(sys.stdout)   726     parse(s, out)   727    728 # vim: tabstop=4 expandtab shiftwidth=4