ConfluenceConverter (file wikiparser.py at 81aaba648b87)

     1 #!/usr/bin/env python     2      3 """     4 Confluence Wiki syntax parsing.     5      6 Copyright (C) 2012, 2013 Paul Boddie <paul@boddie.org.uk>     7      8 This software is free software; you can redistribute it and/or     9 modify it under the terms of the GNU General Public License as    10 published by the Free Software Foundation; either version 2 of    11 the License, or (at your option) any later version.    12     13 This software is distributed in the hope that it will be useful,    14 but WITHOUT ANY WARRANTY; without even the implied warranty of    15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the    16 GNU General Public License for more details.    17     18 You should have received a copy of the GNU General Public    19 License along with this library; see the file LICENCE.txt    20 If not, write to the Free Software Foundation, Inc.,    21 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA    22     23 --------    24     25 The basic procedure is as follows:    26     27  1. Wiki pages are first split up into regions.    28  2. Then, within these regions, the text is split into blocks.    29     1. First, lists are identified.    30     2. Additionally, other block-like elements are identified.    31  3. Each block is then parsed.    32 """    33     34 from common import *    35 import re    36 import sys    37 import codecs    38     39 # Section extraction.    40     41 sections_regexp_str = r"(?<!{){(?P<type>[^-_*+{}\n:]+)(:[^}\n]+)?}.*?{(?P=type)}"    42 sections_regexp = re.compile(sections_regexp_str, re.DOTALL | re.MULTILINE)    43     44 def get_regions(s):    45     46     """    47     Return a list of regions from 's'. Each region is specified using a tuple of    48     the form (type, text).    49     """    50     51     last = 0    52     regions = []    53     for match in sections_regexp.finditer(s):    54         start, end = match.span()    55         regions.append((None, s[last:start]))    56         regions.append(get_section_details(s[start:end]))    57         last = end    58     regions.append((None, s[last:]))    59     return regions    60     61 # Section inspection.    62     63 section_regexp_str = r"{(?P<sectiontype>[^\n:]*?)(?::(?P<options>.*?))?}(?P<section>.*){(?P=sectiontype)}"    64 section_regexp = re.compile(section_regexp_str, re.DOTALL | re.MULTILINE)    65     66 def get_section_details(s):    67     68     "Return the details of a section 's' in the form (type, text)."    69     70     match = section_regexp.match(s)    71     if match:    72         return (match.group("sectiontype"), match.group("options")), match.group("section")    73     else:    74         return None, s    75     76 # Heading, table and list extraction.    77     78 list_regexp_str = r"^\s*(?P<listtype>[*#-])[*#-]*\s+.*(\n\s*(?P=listtype).*?)*(?:\n|$)"    79 table_regexp_str = r"^((?P<celltype>[|]{1,2})((.|\n(?!\n))+?(?P=celltype))+(\n|$))+"    80 blocktext_regexp_str = r"^(?P<type>h\d|bq)\.\s+(?P<text>.*)$"    81     82 blockelement_regexp = re.compile(    83     "(" + list_regexp_str + ")"    84     "|"    85     "(" + table_regexp_str + ")"    86     "|"    87     "(" + blocktext_regexp_str + ")",    88     re.MULTILINE    89     )    90     91 def get_block_elements(s):    92     93     """    94     Extract headings, tables and lists from the given string 's'.    95     """    96     97     last = 0    98     blocks = []    99     for match in blockelement_regexp.finditer(s):   100         start, end = match.span()   101         matchtype = match.group("listtype") and "list" or match.group("celltype") and "table" or match.group("type")   102         blocks.append((None, s[last:start]))   103         blocks.append((matchtype, match.group("text") or s[start:end]))   104         last = end   105     blocks.append((None, s[last:]))   106     return blocks   107    108 # Block extraction.   109    110 block_regexp_str = r"^(?:\s*\n)+"   111 block_regexp = re.compile(block_regexp_str, re.MULTILINE)   112    113 def get_basic_blocks(s):   114    115     """   116     Return blocks from the given string 's' by splitting the text on blank lines   117     and eliminating those lines.   118     """   119    120     return [b for b in block_regexp.split(s) if b.strip()]   121    122 # Block inspection.   123    124 def get_blocks(s):   125    126     """   127     Return blocks from the given string 's', inspecting the basic blocks and   128     generating additional block-level text where appropriate.   129     """   130    131     blocks = []   132    133     for blocktype, blocktext in get_block_elements(s):   134    135         # Collect heading, list and table blocks.   136    137         if blocktype is not None:   138             blocks.append((blocktype, blocktext))   139    140         # Attempt to find new subblocks in other regions.   141    142         else:   143             for block in get_basic_blocks(blocktext):   144                 blocks.append((None, block))   145    146     return blocks   147    148 # List item inspection.   149    150 listitem_regexp_str = r"^(?P<marker> *[-*#]+)\s+(?P<text>.*)$"   151 listitem_regexp = re.compile(listitem_regexp_str, re.MULTILINE)   152    153 def get_list_items(text):   154    155     "Return a list of (marker, text) tuples for the given list 'text'."   156    157     items = []   158    159     for match in listitem_regexp.finditer(text):   160         items.append((match.group("marker"), match.group("text")))   161    162     return items   163    164 # Content inspection.   165    166 monospace_regexp_str = r"{{(?P<monotext>.*?)}}"   167 link_regexp_str      = r"[[](?P<linktext>.*?)]"   168 image_regexp_str     = r"!(?P<imagetext>\w.*?)!"   169    170 # Word-dependent patterns.   171 # Here, the unbracketed markers must test for the absence of surrounding word   172 # characters.   173    174 italic_regexp_str    = r"(?:(?<!\w)_|\{_\})(?P<italictext>.*?)(?:_(?!\w)|\{_\})"   175 bold_regexp_str      = r"(?:(?<!\w)\*|\{\*\})(?P<boldtext>.*?)(?:\*(?!\w)|\{\*\})"   176 del_regexp_str       = r"(?:(?<!\w)-|\{-\})(?P<deltext>.*?)(?:-(?!\w)|\{-\})"   177 underline_regexp_str = r"(?:(?<!\w)\+|\{\+\})(?P<underlinetext>.*?)(?:\+(?!\w)|\{\+\})"   178 sub_regexp_str       = r"(?:(?<!\w)~|\{~\})(?P<subtext>.*?)(?:~(?!\w)|\{~\})"   179    180 content_regexp_str = (   181     "(" + monospace_regexp_str + ")"   182     "|"   183     "(" + link_regexp_str + ")"   184     "|"   185     "(" + image_regexp_str + ")"   186     "|"   187     "(" + italic_regexp_str + ")"   188     "|"   189     "(" + bold_regexp_str + ")"   190     "|"   191     "(" + del_regexp_str + ")"   192     "|"   193     "(" + underline_regexp_str + ")"   194     "|"   195     "(" + sub_regexp_str + ")"   196     )   197    198 # Table row inspection.   199    200 cellsep_regexp_str = r"(?P<celltype>[|]{1,2})"   201    202 table_content_regexp_str = (   203     content_regexp_str +   204     "|"   205     "(" + cellsep_regexp_str + ")"   206     )   207    208 content_regexp = re.compile(content_regexp_str)   209 table_content_regexp = re.compile(table_content_regexp_str)   210    211 def get_table_rows(text):   212    213     "Return a list of (cellsep, columns) tuples for the given table 'text'."   214    215     rows = []   216    217     for row in text.split("|\n"):   218         if not row:   219             break   220    221         row += "|"   222         cellsep = None   223         columns = [""]   224         last = 0   225         for match in table_content_regexp.finditer(row):   226             start, end = match.span()   227             columns[-1] += row[last:start]   228    229             if match.group("celltype"):   230                 if cellsep is None:   231                     cellsep = match.group("celltype")   232                 columns.append("")   233             else:   234                 columns[-1] += match.group()   235    236             last = end   237    238         columns[-1] += row[last:]   239    240         if cellsep:   241             rows.append((cellsep, columns[1:-1]))   242    243     return rows   244    245 # Notation conversion.   246    247 notation_mapping = [   248     (r"\!", "!"),   249     (r"\-", "-"),   250     (r"\\""\n", "<<BR>>"),   251     (r"\\ ", "<<BR>>"),   252     (r"\~", "~"),   253     ]   254    255 preformatted_notation_mapping = [   256     (r"\!", "!"),   257     (r"\-", "-"),   258     (r"\\""\n", "\n"),   259     (r"\\ ", "\n"),   260     (r"\~", "~"),   261     ]   262    263 # Translation helpers.   264    265 markers = {   266     "*" : "*",   267     "#" : "1.",   268     "-" : "*",   269     }   270    271 cellseps = {   272     "|" : "\n|| ",   273     "||" : "\n|| ",   274     }   275    276 cellextra = {   277     "|" : "",   278     "||" : "'''",   279     }   280    281 sectiontypes = {   282     "code"      : "",   283     "noformat"  : "",   284     "quote"     : "",   285     "info"      : "#!wiki important",   286     "note"      : "#!wiki caution",   287     "tip"       : "#!wiki tip",   288     "warning"   : "#!wiki warning",   289     }   290    291 preformatted_sectiontypes = (None, "noformat")   292    293 macrotypes = {   294     "anchor"    : "<<Anchor(%s)>>",   295     "color"     : "<<Color(%s)>>",   296     }   297    298 class ConfluenceParser:   299    300     "A parser for Confluence markup."   301    302     def __init__(self):   303         self.max_level = self.level = 0   304    305     def translate_marker(self, marker):   306    307         "Translate the given 'marker' to a suitable Moin representation."   308    309         return " " * len(marker) + markers[marker[-1]]   310    311     def translate_cellsep(self, cellsep):   312    313         "Translate the given 'cellsep' to a suitable Moin representation."   314    315         return cellseps[cellsep]   316    317     def translate_cell(self, cellsep, text):   318    319         "Using 'cellsep', translate the cell 'text'."   320    321         return cellextra[cellsep] + self.parse_text(text).strip() + cellextra[cellsep]   322    323     def translate_content_match(self, match):   324    325         "Translate the content described by the given 'match', returning a string."   326    327         if match.group("monotext"):   328             self.enter_section(); self.leave_section()   329             return "{{{%s}}}" % match.group("monotext")   330    331         elif match.group("linktext"):   332             parts = match.group("linktext").split("|")   333    334             # NOTE: Proper detection of external links required.   335    336             if len(parts) == 1:   337                 label, target, title = None, parts[0], None   338             elif len(parts) == 2:   339                 (label, target), title = parts, None   340             else:   341                 label, target, title = parts   342    343             target = target.strip()   344    345             # Look for namespace links and rewrite them.   346    347             if target.find(":") != -1:   348                 prefix = ""   349                 space, rest = target.split(":", 1)   350                 if space not in URL_SCHEMES:   351                     target = "%s/%s" % (space, rest)   352    353             # Detect anchors.   354    355             elif target.startswith("#"):   356                 prefix = ""   357    358             # Detect attachments.   359    360             elif target.startswith("^"):   361                 prefix = "attachment:"   362    363             # Link to other pages within a space.   364    365             else:   366                 prefix = "../"   367    368                 # Make the link tidier by making a target if none was given.   369    370                 if not label:   371                     label = target   372    373             if not label and not title:   374                 return "[[%s%s]]" % (prefix, target)   375             elif not title:   376                 return "[[%s%s|%s]]" % (prefix, target, label)   377             else:   378                 return "[[%s%s|%s|title=%s]]" % (prefix, target, label, title)   379    380         elif match.group("imagetext"):   381             parts = match.group("imagetext").split("|")   382    383             # NOTE: Proper detection of external links required.   384    385             if parts[0].startswith("http"):   386                 prefix = ""   387             else:   388                 prefix = "attachment:"   389    390             # NOTE: Proper options conversion required.   391    392             if len(parts) == 1:   393                 return "{{%s%s}}" % (prefix, parts[0])   394             else:   395                 return "{{%s%s|%s}}" % (prefix, parts[0], parts[1])   396    397         elif match.group("italictext"):   398             return "''%s''" % self.translate_content(match.group("italictext"))   399    400         elif match.group("boldtext"):   401             return "'''%s'''" % self.translate_content(match.group("boldtext"))   402    403         elif match.group("deltext"):   404             return "--(%s)--" % self.translate_content(match.group("deltext"))   405    406         elif match.group("underlinetext"):   407             return "__%s__" % self.translate_content(match.group("underlinetext"))   408    409         elif match.group("subtext"):   410             return ",,%s,," % self.translate_content(match.group("subtext"))   411    412         else:   413             return self.translate_text(match.group())   414    415     def translate_text(self, s, preformatted=False):   416    417         "Translate the plain text string 's', converting notation."   418    419         for before, after in preformatted and preformatted_notation_mapping or notation_mapping:   420             s = s.replace(before, after)   421         return s   422    423     def translate_content(self, text, sectiontype=None):   424    425         """   426         Return a translation of the given 'text'. If the optional 'sectiontype' is   427         specified, the translation may be modified to a form appropriate to the   428         section being translated.   429         """   430    431         parts = []   432         preformatted = sectiontype in preformatted_sectiontypes   433    434         last = 0   435         for match in content_regexp.finditer(text):   436             start, end = match.span()   437             parts.append(self.translate_text(text[last:start], preformatted))   438    439             # Handle unformatted sections.   440    441             if sectiontype in ("code", "noformat"):   442                 parts.append(match.group())   443             else:   444                 parts.append(self.translate_content_match(match))   445    446             last = end   447    448         parts.append(self.translate_text(text[last:], preformatted))   449         return "".join(parts)   450    451     def translate_block(self, blocktype, blocktext):   452    453         "Translate the block with the given 'blocktype' and 'blocktext'."   454    455         parts = []   456    457         # Translate headings and blockquotes.   458    459         if blocktypes.has_key(blocktype):   460             parts.append(blocktypes[blocktype] % blocktext)   461    462         # Translate list items.   463    464         elif blocktype == "list":   465             for listmarker, listitem in get_list_items(blocktext):   466                 parts.append("%s %s" % (self.translate_marker(listmarker), self.translate_content(listitem)))   467    468         # Translate table items.   469    470         elif blocktype == "table":   471    472             # Enter the table.   473    474             self.enter_section()   475    476             table_parts = []   477             first = True   478    479             for cellsep, columns in get_table_rows(blocktext):   480                 if not first:   481                     table_parts.append("==")   482                 else:   483                     first = False   484                 moinsep = self.translate_cellsep(cellsep)   485                 table_parts.append(moinsep.join([self.translate_cell(cellsep, column) for column in columns]))   486    487             # Nest the section appropriately.   488    489             opening, closing = self.nest_section()   490    491             parts.append("%s#!table" % opening)   492             parts += table_parts   493             parts.append(closing)   494    495             # Leave the table.   496    497             self.leave_section()   498    499         # Handle anonymous blocks.   500    501         else:   502             parts.append(self.translate_content(blocktext))   503    504         return "\n".join(parts)   505    506     def translate_section(self, sectiontype, options, text):   507    508         """   509         Translate the section with the given 'sectiontype', 'options' and   510         'text'.   511         """   512    513         parts = []   514    515         # Enter the section.   516    517         self.enter_section()   518    519         mointype = sectiontypes.get(sectiontype)   520         section_content = self.translate_content(text.strip(), sectiontype)   521    522         # Nest the section appropriately.   523    524         opening, closing = self.nest_section()   525    526         parts.append("%s%s\n" % (opening, mointype or ""))   527         if options:   528             parts.append("## %s\n" % options)   529         parts.append(section_content)   530         parts.append("\n%s\n" % closing)   531    532         # Leave the section.   533    534         self.leave_section()   535    536         return parts   537    538     def enter_section(self):   539         self.level += 1   540         self.max_level = max(self.level, self.max_level)   541    542     def leave_section(self):   543         self.level -= 1   544         if not self.level:   545             self.max_level = 0   546    547     def nest_section(self):   548         level = 3 + self.max_level - self.level   549         opening = "{" * level   550         closing = "}" * level   551         return opening, closing   552    553     # General parsing.   554    555     def parse_text(self, s):   556    557         "Parse the content in the string 's', returning the translation."   558    559         parts = []   560    561         # Control spacing between blocks and other blocks or sections.   562    563         preceded_by_block = False   564    565         for type, text in get_regions(s):   566    567             # Handle list, heading, blockquote or anonymous blocks.   568    569             if type is None:   570                 if preceded_by_block:   571                     parts.append("\n")   572    573                 first = True   574                 for blocktype, blocktext in get_blocks(text):   575                     if not first:   576                         parts.append("\n")   577                     else:   578                         first = False   579                     parts.append("%s" % self.translate_block(blocktype, blocktext))   580    581                 if not first:   582                     preceded_by_block = True   583    584             # Handle sections.   585    586             else:   587                 sectiontype, options = type   588    589                 # Direct translations of sections.   590    591                 if sectiontypes.has_key(sectiontype):   592                     if preceded_by_block:   593                         parts.append("\n")   594    595                     parts += self.translate_section(sectiontype, options, text)   596                     preceded_by_block = True   597    598                 # Translations of macros (which can look like sections).   599    600                 elif macrotypes.has_key(sectiontype):   601                     parts.append(macrotypes[sectiontype] % self.translate_content(text, sectiontype))   602                     preceded_by_block = False   603    604                 # Unrecognised sections.   605    606                 else:   607                     parts += self.translate_section(sectiontype, None, text)   608                     preceded_by_block = False   609    610         return "".join(parts)   611    612 def parse(s, out):   613    614     "Parse the content in the string 's', writing a translation to 'out'."   615    616     parser = ConfluenceParser()   617     out.write(parser.parse_text(s))   618    619 if __name__ == "__main__":   620     s = codecs.getreader("utf-8")(sys.stdin).read()   621     out = codecs.getwriter("utf-8")(sys.stdout)   622     parse(s, out)   623    624 # vim: tabstop=4 expandtab shiftwidth=4