ConfluenceConverter (file parser.py at 702a040785d7)

     1 #!/usr/bin/env python     2      3 """     4 Confluence Wiki syntax parsing.     5      6 Copyright (C) 2012 Paul Boddie <paul@boddie.org.uk>     7      8 This software is free software; you can redistribute it and/or     9 modify it under the terms of the GNU General Public License as    10 published by the Free Software Foundation; either version 2 of    11 the License, or (at your option) any later version.    12     13 This software is distributed in the hope that it will be useful,    14 but WITHOUT ANY WARRANTY; without even the implied warranty of    15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the    16 GNU General Public License for more details.    17     18 You should have received a copy of the GNU General Public    19 License along with this library; see the file LICENCE.txt    20 If not, write to the Free Software Foundation, Inc.,    21 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA    22     23 --------    24     25 The basic procedure is as follows:    26     27  1. Wiki pages are first split up into regions.    28  2. Then, within these regions, the text is split into blocks.    29     1. First, lists are identified.    30     2. Additionally, other block-like elements are identified.    31  3. Each block is then parsed.    32 """    33     34 import re    35     36 URL_SCHEMES = ("http", "https", "ftp", "mailto")    37     38 # Section extraction.    39     40 sections_regexp_str = r"(?<!{){(?P<type>[^-_*+{}\n:]+)(:[^}\n]+)?}.*?{(?P=type)}"    41 sections_regexp = re.compile(sections_regexp_str, re.DOTALL | re.MULTILINE)    42     43 def get_regions(s):    44     45     """    46     Return a list of regions from 's'. Each region is specified using a tuple of    47     the form (type, text).    48     """    49     50     last = 0    51     regions = []    52     for match in sections_regexp.finditer(s):    53         start, end = match.span()    54         regions.append((None, s[last:start]))    55         regions.append(get_section_details(s[start:end]))    56         last = end    57     regions.append((None, s[last:]))    58     return regions    59     60 # Section inspection.    61     62 section_regexp_str = r"{(?P<sectiontype>[^\n:]*?)(?::(?P<options>.*?))?}(?P<section>.*){(?P=sectiontype)}"    63 section_regexp = re.compile(section_regexp_str, re.DOTALL | re.MULTILINE)    64     65 def get_section_details(s):    66     67     "Return the details of a section 's' in the form (type, text)."    68     69     match = section_regexp.match(s)    70     if match:    71         return (match.group("sectiontype"), match.group("options")), match.group("section")    72     else:    73         return None, s    74     75 # Heading, table and list extraction.    76     77 list_regexp_str = r"^\s*(?P<listtype>[*#-])[*#-]*.*\n(\s*(?P=listtype).*(?:\n|$))*"    78 table_regexp_str = r"^((?P<celltype>[|]{1,2})(.+?(?P=celltype))+(\n|$))+"    79 blocktext_regexp_str = r"^(?P<type>h\d|bq)\.\s+(?P<text>.*)$"    80     81 blockelement_regexp = re.compile(    82     "(" + list_regexp_str + ")"    83     "|"    84     "(" + table_regexp_str + ")"    85     "|"    86     "(" + blocktext_regexp_str + ")",    87     re.MULTILINE    88     )    89     90 def get_block_elements(s):    91     92     """    93     Extract headings, tables and lists from the given string 's'.    94     """    95     96     last = 0    97     blocks = []    98     for match in blockelement_regexp.finditer(s):    99         start, end = match.span()   100         matchtype = match.group("listtype") and "list" or match.group("celltype") and "table" or match.group("type")   101         blocks.append((None, s[last:start]))   102         blocks.append((matchtype, match.group("text") or s[start:end]))   103         last = end   104     blocks.append((None, s[last:]))   105     return blocks   106    107 # Block extraction.   108    109 block_regexp_str = r"^(?:\s*\n)+"   110 block_regexp = re.compile(block_regexp_str, re.MULTILINE)   111    112 def get_basic_blocks(s):   113    114     """   115     Return blocks from the given string 's' by splitting the text on blank lines   116     and eliminating those lines.   117     """   118    119     return [b for b in block_regexp.split(s) if b.strip()]   120    121 # Block inspection.   122    123 def get_blocks(s):   124    125     """   126     Return blocks from the given string 's', inspecting the basic blocks and   127     generating additional block-level text where appropriate.   128     """   129    130     blocks = []   131    132     for blocktype, blocktext in get_block_elements(s):   133    134         # Collect heading, list and table blocks.   135    136         if blocktype is not None:   137             blocks.append((blocktype, blocktext))   138    139         # Attempt to find new subblocks in other regions.   140    141         else:   142             for block in get_basic_blocks(blocktext):   143                 blocks.append((None, block))   144    145     return blocks   146    147 # List item inspection.   148    149 listitem_regexp_str = r"^(?P<marker> *[-*#]+)\s*(?P<text>.*)$"   150 listitem_regexp = re.compile(listitem_regexp_str, re.MULTILINE)   151    152 def get_list_items(text):   153    154     "Return a list of (marker, text) tuples for the given list 'text'."   155    156     items = []   157    158     for match in listitem_regexp.finditer(text):   159         items.append((match.group("marker"), match.group("text")))   160    161     return items   162    163 # Table row inspection.   164    165 monospace_regexp_str = r"{{(?P<monotext>.*?)}}"   166 link_regexp_str = r"[[](?P<linktext>.*?)]"   167 image_regexp_str = r"!(?P<imagetext>.*?)!"   168 cellsep_regexp_str = r"(?P<celltype>[|]{1,2})"   169    170 content_regexp_str = (   171     "(" + monospace_regexp_str + ")"   172     "|"   173     "(" + link_regexp_str + ")"   174     "|"   175     "(" + image_regexp_str + ")"   176     )   177    178 table_content_regexp_str = (   179     content_regexp_str +   180     "|"   181     "(" + cellsep_regexp_str + ")"   182     )   183    184 content_regexp = re.compile(content_regexp_str)   185 table_content_regexp = re.compile(table_content_regexp_str)   186    187 def translate_content_match(match):   188    189     "Translate the content described by the given 'match', returning a string."   190    191     if match.group("monotext"):   192         return "{{{%s}}}" % match.group("monotext")   193    194     elif match.group("linktext"):   195         parts = match.group("linktext").split("|")   196    197         # NOTE: Proper detection of external links required.   198    199         if len(parts) == 1:   200             label, target = None, parts[0]   201         elif len(parts) == 2:   202             label, target = parts   203         else:   204             label, target, title = parts   205    206         if target.find(":") != -1:   207             prefix = ""   208             space, rest = target.split(":", 1)   209             if space not in URL_SCHEMES:   210                 target = "%s/%s" % (space, rest)   211         elif target.startswith("#"):   212             prefix = ""   213         elif target.startswith("^"):   214             prefix = "attachment:"   215         else:   216             prefix = "../"   217    218         if len(parts) == 1:   219             return "[[%s%s]]" % (prefix, target)   220         elif len(parts) == 2:   221             return "[[%s%s|%s]]" % (prefix, target, label)   222         else:   223             return "[[%s%s|%s|title=%s]]" % (prefix, target, label, title)   224    225     elif match.group("imagetext"):   226         parts = match.group("imagetext").split("|")   227    228         # NOTE: Proper detection of external links required.   229    230         if parts[0].startswith("http"):   231             prefix = ""   232         else:   233             prefix = "attachment:"   234    235         # NOTE: Proper options conversion required.   236    237         if len(parts) == 1:   238             return "{{%s%s}}" % (prefix, parts[0])   239         else:   240             return "{{%s%s|%s}}" % (prefix, parts[0], parts[1])   241    242     else:   243         return match.group()   244    245 def get_table_rows(text):   246    247     "Return a list of (cellsep, columns) tuples for the given table 'text'."   248    249     rows = []   250    251     for line in text.split("\n"):   252         cellsep = None   253         columns = [""]   254         last = 0   255         for match in table_content_regexp.finditer(line):   256             start, end = match.span()   257             columns[-1] += line[last:start]   258    259             if match.group("celltype"):   260                 if cellsep is None:   261                     cellsep = match.group("celltype")   262                 columns.append("")   263             else:   264                 columns[-1] += match.group()   265    266             last = end   267    268         columns[-1] += line[last:]   269    270         if cellsep:   271             rows.append((cellsep, columns[1:-1]))   272    273     return rows   274    275 def translate_content(text, sectiontype=None):   276    277     """   278     Return a translation of the given 'text'. If the optional 'sectiontype' is   279     specified, the translation may be modified to a form appropriate to the   280     section being translated.   281     """   282    283     parts = []   284    285     last = 0   286     for match in content_regexp.finditer(text):   287         start, end = match.span()   288         parts.append(text[last:start])   289    290         # Handle unformatted sections.   291    292         if sectiontype in ("code", "noformat"):   293             parts.append(match.group())   294         else:   295             parts.append(translate_content_match(match))   296    297         last = end   298    299     parts.append(text[last:])   300     return "".join(parts)   301    302 # Translation helpers.   303    304 blocktypes = {   305     "h1" : "= %s =",   306     "h2" : "== %s ==",   307     "h3" : "=== %s ===",   308     "h4" : "==== %s ====",   309     "h5" : "===== %s =====",   310     "h6" : "====== %s ======",   311     "bq" : "{{{%s}}}",   312     }   313    314 markers = {   315     "*" : "*",   316     "#" : "1.",   317     "-" : "*",   318     }   319    320 def translate_marker(marker):   321    322     "Translate the given 'marker' to a suitable Moin representation."   323    324     return " " * len(marker) + markers[marker[-1]]   325    326 cellseps = {   327     "|" : "||",   328     "||" : "||",   329     }   330    331 cellextra = {   332     "|" : "",   333     "||" : "'''",   334     }   335    336 def translate_cellsep(cellsep):   337    338     "Translate the given 'cellsep' to a suitable Moin representation."   339    340     return cellseps[cellsep]   341    342 def translate_cell(cellsep, text):   343    344     "Using 'cellsep', translate the cell 'text'."   345    346     return cellextra[cellsep] + translate_content(text) + cellextra[cellsep]   347    348 sectiontypes = {   349     "code" : "",   350     "noformat" : "",   351     "quote" : "",   352     "info" : "wiki important",   353     "note" : "wiki caution",   354     "tip" : "wiki tip",   355     "warning" : "wiki warning",   356     }   357    358 # General parsing.   359    360 def parse(s, out):   361    362     "Parse the content in the string 's', writing a translation to 'out'."   363    364     for type, text in get_regions(s):   365    366         # Handle list, heading, blockquote or anonymous blocks.   367    368         if type is None:   369             for blocktype, blocktext in get_blocks(text):   370    371                 # Translate headings and blockquotes.   372    373                 if blocktypes.has_key(blocktype):   374                     print >>out, blocktypes[blocktype] % blocktext   375    376                 # Translate list items.   377    378                 elif blocktype == "list":   379                     for listmarker, listitem in get_list_items(blocktext):   380                         print >>out, "%s %s" % (translate_marker(listmarker), translate_content(listitem))   381    382                 # Translate table items.   383    384                 elif blocktype == "table":   385                     for cellsep, columns in get_table_rows(blocktext):   386                         moinsep = translate_cellsep(cellsep)   387                         print >>out, moinsep + moinsep.join([translate_cell(cellsep, column) for column in columns]) + moinsep   388    389                 # Handle anonymous blocks.   390    391                 else:   392                     print >>out, translate_content(blocktext.rstrip())   393    394                 print >>out   395    396         # Handle sections.   397    398         else:   399             sectiontype, options = type   400    401             # Direct translations of sections.   402    403             mointype = sectiontypes.get(sectiontype)   404             if mointype:   405                 print >>out, "{{{#!%s" % mointype   406                 if options:   407                     print >>out, "##", options   408             else:   409                 print >>out, "{{{",   410             print >>out, translate_content(text, sectiontype),   411             print >>out, "}}}"   412             print >>out   413    414 if __name__ == "__main__":   415     import sys   416    417     s = sys.stdin.read()   418     parse(s, sys.stdout)   419    420 # vim: tabstop=4 expandtab shiftwidth=4