ConfluenceConverter (file wikiparser.py at 371d25b0f062)

     1 #!/usr/bin/env python     2      3 """     4 Confluence Wiki syntax parsing.     5      6 Copyright (C) 2012, 2013 Paul Boddie <paul@boddie.org.uk>     7      8 This software is free software; you can redistribute it and/or     9 modify it under the terms of the GNU General Public License as    10 published by the Free Software Foundation; either version 2 of    11 the License, or (at your option) any later version.    12     13 This software is distributed in the hope that it will be useful,    14 but WITHOUT ANY WARRANTY; without even the implied warranty of    15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the    16 GNU General Public License for more details.    17     18 You should have received a copy of the GNU General Public    19 License along with this library; see the file LICENCE.txt    20 If not, write to the Free Software Foundation, Inc.,    21 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA    22     23 --------    24     25 The basic procedure is as follows:    26     27  1. Wiki pages are first split up into regions.    28  2. Then, within these regions, the text is split into blocks.    29     1. First, lists are identified.    30     2. Additionally, other block-like elements are identified.    31  3. Each block is then parsed.    32 """    33     34 from common import *    35 import re    36 import sys    37 import codecs    38     39 # Section extraction.    40     41 sections_regexp_str = r"(?<!{){(?P<type>[^-_*+{}\n:]+)(:[^}\n]+)?}.*?{(?P=type)}"    42 sections_regexp = re.compile(sections_regexp_str, re.DOTALL | re.MULTILINE)    43     44 def get_regions(s):    45     46     """    47     Return a list of regions from 's'. Each region is specified using a tuple of    48     the form (type, text).    49     """    50     51     last = 0    52     regions = []    53     for match in sections_regexp.finditer(s):    54         start, end = match.span()    55         regions.append((None, s[last:start]))    56         regions.append(get_section_details(s[start:end]))    57         last = end    58     regions.append((None, s[last:]))    59     return regions    60     61 # Section inspection.    62     63 section_regexp_str = r"{(?P<sectiontype>[^\n:]*?)(?::(?P<options>.*?))?}(?P<section>.*){(?P=sectiontype)}"    64 section_regexp = re.compile(section_regexp_str, re.DOTALL | re.MULTILINE)    65     66 def get_section_details(s):    67     68     "Return the details of a section 's' in the form (type, text)."    69     70     match = section_regexp.match(s)    71     if match:    72         return (match.group("sectiontype"), match.group("options")), match.group("section")    73     else:    74         return None, s    75     76 # Heading, table and list extraction.    77     78 list_regexp_str = r"^\s*(?P<listtype>[*#-])[*#-]*\s+.*(\n\s*(?P=listtype).*?)*(?:\n|$)"    79 table_regexp_str = r"^((?P<celltype>[|]{1,2})((.|\n(?!\n))+?(?P=celltype))+(\n|$))+"    80 blocktext_regexp_str = r"^(?P<type>h\d|bq)\.\s+(?P<text>.*)$"    81     82 blockelement_regexp = re.compile(    83     "(" + list_regexp_str + ")"    84     "|"    85     "(" + table_regexp_str + ")"    86     "|"    87     "(" + blocktext_regexp_str + ")",    88     re.MULTILINE    89     )    90     91 def get_block_elements(s):    92     93     """    94     Extract headings, tables and lists from the given string 's'.    95     """    96     97     last = 0    98     blocks = []    99     for match in blockelement_regexp.finditer(s):   100         start, end = match.span()   101         matchtype = match.group("listtype") and "list" or match.group("celltype") and "table" or match.group("type")   102         blocks.append((None, s[last:start]))   103         blocks.append((matchtype, match.group("text") or s[start:end]))   104         last = end   105     blocks.append((None, s[last:]))   106     return blocks   107    108 # Block extraction.   109    110 block_regexp_str = r"^(?:\s*\n)+"   111 block_regexp = re.compile(block_regexp_str, re.MULTILINE)   112    113 def get_basic_blocks(s):   114    115     """   116     Return blocks from the given string 's' by splitting the text on blank lines   117     and eliminating those lines.   118     """   119    120     return [b for b in block_regexp.split(s) if b.strip()]   121    122 # Block inspection.   123    124 def get_blocks(s):   125    126     """   127     Return blocks from the given string 's', inspecting the basic blocks and   128     generating additional block-level text where appropriate.   129     """   130    131     blocks = []   132    133     for blocktype, blocktext in get_block_elements(s):   134    135         # Collect heading, list and table blocks.   136    137         if blocktype is not None:   138             blocks.append((blocktype, blocktext))   139    140         # Attempt to find new subblocks in other regions.   141    142         else:   143             for block in get_basic_blocks(blocktext):   144                 blocks.append((None, block))   145    146     return blocks   147    148 # List item inspection.   149    150 listitem_regexp_str = r"^(?P<marker> *[-*#]+)\s+(?P<text>.*)$"   151 listitem_regexp = re.compile(listitem_regexp_str, re.MULTILINE)   152    153 def get_list_items(text):   154    155     "Return a list of (marker, text) tuples for the given list 'text'."   156    157     items = []   158    159     for match in listitem_regexp.finditer(text):   160         items.append((match.group("marker"), match.group("text")))   161    162     return items   163    164 # Content inspection.   165    166 monospace_regexp_str = r"{{(?P<monotext>.*?)}}"   167 link_regexp_str      = r"[[](?P<linktext>.*?)]"   168 image_regexp_str     = r"!(?P<imagetext>\w.*?)!"   169    170 # Word-dependent patterns.   171 # Here, the unbracketed markers must test for the absence of surrounding word   172 # characters.   173    174 italic_regexp_str    = r"(?:(?<!\w)_|\{_\})(?P<italictext>.*?)(?:_(?!\w)|\{_\})"   175 bold_regexp_str      = r"(?:(?<!\w)\*|\{\*\})(?P<boldtext>.*?)(?:\*(?!\w)|\{\*\})"   176 del_regexp_str       = r"(?:(?<!\w)-|\{-\})(?P<deltext>.*?)(?:-(?!\w)|\{-\})"   177 underline_regexp_str = r"(?:(?<!\w)\+|\{\+\})(?P<underlinetext>.*?)(?:\+(?!\w)|\{\+\})"   178 sub_regexp_str       = r"(?:(?<!\w)~|\{~\})(?P<subtext>.*?)(?:~(?!\w)|\{~\})"   179    180 content_regexp_str = (   181     "(" + monospace_regexp_str + ")"   182     "|"   183     "(" + link_regexp_str + ")"   184     "|"   185     "(" + image_regexp_str + ")"   186     "|"   187     "(" + italic_regexp_str + ")"   188     "|"   189     "(" + bold_regexp_str + ")"   190     "|"   191     "(" + del_regexp_str + ")"   192     "|"   193     "(" + underline_regexp_str + ")"   194     "|"   195     "(" + sub_regexp_str + ")"   196     )   197    198 # Table row inspection.   199    200 cellsep_regexp_str = r"(?P<celltype>[|]{1,2})"   201    202 table_content_regexp_str = (   203     content_regexp_str +   204     "|"   205     "(" + cellsep_regexp_str + ")"   206     )   207    208 content_regexp = re.compile(content_regexp_str)   209 table_content_regexp = re.compile(table_content_regexp_str)   210    211 # Notation conversion.   212    213 notation_mapping = [   214     (r"\!", "!"),   215     (r"\-", "-"),   216     (r"\\""\n", "<<BR>> "),   217     (r"\\ ", " "),   218     ]   219    220 # Translation helpers.   221    222 markers = {   223     "*" : "*",   224     "#" : "1.",   225     "-" : "*",   226     }   227    228 def translate_marker(marker):   229    230     "Translate the given 'marker' to a suitable Moin representation."   231    232     return " " * len(marker) + markers[marker[-1]]   233    234 cellseps = {   235     "|" : "||",   236     "||" : "||",   237     }   238    239 cellextra = {   240     "|" : "",   241     "||" : "'''",   242     }   243    244 def translate_cellsep(cellsep):   245    246     "Translate the given 'cellsep' to a suitable Moin representation."   247    248     return cellseps[cellsep]   249    250 def translate_cell(cellsep, text):   251    252     "Using 'cellsep', translate the cell 'text'."   253    254     return cellextra[cellsep] + parse_text(text) + cellextra[cellsep]   255    256 def translate_content_match(match):   257    258     "Translate the content described by the given 'match', returning a string."   259    260     if match.group("monotext"):   261         return "{{{%s}}}" % match.group("monotext")   262    263     elif match.group("linktext"):   264         parts = match.group("linktext").split("|")   265    266         # NOTE: Proper detection of external links required.   267    268         if len(parts) == 1:   269             label, target, title = None, parts[0], None   270         elif len(parts) == 2:   271             (label, target), title = parts, None   272         else:   273             label, target, title = parts   274    275         target = target.strip()   276    277         # Look for namespace links and rewrite them.   278    279         if target.find(":") != -1:   280             prefix = ""   281             space, rest = target.split(":", 1)   282             if space not in URL_SCHEMES:   283                 target = "%s/%s" % (space, rest)   284    285         # Detect anchors.   286    287         elif target.startswith("#"):   288             prefix = ""   289    290         # Detect attachments.   291    292         elif target.startswith("^"):   293             prefix = "attachment:"   294    295         # Link to other pages within a space.   296    297         else:   298             prefix = "../"   299    300             # Make the link tidier by making a target if none was given.   301    302             if not label:   303                 label = target   304    305         if not label and not title:   306             return "[[%s%s]]" % (prefix, target)   307         elif not title:   308             return "[[%s%s|%s]]" % (prefix, target, label)   309         else:   310             return "[[%s%s|%s|title=%s]]" % (prefix, target, label, title)   311    312     elif match.group("imagetext"):   313         parts = match.group("imagetext").split("|")   314    315         # NOTE: Proper detection of external links required.   316    317         if parts[0].startswith("http"):   318             prefix = ""   319         else:   320             prefix = "attachment:"   321    322         # NOTE: Proper options conversion required.   323    324         if len(parts) == 1:   325             return "{{%s%s}}" % (prefix, parts[0])   326         else:   327             return "{{%s%s|%s}}" % (prefix, parts[0], parts[1])   328    329     elif match.group("italictext"):   330         return "''%s''" % translate_content(match.group("italictext"))   331    332     elif match.group("boldtext"):   333         return "'''%s'''" % translate_content(match.group("boldtext"))   334    335     elif match.group("deltext"):   336         return "--(%s)--" % translate_content(match.group("deltext"))   337    338     elif match.group("underlinetext"):   339         return "__%s__" % translate_content(match.group("underlinetext"))   340    341     elif match.group("subtext"):   342         return ",,%s,," % translate_content(match.group("subtext"))   343    344     else:   345         return translate_text(match.group())   346    347 def translate_text(s):   348    349     "Translate the plain text string 's', converting notation."   350    351     for before, after in notation_mapping:   352         s = s.replace(before, after)   353     return s   354    355 def translate_content(text, sectiontype=None):   356    357     """   358     Return a translation of the given 'text'. If the optional 'sectiontype' is   359     specified, the translation may be modified to a form appropriate to the   360     section being translated.   361     """   362    363     parts = []   364    365     last = 0   366     for match in content_regexp.finditer(text):   367         start, end = match.span()   368         parts.append(translate_text(text[last:start]))   369    370         # Handle unformatted sections.   371    372         if sectiontype in ("code", "noformat"):   373             parts.append(match.group())   374         else:   375             parts.append(translate_content_match(match))   376    377         last = end   378    379     parts.append(translate_text(text[last:]))   380     return "".join(parts)   381    382 def translate_block(blocktype, blocktext):   383    384     "Translate the block with the given 'blocktype' and 'blocktext'."   385    386     parts = []   387    388     # Translate headings and blockquotes.   389    390     if blocktypes.has_key(blocktype):   391         parts.append(blocktypes[blocktype] % blocktext)   392    393     # Translate list items.   394    395     elif blocktype == "list":   396         for listmarker, listitem in get_list_items(blocktext):   397             parts.append("%s %s" % (translate_marker(listmarker), translate_content(listitem)))   398    399     # Translate table items.   400    401     elif blocktype == "table":   402         parts.append("{{{#!table")   403         first = True   404         for cellsep, columns in get_table_rows(blocktext):   405             if not first:   406                 parts.append("==")   407             else:   408                 first = False   409             moinsep = translate_cellsep(cellsep)   410             parts.append(moinsep.join([translate_cell(cellsep, column) for column in columns]))   411         parts.append("}}}")   412    413     # Handle anonymous blocks.   414    415     else:   416         parts.append(translate_content(blocktext).rstrip())   417    418     return "\n".join(parts) + "\n"   419    420 def get_table_rows(text):   421    422     "Return a list of (cellsep, columns) tuples for the given table 'text'."   423    424     rows = []   425    426     for row in text.split("|\n"):   427         if not row:   428             break   429    430         row += "|"   431         cellsep = None   432         columns = [""]   433         last = 0   434         for match in table_content_regexp.finditer(row):   435             start, end = match.span()   436             columns[-1] += row[last:start]   437    438             if match.group("celltype"):   439                 if cellsep is None:   440                     cellsep = match.group("celltype")   441                 columns.append("")   442             else:   443                 columns[-1] += match.group()   444    445             last = end   446    447         columns[-1] += row[last:]   448    449         if cellsep:   450             rows.append((cellsep, columns[1:-1]))   451    452     return rows   453    454 sectiontypes = {   455     "code" : "",   456     "noformat" : "",   457     "quote" : "",   458     "info" : "wiki important",   459     "note" : "wiki caution",   460     "tip" : "wiki tip",   461     "warning" : "wiki warning",   462     }   463    464 # General parsing.   465    466 def parse_text(s):   467    468     "Parse the content in the string 's', returning the translation."   469    470     parts = []   471    472     for type, text in get_regions(s):   473    474         # Handle list, heading, blockquote or anonymous blocks.   475    476         if type is None:   477             for blocktype, blocktext in get_blocks(text):   478                 parts.append("%s\n" % translate_block(blocktype, blocktext))   479    480         # Handle sections.   481    482         else:   483             sectiontype, options = type   484    485             # Direct translations of sections.   486    487             mointype = sectiontypes.get(sectiontype)   488             if mointype:   489                 parts.append("{{{#!%s\n" % mointype)   490                 if options:   491                     parts.append("## %s\n" % options)   492             else:   493                 parts.append("{{{")   494             parts.append(translate_content(text, sectiontype))   495             parts.append("}}}\n")   496    497     return "".join(parts)   498    499 def parse(s, out):   500    501     "Parse the content in the string 's', writing a translation to 'out'."   502    503     out.write(parse_text(s))   504    505 if __name__ == "__main__":   506     s = sys.stdin.read()   507     out = codecs.getwriter("utf-8")(sys.stdout)   508     parse(s, out)   509    510 # vim: tabstop=4 expandtab shiftwidth=4