ConfluenceConverter (file wikiparser.py at f9771c857a29)

     1 #!/usr/bin/env python     2      3 """     4 Confluence Wiki syntax parsing.     5      6 Copyright (C) 2012, 2013 Paul Boddie <paul@boddie.org.uk>     7      8 This software is free software; you can redistribute it and/or     9 modify it under the terms of the GNU General Public License as    10 published by the Free Software Foundation; either version 2 of    11 the License, or (at your option) any later version.    12     13 This software is distributed in the hope that it will be useful,    14 but WITHOUT ANY WARRANTY; without even the implied warranty of    15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the    16 GNU General Public License for more details.    17     18 You should have received a copy of the GNU General Public    19 License along with this library; see the file LICENCE.txt    20 If not, write to the Free Software Foundation, Inc.,    21 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA    22     23 --------    24     25 The basic procedure is as follows:    26     27  1. Wiki pages are first split up into regions.    28  2. Then, within these regions, the text is split into blocks.    29     1. First, lists are identified.    30     2. Additionally, other block-like elements are identified.    31  3. Each block is then parsed.    32 """    33     34 from common import *    35 import re    36 import sys    37     38 # Section extraction.    39     40 sections_regexp_str = r"(?<!{){(?P<type>[^-_*+{}\n:]+)(:[^}\n]+)?}.*?{(?P=type)}"    41 sections_regexp = re.compile(sections_regexp_str, re.DOTALL | re.MULTILINE)    42     43 def get_regions(s):    44     45     """    46     Return a list of regions from 's'. Each region is specified using a tuple of    47     the form (type, text).    48     """    49     50     last = 0    51     regions = []    52     for match in sections_regexp.finditer(s):    53         start, end = match.span()    54         regions.append((None, s[last:start]))    55         regions.append(get_section_details(s[start:end]))    56         last = end    57     regions.append((None, s[last:]))    58     return regions    59     60 # Section inspection.    61     62 section_regexp_str = r"{(?P<sectiontype>[^\n:]*?)(?::(?P<options>.*?))?}(?P<section>.*){(?P=sectiontype)}"    63 section_regexp = re.compile(section_regexp_str, re.DOTALL | re.MULTILINE)    64     65 def get_section_details(s):    66     67     "Return the details of a section 's' in the form (type, text)."    68     69     match = section_regexp.match(s)    70     if match:    71         return (match.group("sectiontype"), match.group("options")), match.group("section")    72     else:    73         return None, s    74     75 # Heading, table and list extraction.    76     77 list_regexp_str = r"^\s*(?P<listtype>[*#-])[*#-]*.*(\n\s*(?P=listtype).*?)*(?:\n|$)"    78 table_regexp_str = r"^((?P<celltype>[|]{1,2})((.|\n(?!\n))+?(?P=celltype))+(\n|$))+"    79 blocktext_regexp_str = r"^(?P<type>h\d|bq)\.\s+(?P<text>.*)$"    80     81 blockelement_regexp = re.compile(    82     "(" + list_regexp_str + ")"    83     "|"    84     "(" + table_regexp_str + ")"    85     "|"    86     "(" + blocktext_regexp_str + ")",    87     re.MULTILINE    88     )    89     90 def get_block_elements(s):    91     92     """    93     Extract headings, tables and lists from the given string 's'.    94     """    95     96     last = 0    97     blocks = []    98     for match in blockelement_regexp.finditer(s):    99         start, end = match.span()   100         matchtype = match.group("listtype") and "list" or match.group("celltype") and "table" or match.group("type")   101         blocks.append((None, s[last:start]))   102         blocks.append((matchtype, match.group("text") or s[start:end]))   103         last = end   104     blocks.append((None, s[last:]))   105     return blocks   106    107 # Block extraction.   108    109 block_regexp_str = r"^(?:\s*\n)+"   110 block_regexp = re.compile(block_regexp_str, re.MULTILINE)   111    112 def get_basic_blocks(s):   113    114     """   115     Return blocks from the given string 's' by splitting the text on blank lines   116     and eliminating those lines.   117     """   118    119     return [b for b in block_regexp.split(s) if b.strip()]   120    121 # Block inspection.   122    123 def get_blocks(s):   124    125     """   126     Return blocks from the given string 's', inspecting the basic blocks and   127     generating additional block-level text where appropriate.   128     """   129    130     blocks = []   131    132     for blocktype, blocktext in get_block_elements(s):   133    134         # Collect heading, list and table blocks.   135    136         if blocktype is not None:   137             blocks.append((blocktype, blocktext))   138    139         # Attempt to find new subblocks in other regions.   140    141         else:   142             for block in get_basic_blocks(blocktext):   143                 blocks.append((None, block))   144    145     return blocks   146    147 # List item inspection.   148    149 listitem_regexp_str = r"^(?P<marker> *[-*#]+)\s*(?P<text>.*)$"   150 listitem_regexp = re.compile(listitem_regexp_str, re.MULTILINE)   151    152 def get_list_items(text):   153    154     "Return a list of (marker, text) tuples for the given list 'text'."   155    156     items = []   157    158     for match in listitem_regexp.finditer(text):   159         items.append((match.group("marker"), match.group("text")))   160    161     return items   162    163 # Content inspection.   164    165 monospace_regexp_str = r"{{(?P<monotext>.*?)}}"   166 link_regexp_str      = r"[[](?P<linktext>.*?)]"   167 image_regexp_str     = r"!(?P<imagetext>\w.*?)!"   168    169 # Word-dependent patterns.   170 # Here, the unbracketed markers must test for the absence of surrounding word   171 # characters.   172    173 italic_regexp_str    = r"(?:(?<!\w)_|\{_\})(?P<italictext>.*?)(?:_(?!\w)|\{_\})"   174 bold_regexp_str      = r"(?:(?<!\w)\*|\{\*\})(?P<boldtext>.*?)(?:\*(?!\w)|\{\*\})"   175 del_regexp_str       = r"(?:(?<!\w)-|\{-\})(?P<deltext>.*?)(?:-(?!\w)|\{-\})"   176 underline_regexp_str = r"(?:(?<!\w)\+|\{\+\})(?P<underlinetext>.*?)(?:\+(?!\w)|\{\+\})"   177 sub_regexp_str       = r"(?:(?<!\w)~|\{~\})(?P<subtext>.*?)(?:~(?!\w)|\{~\})"   178    179 content_regexp_str = (   180     "(" + monospace_regexp_str + ")"   181     "|"   182     "(" + link_regexp_str + ")"   183     "|"   184     "(" + image_regexp_str + ")"   185     "|"   186     "(" + italic_regexp_str + ")"   187     "|"   188     "(" + bold_regexp_str + ")"   189     "|"   190     "(" + del_regexp_str + ")"   191     "|"   192     "(" + underline_regexp_str + ")"   193     "|"   194     "(" + sub_regexp_str + ")"   195     )   196    197 # Table row inspection.   198    199 cellsep_regexp_str = r"(?P<celltype>[|]{1,2})"   200    201 table_content_regexp_str = (   202     content_regexp_str +   203     "|"   204     "(" + cellsep_regexp_str + ")"   205     )   206    207 content_regexp = re.compile(content_regexp_str)   208 table_content_regexp = re.compile(table_content_regexp_str)   209    210 # Notation conversion.   211    212 notation_mapping = [   213     (r"\!", "!"),   214     (r"\-", "-"),   215     (r"\\""\n", "<<BR>> "),   216     (r"\\ ", " "),   217     ]   218    219 # Translation helpers.   220    221 markers = {   222     "*" : "*",   223     "#" : "1.",   224     "-" : "*",   225     }   226    227 def translate_marker(marker):   228    229     "Translate the given 'marker' to a suitable Moin representation."   230    231     return " " * len(marker) + markers[marker[-1]]   232    233 cellseps = {   234     "|" : "||",   235     "||" : "||",   236     }   237    238 cellextra = {   239     "|" : "",   240     "||" : "'''",   241     }   242    243 def translate_cellsep(cellsep):   244    245     "Translate the given 'cellsep' to a suitable Moin representation."   246    247     return cellseps[cellsep]   248    249 def translate_cell(cellsep, text):   250    251     "Using 'cellsep', translate the cell 'text'."   252    253     return cellextra[cellsep] + parse_text(text) + cellextra[cellsep]   254    255 def translate_content_match(match):   256    257     "Translate the content described by the given 'match', returning a string."   258    259     if match.group("monotext"):   260         return "{{{%s}}}" % match.group("monotext")   261    262     elif match.group("linktext"):   263         parts = match.group("linktext").split("|")   264    265         # NOTE: Proper detection of external links required.   266    267         if len(parts) == 1:   268             label, target, title = None, parts[0], None   269         elif len(parts) == 2:   270             (label, target), title = parts, None   271         else:   272             label, target, title = parts   273    274         target = target.strip()   275    276         # Look for namespace links and rewrite them.   277    278         if target.find(":") != -1:   279             prefix = ""   280             space, rest = target.split(":", 1)   281             if space not in URL_SCHEMES:   282                 target = "%s/%s" % (space, rest)   283    284         # Detect anchors.   285    286         elif target.startswith("#"):   287             prefix = ""   288    289         # Detect attachments.   290    291         elif target.startswith("^"):   292             prefix = "attachment:"   293    294         # Link to other pages within a space.   295    296         else:   297             prefix = "../"   298    299             # Make the link tidier by making a target if none was given.   300    301             if not label:   302                 label = target   303    304         if not label and not title:   305             return "[[%s%s]]" % (prefix, target)   306         elif not title:   307             return "[[%s%s|%s]]" % (prefix, target, label)   308         else:   309             return "[[%s%s|%s|title=%s]]" % (prefix, target, label, title)   310    311     elif match.group("imagetext"):   312         parts = match.group("imagetext").split("|")   313    314         # NOTE: Proper detection of external links required.   315    316         if parts[0].startswith("http"):   317             prefix = ""   318         else:   319             prefix = "attachment:"   320    321         # NOTE: Proper options conversion required.   322    323         if len(parts) == 1:   324             return "{{%s%s}}" % (prefix, parts[0])   325         else:   326             return "{{%s%s|%s}}" % (prefix, parts[0], parts[1])   327    328     elif match.group("italictext"):   329         return "''%s''" % translate_content(match.group("italictext"))   330    331     elif match.group("boldtext"):   332         return "'''%s'''" % translate_content(match.group("boldtext"))   333    334     elif match.group("deltext"):   335         return "--(%s)--" % translate_content(match.group("deltext"))   336    337     elif match.group("underlinetext"):   338         return "__%s__" % translate_content(match.group("underlinetext"))   339    340     elif match.group("subtext"):   341         return ",,%s,," % translate_content(match.group("subtext"))   342    343     else:   344         return translate_text(match.group())   345    346 def translate_text(s):   347    348     "Translate the plain text string 's', converting notation."   349    350     for before, after in notation_mapping:   351         s = s.replace(before, after)   352     return s   353    354 def translate_content(text, sectiontype=None):   355    356     """   357     Return a translation of the given 'text'. If the optional 'sectiontype' is   358     specified, the translation may be modified to a form appropriate to the   359     section being translated.   360     """   361    362     parts = []   363    364     last = 0   365     for match in content_regexp.finditer(text):   366         start, end = match.span()   367         parts.append(translate_text(text[last:start]))   368    369         # Handle unformatted sections.   370    371         if sectiontype in ("code", "noformat"):   372             parts.append(match.group())   373         else:   374             parts.append(translate_content_match(match))   375    376         last = end   377    378     parts.append(translate_text(text[last:]))   379     return "".join(parts)   380    381 def translate_block(blocktype, blocktext):   382    383     "Translate the block with the given 'blocktype' and 'blocktext'."   384    385     parts = []   386    387     # Translate headings and blockquotes.   388    389     if blocktypes.has_key(blocktype):   390         parts.append(blocktypes[blocktype] % blocktext)   391    392     # Translate list items.   393    394     elif blocktype == "list":   395         for listmarker, listitem in get_list_items(blocktext):   396             parts.append("%s %s" % (translate_marker(listmarker), translate_content(listitem)))   397    398     # Translate table items.   399    400     elif blocktype == "table":   401         parts.append("{{{#!table")   402         first = True   403         for cellsep, columns in get_table_rows(blocktext):   404             if not first:   405                 parts.append("==")   406             else:   407                 first = False   408             moinsep = translate_cellsep(cellsep)   409             parts.append(moinsep.join([translate_cell(cellsep, column) for column in columns]))   410         parts.append("}}}")   411    412     # Handle anonymous blocks.   413    414     else:   415         parts.append(translate_content(blocktext).rstrip())   416    417     return "\n".join(parts) + "\n"   418    419 def get_table_rows(text):   420    421     "Return a list of (cellsep, columns) tuples for the given table 'text'."   422    423     rows = []   424    425     for row in text.split("|\n"):   426         if not row:   427             break   428    429         row += "|"   430         cellsep = None   431         columns = [""]   432         last = 0   433         for match in table_content_regexp.finditer(row):   434             start, end = match.span()   435             columns[-1] += row[last:start]   436    437             if match.group("celltype"):   438                 if cellsep is None:   439                     cellsep = match.group("celltype")   440                 columns.append("")   441             else:   442                 columns[-1] += match.group()   443    444             last = end   445    446         columns[-1] += row[last:]   447    448         if cellsep:   449             rows.append((cellsep, columns[1:-1]))   450    451     return rows   452    453 sectiontypes = {   454     "code" : "",   455     "noformat" : "",   456     "quote" : "",   457     "info" : "wiki important",   458     "note" : "wiki caution",   459     "tip" : "wiki tip",   460     "warning" : "wiki warning",   461     }   462    463 # General parsing.   464    465 def parse_text(s):   466    467     "Parse the content in the string 's', returning the translation."   468    469     parts = []   470    471     for type, text in get_regions(s):   472    473         # Handle list, heading, blockquote or anonymous blocks.   474    475         if type is None:   476             for blocktype, blocktext in get_blocks(text):   477                 parts.append("%s\n" % translate_block(blocktype, blocktext))   478    479         # Handle sections.   480    481         else:   482             sectiontype, options = type   483    484             # Direct translations of sections.   485    486             mointype = sectiontypes.get(sectiontype)   487             if mointype:   488                 parts.append("{{{#!%s\n" % mointype)   489                 if options:   490                     parts.append("## %s\n" % options)   491             else:   492                 parts.append("{{{")   493             parts.append(translate_content(text, sectiontype))   494             parts.append("}}}\n")   495    496     return "".join(parts)   497    498 def parse(s, out):   499    500     "Parse the content in the string 's', writing a translation to 'out'."   501    502     out.write(parse_text(s))   503    504 if __name__ == "__main__":   505     s = sys.stdin.read()   506     parse(s, sys.stdout)   507    508 # vim: tabstop=4 expandtab shiftwidth=4