ConfluenceConverter (file parser.py at 1d1cc4d70e12)

     1 #!/usr/bin/env python     2      3 """     4 Confluence Wiki syntax parsing.     5      6 Copyright (C) 2012 Paul Boddie <paul@boddie.org.uk>     7      8 This software is free software; you can redistribute it and/or     9 modify it under the terms of the GNU General Public License as    10 published by the Free Software Foundation; either version 2 of    11 the License, or (at your option) any later version.    12     13 This software is distributed in the hope that it will be useful,    14 but WITHOUT ANY WARRANTY; without even the implied warranty of    15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the    16 GNU General Public License for more details.    17     18 You should have received a copy of the GNU General Public    19 License along with this library; see the file LICENCE.txt    20 If not, write to the Free Software Foundation, Inc.,    21 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA    22     23 --------    24     25 The basic procedure is as follows:    26     27  1. Wiki pages are first split up into regions.    28  2. Then, within these regions, the text is split into blocks.    29     1. First, lists are identified.    30     2. Additionally, other block-like elements are identified.    31  3. Each block is then parsed.    32 """    33     34 try:    35     from cStringIO import StringIO    36 except ImportError:    37     from StringIO import StringIO    38     39 from xmlread import Parser    40 import re    41 import sys    42 import operator    43 import htmlentitydefs    44     45 URL_SCHEMES = ("http", "https", "ftp", "mailto")    46     47 # Section extraction.    48     49 sections_regexp_str = r"(?<!{){(?P<type>[^-_*+{}\n:]+)(:[^}\n]+)?}.*?{(?P=type)}"    50 sections_regexp = re.compile(sections_regexp_str, re.DOTALL | re.MULTILINE)    51     52 def get_regions(s):    53     54     """    55     Return a list of regions from 's'. Each region is specified using a tuple of    56     the form (type, text).    57     """    58     59     last = 0    60     regions = []    61     for match in sections_regexp.finditer(s):    62         start, end = match.span()    63         regions.append((None, s[last:start]))    64         regions.append(get_section_details(s[start:end]))    65         last = end    66     regions.append((None, s[last:]))    67     return regions    68     69 # Section inspection.    70     71 section_regexp_str = r"{(?P<sectiontype>[^\n:]*?)(?::(?P<options>.*?))?}(?P<section>.*){(?P=sectiontype)}"    72 section_regexp = re.compile(section_regexp_str, re.DOTALL | re.MULTILINE)    73     74 def get_section_details(s):    75     76     "Return the details of a section 's' in the form (type, text)."    77     78     match = section_regexp.match(s)    79     if match:    80         return (match.group("sectiontype"), match.group("options")), match.group("section")    81     else:    82         return None, s    83     84 # Heading, table and list extraction.    85     86 list_regexp_str = r"^\s*(?P<listtype>[*#-])[*#-]*.*\n(\s*(?P=listtype).*(?:\n|$))*"    87 table_regexp_str = r"^((?P<celltype>[|]{1,2})(.+?(?P=celltype))+(\n|$))+"    88 blocktext_regexp_str = r"^(?P<type>h\d|bq)\.\s+(?P<text>.*)$"    89     90 blockelement_regexp = re.compile(    91     "(" + list_regexp_str + ")"    92     "|"    93     "(" + table_regexp_str + ")"    94     "|"    95     "(" + blocktext_regexp_str + ")",    96     re.MULTILINE    97     )    98     99 def get_block_elements(s):   100    101     """   102     Extract headings, tables and lists from the given string 's'.   103     """   104    105     last = 0   106     blocks = []   107     for match in blockelement_regexp.finditer(s):   108         start, end = match.span()   109         matchtype = match.group("listtype") and "list" or match.group("celltype") and "table" or match.group("type")   110         blocks.append((None, s[last:start]))   111         blocks.append((matchtype, match.group("text") or s[start:end]))   112         last = end   113     blocks.append((None, s[last:]))   114     return blocks   115    116 # Block extraction.   117    118 block_regexp_str = r"^(?:\s*\n)+"   119 block_regexp = re.compile(block_regexp_str, re.MULTILINE)   120    121 def get_basic_blocks(s):   122    123     """   124     Return blocks from the given string 's' by splitting the text on blank lines   125     and eliminating those lines.   126     """   127    128     return [b for b in block_regexp.split(s) if b.strip()]   129    130 # Block inspection.   131    132 def get_blocks(s):   133    134     """   135     Return blocks from the given string 's', inspecting the basic blocks and   136     generating additional block-level text where appropriate.   137     """   138    139     blocks = []   140    141     for blocktype, blocktext in get_block_elements(s):   142    143         # Collect heading, list and table blocks.   144    145         if blocktype is not None:   146             blocks.append((blocktype, blocktext))   147    148         # Attempt to find new subblocks in other regions.   149    150         else:   151             for block in get_basic_blocks(blocktext):   152                 blocks.append((None, block))   153    154     return blocks   155    156 # List item inspection.   157    158 listitem_regexp_str = r"^(?P<marker> *[-*#]+)\s*(?P<text>.*)$"   159 listitem_regexp = re.compile(listitem_regexp_str, re.MULTILINE)   160    161 def get_list_items(text):   162    163     "Return a list of (marker, text) tuples for the given list 'text'."   164    165     items = []   166    167     for match in listitem_regexp.finditer(text):   168         items.append((match.group("marker"), match.group("text")))   169    170     return items   171    172 # Table row inspection.   173    174 monospace_regexp_str = r"{{(?P<monotext>.*?)}}"   175 link_regexp_str = r"[[](?P<linktext>.*?)]"   176 image_regexp_str = r"!(?P<imagetext>.*?)!"   177 cellsep_regexp_str = r"(?P<celltype>[|]{1,2})"   178    179 content_regexp_str = (   180     "(" + monospace_regexp_str + ")"   181     "|"   182     "(" + link_regexp_str + ")"   183     "|"   184     "(" + image_regexp_str + ")"   185     )   186    187 table_content_regexp_str = (   188     content_regexp_str +   189     "|"   190     "(" + cellsep_regexp_str + ")"   191     )   192    193 content_regexp = re.compile(content_regexp_str)   194 table_content_regexp = re.compile(table_content_regexp_str)   195    196 def translate_content_match(match):   197    198     "Translate the content described by the given 'match', returning a string."   199    200     if match.group("monotext"):   201         return "{{{%s}}}" % match.group("monotext")   202    203     elif match.group("linktext"):   204         parts = match.group("linktext").split("|")   205    206         # NOTE: Proper detection of external links required.   207    208         if len(parts) == 1:   209             label, target, title = None, parts[0], None   210         elif len(parts) == 2:   211             (label, target), title = parts, None   212         else:   213             label, target, title = parts   214    215         target = target.strip()   216    217         # Look for namespace links and rewrite them.   218    219         if target.find(":") != -1:   220             prefix = ""   221             space, rest = target.split(":", 1)   222             if space not in URL_SCHEMES:   223                 target = "%s/%s" % (space, rest)   224    225         # Detect anchors.   226    227         elif target.startswith("#"):   228             prefix = ""   229    230         # Detect attachments.   231    232         elif target.startswith("^"):   233             prefix = "attachment:"   234    235         # Link to other pages within a space.   236    237         else:   238             prefix = "../"   239    240             # Make the link tidier by making a target if none was given.   241    242             if not label:   243                 label = target   244    245         if not label and not title:   246             return "[[%s%s]]" % (prefix, target)   247         elif not title:   248             return "[[%s%s|%s]]" % (prefix, target, label)   249         else:   250             return "[[%s%s|%s|title=%s]]" % (prefix, target, label, title)   251    252     elif match.group("imagetext"):   253         parts = match.group("imagetext").split("|")   254    255         # NOTE: Proper detection of external links required.   256    257         if parts[0].startswith("http"):   258             prefix = ""   259         else:   260             prefix = "attachment:"   261    262         # NOTE: Proper options conversion required.   263    264         if len(parts) == 1:   265             return "{{%s%s}}" % (prefix, parts[0])   266         else:   267             return "{{%s%s|%s}}" % (prefix, parts[0], parts[1])   268    269     else:   270         return match.group()   271    272 def get_table_rows(text):   273    274     "Return a list of (cellsep, columns) tuples for the given table 'text'."   275    276     rows = []   277    278     for line in text.split("\n"):   279         cellsep = None   280         columns = [""]   281         last = 0   282         for match in table_content_regexp.finditer(line):   283             start, end = match.span()   284             columns[-1] += line[last:start]   285    286             if match.group("celltype"):   287                 if cellsep is None:   288                     cellsep = match.group("celltype")   289                 columns.append("")   290             else:   291                 columns[-1] += match.group()   292    293             last = end   294    295         columns[-1] += line[last:]   296    297         if cellsep:   298             rows.append((cellsep, columns[1:-1]))   299    300     return rows   301    302 def translate_content(text, sectiontype=None):   303    304     """   305     Return a translation of the given 'text'. If the optional 'sectiontype' is   306     specified, the translation may be modified to a form appropriate to the   307     section being translated.   308     """   309    310     parts = []   311    312     last = 0   313     for match in content_regexp.finditer(text):   314         start, end = match.span()   315         parts.append(text[last:start])   316    317         # Handle unformatted sections.   318    319         if sectiontype in ("code", "noformat"):   320             parts.append(match.group())   321         else:   322             parts.append(translate_content_match(match))   323    324         last = end   325    326     parts.append(text[last:])   327     return "".join(parts)   328    329 # Translation helpers.   330    331 blocktypes = {   332     "h1" : "= %s =",   333     "h2" : "== %s ==",   334     "h3" : "=== %s ===",   335     "h4" : "==== %s ====",   336     "h5" : "===== %s =====",   337     "h6" : "====== %s ======",   338     "bq" : "{{{%s}}}",   339     }   340    341 markers = {   342     "*" : "*",   343     "#" : "1.",   344     "-" : "*",   345     }   346    347 def translate_marker(marker):   348    349     "Translate the given 'marker' to a suitable Moin representation."   350    351     return " " * len(marker) + markers[marker[-1]]   352    353 cellseps = {   354     "|" : "||",   355     "||" : "||",   356     }   357    358 cellextra = {   359     "|" : "",   360     "||" : "'''",   361     }   362    363 def translate_cellsep(cellsep):   364    365     "Translate the given 'cellsep' to a suitable Moin representation."   366    367     return cellseps[cellsep]   368    369 def translate_cell(cellsep, text):   370    371     "Using 'cellsep', translate the cell 'text'."   372    373     return cellextra[cellsep] + translate_content(text) + cellextra[cellsep]   374    375 sectiontypes = {   376     "code" : "",   377     "noformat" : "",   378     "quote" : "",   379     "info" : "wiki important",   380     "note" : "wiki caution",   381     "tip" : "wiki tip",   382     "warning" : "wiki warning",   383     }   384    385 # XML dialect syntax parsing.   386    387 tags = {   388     # XHTML tag               MoinMoin syntax   389     "strong"                : "'''%s'''",   390     "em"                    : "''%s''",   391     "u"                     : "__%s__",   392     "del"                   : "--(%s)--",   393     "sup"                   : "^%s^",   394     "sub"                   : ",,%s,,",   395     "code"                  : "`%s`",   396     "pre"                   : "{{{%s}}}",   397     "blockquote"            : " %s",   398     "small"                 : "~-%s-~",   399     "big"                   : "~+%s+~",   400     "p"                     : "\n%s\n",   401     "ol"                    : "\n%s",   402     "ul"                    : "\n%s",   403     "ac:plain-text-body"    : "{{{%s}}}",   404     "ac:link"               : "[[%s%s|%s]]",   405     }   406    407 for tag, translation in blocktypes.items():   408     tags[tag] = "\n%s\n" % translation   409    410 simple_tags = {   411     # XHTML tag               MoinMoin syntax   412     "br"                    : "<<BR>>",   413     }   414    415 list_tags = {   416     # XHTML list tag          MoinMoin list item syntax   417     "ol"                    : "1. %s\n",   418     "ul"                    : "* %s\n",   419     }   420    421 indented_tags = ["li", "p"]   422    423 link_target_tags = {   424     # Confluence element      Attribute providing the target   425     "ri:page"               : "ri:content-title",   426     "ri:attachment"         : "ri:filename",   427     "ri:user"               : "ri:username",   428     }   429    430 macro_rich_text_styles = {   431     # Confluence style        MoinMoin admonition style   432     "note"                  : "caution",   433     "warning"               : "warning",   434     "info"                  : "important",   435     "tip"                   : "tip",   436     }   437    438 normalise_regexp_str = r"\s+"   439 normalise_regexp = re.compile(normalise_regexp_str)   440    441 normalise_end_regexp_str = r"\s\s+$"   442 normalise_end_regexp = re.compile(normalise_end_regexp_str)   443    444 class ConfluenceXMLParser(Parser):   445    446     "Handle content from Confluence 4 page revisions."   447    448     def __init__(self, out):   449         Parser.__init__(self)   450         self.out = out   451    452         # Link target information.   453    454         self.target = None   455         self.target_type = None   456    457         # Macro information.   458    459         self.macro = None   460         self.macro_parameters = {}   461    462         # Indentation and preformatted states.   463    464         self.indent = 0   465         self.states = {}   466         for name in ("pre", "ac:plain-text-body"):   467             self.states[name] = 0   468    469     # ContentHandler-related methods.   470    471     def startElement(self, name, attrs):   472         if list_tags.has_key(name):   473             self.indent += 1   474         elif self.states.has_key(name):   475             self.states[name] += 1   476         Parser.startElement(self, name, attrs)   477    478     def endElement(self, name):   479         Parser.endElement(self, name)   480         if list_tags.has_key(name):   481             self.indent -= 1   482         elif self.states.has_key(name):   483             self.states[name] -= 1   484    485     def characters(self, content):   486         if not self.is_preformatted():   487             content = self.normalise(content, self.elements[-1])   488         Parser.characters(self, content)   489    490     def skippedEntity(self, name):   491         ch = htmlentitydefs.name2codepoint.get(name)   492         if ch:   493             self.text[-1].append(unichr(ch))   494    495     # Parser-related methods.   496    497     def handleElement(self, name):   498         text = "".join(self.text[-1])   499         conversion = None   500    501         # Handle list elements.   502    503         if name == "li" and len(self.elements) > 1:   504             list_tag = self.elements[-2]   505             conversion = list_tags.get(list_tag)   506    507         # Remember link target information.   508    509         elif link_target_tags.has_key(name):   510             self.target = self.attributes[-1].get(link_target_tags[name])   511             self.target_type = name   512             text = ""   513    514         # Remember macro information.   515    516         elif name == "ac:parameter":   517             self.macro_parameters[self.attributes[-1].get("ac:name")] = text   518             text = ""   519    520         elif name == "ac:macro":   521             self.macro = self.attributes[-1].get("ac:name")   522    523         # Handle the common case.   524    525         else:   526             conversion = tags.get(name)   527    528         # Attempt to convert the text.   529    530         # Links require target information.   531         # NOTE: User links should support the intended user namespace prefix.   532    533         if name == "ac:link":   534             if self.target_type == "ri:attachment":   535                 prefix = "attachment:"   536             elif self.target_type == "ri:user":   537                 prefix = ""   538             else:   539                 prefix = "../"   540    541             text = conversion % (prefix, self.target, text or self.target)   542             self.target = self.target_type = None   543    544         # Macro name information is used to style rich text body regions.   545    546         elif name == "ac:macro" and macro_rich_text_styles.has_key(self.macro):   547             details = macro_rich_text_styles[self.macro]   548             title = self.macro_parameters.get("title")   549             if title:   550                 details = "%s\n\n%s" % (details, title)   551             text = "{{{#!wiki %s\n\n%s}}}" % (details, text)   552             self.macro = None   553             self.macro_parameters = {}   554    555         # Handle the common case.   556    557         elif text and conversion:   558             text = conversion % text   559         elif simple_tags.has_key(name):   560             text = simple_tags[name]   561    562         # Normalise leading whitespace and indent the text if appropriate.   563    564         if name in indented_tags:   565             text = " " * self.indent + text.lstrip()   566    567         # Add the converted text to the end of the parent element's text nodes.   568    569         if len(self.text) > 1:   570             preceding = "".join(self.text[-2])   571    572             if not self.is_preformatted():   573                 preceding = self.normalise_end(preceding, self.elements[-2])   574    575             self.text[-2] = [preceding]   576             self.text[-2].append(text)   577    578         # Otherwise, emit the text.   579    580         else:   581             self.out.write(text)   582    583     def is_preformatted(self):   584         return reduce(operator.or_, self.states.values(), False)   585    586     def get_replacement(self, name, end=False):   587         if list_tags.has_key(name):   588             if end:   589                 return "\n"   590             else:   591                 return ""   592         elif name == "body":   593             return "\n\n"   594         else:   595             return " "   596    597     def normalise(self, text, name):   598         return normalise_regexp.sub(self.get_replacement(name), text)   599    600     def normalise_end(self, text, name):   601         return normalise_end_regexp.sub(self.get_replacement(name, True), text)   602    603 def xmlparse(s, out):   604    605     "Parse the content in the string 's', writing a translation to 'out'."   606    607     # NOTE: CDATA sections appear to have erroneous endings.   608    609     s = u"""\   610 <?xml version="1.0"?>   611 <!DOCTYPE html    612      PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"   613      "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">   614 <html xmlns="http://www.w3.org/1999/xhtml">   615 <body>   616 %s   617 </body>   618 </html>""" % s.replace("]] >", "]]>")   619    620     f = StringIO(s.encode("utf-8"))   621     try:   622         parser = ConfluenceXMLParser(out)   623         parser.parse(f)   624     finally:   625         f.close()   626    627 # General parsing.   628    629 def parse(s, out):   630    631     "Parse the content in the string 's', writing a translation to 'out'."   632    633     for type, text in get_regions(s):   634    635         # Handle list, heading, blockquote or anonymous blocks.   636    637         if type is None:   638             for blocktype, blocktext in get_blocks(text):   639    640                 # Translate headings and blockquotes.   641    642                 if blocktypes.has_key(blocktype):   643                     print >>out, blocktypes[blocktype] % blocktext   644    645                 # Translate list items.   646    647                 elif blocktype == "list":   648                     for listmarker, listitem in get_list_items(blocktext):   649                         print >>out, "%s %s" % (translate_marker(listmarker), translate_content(listitem))   650    651                 # Translate table items.   652    653                 elif blocktype == "table":   654                     for cellsep, columns in get_table_rows(blocktext):   655                         moinsep = translate_cellsep(cellsep)   656                         print >>out, moinsep + moinsep.join([translate_cell(cellsep, column) for column in columns]) + moinsep   657    658                 # Handle anonymous blocks.   659    660                 else:   661                     print >>out, translate_content(blocktext.rstrip())   662    663                 print >>out   664    665         # Handle sections.   666    667         else:   668             sectiontype, options = type   669    670             # Direct translations of sections.   671    672             mointype = sectiontypes.get(sectiontype)   673             if mointype:   674                 print >>out, "{{{#!%s" % mointype   675                 if options:   676                     print >>out, "##", options   677             else:   678                 print >>out, "{{{",   679             print >>out, translate_content(text, sectiontype),   680             print >>out, "}}}"   681             print >>out   682    683 if __name__ == "__main__":   684     s = sys.stdin.read()   685     parse(s, sys.stdout)   686    687 # vim: tabstop=4 expandtab shiftwidth=4