ConfluenceConverter (file parser.py at 2fb03898e6ef)

     1 #!/usr/bin/env python     2      3 """     4 Confluence Wiki syntax parsing.     5      6 Copyright (C) 2012, 2013 Paul Boddie <paul@boddie.org.uk>     7      8 This software is free software; you can redistribute it and/or     9 modify it under the terms of the GNU General Public License as    10 published by the Free Software Foundation; either version 2 of    11 the License, or (at your option) any later version.    12     13 This software is distributed in the hope that it will be useful,    14 but WITHOUT ANY WARRANTY; without even the implied warranty of    15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the    16 GNU General Public License for more details.    17     18 You should have received a copy of the GNU General Public    19 License along with this library; see the file LICENCE.txt    20 If not, write to the Free Software Foundation, Inc.,    21 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA    22     23 --------    24     25 The basic procedure is as follows:    26     27  1. Wiki pages are first split up into regions.    28  2. Then, within these regions, the text is split into blocks.    29     1. First, lists are identified.    30     2. Additionally, other block-like elements are identified.    31  3. Each block is then parsed.    32 """    33     34 try:    35     from cStringIO import StringIO    36 except ImportError:    37     from StringIO import StringIO    38     39 from xmlread import Parser    40 import re    41 import sys    42 import operator    43 import htmlentitydefs    44     45 URL_SCHEMES = ("http", "https", "ftp", "mailto")    46     47 # Section extraction.    48     49 sections_regexp_str = r"(?<!{){(?P<type>[^-_*+{}\n:]+)(:[^}\n]+)?}.*?{(?P=type)}"    50 sections_regexp = re.compile(sections_regexp_str, re.DOTALL | re.MULTILINE)    51     52 def get_regions(s):    53     54     """    55     Return a list of regions from 's'. Each region is specified using a tuple of    56     the form (type, text).    57     """    58     59     last = 0    60     regions = []    61     for match in sections_regexp.finditer(s):    62         start, end = match.span()    63         regions.append((None, s[last:start]))    64         regions.append(get_section_details(s[start:end]))    65         last = end    66     regions.append((None, s[last:]))    67     return regions    68     69 # Section inspection.    70     71 section_regexp_str = r"{(?P<sectiontype>[^\n:]*?)(?::(?P<options>.*?))?}(?P<section>.*){(?P=sectiontype)}"    72 section_regexp = re.compile(section_regexp_str, re.DOTALL | re.MULTILINE)    73     74 def get_section_details(s):    75     76     "Return the details of a section 's' in the form (type, text)."    77     78     match = section_regexp.match(s)    79     if match:    80         return (match.group("sectiontype"), match.group("options")), match.group("section")    81     else:    82         return None, s    83     84 # Heading, table and list extraction.    85     86 list_regexp_str = r"^\s*(?P<listtype>[*#-])[*#-]*.*\n(\s*(?P=listtype).*(?:\n|$))*"    87 table_regexp_str = r"^((?P<celltype>[|]{1,2})(.+?(?P=celltype))+(\n|$))+"    88 blocktext_regexp_str = r"^(?P<type>h\d|bq)\.\s+(?P<text>.*)$"    89     90 blockelement_regexp = re.compile(    91     "(" + list_regexp_str + ")"    92     "|"    93     "(" + table_regexp_str + ")"    94     "|"    95     "(" + blocktext_regexp_str + ")",    96     re.MULTILINE    97     )    98     99 def get_block_elements(s):   100    101     """   102     Extract headings, tables and lists from the given string 's'.   103     """   104    105     last = 0   106     blocks = []   107     for match in blockelement_regexp.finditer(s):   108         start, end = match.span()   109         matchtype = match.group("listtype") and "list" or match.group("celltype") and "table" or match.group("type")   110         blocks.append((None, s[last:start]))   111         blocks.append((matchtype, match.group("text") or s[start:end]))   112         last = end   113     blocks.append((None, s[last:]))   114     return blocks   115    116 # Block extraction.   117    118 block_regexp_str = r"^(?:\s*\n)+"   119 block_regexp = re.compile(block_regexp_str, re.MULTILINE)   120    121 def get_basic_blocks(s):   122    123     """   124     Return blocks from the given string 's' by splitting the text on blank lines   125     and eliminating those lines.   126     """   127    128     return [b for b in block_regexp.split(s) if b.strip()]   129    130 # Block inspection.   131    132 def get_blocks(s):   133    134     """   135     Return blocks from the given string 's', inspecting the basic blocks and   136     generating additional block-level text where appropriate.   137     """   138    139     blocks = []   140    141     for blocktype, blocktext in get_block_elements(s):   142    143         # Collect heading, list and table blocks.   144    145         if blocktype is not None:   146             blocks.append((blocktype, blocktext))   147    148         # Attempt to find new subblocks in other regions.   149    150         else:   151             for block in get_basic_blocks(blocktext):   152                 blocks.append((None, block))   153    154     return blocks   155    156 # List item inspection.   157    158 listitem_regexp_str = r"^(?P<marker> *[-*#]+)\s*(?P<text>.*)$"   159 listitem_regexp = re.compile(listitem_regexp_str, re.MULTILINE)   160    161 def get_list_items(text):   162    163     "Return a list of (marker, text) tuples for the given list 'text'."   164    165     items = []   166    167     for match in listitem_regexp.finditer(text):   168         items.append((match.group("marker"), match.group("text")))   169    170     return items   171    172 # Table row inspection.   173    174 monospace_regexp_str = r"{{(?P<monotext>.*?)}}"   175 link_regexp_str = r"[[](?P<linktext>.*?)]"   176 image_regexp_str = r"!(?P<imagetext>.*?)!"   177 cellsep_regexp_str = r"(?P<celltype>[|]{1,2})"   178    179 content_regexp_str = (   180     "(" + monospace_regexp_str + ")"   181     "|"   182     "(" + link_regexp_str + ")"   183     "|"   184     "(" + image_regexp_str + ")"   185     )   186    187 table_content_regexp_str = (   188     content_regexp_str +   189     "|"   190     "(" + cellsep_regexp_str + ")"   191     )   192    193 content_regexp = re.compile(content_regexp_str)   194 table_content_regexp = re.compile(table_content_regexp_str)   195    196 def translate_content_match(match):   197    198     "Translate the content described by the given 'match', returning a string."   199    200     if match.group("monotext"):   201         return "{{{%s}}}" % match.group("monotext")   202    203     elif match.group("linktext"):   204         parts = match.group("linktext").split("|")   205    206         # NOTE: Proper detection of external links required.   207    208         if len(parts) == 1:   209             label, target, title = None, parts[0], None   210         elif len(parts) == 2:   211             (label, target), title = parts, None   212         else:   213             label, target, title = parts   214    215         target = target.strip()   216    217         # Look for namespace links and rewrite them.   218    219         if target.find(":") != -1:   220             prefix = ""   221             space, rest = target.split(":", 1)   222             if space not in URL_SCHEMES:   223                 target = "%s/%s" % (space, rest)   224    225         # Detect anchors.   226    227         elif target.startswith("#"):   228             prefix = ""   229    230         # Detect attachments.   231    232         elif target.startswith("^"):   233             prefix = "attachment:"   234    235         # Link to other pages within a space.   236    237         else:   238             prefix = "../"   239    240             # Make the link tidier by making a target if none was given.   241    242             if not label:   243                 label = target   244    245         if not label and not title:   246             return "[[%s%s]]" % (prefix, target)   247         elif not title:   248             return "[[%s%s|%s]]" % (prefix, target, label)   249         else:   250             return "[[%s%s|%s|title=%s]]" % (prefix, target, label, title)   251    252     elif match.group("imagetext"):   253         parts = match.group("imagetext").split("|")   254    255         # NOTE: Proper detection of external links required.   256    257         if parts[0].startswith("http"):   258             prefix = ""   259         else:   260             prefix = "attachment:"   261    262         # NOTE: Proper options conversion required.   263    264         if len(parts) == 1:   265             return "{{%s%s}}" % (prefix, parts[0])   266         else:   267             return "{{%s%s|%s}}" % (prefix, parts[0], parts[1])   268    269     else:   270         return match.group()   271    272 def get_table_rows(text):   273    274     "Return a list of (cellsep, columns) tuples for the given table 'text'."   275    276     rows = []   277    278     for line in text.split("\n"):   279         cellsep = None   280         columns = [""]   281         last = 0   282         for match in table_content_regexp.finditer(line):   283             start, end = match.span()   284             columns[-1] += line[last:start]   285    286             if match.group("celltype"):   287                 if cellsep is None:   288                     cellsep = match.group("celltype")   289                 columns.append("")   290             else:   291                 columns[-1] += match.group()   292    293             last = end   294    295         columns[-1] += line[last:]   296    297         if cellsep:   298             rows.append((cellsep, columns[1:-1]))   299    300     return rows   301    302 def translate_content(text, sectiontype=None):   303    304     """   305     Return a translation of the given 'text'. If the optional 'sectiontype' is   306     specified, the translation may be modified to a form appropriate to the   307     section being translated.   308     """   309    310     parts = []   311    312     last = 0   313     for match in content_regexp.finditer(text):   314         start, end = match.span()   315         parts.append(text[last:start])   316    317         # Handle unformatted sections.   318    319         if sectiontype in ("code", "noformat"):   320             parts.append(match.group())   321         else:   322             parts.append(translate_content_match(match))   323    324         last = end   325    326     parts.append(text[last:])   327     return "".join(parts)   328    329 # Translation helpers.   330    331 blocktypes = {   332     "h1" : "= %s =",   333     "h2" : "== %s ==",   334     "h3" : "=== %s ===",   335     "h4" : "==== %s ====",   336     "h5" : "===== %s =====",   337     "h6" : "====== %s ======",   338     "bq" : "{{{%s}}}",   339     }   340    341 markers = {   342     "*" : "*",   343     "#" : "1.",   344     "-" : "*",   345     }   346    347 def translate_marker(marker):   348    349     "Translate the given 'marker' to a suitable Moin representation."   350    351     return " " * len(marker) + markers[marker[-1]]   352    353 cellseps = {   354     "|" : "||",   355     "||" : "||",   356     }   357    358 cellextra = {   359     "|" : "",   360     "||" : "'''",   361     }   362    363 def translate_cellsep(cellsep):   364    365     "Translate the given 'cellsep' to a suitable Moin representation."   366    367     return cellseps[cellsep]   368    369 def translate_cell(cellsep, text):   370    371     "Using 'cellsep', translate the cell 'text'."   372    373     return cellextra[cellsep] + translate_content(text) + cellextra[cellsep]   374    375 sectiontypes = {   376     "code" : "",   377     "noformat" : "",   378     "quote" : "",   379     "info" : "wiki important",   380     "note" : "wiki caution",   381     "tip" : "wiki tip",   382     "warning" : "wiki warning",   383     }   384    385 # XML dialect syntax parsing.   386    387 tags = {   388     # XHTML tag               MoinMoin syntax   389     "strong"                : "'''%s'''",   390     "em"                    : "''%s''",   391     "u"                     : "__%s__",   392     "del"                   : "--(%s)--",   393     "sup"                   : "^%s^",   394     "sub"                   : ",,%s,,",   395     "code"                  : "`%s`",   396     "pre"                   : "{{{%s}}}",   397     "blockquote"            : " %s",   398     "small"                 : "~-%s-~",   399     "big"                   : "~+%s+~",   400     "p"                     : "%s",   401     "ol"                    : "%s",   402     "ul"                    : "%s",   403     "ac:plain-text-body"    : "{{{%s}}}",   404     "ac:link"               : "[[%s%s|%s]]",   405     }   406    407 for tag, translation in blocktypes.items():   408     tags[tag] = translation   409    410 simple_tags = {   411     # XHTML tag               MoinMoin syntax   412     "br"                    : "<<BR>>",   413     }   414    415 list_tags = {   416     # XHTML list tag          MoinMoin list item syntax   417     "ol"                    : "1. %s",   418     "ul"                    : "* %s",   419     }   420    421 indented_tags = ["li", "p"]   422    423 link_target_tags = {   424     # Confluence element      Attribute providing the target   425     "ri:page"               : "ri:content-title",   426     "ri:attachment"         : "ri:filename",   427     "ri:user"               : "ri:username",   428     }   429    430 macro_rich_text_styles = {   431     # Confluence style        MoinMoin admonition style   432     "note"                  : "caution",   433     "warning"               : "warning",   434     "info"                  : "important",   435     "tip"                   : "tip",   436     }   437    438 normalise_regexp_str = r"\s+"   439 normalise_regexp = re.compile(normalise_regexp_str)   440    441 class ConfluenceXMLParser(Parser):   442    443     "Handle content from Confluence 4 page revisions."   444    445     def __init__(self, out):   446         Parser.__init__(self)   447         self.out = out   448    449         # Link target information.   450    451         self.target = None   452         self.target_type = None   453    454         # Macro information.   455    456         self.macro = None   457         self.macro_parameters = {}   458    459         # Indentation and preformatted states.   460    461         self.indent = 0   462         self.states = {}   463         for name in ("pre", "ac:plain-text-body"):   464             self.states[name] = 0   465    466     # ContentHandler-related methods.   467    468     def startElement(self, name, attrs):   469         if list_tags.has_key(name):   470             self.indent += 1   471         elif self.states.has_key(name):   472             self.states[name] += 1   473         Parser.startElement(self, name, attrs)   474    475     def endElement(self, name):   476         Parser.endElement(self, name)   477         if list_tags.has_key(name):   478             self.indent -= 1   479         elif self.states.has_key(name):   480             self.states[name] -= 1   481    482     def characters(self, content):   483         if not self.is_preformatted():   484             content = self.normalise(content, self.elements[-1])   485         Parser.characters(self, content)   486    487     def skippedEntity(self, name):   488         ch = htmlentitydefs.name2codepoint.get(name)   489         if ch:   490             self.text[-1].append(unichr(ch))   491    492     # Parser-related methods.   493    494     def handleElement(self, name):   495         text = "".join(self.text[-1])   496         conversion = None   497    498         # Handle list elements.   499    500         if name == "li" and len(self.elements) > 1:   501             list_tag = self.elements[-2]   502             conversion = list_tags.get(list_tag)   503    504         # Remember link target information.   505    506         elif link_target_tags.has_key(name):   507             self.target = self.attributes[-1].get(link_target_tags[name])   508             self.target_type = name   509             text = ""   510    511         # Remember macro information.   512    513         elif name == "ac:parameter":   514             self.macro_parameters[self.attributes[-1].get("ac:name")] = text   515             text = ""   516    517         elif name == "ac:macro":   518             self.macro = self.attributes[-1].get("ac:name")   519    520         # Handle the common case.   521    522         else:   523             conversion = tags.get(name)   524    525         # Attempt to convert the text.   526    527         # Links require target information.   528         # NOTE: User links should support the intended user namespace prefix.   529    530         if name == "ac:link":   531             if self.target_type == "ri:attachment":   532                 prefix = "attachment:"   533             elif self.target_type == "ri:user":   534                 prefix = ""   535             else:   536                 prefix = "../"   537    538             text = conversion % (prefix, self.target, text or self.target)   539             self.target = self.target_type = None   540    541         # Macro name information is used to style rich text body regions.   542    543         elif name == "ac:macro" and macro_rich_text_styles.has_key(self.macro):   544             details = macro_rich_text_styles[self.macro]   545             title = self.macro_parameters.get("title")   546             if title:   547                 details = "%s\n\n%s" % (details, title)   548             text = "{{{#!wiki %s\n\n%s}}}" % (details, text)   549             self.macro = None   550             self.macro_parameters = {}   551    552         # Handle the common case.   553    554         elif text and conversion:   555             text = conversion % text   556         elif simple_tags.has_key(name):   557             text = simple_tags[name]   558    559         # Normalise leading whitespace and indent the text if appropriate.   560    561         if name in indented_tags:   562             text = " " * self.indent + text.lstrip()   563    564         # Add the converted text to the end of the parent element's text nodes.   565    566         if len(self.text) > 1:   567             nodes = self.text[-2]   568             if "".join(self.text[-2]):   569                 parent = self.elements[-2]   570                 if parent == "body":   571                     nodes.append("\n\n")   572                 elif list_tags.has_key(parent):   573                     nodes.append("\n")   574                 elif list_tags.has_key(name) and parent == "li":   575                     nodes.append("\n")   576             nodes.append(text)   577    578         # Otherwise, emit the text.   579    580         else:   581             self.out.write(text)   582    583     def is_preformatted(self):   584         return reduce(operator.or_, self.states.values(), False)   585    586     # Whitespace normalisation.   587    588     def get_replacement(self, name):   589         if name in ("html", "body") or list_tags.has_key(name):   590             return ""   591         else:   592             return " "   593    594     def normalise(self, text, name):   595         return normalise_regexp.sub(self.get_replacement(name), text)   596    597 def xmlparse(s, out):   598    599     "Parse the content in the string 's', writing a translation to 'out'."   600    601     # NOTE: CDATA sections appear to have erroneous endings.   602    603     s = u"""\   604 <?xml version="1.0"?>   605 <!DOCTYPE html    606      PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"   607      "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">   608 <html xmlns="http://www.w3.org/1999/xhtml">   609 <body>   610 %s   611 </body>   612 </html>""" % s.replace("]] >", "]]>")   613    614     f = StringIO(s.encode("utf-8"))   615     try:   616         parser = ConfluenceXMLParser(out)   617         parser.parse(f)   618     finally:   619         f.close()   620    621 # General parsing.   622    623 def parse(s, out):   624    625     "Parse the content in the string 's', writing a translation to 'out'."   626    627     for type, text in get_regions(s):   628    629         # Handle list, heading, blockquote or anonymous blocks.   630    631         if type is None:   632             for blocktype, blocktext in get_blocks(text):   633    634                 # Translate headings and blockquotes.   635    636                 if blocktypes.has_key(blocktype):   637                     print >>out, blocktypes[blocktype] % blocktext   638    639                 # Translate list items.   640    641                 elif blocktype == "list":   642                     for listmarker, listitem in get_list_items(blocktext):   643                         print >>out, "%s %s" % (translate_marker(listmarker), translate_content(listitem))   644    645                 # Translate table items.   646    647                 elif blocktype == "table":   648                     for cellsep, columns in get_table_rows(blocktext):   649                         moinsep = translate_cellsep(cellsep)   650                         print >>out, moinsep + moinsep.join([translate_cell(cellsep, column) for column in columns]) + moinsep   651    652                 # Handle anonymous blocks.   653    654                 else:   655                     print >>out, translate_content(blocktext.rstrip())   656    657                 print >>out   658    659         # Handle sections.   660    661         else:   662             sectiontype, options = type   663    664             # Direct translations of sections.   665    666             mointype = sectiontypes.get(sectiontype)   667             if mointype:   668                 print >>out, "{{{#!%s" % mointype   669                 if options:   670                     print >>out, "##", options   671             else:   672                 print >>out, "{{{",   673             print >>out, translate_content(text, sectiontype),   674             print >>out, "}}}"   675             print >>out   676    677 if __name__ == "__main__":   678     s = sys.stdin.read()   679     if "--xml" in sys.argv:   680         xmlparse(s, sys.stdout)   681     else:   682         parse(s, sys.stdout)   683    684 # vim: tabstop=4 expandtab shiftwidth=4