ConfluenceConverter (file xmlparser.py at a79c3559e45c)

     1 #!/usr/bin/env python     2      3 """     4 Confluence Wiki XML/XHTML syntax parsing.     5      6 Copyright (C) 2012, 2013 Paul Boddie <paul@boddie.org.uk>     7      8 This software is free software; you can redistribute it and/or     9 modify it under the terms of the GNU General Public License as    10 published by the Free Software Foundation; either version 2 of    11 the License, or (at your option) any later version.    12     13 This software is distributed in the hope that it will be useful,    14 but WITHOUT ANY WARRANTY; without even the implied warranty of    15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the    16 GNU General Public License for more details.    17     18 You should have received a copy of the GNU General Public    19 License along with this library; see the file LICENCE.txt    20 If not, write to the Free Software Foundation, Inc.,    21 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA    22 """    23     24 try:    25     from cStringIO import StringIO    26 except ImportError:    27     from StringIO import StringIO    28     29 from MoinMoin import wikiutil    30 from common import *    31 from xmlread import Parser    32 import re    33 import sys    34 import operator    35 import htmlentitydefs    36 import codecs    37     38 # XML dialect syntax parsing.    39     40 tags = {    41     # XHTML tag               MoinMoin syntax    42     "strong"                : "'''%s'''",    43     "em"                    : "''%s''",    44     "u"                     : "__%s__",    45     "del"                   : "--(%s)--",    46     "sup"                   : "^%s^",    47     "sub"                   : ",,%s,,",    48     "code"                  : "`%s`",    49     "tbody"                 : "%s",    50     "tr"                    : "%s",    51     "th"                    : "'''%s'''",    52     "td"                    : "%s",    53     "blockquote"            : " %s",    54     "small"                 : "~-%s-~",    55     "big"                   : "~+%s+~",    56     "p"                     : "%s",    57     "ol"                    : "%s",    58     "ul"                    : "%s",    59     "ac:link"               : "[[%s%s%s|%s]]",    60     "ac:image"              : "{{%s%s%s|%s}}",    61     "a"                     : "[[%s|%s]]",    62     }    63     64 for tag, translation in blocktypes.items():    65     tags[tag] = translation    66     67 simple_tags = {    68     # XHTML tag               MoinMoin syntax    69     "br"                    : "<<BR>>",    70     }    71     72 simple_preformatted_tags = {    73     # XHTML tag               MoinMoin syntax    74     "br"                    : "\n",    75     }    76     77 list_tags = {    78     # XHTML list tag          MoinMoin list item syntax    79     "ol"                    : "1. %s",    80     "ul"                    : "* %s",    81     }    82     83 preformatted_tags = ["pre", "ac:plain-text-body"]    84 single_level_tags = ["strong", "em", "u", "del", "sup", "sub", "code"]    85 formatted_tags    = ["ac:rich-text-body", "table"]    86     87 indented_tags = ["li", "p"] + preformatted_tags + formatted_tags    88 block_tags = indented_tags + blocktypes.keys() + list_tags.keys()    89 span_override_tags = ["ac:link"]    90     91 link_target_tags = {    92     # Confluence element      Attributes providing the target    93     "ri:page"               : ("ri:space-key", "ri:content-title"),    94     "ri:attachment"         : ("ri:filename",),    95     "ri:user"               : ("ri:username",),    96     }    97     98 link_target_prefixes = {    99     # Attribute with details  Prefix ensuring correct relative link   100     "ri:space-key"          : "..",   101     "ri:content-title"      : "..",   102     }   103    104 link_label_attributes = "ri:content-title", "ac:link-body"   105    106 # NOTE: User links should support the intended user namespace prefix.   107    108 link_target_types = {   109     # Confluence element      MoinMoin link prefix   110     "ri:attachment"         : "attachment:",   111     "ri:user"               : "",   112     }   113    114 macro_rich_text_styles = {   115     # Confluence style        MoinMoin admonition style   116     "note"                  : "caution",   117     "warning"               : "warning",   118     "info"                  : "important",   119     "tip"                   : "tip",   120     }   121    122 macroargs = {   123     # Confluence macro        Confluence and MoinMoin macro arguments   124     "color"                 : ("color", "col"),   125     }   126    127 macrotypes = {   128     # Confluence macro        MoinMoin syntax   129     "anchor"                : "<<Anchor(%(anchor)s)>>",   130     "color"                 : "<<Color2(%(content)s, %(args)s)>>",   131     }   132    133 normalise_regexp_str = r"\s+"   134 normalise_regexp = re.compile(normalise_regexp_str)   135    136 class ConfluenceXMLParser(Parser):   137    138     "Handle content from Confluence 4 page revisions."   139    140     def __init__(self, out):   141         Parser.__init__(self)   142         self.out = out   143    144         # Link target and label information.   145    146         self.target = None   147         self.target_type = None   148         self.label = None   149    150         # Macro information.   151    152         self.macro = None   153         self.macro_parameters = {}   154         self.held_anchors = []   155    156         # Indentation and element nesting states.   157    158         self.indents = [0]   159         self.states = {}   160         self.max_level = self.level = 0   161    162         for name in preformatted_tags + single_level_tags:   163             self.states[name] = 0   164    165         # Table states.   166    167         self.table_rows = 0   168         self.table_columns = 0   169    170         # Block states.   171    172         self.have_block = False   173    174     # ContentHandler-related methods.   175    176     def startElement(self, name, attrs):   177    178         # Track indentation for lists.   179    180         if list_tags.has_key(name):   181             self.indents.append(self.indents[-1] + 1)   182    183         # Track element nesting.   184    185         if self.states.has_key(name):   186             self.states[name] += 1   187    188         # Track cumulative element nesting in order to produce appropriate depth   189         # indicators in the formatted output.   190    191         if name in preformatted_tags or name in formatted_tags:   192             self.level += 1   193             self.max_level = max(self.level, self.max_level)   194    195             # Reset indentation within regions.   196    197             self.indents.append(0)   198    199         if name in headings:   200             self.held_anchors = []   201    202         Parser.startElement(self, name, attrs)   203    204         # Remember macro information for use within the element.   205    206         if name == "ac:macro":   207             self.macro = self.attributes[-1].get("ac:name")   208    209     def endElement(self, name):   210    211         # Reset the indent for any preformatted/formatted region so that it may   212         # itself be indented.   213    214         if name in preformatted_tags or name in formatted_tags:   215             self.indents.pop()   216    217         Parser.endElement(self, name)   218    219         if list_tags.has_key(name):   220             self.indents.pop()   221    222         if self.states.has_key(name):   223             self.states[name] -= 1   224    225         if name in preformatted_tags or name in formatted_tags:   226             self.level -= 1   227             if not self.level:   228                 self.max_level = 0   229    230         # Discard macro state.   231    232         if name == "ac:macro":   233             self.macro = None   234             self.macro_parameters = {}   235    236     def characters(self, content):   237         if not self.is_preformatted():   238             content = self.normalise(content, self.elements[-1])   239         Parser.characters(self, content)   240    241     def skippedEntity(self, name):   242         ch = htmlentitydefs.name2codepoint.get(name)   243         if ch:   244             self.text[-1].append(unichr(ch))   245    246     # Parser-related methods.   247    248     def handleElement(self, name):   249    250         """   251         Handle the completion of the element with the given 'name'. Any content   252         will either be recorded for later use (by an enclosing element, for   253         example) or emitted in some form.   254         """   255    256         text = u"".join(self.text[-1])   257    258         # Handle state.   259    260         if name == "table":   261             self.table_rows = 0   262         elif name == "tr":   263             self.table_columns = 0   264    265         # Find conversions.   266    267         conversion = None   268    269         # Handle list elements.   270    271         if name == "li" and len(self.elements) > 1:   272             list_tag = self.elements[-2]   273             conversion = list_tags.get(list_tag)   274    275         # Remember link target information.   276    277         elif link_target_tags.has_key(name):   278             target_details = []   279    280             # Get target details from the element's attributes.   281    282             for attrname in link_target_tags[name]:   283                 attrvalue = self.attributes[-1].get(attrname)   284                 if attrvalue:   285    286                     # Obtain a link label.   287    288                     if attrname in link_label_attributes and not self.label:   289                         self.label = attrvalue   290    291                     # Validate any page title.   292    293                     if attrname == "ri:content-title":   294                         attrvalue = get_page_title(attrvalue)   295                     target_details.append(attrvalue)   296    297                     # Insert any prefix required for the link.   298    299                     prefix = link_target_prefixes.get(attrname)   300                     if prefix:   301                         target_details.insert(0, prefix)   302    303             # Make a link based on the details.   304    305             self.target = u"/".join(target_details)   306             self.target_type = name   307             text = ""   308    309         # For anchor links, just use the raw text and let Moin do the formatting.   310    311         elif name == "ac:link-body":   312             self.label = text.strip()   313             text = ""   314    315         # For conventional links, remember the href attribute as the target.   316    317         elif name == "a":   318             self.target = self.attributes[-1].get("href")   319             self.label = text.strip()   320             text = ""   321    322         # Remember macro information.   323    324         elif name == "ac:parameter":   325             self.macro_parameters[self.attributes[-1].get("ac:name")] = text   326             text = ""   327    328         elif name == "ac:default-parameter":   329             self.macro_parameters[self.attributes[-2].get("ac:name")] = text   330             text = ""   331    332         # Handle single-level tags.   333    334         elif name in single_level_tags and self.states[name] > 1:   335             conversion = "%s"   336    337         # Handle preformatted sections.   338    339         elif name in preformatted_tags or name in formatted_tags:   340    341             # Nest the section appropriately.   342    343             level = 3 + self.max_level - self.level   344             opening = "{" * level   345             closing = "}" * level   346    347             # Macro name information is used to style rich text body regions.   348    349             if name != "table" and self.macro and macro_rich_text_styles.has_key(self.macro):   350                 details = macro_rich_text_styles[self.macro]   351                 title = self.macro_parameters.get("title")   352                 if title:   353                     details = "%s\n\n%s" % (details, title)   354    355                 conversion = "%s#!wiki %s\n\n%%s\n%s" % (opening, details, closing)   356    357             elif name == "table":   358                 conversion = "%s#!table\n%%s\n%s" % (opening, closing)   359    360             else:   361                 # Preformatted sections containing newlines must contain an initial   362                 # newline.   363    364                 if text.find("\n") != -1 and not text.startswith("\n"):   365                     opening += "\n"   366    367                 conversion = "%s%%s%s" % (opening, closing)   368    369         # Handle the common case and simpler special cases.   370    371         if not conversion:   372             conversion = tags.get(name)   373    374    375    376         # Attempt to convert the text.   377    378         # Links require target information.   379    380         if name in ("ac:link", "ac:image"):   381             prefix = link_target_types.get(self.target_type, "")   382             anchor = self.attributes[-1].get("ac:anchor") or ""   383             label = self.label or text.strip() or self.target   384             text = conversion % (prefix, self.target, anchor and ("#%s" % anchor) or "", label)   385             self.target = self.target_type = self.label = None   386    387         elif name == "a":   388             text = conversion % (self.target, self.label or self.target)   389             self.target = self.target_type = self.label = None   390    391         # Macros require various kinds of information.   392         # Some macros affect the formatting of their contents, whereas other   393         # simpler macros are handled here.   394    395         elif name == "ac:macro":   396             conversion = macrotypes.get(self.macro)   397             if conversion:   398                 parameters = {"content" : text}   399                 parameters.update(self.macro_parameters)   400                 argnames = macroargs.get(self.macro)   401                 if argnames:   402                     confargname, moinargname = argnames   403                     parameters["args"] = quote_macro_argument("%s=%s" % (moinargname, self.macro_parameters[confargname]))   404                 text = conversion % parameters   405                 if self.macro == "anchor" and self.forbids_macros():   406                     self.held_anchors.append(text)   407                     text = ""   408    409         # Handle the common cases for parameterised and unparameterised   410         # substitutions.   411    412         elif text and conversion:   413             text = conversion % text   414         elif simple_tags.has_key(name) and not self.is_preformatted():   415             text = simple_tags[name]   416         elif simple_preformatted_tags.has_key(name) and self.is_preformatted():   417             text = simple_preformatted_tags[name]   418    419    420    421         # Postprocess table columns and rows.   422    423         if name in ("th", "td"):   424             if self.table_columns:   425                 text = "\n|| %s" % text   426             self.table_columns += 1   427         elif name == "tr":   428             if self.table_rows:   429                 text = "\n==\n%s" % text   430             self.table_rows += 1   431    432         # Postprocess held anchor tags in headings.   433    434         elif name in headings and self.held_anchors:   435             text = "%s\n%s" % ("".join(self.held_anchors), text)   436    437    438    439         # Normalise leading whitespace and indent the text if appropriate.   440    441         if name in indented_tags:   442             text = " " * self.indents[-1] + text.lstrip()   443    444         # Add the converted text to the end of the parent element's text nodes.   445    446         if len(self.text) > 1:   447             nodes = self.text[-2]   448             parent = self.elements[-2]   449    450             # Where preceding text exists, add any blank line separators.   451    452             if u"".join(nodes):   453    454                 # All top-level elements are separated with blank lines.   455    456                 if parent == "body":   457                     nodes.append("\n")   458    459                 # Block elements always cause a new line to be started.   460    461                 if name in block_tags or self.have_block and name not in span_override_tags:   462                     nodes.append("\n")   463    464                 self.have_block = False   465    466             # Lists inside lists require separation.   467    468             elif list_tags.has_key(name) and parent == "li":   469                 nodes.append("\n")   470    471             # Without preceding text, save any block node state for non-block   472             # elements so that newline separators can be added at another   473             # level.   474    475             elif name in block_tags and parent not in block_tags:   476                 self.have_block = True   477    478             elif name not in block_tags and self.have_block and name not in span_override_tags:   479                 self.have_block = True   480    481             else:   482                 self.have_block = False   483    484             nodes.append(text)   485    486         # Otherwise, emit the text (at the top level of the document).   487    488         else:   489             self.out.write(text)   490    491     def is_preformatted(self):   492         return reduce(operator.or_, [self.states[tag] for tag in preformatted_tags], False)   493    494     def forbids_macros(self):   495         return reduce(operator.or_, [(tag in headings or tag == "a") for tag in self.elements], False)   496    497     # Whitespace normalisation.   498    499     def get_replacement(self, name):   500         if name in ("html", "body", "table", "tbody", "tr") or list_tags.has_key(name):   501             return ""   502         else:   503             return " "   504    505     def normalise(self, text, name):   506         return normalise_regexp.sub(self.get_replacement(name), text)   507    508 def parse(s, out):   509    510     "Parse the content in the string 's', writing a translation to 'out'."   511    512     # NOTE: CDATA sections appear to have erroneous endings.   513    514     s = u"""\   515 <?xml version="1.0"?>   516 <!DOCTYPE html    517      PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"   518      "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">   519 <html xmlns="http://www.w3.org/1999/xhtml">   520 <body>   521 %s   522 </body>   523 </html>""" % s.replace("]] >", "]]>")   524    525     f = StringIO(s.encode("utf-8"))   526     try:   527         parser = ConfluenceXMLParser(out)   528         parser.parse(f)   529     finally:   530         f.close()   531    532 if __name__ == "__main__":   533     s = codecs.getreader("utf-8")(sys.stdin).read()   534     out = codecs.getwriter("utf-8")(sys.stdout)   535     parse(s, out)   536    537 # vim: tabstop=4 expandtab shiftwidth=4