ConfluenceConverter (file xmlparser.py at a95675d52731)

     1 #!/usr/bin/env python     2      3 """     4 Confluence Wiki XML/XHTML syntax parsing.     5      6 Copyright (C) 2012, 2013 Paul Boddie <paul@boddie.org.uk>     7      8 This software is free software; you can redistribute it and/or     9 modify it under the terms of the GNU General Public License as    10 published by the Free Software Foundation; either version 2 of    11 the License, or (at your option) any later version.    12     13 This software is distributed in the hope that it will be useful,    14 but WITHOUT ANY WARRANTY; without even the implied warranty of    15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the    16 GNU General Public License for more details.    17     18 You should have received a copy of the GNU General Public    19 License along with this library; see the file LICENCE.txt    20 If not, write to the Free Software Foundation, Inc.,    21 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA    22 """    23     24 try:    25     from cStringIO import StringIO    26 except ImportError:    27     from StringIO import StringIO    28     29 from MoinMoin import wikiutil    30 from common import *    31 from xmlread import Parser    32 import re    33 import sys    34 import operator    35 import htmlentitydefs    36 import codecs    37     38 # XML dialect syntax parsing.    39     40 tags = {    41     # XHTML tag               MoinMoin syntax    42     "strong"                : "'''%s'''",    43     "em"                    : "''%s''",    44     "u"                     : "__%s__",    45     "del"                   : "--(%s)--",    46     "sup"                   : "^%s^",    47     "sub"                   : ",,%s,,",    48     "code"                  : "`%s`",    49     "tbody"                 : "%s",    50     "tr"                    : "%s",    51     "th"                    : "'''%s'''",    52     "td"                    : "%s",    53     "blockquote"            : " %s",    54     "small"                 : "~-%s-~",    55     "big"                   : "~+%s+~",    56     "p"                     : "%s",    57     "ol"                    : "%s",    58     "ul"                    : "%s",    59     "ac:link"               : "[[%s%s%s|%s]]",    60     "ac:image"              : "{{%s%s%s|%s}}",    61     "a"                     : "[[%s|%s]]",    62     }    63     64 for tag, translation in blocktypes.items():    65     tags[tag] = translation    66     67 simple_tags = {    68     # XHTML tag               MoinMoin syntax    69     "br"                    : "<<BR>>",    70     }    71     72 simple_preformatted_tags = {    73     # XHTML tag               MoinMoin syntax    74     "br"                    : "\n",    75     }    76     77 list_tags = {    78     # XHTML list tag          MoinMoin list item syntax    79     "ol"                    : "1. %s",    80     "ul"                    : "* %s",    81     }    82     83 preformatted_tags = ["pre", "ac:plain-text-body"]    84 single_level_tags = ["strong", "em", "u", "del", "sup", "sub", "code"]    85 formatted_tags    = ["ac:rich-text-body", "table"]    86     87 indented_tags = ["li", "p"] + preformatted_tags + formatted_tags    88 block_tags = indented_tags + blocktypes.keys() + list_tags.keys()    89 span_override_tags = ["ac:link"]    90     91 link_target_tags = {    92     # Confluence element      Attributes providing the target    93     "ri:page"               : ("ri:space-key", "ri:content-title"),    94     "ri:attachment"         : ("ri:filename",),    95     "ri:user"               : ("ri:username",),    96     }    97     98 link_target_prefixes = {    99     # Attribute with details  Prefix ensuring correct relative link   100     "ri:space-key"          : "..",   101     "ri:content-title"      : "..",   102     }   103    104 link_label_attributes = "ri:content-title", "ac:link-body"   105    106 # NOTE: User links should support the intended user namespace prefix.   107    108 link_target_types = {   109     # Confluence element      MoinMoin link prefix   110     "ri:attachment"         : "attachment:",   111     "ri:user"               : "",   112     }   113    114 macro_rich_text_styles = {   115     # Confluence style        MoinMoin admonition style   116     "note"                  : "caution",   117     "warning"               : "warning",   118     "info"                  : "important",   119     "tip"                   : "tip",   120     "excerpt"               : "",   121     }   122    123 macroargs = {   124     # Confluence macro        Confluence and MoinMoin macro arguments   125     "color"                 : ("color", "col"),   126     }   127    128 macrotypes = {   129     # Confluence macro        MoinMoin syntax   130     "anchor"                : "<<Anchor(%(anchor)s)>>",   131     "color"                 : "<<Color2(%(content)s, %(args)s)>>",   132     "toc"                   : "<<TableOfContents>>",   133     }   134    135 normalise_regexp_str = r"\s+"   136 normalise_regexp = re.compile(normalise_regexp_str)   137    138 class ConfluenceXMLParser(Parser):   139    140     "Handle content from Confluence 4 page revisions."   141    142     def __init__(self, out):   143         Parser.__init__(self)   144         self.out = out   145    146         # Link target and label information.   147    148         self.target = None   149         self.target_type = None   150         self.label = None   151    152         # Macro information.   153    154         self.macros = []   155         self.macro_parameters = []   156         self.held_anchors = []   157    158         # Indentation and element nesting states.   159    160         self.indents = [0]   161         self.states = {}   162         self.max_level = self.level = 0   163    164         for name in preformatted_tags + single_level_tags:   165             self.states[name] = 0   166    167         # Table states.   168    169         self.table_rows = 0   170         self.table_columns = 0   171    172         # Block states.   173    174         self.have_block = False   175    176     # ContentHandler-related methods.   177    178     def startElement(self, name, attrs):   179    180         # Track indentation for lists.   181    182         if list_tags.has_key(name):   183             self.indents.append(self.indents[-1] + 1)   184    185         # Track element nesting.   186    187         if self.states.has_key(name):   188             self.states[name] += 1   189    190         # Track cumulative element nesting in order to produce appropriate depth   191         # indicators in the formatted output.   192    193         if name in preformatted_tags or name in formatted_tags:   194             self.level += 1   195             self.max_level = max(self.level, self.max_level)   196    197             # Reset indentation within regions.   198    199             self.indents.append(0)   200    201         if name in headings:   202             self.held_anchors = []   203    204         Parser.startElement(self, name, attrs)   205    206         # Remember macro information for use within the element.   207    208         if name == "ac:macro":   209             self.macros.append(self.attributes[-1].get("ac:name"))   210             self.macro_parameters.append({})   211    212     def endElement(self, name):   213    214         # Reset the indent for any preformatted/formatted region so that it may   215         # itself be indented.   216    217         if name in preformatted_tags or name in formatted_tags:   218             self.indents.pop()   219    220         Parser.endElement(self, name)   221    222         if list_tags.has_key(name):   223             self.indents.pop()   224    225         if self.states.has_key(name):   226             self.states[name] -= 1   227    228         if name in preformatted_tags or name in formatted_tags:   229             self.level -= 1   230             if not self.level:   231                 self.max_level = 0   232    233         # Discard macro state.   234    235         if name == "ac:macro":   236             self.macros.pop()   237             self.macro_parameters.pop()   238    239     def characters(self, content):   240         if not self.is_preformatted():   241             content = self.normalise(content, self.elements[-1])   242         Parser.characters(self, content)   243    244     def skippedEntity(self, name):   245         ch = htmlentitydefs.name2codepoint.get(name)   246         if ch:   247             self.text[-1].append(unichr(ch))   248    249     # Parser-related methods.   250    251     def handleElement(self, name):   252    253         """   254         Handle the completion of the element with the given 'name'. Any content   255         will either be recorded for later use (by an enclosing element, for   256         example) or emitted in some form.   257         """   258    259         text = u"".join(self.text[-1])   260    261         # Handle state.   262    263         if name == "table":   264             self.table_rows = 0   265         elif name == "tr":   266             self.table_columns = 0   267    268         # Find conversions.   269    270         conversion = None   271    272         # Handle list elements.   273    274         if name == "li" and len(self.elements) > 1:   275             list_tag = self.elements[-2]   276             conversion = list_tags.get(list_tag)   277    278         # Remember link target information.   279    280         elif link_target_tags.has_key(name):   281             target_details = []   282    283             # Get target details from the element's attributes.   284    285             for attrname in link_target_tags[name]:   286                 attrvalue = self.attributes[-1].get(attrname)   287                 if attrvalue:   288    289                     # Obtain a link label.   290    291                     if attrname in link_label_attributes and not self.label:   292                         self.label = attrvalue   293    294                     # Validate any page title.   295    296                     if attrname == "ri:content-title":   297                         attrvalue = get_page_title(attrvalue)   298                     target_details.append(attrvalue)   299    300                     # Insert any prefix required for the link.   301    302                     prefix = link_target_prefixes.get(attrname)   303                     if prefix:   304                         target_details.insert(0, prefix)   305    306             # Make a link based on the details.   307    308             self.target = u"/".join(target_details)   309             self.target_type = name   310             text = ""   311    312         # For anchor links, just use the raw text and let Moin do the formatting.   313         # Set an empty default target, overwriting it if enclosing elements   314         # specify target details.   315    316         elif name == "ac:link-body":   317             self.target = self.target or ""   318             self.label = text.strip()   319             text = ""   320    321         # For conventional links, remember the href attribute as the target.   322    323         elif name == "a":   324             self.target = self.attributes[-1].get("href")   325             self.label = text.strip()   326             text = ""   327    328         # Remember macro information.   329    330         elif name == "ac:parameter":   331             self.macro_parameters[-1][self.attributes[-1].get("ac:name")] = text   332             text = ""   333    334         elif name == "ac:default-parameter":   335             self.macro_parameters[-1][self.attributes[-2].get("ac:name")] = text   336             text = ""   337    338         # Handle single-level tags.   339    340         elif name in single_level_tags and self.states[name] > 1:   341             conversion = "%s"   342    343         # Handle preformatted sections.   344    345         elif name in preformatted_tags or name in formatted_tags:   346    347             # Nest the section appropriately.   348    349             level = 3 + self.max_level - self.level   350             opening = "{" * level   351             closing = "}" * level   352    353             # Macro name information is used to style rich text body regions.   354    355             if name != "table" and self.macros and macro_rich_text_styles.has_key(self.macros[-1]):   356                 details = macro_rich_text_styles[self.macros[-1]]   357                 title = self.macro_parameters[-1].get("title")   358                 if title:   359                     details = "%s\n\n%s" % (details, title)   360    361                 conversion = "%s#!wiki %s\n\n%%s\n%s" % (opening, details, closing)   362    363             elif name == "table":   364                 conversion = "%s#!table\n%%s\n%s" % (opening, closing)   365    366             else:   367                 # Preformatted sections containing newlines must contain an initial   368                 # newline.   369    370                 if text.find("\n") != -1 and not text.startswith("\n"):   371                     opening += "\n"   372    373                 conversion = "%s%%s%s" % (opening, closing)   374    375         # Handle the common case and simpler special cases.   376    377         if not conversion:   378             conversion = tags.get(name)   379    380    381    382         # Attempt to convert the text.   383    384         # Links require target information.   385    386         if name in ("ac:link", "ac:image"):   387             prefix = link_target_types.get(self.target_type, "")   388             anchor = self.attributes[-1].get("ac:anchor") or ""   389             label = self.label or text.strip() or self.target   390             text = conversion % (prefix, self.target, anchor and ("#%s" % anchor) or "", label)   391             self.target = self.target_type = self.label = None   392    393         elif name == "a":   394             text = conversion % (self.target, self.label or self.target)   395             self.target = self.target_type = self.label = None   396    397         # Macros require various kinds of information.   398         # Some macros affect the formatting of their contents, whereas other   399         # simpler macros are handled here.   400    401         elif name == "ac:macro":   402             conversion = macrotypes.get(self.macros[-1])   403             if conversion:   404                 parameters = {"content" : text}   405                 parameters.update(self.macro_parameters[-1])   406                 argnames = macroargs.get(self.macros[-1])   407                 if argnames:   408                     confargname, moinargname = argnames   409                     parameters["args"] = quote_macro_argument("%s=%s" % (moinargname, self.macro_parameters[-1][confargname]))   410                 text = conversion % parameters   411                 if self.macros[-1] == "anchor" and self.forbids_macros():   412                     self.held_anchors.append(text)   413                     text = ""   414    415         # Handle the common cases for parameterised and unparameterised   416         # substitutions.   417    418         elif text and conversion:   419             text = conversion % text   420         elif simple_tags.has_key(name) and not self.is_preformatted():   421             text = simple_tags[name]   422         elif simple_preformatted_tags.has_key(name) and self.is_preformatted():   423             text = simple_preformatted_tags[name]   424    425    426    427         # Postprocess table columns and rows.   428    429         if name in ("th", "td"):   430             if self.table_columns:   431                 text = "\n|| %s" % text   432             self.table_columns += 1   433         elif name == "tr":   434             if self.table_rows:   435                 text = "\n==\n%s" % text   436             self.table_rows += 1   437    438         # Postprocess held anchor tags in headings.   439    440         elif name in headings and self.held_anchors:   441             text = "%s\n%s" % ("".join(self.held_anchors), text)   442    443    444    445         # Normalise leading whitespace and indent the text if appropriate.   446    447         if name in indented_tags:   448             text = " " * self.indents[-1] + text.lstrip()   449    450         # Add the converted text to the end of the parent element's text nodes.   451    452         if len(self.text) > 1:   453             nodes = self.text[-2]   454             parent = self.elements[-2]   455    456             # Where preceding text exists, add any blank line separators.   457    458             if u"".join(nodes):   459    460                 # All top-level elements are separated with blank lines.   461    462                 if parent == "body":   463                     nodes.append("\n")   464    465                 # Block elements always cause a new line to be started.   466    467                 if name in block_tags or self.have_block and name not in span_override_tags:   468                     nodes.append("\n")   469    470                 self.have_block = False   471    472             # Lists inside lists require separation.   473    474             elif list_tags.has_key(name) and parent == "li":   475                 nodes.append("\n")   476    477             # Without preceding text, save any block node state for non-block   478             # elements so that newline separators can be added at another   479             # level.   480    481             elif name in block_tags and parent not in block_tags:   482                 self.have_block = True   483    484             elif name not in block_tags and self.have_block and name not in span_override_tags:   485                 self.have_block = True   486    487             else:   488                 self.have_block = False   489    490             nodes.append(text)   491    492         # Otherwise, emit the text (at the top level of the document).   493    494         else:   495             self.out.write(text)   496    497     def is_preformatted(self):   498         return reduce(operator.or_, [self.states[tag] for tag in preformatted_tags], False)   499    500     def forbids_macros(self):   501         return reduce(operator.or_, [(tag in headings or tag == "a") for tag in self.elements], False)   502    503     # Whitespace normalisation.   504    505     def get_replacement(self, name):   506         if name in ("html", "body", "table", "tbody", "tr") or list_tags.has_key(name):   507             return ""   508         else:   509             return " "   510    511     def normalise(self, text, name):   512         return normalise_regexp.sub(self.get_replacement(name), text)   513    514 def parse(s, out):   515    516     "Parse the content in the string 's', writing a translation to 'out'."   517    518     # NOTE: CDATA sections appear to have erroneous endings.   519    520     s = u"""\   521 <?xml version="1.0"?>   522 <!DOCTYPE html    523      PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"   524      "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">   525 <html xmlns="http://www.w3.org/1999/xhtml">   526 <body>   527 %s   528 </body>   529 </html>""" % s.replace("]] >", "]]>")   530    531     f = StringIO(s.encode("utf-8"))   532     try:   533         parser = ConfluenceXMLParser(out)   534         parser.parse(f)   535     finally:   536         f.close()   537    538 if __name__ == "__main__":   539     s = codecs.getreader("utf-8")(sys.stdin).read()   540     out = codecs.getwriter("utf-8")(sys.stdout)   541     parse(s, out)   542    543 # vim: tabstop=4 expandtab shiftwidth=4