ConfluenceConverter (file xmlparser.py at dac27f1544a5)

     1 #!/usr/bin/env python     2      3 """     4 Confluence Wiki XML/XHTML syntax parsing.     5      6 Copyright (C) 2012, 2013 Paul Boddie <paul@boddie.org.uk>     7      8 This software is free software; you can redistribute it and/or     9 modify it under the terms of the GNU General Public License as    10 published by the Free Software Foundation; either version 2 of    11 the License, or (at your option) any later version.    12     13 This software is distributed in the hope that it will be useful,    14 but WITHOUT ANY WARRANTY; without even the implied warranty of    15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the    16 GNU General Public License for more details.    17     18 You should have received a copy of the GNU General Public    19 License along with this library; see the file LICENCE.txt    20 If not, write to the Free Software Foundation, Inc.,    21 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA    22 """    23     24 try:    25     from cStringIO import StringIO    26 except ImportError:    27     from StringIO import StringIO    28     29 from MoinMoin import wikiutil    30 from common import *    31 from xmlread import Parser    32 import re    33 import sys    34 import operator    35 import htmlentitydefs    36 import codecs    37     38 # XML dialect syntax parsing.    39     40 tags = {    41     # XHTML tag               MoinMoin syntax    42     "strong"                : "'''%s'''",    43     "em"                    : "''%s''",    44     "u"                     : "__%s__",    45     "del"                   : "--(%s)--",    46     "sup"                   : "^%s^",    47     "sub"                   : ",,%s,,",    48     "code"                  : "`%s`",    49     "tbody"                 : "%s",    50     "tr"                    : "%s",    51     "th"                    : "'''%s'''",    52     "td"                    : "%s",    53     "blockquote"            : " %s",    54     "small"                 : "~-%s-~",    55     "big"                   : "~+%s+~",    56     "p"                     : "%s",    57     "ol"                    : "%s",    58     "ul"                    : "%s",    59     "ac:link"               : "[[%s%s|%s]]",    60     "ac:image"              : "{{%s%s|%s}}",    61     "a"                     : "[[%s|%s]]",    62     }    63     64 for tag, translation in blocktypes.items():    65     tags[tag] = translation    66     67 simple_tags = {    68     # XHTML tag               MoinMoin syntax    69     "br"                    : "<<BR>>",    70     }    71     72 simple_preformatted_tags = {    73     # XHTML tag               MoinMoin syntax    74     "br"                    : "\n",    75     }    76     77 list_tags = {    78     # XHTML list tag          MoinMoin list item syntax    79     "ol"                    : "1. %s",    80     "ul"                    : "* %s",    81     }    82     83 preformatted_tags = ["pre", "ac:plain-text-body"]    84 single_level_tags = ["strong", "em", "u", "del", "sup", "sub", "code"]    85 formatted_tags    = ["ac:rich-text-body", "table"]    86     87 indented_tags = ["li", "p"] + preformatted_tags + formatted_tags    88 block_tags = indented_tags + blocktypes.keys() + list_tags.keys()    89 span_override_tags = ["ac:link"]    90     91 link_target_tags = {    92     # Confluence element      Attributes providing the target    93     "ri:page"               : ("ri:space-key", "ri:content-title"),    94     "ri:attachment"         : ("ri:filename",),    95     "ri:user"               : ("ri:username",),    96     }    97     98 link_target_prefixes = {    99     # Attribute with details  Prefix ensuring correct relative link   100     "ri:space-key"          : "..",   101     "ri:content-title"      : "..",   102     }   103    104 link_label_attributes = "ri:content-title", "ac:link-body"   105    106 # NOTE: User links should support the intended user namespace prefix.   107    108 link_target_types = {   109     # Confluence element      MoinMoin link prefix   110     "ri:attachment"         : "attachment:",   111     "ri:user"               : "",   112     "ac:link-body"          : "#",   113     }   114    115 macro_rich_text_styles = {   116     # Confluence style        MoinMoin admonition style   117     "note"                  : "caution",   118     "warning"               : "warning",   119     "info"                  : "important",   120     "tip"                   : "tip",   121     }   122    123 macroargs = {   124     # Confluence macro        Confluence and MoinMoin macro arguments   125     "color"                 : ("color", "col"),   126     }   127    128 macrotypes = {   129     # Confluence macro        MoinMoin syntax   130     "anchor"                : "<<Anchor(%(anchor)s)>>",   131     "color"                 : "<<Color2(%(content)s, %(args)s)>>",   132     }   133    134 normalise_regexp_str = r"\s+"   135 normalise_regexp = re.compile(normalise_regexp_str)   136    137 class ConfluenceXMLParser(Parser):   138    139     "Handle content from Confluence 4 page revisions."   140    141     def __init__(self, out):   142         Parser.__init__(self)   143         self.out = out   144    145         # Link target and label information.   146    147         self.target = None   148         self.target_type = None   149         self.label = None   150    151         # Macro information.   152    153         self.macro = None   154         self.macro_parameters = {}   155    156         # Indentation and element nesting states.   157    158         self.indents = [0]   159         self.states = {}   160         self.max_level = self.level = 0   161    162         for name in preformatted_tags + single_level_tags:   163             self.states[name] = 0   164    165         # Table states.   166    167         self.table_rows = 0   168         self.table_columns = 0   169    170         # Block states.   171    172         self.have_block = False   173    174     # ContentHandler-related methods.   175    176     def startElement(self, name, attrs):   177    178         # Track indentation for lists.   179    180         if list_tags.has_key(name):   181             self.indents.append(self.indents[-1] + 1)   182    183         # Track element nesting.   184    185         if self.states.has_key(name):   186             self.states[name] += 1   187    188         # Track cumulative element nesting in order to produce appropriate depth   189         # indicators in the formatted output.   190    191         if name in preformatted_tags or name in formatted_tags:   192             self.level += 1   193             self.max_level = max(self.level, self.max_level)   194    195             # Reset indentation within regions.   196    197             self.indents.append(0)   198    199         Parser.startElement(self, name, attrs)   200    201         # Remember macro information for use within the element.   202    203         if name == "ac:macro":   204             self.macro = self.attributes[-1].get("ac:name")   205    206     def endElement(self, name):   207    208         # Reset the indent for any preformatted/formatted region so that it may   209         # itself be indented.   210    211         if name in preformatted_tags or name in formatted_tags:   212             self.indents.pop()   213    214         Parser.endElement(self, name)   215    216         if list_tags.has_key(name):   217             self.indents.pop()   218    219         if self.states.has_key(name):   220             self.states[name] -= 1   221    222         if name in preformatted_tags or name in formatted_tags:   223             self.level -= 1   224             if not self.level:   225                 self.max_level = 0   226    227         # Discard macro state.   228    229         if name == "ac:macro":   230             self.macro = None   231             self.macro_parameters = {}   232    233     def characters(self, content):   234         if not self.is_preformatted():   235             content = self.normalise(content, self.elements[-1])   236         Parser.characters(self, content)   237    238     def skippedEntity(self, name):   239         ch = htmlentitydefs.name2codepoint.get(name)   240         if ch:   241             self.text[-1].append(unichr(ch))   242    243     # Parser-related methods.   244    245     def handleElement(self, name):   246    247         """   248         Handle the completion of the element with the given 'name'. Any content   249         will either be recorded for later use (by an enclosing element, for   250         example) or emitted in some form.   251         """   252    253         text = u"".join(self.text[-1])   254    255         # Handle state.   256    257         if name == "table":   258             self.table_rows = 0   259         elif name == "tr":   260             self.table_columns = 0   261    262         # Find conversions.   263    264         conversion = None   265    266         # Handle list elements.   267    268         if name == "li" and len(self.elements) > 1:   269             list_tag = self.elements[-2]   270             conversion = list_tags.get(list_tag)   271    272         # Remember link target information.   273    274         elif link_target_tags.has_key(name):   275             target_details = []   276    277             # Get target details from the element's attributes.   278    279             for attrname in link_target_tags[name]:   280                 attrvalue = self.attributes[-1].get(attrname)   281                 if attrvalue:   282                     target_details.append(attrvalue)   283                     prefix = link_target_prefixes.get(attrname)   284                     if prefix:   285                         target_details.insert(0, prefix)   286                     if attrname in link_label_attributes and not self.label:   287                         self.label = attrvalue   288    289             # Make a link based on the details.   290    291             self.target = u"/".join(target_details)   292             self.target_type = name   293             text = ""   294    295         # For anchor links, just use the raw text and let Moin do the formatting.   296    297         elif name == "ac:link-body":   298             if not self.target_type:   299                 self.target_type = name   300             self.label = text.strip()   301             text = ""   302    303         # For conventional links, remember the href attribute as the target.   304    305         elif name == "a":   306             self.target = self.attributes[-1].get("href")   307             self.label = text.strip()   308             text = ""   309    310         # Remember macro information.   311    312         elif name == "ac:parameter":   313             self.macro_parameters[self.attributes[-1].get("ac:name")] = text   314             text = ""   315    316         elif name == "ac:default-parameter":   317             self.macro_parameters[self.attributes[-2].get("ac:name")] = text   318             text = ""   319    320         # Handle single-level tags.   321    322         elif name in single_level_tags and self.states[name] > 1:   323             conversion = "%s"   324    325         # Handle preformatted sections.   326    327         elif name in preformatted_tags or name in formatted_tags:   328    329             # Nest the section appropriately.   330    331             level = 3 + self.max_level - self.level   332             opening = "{" * level   333             closing = "}" * level   334    335             # Macro name information is used to style rich text body regions.   336    337             if name != "table" and self.macro and macro_rich_text_styles.has_key(self.macro):   338                 details = macro_rich_text_styles[self.macro]   339                 title = self.macro_parameters.get("title")   340                 if title:   341                     details = "%s\n\n%s" % (details, title)   342    343                 conversion = "%s#!wiki %s\n\n%%s\n%s" % (opening, details, closing)   344    345             elif name == "table":   346                 conversion = "%s#!table\n%%s\n%s" % (opening, closing)   347    348             else:   349                 # Preformatted sections containing newlines must contain an initial   350                 # newline.   351    352                 if text.find("\n") != -1 and not text.startswith("\n"):   353                     opening += "\n"   354    355                 conversion = "%s%%s%s" % (opening, closing)   356    357         # Handle the common case and simpler special cases.   358    359         if not conversion:   360             conversion = tags.get(name)   361    362    363    364         # Attempt to convert the text.   365    366         # Links require target information.   367    368         if name in ("ac:link", "ac:image"):   369             prefix = link_target_types.get(self.target_type, "")   370             anchor = self.attributes[-1].get("ac:anchor")   371             text = conversion % (prefix, anchor or self.target, self.label or text.strip() or self.target)   372             self.target = self.target_type = self.label = None   373    374         elif name == "a":   375             text = conversion % (self.target, self.label or self.target)   376             self.target = self.target_type = self.label = None   377    378         # Macros require various kinds of information.   379         # Some macros affect the formatting of their contents, whereas other   380         # simpler macros are handled here.   381    382         elif name == "ac:macro" and not self.forbids_macros():   383             conversion = macrotypes.get(self.macro)   384             if conversion:   385                 parameters = {"content" : text}   386                 parameters.update(self.macro_parameters)   387                 argnames = macroargs.get(self.macro)   388                 if argnames:   389                     confargname, moinargname = argnames   390                     parameters["args"] = quote_macro_argument("%s=%s" % (moinargname, self.macro_parameters[confargname]))   391                 text = conversion % parameters   392    393         # Handle the common cases for parameterised and unparameterised   394         # substitutions.   395    396         elif text and conversion:   397             text = conversion % text   398         elif simple_tags.has_key(name) and not self.is_preformatted():   399             text = simple_tags[name]   400         elif simple_preformatted_tags.has_key(name) and self.is_preformatted():   401             text = simple_preformatted_tags[name]   402    403    404    405         # Postprocess table columns and rows.   406    407         if name in ("th", "td"):   408             if self.table_columns:   409                 text = "\n|| %s" % text   410             self.table_columns += 1   411         elif name == "tr":   412             if self.table_rows:   413                 text = "\n==\n%s" % text   414             self.table_rows += 1   415    416    417    418         # Normalise leading whitespace and indent the text if appropriate.   419    420         if name in indented_tags:   421             text = " " * self.indents[-1] + text.lstrip()   422    423         # Add the converted text to the end of the parent element's text nodes.   424    425         if len(self.text) > 1:   426             nodes = self.text[-2]   427             parent = self.elements[-2]   428    429             # Where preceding text exists, add any blank line separators.   430    431             if u"".join(nodes):   432    433                 # All top-level elements are separated with blank lines.   434    435                 if parent == "body":   436                     nodes.append("\n")   437    438                 # Block elements always cause a new line to be started.   439    440                 if name in block_tags or self.have_block and name not in span_override_tags:   441                     nodes.append("\n")   442    443                 self.have_block = False   444    445             # Lists inside lists require separation.   446    447             elif list_tags.has_key(name) and parent == "li":   448                 nodes.append("\n")   449    450             # Without preceding text, save any block node state for non-block   451             # elements so that newline separators can be added at another   452             # level.   453    454             elif name in block_tags and parent not in block_tags:   455                 self.have_block = True   456    457             elif name not in block_tags and self.have_block and name not in span_override_tags:   458                 self.have_block = True   459    460             else:   461                 self.have_block = False   462    463             nodes.append(text)   464    465         # Otherwise, emit the text (at the top level of the document).   466    467         else:   468             self.out.write(text)   469    470     def is_preformatted(self):   471         return reduce(operator.or_, [self.states[tag] for tag in preformatted_tags], False)   472    473     def forbids_macros(self):   474         return reduce(operator.or_, [(tag in headings or tag == "a") for tag in self.elements], False)   475    476     # Whitespace normalisation.   477    478     def get_replacement(self, name):   479         if name in ("html", "body", "table", "tbody", "tr") or list_tags.has_key(name):   480             return ""   481         else:   482             return " "   483    484     def normalise(self, text, name):   485         return normalise_regexp.sub(self.get_replacement(name), text)   486    487 def parse(s, out):   488    489     "Parse the content in the string 's', writing a translation to 'out'."   490    491     # NOTE: CDATA sections appear to have erroneous endings.   492    493     s = u"""\   494 <?xml version="1.0"?>   495 <!DOCTYPE html    496      PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"   497      "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">   498 <html xmlns="http://www.w3.org/1999/xhtml">   499 <body>   500 %s   501 </body>   502 </html>""" % s.replace("]] >", "]]>")   503    504     f = StringIO(s.encode("utf-8"))   505     try:   506         parser = ConfluenceXMLParser(out)   507         parser.parse(f)   508     finally:   509         f.close()   510    511 if __name__ == "__main__":   512     s = codecs.getreader("utf-8")(sys.stdin).read()   513     out = codecs.getwriter("utf-8")(sys.stdout)   514     parse(s, out)   515    516 # vim: tabstop=4 expandtab shiftwidth=4