ConfluenceConverter (file xmlparser.py at b716feccdeba)

     1 #!/usr/bin/env python     2      3 """     4 Confluence Wiki XML/XHTML syntax parsing.     5      6 Copyright (C) 2012, 2013 Paul Boddie <paul@boddie.org.uk>     7      8 This software is free software; you can redistribute it and/or     9 modify it under the terms of the GNU General Public License as    10 published by the Free Software Foundation; either version 2 of    11 the License, or (at your option) any later version.    12     13 This software is distributed in the hope that it will be useful,    14 but WITHOUT ANY WARRANTY; without even the implied warranty of    15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the    16 GNU General Public License for more details.    17     18 You should have received a copy of the GNU General Public    19 License along with this library; see the file LICENCE.txt    20 If not, write to the Free Software Foundation, Inc.,    21 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA    22 """    23     24 try:    25     from cStringIO import StringIO    26 except ImportError:    27     from StringIO import StringIO    28     29 from MoinMoin import wikiutil    30 from common import *    31 from xmlread import Parser    32 import re    33 import sys    34 import operator    35 import htmlentitydefs    36 import codecs    37     38 # XML dialect syntax parsing.    39     40 tags = {    41     # XHTML tag               MoinMoin syntax    42     "strong"                : "'''%s'''",    43     "em"                    : "''%s''",    44     "u"                     : "__%s__",    45     "del"                   : "--(%s)--",    46     "sup"                   : "^%s^",    47     "sub"                   : ",,%s,,",    48     "code"                  : "`%s`",    49     "tbody"                 : "%s",    50     "tr"                    : "%s",    51     "th"                    : "'''%s'''",    52     "td"                    : "%s",    53     "blockquote"            : " %s",    54     "small"                 : "~-%s-~",    55     "big"                   : "~+%s+~",    56     "p"                     : "%s",    57     "ol"                    : "%s",    58     "ul"                    : "%s",    59     "ac:link"               : "[[%s%s|%s]]",    60     "ac:image"              : "{{%s%s|%s}}",    61     "a"                     : "[[%s|%s]]",    62     }    63     64 for tag, translation in blocktypes.items():    65     tags[tag] = translation    66     67 simple_tags = {    68     # XHTML tag               MoinMoin syntax    69     "br"                    : "<<BR>>",    70     }    71     72 simple_preformatted_tags = {    73     # XHTML tag               MoinMoin syntax    74     "br"                    : "\n",    75     }    76     77 list_tags = {    78     # XHTML list tag          MoinMoin list item syntax    79     "ol"                    : "1. %s",    80     "ul"                    : "* %s",    81     }    82     83 preformatted_tags = ["pre", "ac:plain-text-body"]    84 single_level_tags = ["strong", "em", "u", "del", "sup", "sub", "code"]    85 formatted_tags    = ["ac:rich-text-body", "table"]    86     87 indented_tags = ["li", "p"] + preformatted_tags + formatted_tags    88 block_tags = indented_tags + blocktypes.keys() + list_tags.keys()    89 span_override_tags = ["ac:link"]    90     91 link_target_tags = {    92     # Confluence element      Attributes providing the target    93     "ri:page"               : ("ri:space-key", "ri:content-title"),    94     "ri:attachment"         : ("ri:filename",),    95     "ri:user"               : ("ri:username",),    96     }    97     98 link_target_prefixes = {    99     # Attribute with details  Prefix ensuring correct relative link   100     "ri:space-key"          : "..",   101     "ri:content-title"      : "..",   102     }   103    104 link_label_attributes = "ri:content-title", "ac:link-body"   105    106 # NOTE: User links should support the intended user namespace prefix.   107    108 link_target_types = {   109     # Confluence element      MoinMoin link prefix   110     "ri:attachment"         : "attachment:",   111     "ri:user"               : "",   112     "ac:link-body"          : "#",   113     }   114    115 macro_rich_text_styles = {   116     # Confluence style        MoinMoin admonition style   117     "note"                  : "caution",   118     "warning"               : "warning",   119     "info"                  : "important",   120     "tip"                   : "tip",   121     }   122    123 normalise_regexp_str = r"\s+"   124 normalise_regexp = re.compile(normalise_regexp_str)   125    126 class ConfluenceXMLParser(Parser):   127    128     "Handle content from Confluence 4 page revisions."   129    130     def __init__(self, out):   131         Parser.__init__(self)   132         self.out = out   133    134         # Link target and label information.   135    136         self.target = None   137         self.target_type = None   138         self.label = None   139    140         # Macro information.   141    142         self.macro = None   143         self.macro_parameters = {}   144    145         # Indentation and element nesting states.   146    147         self.indents = [0]   148         self.states = {}   149         self.max_level = self.level = 0   150    151         for name in preformatted_tags + single_level_tags:   152             self.states[name] = 0   153    154         # Table states.   155    156         self.table_rows = 0   157         self.table_columns = 0   158    159         # Block states.   160    161         self.have_block = False   162    163     # ContentHandler-related methods.   164    165     def startElement(self, name, attrs):   166    167         # Track indentation for lists.   168    169         if list_tags.has_key(name):   170             self.indents.append(self.indents[-1] + 1)   171    172         # Track element nesting.   173    174         if self.states.has_key(name):   175             self.states[name] += 1   176    177         # Track cumulative element nesting in order to produce appropriate depth   178         # indicators in the formatted output.   179    180         if name in preformatted_tags or name in formatted_tags:   181             self.level += 1   182             self.max_level = max(self.level, self.max_level)   183    184             # Reset indentation within regions.   185    186             self.indents.append(0)   187    188         Parser.startElement(self, name, attrs)   189    190         # Remember macro information for use within the element.   191    192         if name == "ac:macro":   193             self.macro = self.attributes[-1].get("ac:name")   194    195     def endElement(self, name):   196    197         # Reset the indent for any preformatted/formatted region so that it may   198         # itself be indented.   199    200         if name in preformatted_tags or name in formatted_tags:   201             self.indents.pop()   202    203         Parser.endElement(self, name)   204    205         if list_tags.has_key(name):   206             self.indents.pop()   207    208         if self.states.has_key(name):   209             self.states[name] -= 1   210    211         if name in preformatted_tags or name in formatted_tags:   212             self.level -= 1   213             if not self.level:   214                 self.max_level = 0   215    216         # Discard macro state.   217    218         if name == "ac:macro":   219             self.macro = None   220             self.macro_parameters = {}   221    222     def characters(self, content):   223         if not self.is_preformatted():   224             content = self.normalise(content, self.elements[-1])   225         Parser.characters(self, content)   226    227     def skippedEntity(self, name):   228         ch = htmlentitydefs.name2codepoint.get(name)   229         if ch:   230             self.text[-1].append(unichr(ch))   231    232     # Parser-related methods.   233    234     def handleElement(self, name):   235    236         """   237         Handle the completion of the element with the given 'name'. Any content   238         will either be recorded for later use (by an enclosing element, for   239         example) or emitted in some form.   240         """   241    242         text = u"".join(self.text[-1])   243    244         # Handle state.   245    246         if name == "table":   247             self.table_rows = 0   248         elif name == "tr":   249             self.table_columns = 0   250    251         # Find conversions.   252    253         conversion = None   254    255         # Handle list elements.   256    257         if name == "li" and len(self.elements) > 1:   258             list_tag = self.elements[-2]   259             conversion = list_tags.get(list_tag)   260    261         # Remember link target information.   262    263         elif link_target_tags.has_key(name):   264             target_details = []   265    266             # Get target details from the element's attributes.   267    268             for attrname in link_target_tags[name]:   269                 attrvalue = self.attributes[-1].get(attrname)   270                 if attrvalue:   271                     target_details.append(attrvalue)   272                     prefix = link_target_prefixes.get(attrname)   273                     if prefix:   274                         target_details.insert(0, prefix)   275                     if attrname in link_label_attributes and not self.label:   276                         self.label = attrvalue   277    278             # Make a link based on the details.   279    280             self.target = u"/".join(target_details)   281             self.target_type = name   282             text = ""   283    284         # For anchor links, just use the raw text and let Moin do the formatting.   285    286         elif name == "ac:link-body":   287             if not self.target_type:   288                 self.target_type = name   289             self.label = text.strip()   290             text = ""   291    292         # For conventional links, remember the href attribute as the target.   293    294         elif name == "a":   295             self.target = self.attributes[-1].get("href")   296             self.label = text.strip()   297             text = ""   298    299         # Remember macro information.   300    301         elif name == "ac:parameter":   302             self.macro_parameters[self.attributes[-1].get("ac:name")] = text   303             text = ""   304    305         elif name == "ac:default-parameter":   306             self.macro_parameters[self.attributes[-2].get("ac:name")] = text   307             text = ""   308    309         # Handle single-level tags.   310    311         elif name in single_level_tags and self.states[name] > 1:   312             conversion = "%s"   313    314         # Handle preformatted sections.   315    316         elif name in preformatted_tags or name in formatted_tags:   317    318             # Nest the section appropriately.   319    320             level = 3 + self.max_level - self.level   321             opening = "{" * level   322             closing = "}" * level   323    324             # Macro name information is used to style rich text body regions.   325    326             if name != "table" and self.macro and macro_rich_text_styles.has_key(self.macro):   327                 details = macro_rich_text_styles[self.macro]   328                 title = self.macro_parameters.get("title")   329                 if title:   330                     details = "%s\n\n%s" % (details, title)   331    332                 conversion = "%s#!wiki %s\n\n%%s\n%s" % (opening, details, closing)   333    334             elif name == "table":   335                 conversion = "%s#!table\n%%s\n%s" % (opening, closing)   336    337             else:   338                 # Preformatted sections containing newlines must contain an initial   339                 # newline.   340    341                 if text.find("\n") != -1 and not text.startswith("\n"):   342                     opening += "\n"   343    344                 conversion = "%s%%s%s" % (opening, closing)   345    346         # Handle the common case and simpler special cases.   347    348         if not conversion:   349             conversion = tags.get(name)   350    351    352    353         # Attempt to convert the text.   354    355         # Links require target information.   356    357         if name in ("ac:link", "ac:image"):   358             prefix = link_target_types.get(self.target_type, "")   359             anchor = self.attributes[-1].get("ac:anchor")   360             text = conversion % (prefix, anchor or self.target, self.label or text.strip() or self.target)   361             self.target = self.target_type = self.label = None   362    363         elif name == "a":   364             text = conversion % (self.target, self.label or self.target)   365             self.target = self.target_type = self.label = None   366    367         # Macros require various kinds of information.   368    369         elif name == "ac:macro":   370             macro_name = self.attributes[-1]["ac:name"]   371    372         # Handle the common cases for parameterised and unparameterised   373         # substitutions.   374    375         elif text and conversion:   376             text = conversion % text   377         elif simple_tags.has_key(name) and not self.is_preformatted():   378             text = simple_tags[name]   379         elif simple_preformatted_tags.has_key(name) and self.is_preformatted():   380             text = simple_preformatted_tags[name]   381    382    383    384         # Postprocess table columns and rows.   385    386         if name in ("th", "td"):   387             if self.table_columns:   388                 text = "\n|| %s" % text   389             self.table_columns += 1   390         elif name == "tr":   391             if self.table_rows:   392                 text = "\n==\n%s" % text   393             self.table_rows += 1   394    395    396    397         # Normalise leading whitespace and indent the text if appropriate.   398    399         if name in indented_tags:   400             text = " " * self.indents[-1] + text.lstrip()   401    402         # Add the converted text to the end of the parent element's text nodes.   403    404         if len(self.text) > 1:   405             nodes = self.text[-2]   406             parent = self.elements[-2]   407    408             # Where preceding text exists, add any blank line separators.   409    410             if u"".join(nodes):   411    412                 # All top-level elements are separated with blank lines.   413    414                 if parent == "body":   415                     nodes.append("\n")   416    417                 # Block elements always cause a new line to be started.   418    419                 if name in block_tags or self.have_block and name not in span_override_tags:   420                     nodes.append("\n")   421    422                 self.have_block = False   423    424             # Lists inside lists require separation.   425    426             elif list_tags.has_key(name) and parent == "li":   427                 nodes.append("\n")   428    429             # Without preceding text, save any block node state for non-block   430             # elements so that newline separators can be added at another   431             # level.   432    433             elif name in block_tags and parent not in block_tags:   434                 self.have_block = True   435    436             elif name not in block_tags and self.have_block and name not in span_override_tags:   437                 self.have_block = True   438    439             else:   440                 self.have_block = False   441    442             nodes.append(text)   443    444         # Otherwise, emit the text (at the top level of the document).   445    446         else:   447             self.out.write(text)   448    449     def is_preformatted(self):   450         return reduce(operator.or_, [self.states[tag] for tag in preformatted_tags], False)   451    452     # Whitespace normalisation.   453    454     def get_replacement(self, name):   455         if name in ("html", "body", "table", "tbody", "tr") or list_tags.has_key(name):   456             return ""   457         else:   458             return " "   459    460     def normalise(self, text, name):   461         return normalise_regexp.sub(self.get_replacement(name), text)   462    463 def parse(s, out):   464    465     "Parse the content in the string 's', writing a translation to 'out'."   466    467     # NOTE: CDATA sections appear to have erroneous endings.   468    469     s = u"""\   470 <?xml version="1.0"?>   471 <!DOCTYPE html    472      PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"   473      "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">   474 <html xmlns="http://www.w3.org/1999/xhtml">   475 <body>   476 %s   477 </body>   478 </html>""" % s.replace("]] >", "]]>")   479    480     f = StringIO(s.encode("utf-8"))   481     try:   482         parser = ConfluenceXMLParser(out)   483         parser.parse(f)   484     finally:   485         f.close()   486    487 if __name__ == "__main__":   488     s = codecs.getreader("utf-8")(sys.stdin).read()   489     out = codecs.getwriter("utf-8")(sys.stdout)   490     parse(s, out)   491    492 # vim: tabstop=4 expandtab shiftwidth=4