ConfluenceConverter (file xmlparser.py at d9f722364ba2)

     1 #!/usr/bin/env python     2      3 """     4 Confluence Wiki XML/XHTML syntax parsing.     5      6 Copyright (C) 2012, 2013 Paul Boddie <paul@boddie.org.uk>     7      8 This software is free software; you can redistribute it and/or     9 modify it under the terms of the GNU General Public License as    10 published by the Free Software Foundation; either version 2 of    11 the License, or (at your option) any later version.    12     13 This software is distributed in the hope that it will be useful,    14 but WITHOUT ANY WARRANTY; without even the implied warranty of    15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the    16 GNU General Public License for more details.    17     18 You should have received a copy of the GNU General Public    19 License along with this library; see the file LICENCE.txt    20 If not, write to the Free Software Foundation, Inc.,    21 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA    22 """    23     24 try:    25     from cStringIO import StringIO    26 except ImportError:    27     from StringIO import StringIO    28     29 from MoinMoin import wikiutil    30 from common import *    31 from xmlread import Parser    32 import re    33 import sys    34 import operator    35 import htmlentitydefs    36 import codecs    37     38 # XML dialect syntax parsing.    39     40 tags = {    41     # XHTML tag               MoinMoin syntax    42     "strong"                : "'''%s'''",    43     "em"                    : "''%s''",    44     "u"                     : "__%s__",    45     "del"                   : "--(%s)--",    46     "sup"                   : "^%s^",    47     "sub"                   : ",,%s,,",    48     "code"                  : "`%s`",    49     "tbody"                 : "%s",    50     "tr"                    : "%s",    51     "th"                    : "'''%s'''",    52     "td"                    : "%s",    53     "blockquote"            : " %s",    54     "small"                 : "~-%s-~",    55     "big"                   : "~+%s+~",    56     "p"                     : "%s",    57     "ol"                    : "%s",    58     "ul"                    : "%s",    59     "ac:link"               : "[[%s%s|%s]]",    60     "ac:image"              : "{{%s%s|%s}}",    61     }    62     63 for tag, translation in blocktypes.items():    64     tags[tag] = translation    65     66 simple_tags = {    67     # XHTML tag               MoinMoin syntax    68     "br"                    : "<<BR>>",    69     }    70     71 list_tags = {    72     # XHTML list tag          MoinMoin list item syntax    73     "ol"                    : "1. %s",    74     "ul"                    : "* %s",    75     }    76     77 indented_tags = ["li", "p"]    78     79 preformatted_tags = ["pre", "ac:plain-text-body"]    80 single_level_tags = ["strong", "em", "u", "del", "sup", "sub", "code"]    81 formatted_tags    = ["ac:rich-text-body", "table"]    82     83 link_target_tags = {    84     # Confluence element      Attribute providing the target    85     "ri:page"               : "ri:content-title",    86     "ri:attachment"         : "ri:filename",    87     "ri:user"               : "ri:username",    88     }    89     90 # NOTE: User links should support the intended user namespace prefix.    91     92 link_target_types = {    93     # Confluence element      MoinMoin link prefix    94     "ri:attachment"         : "attachment:",    95     "ri:user"               : "",    96     "ac:link-body"          : "#",    97     }    98     99 macro_rich_text_styles = {   100     # Confluence style        MoinMoin admonition style   101     "note"                  : "caution",   102     "warning"               : "warning",   103     "info"                  : "important",   104     "tip"                   : "tip",   105     }   106    107 normalise_regexp_str = r"\s+"   108 normalise_regexp = re.compile(normalise_regexp_str)   109    110 class ConfluenceXMLParser(Parser):   111    112     "Handle content from Confluence 4 page revisions."   113    114     def __init__(self, out):   115         Parser.__init__(self)   116         self.out = out   117    118         # Link target and label information.   119    120         self.target = None   121         self.target_type = None   122         self.label = None   123    124         # Macro information.   125    126         self.macro = None   127         self.macro_parameters = {}   128    129         # Indentation and element nesting states.   130    131         self.indent = 0   132         self.states = {}   133         self.max_level = self.level = 0   134    135         for name in preformatted_tags + single_level_tags:   136             self.states[name] = 0   137    138         # Table states.   139    140         self.table_rows = 0   141         self.table_columns = 0   142    143     # ContentHandler-related methods.   144    145     def startElement(self, name, attrs):   146         if list_tags.has_key(name):   147             self.indent += 1   148         elif self.states.has_key(name):   149             self.states[name] += 1   150         if name in preformatted_tags or name in formatted_tags:   151             self.level += 1   152             self.max_level = max(self.level, self.max_level)   153    154         Parser.startElement(self, name, attrs)   155    156         # Remember macro information for use within the element.   157    158         if name == "ac:macro":   159             self.macro = self.attributes[-1].get("ac:name")   160    161     def endElement(self, name):   162         Parser.endElement(self, name)   163    164         if list_tags.has_key(name):   165             self.indent -= 1   166         elif self.states.has_key(name):   167             self.states[name] -= 1   168         if name in preformatted_tags or name in formatted_tags:   169             self.level -= 1   170             if not self.level:   171                 self.max_level = 0   172    173     def characters(self, content):   174         if not self.is_preformatted():   175             content = self.normalise(content, self.elements[-1])   176         Parser.characters(self, content)   177    178     def skippedEntity(self, name):   179         ch = htmlentitydefs.name2codepoint.get(name)   180         if ch:   181             self.text[-1].append(unichr(ch))   182    183     # Parser-related methods.   184    185     def handleElement(self, name):   186    187         """   188         Handle the completion of the element with the given 'name'. Any content   189         will either be recorded for later use (by an enclosing element, for   190         example) or emitted in some form.   191         """   192    193         text = "".join(self.text[-1])   194    195         # Handle state.   196    197         if name == "table":   198             self.table_rows = 0   199         elif name == "tr":   200             self.table_columns = 0   201    202         # Find conversions.   203    204         conversion = None   205    206         # Handle list elements.   207    208         if name == "li" and len(self.elements) > 1:   209             list_tag = self.elements[-2]   210             conversion = list_tags.get(list_tag)   211    212         # Remember link target information.   213    214         elif link_target_tags.has_key(name):   215             self.target = self.attributes[-1].get(link_target_tags[name])   216             self.target_type = name   217             text = ""   218    219         # For anchor links, just use the raw text and let Moin do the formatting.   220    221         elif name == "ac:link-body":   222             self.target_type = name   223             self.label = text   224             text = ""   225    226         # Discard macro state.   227    228         elif name == "ac:macro":   229             self.macro = None   230             self.macro_parameters = {}   231    232         # Remember macro information.   233    234         elif name in ("ac:parameter", "ac:default-parameter"):   235             self.macro_parameters[self.attributes[-1].get("ac:name")] = text   236             text = ""   237    238         # Handle single-level tags.   239    240         elif name in single_level_tags and self.states[name] > 1:   241             conversion = "%s"   242    243         # Handle preformatted sections.   244    245         elif name in preformatted_tags or name in formatted_tags:   246    247             # Nest the section appropriately.   248    249             level = 3 + self.max_level - self.level   250             opening = "{" * level   251             closing = "}" * level   252    253             # Macro name information is used to style rich text body regions.   254    255             if name != "table" and self.macro and macro_rich_text_styles.has_key(self.macro):   256                 details = macro_rich_text_styles[self.macro]   257                 title = self.macro_parameters.get("title")   258                 if title:   259                     details = "%s\n\n%s" % (details, title)   260    261                 conversion = "%s#!wiki %s\n\n%%s\n%s" % (opening, details, closing)   262    263             elif name == "table":   264                 conversion = "%s#!table\n%%s\n%s" % (opening, closing)   265    266             else:   267                 conversion = "%s%%s%s" % (opening, closing)   268    269         # Handle the common case.   270    271         else:   272             conversion = tags.get(name)   273    274         # Attempt to convert the text.   275    276         # Links require target information.   277    278         if name in ("ac:link", "ac:image"):   279             prefix = link_target_types.get(self.target_type, "../")   280             anchor = self.attributes[-1].get("ac:anchor")   281             text = conversion % (prefix, anchor or self.target, self.label or text or self.target)   282             self.target = self.target_type = self.label = None   283    284         # Handle the common case.   285    286         elif text and conversion:   287             text = conversion % text   288         elif simple_tags.has_key(name):   289             text = simple_tags[name]   290    291         # Postprocess table columns and rows.   292    293         if name in ("th", "td"):   294             if self.table_columns:   295                 text = "\n|| %s" % text   296             self.table_columns += 1   297         elif name == "tr":   298             if self.table_rows:   299                 text = "\n==\n%s" % text   300             self.table_rows += 1   301    302         # Normalise leading whitespace and indent the text if appropriate.   303    304         if name in indented_tags:   305             text = " " * self.indent + text.lstrip()   306    307         # Add the converted text to the end of the parent element's text nodes.   308    309         if len(self.text) > 1:   310             nodes = self.text[-2]   311             if "".join(self.text[-2]):   312                 parent = self.elements[-2]   313                 if parent == "body":   314                     nodes.append("\n\n")   315                 elif list_tags.has_key(parent):   316                     nodes.append("\n")   317                 elif list_tags.has_key(name):   318                     nodes.append("\n")   319             nodes.append(text)   320    321         # Otherwise, emit the text.   322    323         else:   324             self.out.write(text)   325    326     def is_preformatted(self):   327         return reduce(operator.or_, [self.states[tag] for tag in preformatted_tags], False)   328    329     # Whitespace normalisation.   330    331     def get_replacement(self, name):   332         if name in ("html", "body", "table", "tbody", "tr") or list_tags.has_key(name):   333             return ""   334         else:   335             return " "   336    337     def normalise(self, text, name):   338         return normalise_regexp.sub(self.get_replacement(name), text)   339    340 def parse(s, out):   341    342     "Parse the content in the string 's', writing a translation to 'out'."   343    344     # NOTE: CDATA sections appear to have erroneous endings.   345    346     s = u"""\   347 <?xml version="1.0"?>   348 <!DOCTYPE html    349      PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"   350      "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">   351 <html xmlns="http://www.w3.org/1999/xhtml">   352 <body>   353 %s   354 </body>   355 </html>""" % s.replace("]] >", "]]>")   356    357     f = StringIO(s.encode("utf-8"))   358     try:   359         parser = ConfluenceXMLParser(out)   360         parser.parse(f)   361     finally:   362         f.close()   363    364 if __name__ == "__main__":   365     s = sys.stdin.read()   366     out = codecs.getwriter("utf-8")(sys.stdout)   367     parse(s, out)   368    369 # vim: tabstop=4 expandtab shiftwidth=4