ConfluenceConverter (file xmlparser.py at 8fdb1b047d0d)

     1 #!/usr/bin/env python     2      3 """     4 Confluence Wiki XML/XHTML syntax parsing.     5      6 Copyright (C) 2012, 2013 Paul Boddie <paul@boddie.org.uk>     7      8 This software is free software; you can redistribute it and/or     9 modify it under the terms of the GNU General Public License as    10 published by the Free Software Foundation; either version 2 of    11 the License, or (at your option) any later version.    12     13 This software is distributed in the hope that it will be useful,    14 but WITHOUT ANY WARRANTY; without even the implied warranty of    15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the    16 GNU General Public License for more details.    17     18 You should have received a copy of the GNU General Public    19 License along with this library; see the file LICENCE.txt    20 If not, write to the Free Software Foundation, Inc.,    21 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA    22 """    23     24 try:    25     from cStringIO import StringIO    26 except ImportError:    27     from StringIO import StringIO    28     29 from MoinMoin import wikiutil    30 from common import *    31 from xmlread import Parser    32 import re    33 import sys    34 import operator    35 import htmlentitydefs    36 import codecs    37     38 # XML dialect syntax parsing.    39     40 tags = {    41     # XHTML tag               MoinMoin syntax    42     "strong"                : "'''%s'''",    43     "em"                    : "''%s''",    44     "u"                     : "__%s__",    45     "del"                   : "--(%s)--",    46     "sup"                   : "^%s^",    47     "sub"                   : ",,%s,,",    48     "code"                  : "`%s`",    49     "tbody"                 : "%s",    50     "tr"                    : "%s",    51     "th"                    : "'''%s'''",    52     "td"                    : "%s",    53     "blockquote"            : " %s",    54     "small"                 : "~-%s-~",    55     "big"                   : "~+%s+~",    56     "p"                     : "%s",    57     "ol"                    : "%s",    58     "ul"                    : "%s",    59     "ac:link"               : "[[%s%s|%s]]",    60     "ac:image"              : "{{%s%s|%s}}",    61     "a"                     : "[[%s|%s]]",    62     }    63     64 for tag, translation in blocktypes.items():    65     tags[tag] = translation    66     67 simple_tags = {    68     # XHTML tag               MoinMoin syntax    69     "br"                    : "<<BR>>",    70     }    71     72 list_tags = {    73     # XHTML list tag          MoinMoin list item syntax    74     "ol"                    : "1. %s",    75     "ul"                    : "* %s",    76     }    77     78 preformatted_tags = ["pre", "ac:plain-text-body"]    79 single_level_tags = ["strong", "em", "u", "del", "sup", "sub", "code"]    80 formatted_tags    = ["ac:rich-text-body", "table"]    81     82 indented_tags = ["li", "p"] + preformatted_tags + formatted_tags    83 block_tags = indented_tags + blocktypes.keys() + list_tags.keys()    84     85 link_target_tags = {    86     # Confluence element      Attributes providing the target    87     "ri:page"               : ("ri:space-key", "ri:content-title"),    88     "ri:attachment"         : ("ri:filename",),    89     "ri:user"               : ("ri:username",),    90     }    91     92 link_target_prefixes = {    93     # Attribute with details  Prefix ensuring correct relative link    94     "ri:space-key"          : "..",    95     "ri:content-title"      : "..",    96     }    97     98 link_label_attributes = "ri:content-title", "ac:link-body"    99    100 # NOTE: User links should support the intended user namespace prefix.   101    102 link_target_types = {   103     # Confluence element      MoinMoin link prefix   104     "ri:attachment"         : "attachment:",   105     "ri:user"               : "",   106     "ac:link-body"          : "#",   107     }   108    109 macro_rich_text_styles = {   110     # Confluence style        MoinMoin admonition style   111     "note"                  : "caution",   112     "warning"               : "warning",   113     "info"                  : "important",   114     "tip"                   : "tip",   115     }   116    117 normalise_regexp_str = r"\s+"   118 normalise_regexp = re.compile(normalise_regexp_str)   119    120 class ConfluenceXMLParser(Parser):   121    122     "Handle content from Confluence 4 page revisions."   123    124     def __init__(self, out):   125         Parser.__init__(self)   126         self.out = out   127    128         # Link target and label information.   129    130         self.target = None   131         self.target_type = None   132         self.label = None   133    134         # Macro information.   135    136         self.macro = None   137         self.macro_parameters = {}   138    139         # Indentation and element nesting states.   140    141         self.indent = 0   142         self.states = {}   143         self.max_level = self.level = 0   144    145         for name in preformatted_tags + single_level_tags:   146             self.states[name] = 0   147    148         # Table states.   149    150         self.table_rows = 0   151         self.table_columns = 0   152    153         # Block states.   154    155         self.have_block = False   156    157     # ContentHandler-related methods.   158    159     def startElement(self, name, attrs):   160    161         # Track indentation for lists.   162    163         if list_tags.has_key(name):   164             self.indent += 1   165    166         # Track element nesting.   167    168         elif self.states.has_key(name):   169             self.states[name] += 1   170    171         # Track cumulative element nesting in order to produce appropriate depth   172         # indicators in the formatted output.   173    174         if name in preformatted_tags or name in formatted_tags:   175             self.level += 1   176             self.max_level = max(self.level, self.max_level)   177    178         Parser.startElement(self, name, attrs)   179    180         # Remember macro information for use within the element.   181    182         if name == "ac:macro":   183             self.macro = self.attributes[-1].get("ac:name")   184    185     def endElement(self, name):   186         Parser.endElement(self, name)   187    188         if list_tags.has_key(name):   189             self.indent -= 1   190         elif self.states.has_key(name):   191             self.states[name] -= 1   192         if name in preformatted_tags or name in formatted_tags:   193             self.level -= 1   194             if not self.level:   195                 self.max_level = 0   196    197     def characters(self, content):   198         if not self.is_preformatted():   199             content = self.normalise(content, self.elements[-1])   200         Parser.characters(self, content)   201    202     def skippedEntity(self, name):   203         ch = htmlentitydefs.name2codepoint.get(name)   204         if ch:   205             self.text[-1].append(unichr(ch))   206    207     # Parser-related methods.   208    209     def handleElement(self, name):   210    211         """   212         Handle the completion of the element with the given 'name'. Any content   213         will either be recorded for later use (by an enclosing element, for   214         example) or emitted in some form.   215         """   216    217         text = "".join(self.text[-1])   218    219         # Handle state.   220    221         if name == "table":   222             self.table_rows = 0   223         elif name == "tr":   224             self.table_columns = 0   225    226         # Find conversions.   227    228         conversion = None   229    230         # Handle list elements.   231    232         if name == "li" and len(self.elements) > 1:   233             list_tag = self.elements[-2]   234             conversion = list_tags.get(list_tag)   235    236         # Remember link target information.   237    238         elif link_target_tags.has_key(name):   239             target_details = []   240    241             # Get target details from the element's attributes.   242    243             for attrname in link_target_tags[name]:   244                 attrvalue = self.attributes[-1].get(attrname)   245                 if attrvalue:   246                     target_details.append(attrvalue)   247                     prefix = link_target_prefixes.get(attrname)   248                     if prefix:   249                         target_details.insert(0, prefix)   250                     if attrname in link_label_attributes and not self.label:   251                         self.label = attrvalue   252    253             # Make a link based on the details.   254    255             self.target = "/".join(target_details)   256             self.target_type = name   257             text = ""   258    259         # For anchor links, just use the raw text and let Moin do the formatting.   260    261         elif name == "ac:link-body":   262             if not self.target_type:   263                 self.target_type = name   264             self.label = text   265             text = ""   266    267         # For conventional links, remember the href attribute as the target.   268    269         elif name == "a":   270             self.target = self.attributes[-1].get("href")   271             self.label = text   272             text = ""   273    274         # Discard macro state.   275    276         elif name == "ac:macro":   277             self.macro = None   278             self.macro_parameters = {}   279    280         # Remember macro information.   281    282         elif name in ("ac:parameter", "ac:default-parameter"):   283             self.macro_parameters[self.attributes[-1].get("ac:name")] = text   284             text = ""   285    286         # Handle single-level tags.   287    288         elif name in single_level_tags and self.states[name] > 1:   289             conversion = "%s"   290    291         # Handle preformatted sections.   292    293         elif name in preformatted_tags or name in formatted_tags:   294    295             # Nest the section appropriately.   296    297             level = 3 + self.max_level - self.level   298             opening = "{" * level   299             closing = "}" * level   300    301             # Macro name information is used to style rich text body regions.   302    303             if name != "table" and self.macro and macro_rich_text_styles.has_key(self.macro):   304                 details = macro_rich_text_styles[self.macro]   305                 title = self.macro_parameters.get("title")   306                 if title:   307                     details = "%s\n\n%s" % (details, title)   308    309                 conversion = "%s#!wiki %s\n\n%%s\n%s" % (opening, details, closing)   310    311             elif name == "table":   312                 conversion = "%s#!table\n%%s\n%s" % (opening, closing)   313    314             else:   315                 conversion = "%s%%s%s" % (opening, closing)   316    317         # Handle the common case and simpler special cases.   318    319         if not conversion:   320             conversion = tags.get(name)   321    322    323    324         # Attempt to convert the text.   325    326         # Links require target information.   327    328         if name in ("ac:link", "ac:image"):   329             prefix = link_target_types.get(self.target_type, "")   330             anchor = self.attributes[-1].get("ac:anchor")   331             text = conversion % (prefix, anchor or self.target, self.label or text or self.target)   332             self.target = self.target_type = self.label = None   333    334         elif name == "a":   335             text = conversion % (self.target, self.label)   336             self.target = self.target_type = self.label = None   337    338         # Handle the common case.   339    340         elif text and conversion:   341             text = conversion % text   342         elif simple_tags.has_key(name):   343             text = simple_tags[name]   344    345         # Postprocess table columns and rows.   346    347         if name in ("th", "td"):   348             if self.table_columns:   349                 text = "\n|| %s" % text   350             self.table_columns += 1   351         elif name == "tr":   352             if self.table_rows:   353                 text = "\n==\n%s" % text   354             self.table_rows += 1   355    356         # Normalise leading whitespace and indent the text if appropriate.   357    358         if name in indented_tags:   359             text = " " * self.indent + text.lstrip()   360    361         # Add the converted text to the end of the parent element's text nodes.   362    363         if len(self.text) > 1:   364             nodes = self.text[-2]   365    366             # Where preceding text exists, add any blank line separators.   367    368             if "".join(nodes):   369                 parent = self.elements[-2]   370    371                 # All top-level elements are separated with blank lines.   372    373                 if parent == "body":   374                     nodes.append("\n")   375    376                 # Block elements always cause a new line to be started.   377    378                 if name in block_tags or self.have_block:   379                     nodes.append("\n")   380    381                 self.have_block = False   382    383             # Without preceding text, save any block node state so that new line   384             # separators can be added at another level.   385    386             elif name in block_tags:   387                 self.have_block = True   388    389             else:   390                 self.have_block = False   391    392             nodes.append(text)   393    394         # Otherwise, emit the text (at the top level of the document).   395    396         else:   397             self.out.write(text)   398    399     def is_preformatted(self):   400         return reduce(operator.or_, [self.states[tag] for tag in preformatted_tags], False)   401    402     # Whitespace normalisation.   403    404     def get_replacement(self, name):   405         if name in ("html", "body", "table", "tbody", "tr") or list_tags.has_key(name):   406             return ""   407         else:   408             return " "   409    410     def normalise(self, text, name):   411         return normalise_regexp.sub(self.get_replacement(name), text)   412    413 def parse(s, out):   414    415     "Parse the content in the string 's', writing a translation to 'out'."   416    417     # NOTE: CDATA sections appear to have erroneous endings.   418    419     s = u"""\   420 <?xml version="1.0"?>   421 <!DOCTYPE html    422      PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"   423      "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">   424 <html xmlns="http://www.w3.org/1999/xhtml">   425 <body>   426 %s   427 </body>   428 </html>""" % s.replace("]] >", "]]>")   429    430     f = StringIO(s.encode("utf-8"))   431     try:   432         parser = ConfluenceXMLParser(out)   433         parser.parse(f)   434     finally:   435         f.close()   436    437 if __name__ == "__main__":   438     s = sys.stdin.read()   439     out = codecs.getwriter("utf-8")(sys.stdout)   440     parse(s, out)   441    442 # vim: tabstop=4 expandtab shiftwidth=4