ConfluenceConverter (file xmlparser.py at 793756e9e933)

     1 #!/usr/bin/env python     2      3 """     4 Confluence Wiki XML/XHTML syntax parsing.     5      6 Copyright (C) 2012, 2013 Paul Boddie <paul@boddie.org.uk>     7      8 This software is free software; you can redistribute it and/or     9 modify it under the terms of the GNU General Public License as    10 published by the Free Software Foundation; either version 2 of    11 the License, or (at your option) any later version.    12     13 This software is distributed in the hope that it will be useful,    14 but WITHOUT ANY WARRANTY; without even the implied warranty of    15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the    16 GNU General Public License for more details.    17     18 You should have received a copy of the GNU General Public    19 License along with this library; see the file LICENCE.txt    20 If not, write to the Free Software Foundation, Inc.,    21 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA    22 """    23     24 try:    25     from cStringIO import StringIO    26 except ImportError:    27     from StringIO import StringIO    28     29 from MoinMoin import wikiutil    30 from common import *    31 from xmlread import Parser    32 import re    33 import sys    34 import operator    35 import htmlentitydefs    36 import codecs    37     38 # XML dialect syntax parsing.    39     40 tags = {    41     # XHTML tag               MoinMoin syntax    42     "strong"                : "'''%s'''",    43     "em"                    : "''%s''",    44     "u"                     : "__%s__",    45     "del"                   : "--(%s)--",    46     "sup"                   : "^%s^",    47     "sub"                   : ",,%s,,",    48     "code"                  : "`%s`",    49     "tbody"                 : "%s",    50     "tr"                    : "%s",    51     "th"                    : "'''%s'''",    52     "td"                    : "%s",    53     "blockquote"            : " %s",    54     "small"                 : "~-%s-~",    55     "big"                   : "~+%s+~",    56     "p"                     : "%s",    57     "ol"                    : "%s",    58     "ul"                    : "%s",    59     "ac:link"               : "[[%s%s|%s]]",    60     "ac:image"              : "{{%s%s|%s}}",    61     }    62     63 for tag, translation in blocktypes.items():    64     tags[tag] = translation    65     66 simple_tags = {    67     # XHTML tag               MoinMoin syntax    68     "br"                    : "<<BR>>",    69     }    70     71 list_tags = {    72     # XHTML list tag          MoinMoin list item syntax    73     "ol"                    : "1. %s",    74     "ul"                    : "* %s",    75     }    76     77 indented_tags = ["li", "p"]    78     79 preformatted_tags = ["pre", "ac:plain-text-body"]    80 single_level_tags = ["strong", "em", "u", "del", "sup", "sub", "code"]    81 formatted_tags    = ["ac:rich-text-body", "table"]    82     83 link_target_tags = {    84     # Confluence element      Attributes providing the target    85     "ri:page"               : ("ri:space-key", "ri:content-title"),    86     "ri:attachment"         : ("ri:filename",),    87     "ri:user"               : ("ri:username",),    88     }    89     90 link_target_prefixes = {    91     # Attribute with details  Prefix ensuring correct relative link    92     "ri:space-key"          : "..",    93     "ri:content-title"      : "..",    94     }    95     96 link_label_attributes = "ri:content-title", "ac:link-body"    97     98 # NOTE: User links should support the intended user namespace prefix.    99    100 link_target_types = {   101     # Confluence element      MoinMoin link prefix   102     "ri:attachment"         : "attachment:",   103     "ri:user"               : "",   104     "ac:link-body"          : "#",   105     }   106    107 macro_rich_text_styles = {   108     # Confluence style        MoinMoin admonition style   109     "note"                  : "caution",   110     "warning"               : "warning",   111     "info"                  : "important",   112     "tip"                   : "tip",   113     }   114    115 normalise_regexp_str = r"\s+"   116 normalise_regexp = re.compile(normalise_regexp_str)   117    118 class ConfluenceXMLParser(Parser):   119    120     "Handle content from Confluence 4 page revisions."   121    122     def __init__(self, out):   123         Parser.__init__(self)   124         self.out = out   125    126         # Link target and label information.   127    128         self.target = None   129         self.target_type = None   130         self.label = None   131    132         # Macro information.   133    134         self.macro = None   135         self.macro_parameters = {}   136    137         # Indentation and element nesting states.   138    139         self.indent = 0   140         self.states = {}   141         self.max_level = self.level = 0   142    143         for name in preformatted_tags + single_level_tags:   144             self.states[name] = 0   145    146         # Table states.   147    148         self.table_rows = 0   149         self.table_columns = 0   150    151     # ContentHandler-related methods.   152    153     def startElement(self, name, attrs):   154    155         # Track indentation for lists.   156    157         if list_tags.has_key(name):   158             self.indent += 1   159    160         # Track element nesting.   161    162         elif self.states.has_key(name):   163             self.states[name] += 1   164    165         # Track cumulative element nesting in order to produce appropriate depth   166         # indicators in the formatted output.   167    168         if name in preformatted_tags or name in formatted_tags:   169             self.level += 1   170             self.max_level = max(self.level, self.max_level)   171    172         Parser.startElement(self, name, attrs)   173    174         # Remember macro information for use within the element.   175    176         if name == "ac:macro":   177             self.macro = self.attributes[-1].get("ac:name")   178    179     def endElement(self, name):   180         Parser.endElement(self, name)   181    182         if list_tags.has_key(name):   183             self.indent -= 1   184         elif self.states.has_key(name):   185             self.states[name] -= 1   186         if name in preformatted_tags or name in formatted_tags:   187             self.level -= 1   188             if not self.level:   189                 self.max_level = 0   190    191     def characters(self, content):   192         if not self.is_preformatted():   193             content = self.normalise(content, self.elements[-1])   194         Parser.characters(self, content)   195    196     def skippedEntity(self, name):   197         ch = htmlentitydefs.name2codepoint.get(name)   198         if ch:   199             self.text[-1].append(unichr(ch))   200    201     # Parser-related methods.   202    203     def handleElement(self, name):   204    205         """   206         Handle the completion of the element with the given 'name'. Any content   207         will either be recorded for later use (by an enclosing element, for   208         example) or emitted in some form.   209         """   210    211         text = "".join(self.text[-1])   212    213         # Handle state.   214    215         if name == "table":   216             self.table_rows = 0   217         elif name == "tr":   218             self.table_columns = 0   219    220         # Find conversions.   221    222         conversion = None   223    224         # Handle list elements.   225    226         if name == "li" and len(self.elements) > 1:   227             list_tag = self.elements[-2]   228             conversion = list_tags.get(list_tag)   229    230         # Remember link target information.   231    232         elif link_target_tags.has_key(name):   233             target_details = []   234    235             # Get target details from the element's attributes.   236    237             for attrname in link_target_tags[name]:   238                 attrvalue = self.attributes[-1].get(attrname)   239                 if attrvalue:   240                     target_details.append(attrvalue)   241                     prefix = link_target_prefixes.get(attrname)   242                     if prefix:   243                         target_details.insert(0, prefix)   244                     if attrname in link_label_attributes and not self.label:   245                         self.label = attrvalue   246    247             # Make a link based on the details.   248    249             self.target = "/".join(target_details)   250             self.target_type = name   251             text = ""   252    253         # For anchor links, just use the raw text and let Moin do the formatting.   254    255         elif name == "ac:link-body":   256             if not self.target_type:   257                 self.target_type = name   258             self.label = text   259             text = ""   260    261         # Discard macro state.   262    263         elif name == "ac:macro":   264             self.macro = None   265             self.macro_parameters = {}   266    267         # Remember macro information.   268    269         elif name in ("ac:parameter", "ac:default-parameter"):   270             self.macro_parameters[self.attributes[-1].get("ac:name")] = text   271             text = ""   272    273         # Handle single-level tags.   274    275         elif name in single_level_tags and self.states[name] > 1:   276             conversion = "%s"   277    278         # Handle preformatted sections.   279    280         elif name in preformatted_tags or name in formatted_tags:   281    282             # Nest the section appropriately.   283    284             level = 3 + self.max_level - self.level   285             opening = "{" * level   286             closing = "}" * level   287    288             # Macro name information is used to style rich text body regions.   289    290             if name != "table" and self.macro and macro_rich_text_styles.has_key(self.macro):   291                 details = macro_rich_text_styles[self.macro]   292                 title = self.macro_parameters.get("title")   293                 if title:   294                     details = "%s\n\n%s" % (details, title)   295    296                 conversion = "%s#!wiki %s\n\n%%s\n%s" % (opening, details, closing)   297    298             elif name == "table":   299                 conversion = "%s#!table\n%%s\n%s" % (opening, closing)   300    301             else:   302                 conversion = "%s%%s%s" % (opening, closing)   303    304         # Handle the common case.   305    306         else:   307             conversion = tags.get(name)   308    309         # Attempt to convert the text.   310    311         # Links require target information.   312    313         if name in ("ac:link", "ac:image"):   314             prefix = link_target_types.get(self.target_type, "")   315             anchor = self.attributes[-1].get("ac:anchor")   316             text = conversion % (prefix, anchor or self.target, self.label or text or self.target)   317             self.target = self.target_type = self.label = None   318    319         # Handle the common case.   320    321         elif text and conversion:   322             text = conversion % text   323         elif simple_tags.has_key(name):   324             text = simple_tags[name]   325    326         # Postprocess table columns and rows.   327    328         if name in ("th", "td"):   329             if self.table_columns:   330                 text = "\n|| %s" % text   331             self.table_columns += 1   332         elif name == "tr":   333             if self.table_rows:   334                 text = "\n==\n%s" % text   335             self.table_rows += 1   336    337         # Normalise leading whitespace and indent the text if appropriate.   338    339         if name in indented_tags:   340             text = " " * self.indent + text.lstrip()   341    342         # Add the converted text to the end of the parent element's text nodes.   343    344         if len(self.text) > 1:   345             nodes = self.text[-2]   346             if "".join(self.text[-2]):   347                 parent = self.elements[-2]   348                 if parent == "body":   349                     nodes.append("\n\n")   350                 elif list_tags.has_key(parent):   351                     nodes.append("\n")   352                 elif list_tags.has_key(name):   353                     nodes.append("\n")   354             nodes.append(text)   355    356         # Otherwise, emit the text.   357    358         else:   359             self.out.write(text)   360    361     def is_preformatted(self):   362         return reduce(operator.or_, [self.states[tag] for tag in preformatted_tags], False)   363    364     # Whitespace normalisation.   365    366     def get_replacement(self, name):   367         if name in ("html", "body", "table", "tbody", "tr") or list_tags.has_key(name):   368             return ""   369         else:   370             return " "   371    372     def normalise(self, text, name):   373         return normalise_regexp.sub(self.get_replacement(name), text)   374    375 def parse(s, out):   376    377     "Parse the content in the string 's', writing a translation to 'out'."   378    379     # NOTE: CDATA sections appear to have erroneous endings.   380    381     s = u"""\   382 <?xml version="1.0"?>   383 <!DOCTYPE html    384      PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"   385      "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">   386 <html xmlns="http://www.w3.org/1999/xhtml">   387 <body>   388 %s   389 </body>   390 </html>""" % s.replace("]] >", "]]>")   391    392     f = StringIO(s.encode("utf-8"))   393     try:   394         parser = ConfluenceXMLParser(out)   395         parser.parse(f)   396     finally:   397         f.close()   398    399 if __name__ == "__main__":   400     s = sys.stdin.read()   401     out = codecs.getwriter("utf-8")(sys.stdout)   402     parse(s, out)   403    404 # vim: tabstop=4 expandtab shiftwidth=4