ConfluenceConverter (file xmlparser.py at 371d25b0f062)

     1 #!/usr/bin/env python     2      3 """     4 Confluence Wiki XML/XHTML syntax parsing.     5      6 Copyright (C) 2012, 2013 Paul Boddie <paul@boddie.org.uk>     7      8 This software is free software; you can redistribute it and/or     9 modify it under the terms of the GNU General Public License as    10 published by the Free Software Foundation; either version 2 of    11 the License, or (at your option) any later version.    12     13 This software is distributed in the hope that it will be useful,    14 but WITHOUT ANY WARRANTY; without even the implied warranty of    15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the    16 GNU General Public License for more details.    17     18 You should have received a copy of the GNU General Public    19 License along with this library; see the file LICENCE.txt    20 If not, write to the Free Software Foundation, Inc.,    21 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA    22 """    23     24 try:    25     from cStringIO import StringIO    26 except ImportError:    27     from StringIO import StringIO    28     29 from common import *    30 from xmlread import Parser    31 import re    32 import sys    33 import operator    34 import htmlentitydefs    35 import codecs    36     37 # XML dialect syntax parsing.    38     39 tags = {    40     # XHTML tag               MoinMoin syntax    41     "strong"                : "'''%s'''",    42     "em"                    : "''%s''",    43     "u"                     : "__%s__",    44     "del"                   : "--(%s)--",    45     "sup"                   : "^%s^",    46     "sub"                   : ",,%s,,",    47     "code"                  : "`%s`",    48     "pre"                   : "{{{%s}}}",    49     "table"                 : "{{{#!table\n%s\n}}}",    50     "tbody"                 : "%s",    51     "tr"                    : "%s",    52     "th"                    : "'''%s'''",    53     "td"                    : "%s",    54     "blockquote"            : " %s",    55     "small"                 : "~-%s-~",    56     "big"                   : "~+%s+~",    57     "p"                     : "%s",    58     "ol"                    : "%s",    59     "ul"                    : "%s",    60     "ac:plain-text-body"    : "{{{%s}}}",    61     "ac:link"               : "[[%s%s|%s]]",    62     }    63     64 for tag, translation in blocktypes.items():    65     tags[tag] = translation    66     67 simple_tags = {    68     # XHTML tag               MoinMoin syntax    69     "br"                    : "<<BR>>",    70     }    71     72 list_tags = {    73     # XHTML list tag          MoinMoin list item syntax    74     "ol"                    : "1. %s",    75     "ul"                    : "* %s",    76     }    77     78 indented_tags = ["li", "p"]    79     80 link_target_tags = {    81     # Confluence element      Attribute providing the target    82     "ri:page"               : "ri:content-title",    83     "ri:attachment"         : "ri:filename",    84     "ri:user"               : "ri:username",    85     }    86     87 macro_rich_text_styles = {    88     # Confluence style        MoinMoin admonition style    89     "note"                  : "caution",    90     "warning"               : "warning",    91     "info"                  : "important",    92     "tip"                   : "tip",    93     }    94     95 normalise_regexp_str = r"\s+"    96 normalise_regexp = re.compile(normalise_regexp_str)    97     98 class ConfluenceXMLParser(Parser):    99    100     "Handle content from Confluence 4 page revisions."   101    102     def __init__(self, out):   103         Parser.__init__(self)   104         self.out = out   105    106         # Link target information.   107    108         self.target = None   109         self.target_type = None   110    111         # Macro information.   112    113         self.macro = None   114         self.macro_parameters = {}   115    116         # Indentation and preformatted states.   117    118         self.indent = 0   119         self.states = {}   120         for name in ("pre", "ac:plain-text-body"):   121             self.states[name] = 0   122    123         # Table states.   124    125         self.table_rows = 0   126         self.table_columns = 0   127    128     # ContentHandler-related methods.   129    130     def startElement(self, name, attrs):   131         if list_tags.has_key(name):   132             self.indent += 1   133         elif self.states.has_key(name):   134             self.states[name] += 1   135         Parser.startElement(self, name, attrs)   136    137     def endElement(self, name):   138         Parser.endElement(self, name)   139         if list_tags.has_key(name):   140             self.indent -= 1   141         elif self.states.has_key(name):   142             self.states[name] -= 1   143    144     def characters(self, content):   145         if not self.is_preformatted():   146             content = self.normalise(content, self.elements[-1])   147         Parser.characters(self, content)   148    149     def skippedEntity(self, name):   150         ch = htmlentitydefs.name2codepoint.get(name)   151         if ch:   152             self.text[-1].append(unichr(ch))   153    154     # Parser-related methods.   155    156     def handleElement(self, name):   157         text = "".join(self.text[-1]).strip()   158    159         # Handle state.   160    161         if name == "table":   162             self.table_rows = 0   163         elif name == "tr":   164             self.table_columns = 0   165    166         # Find conversions.   167    168         conversion = None   169    170         # Handle list elements.   171    172         if name == "li" and len(self.elements) > 1:   173             list_tag = self.elements[-2]   174             conversion = list_tags.get(list_tag)   175    176         # Remember link target information.   177    178         elif link_target_tags.has_key(name):   179             self.target = self.attributes[-1].get(link_target_tags[name])   180             self.target_type = name   181             text = ""   182    183         # Remember macro information.   184    185         elif name == "ac:parameter":   186             self.macro_parameters[self.attributes[-1].get("ac:name")] = text   187             text = ""   188    189         elif name == "ac:macro":   190             self.macro = self.attributes[-1].get("ac:name")   191    192         # Handle the common case.   193    194         else:   195             conversion = tags.get(name)   196    197         # Attempt to convert the text.   198    199         # Links require target information.   200         # NOTE: User links should support the intended user namespace prefix.   201    202         if name == "ac:link":   203             if self.target_type == "ri:attachment":   204                 prefix = "attachment:"   205             elif self.target_type == "ri:user":   206                 prefix = ""   207             else:   208                 prefix = "../"   209    210             text = conversion % (prefix, self.target, text or self.target)   211             self.target = self.target_type = None   212    213         # Macro name information is used to style rich text body regions.   214    215         elif name == "ac:macro" and macro_rich_text_styles.has_key(self.macro):   216             details = macro_rich_text_styles[self.macro]   217             title = self.macro_parameters.get("title")   218             if title:   219                 details = "%s\n\n%s" % (details, title)   220             text = "{{{#!wiki %s\n\n%s}}}" % (details, text)   221             self.macro = None   222             self.macro_parameters = {}   223    224         # Handle the common case.   225    226         elif text and conversion:   227             text = conversion % text   228         elif simple_tags.has_key(name):   229             text = simple_tags[name]   230    231         # Postprocess table columns and rows.   232    233         if name in ("th", "td"):   234             if self.table_columns:   235                 text = "\n|| %s" % text   236             self.table_columns += 1   237         elif name == "tr":   238             if self.table_rows:   239                 text = "\n==\n%s" % text   240             self.table_rows += 1   241    242         # Normalise leading whitespace and indent the text if appropriate.   243    244         if name in indented_tags:   245             text = " " * self.indent + text.lstrip()   246    247         # Add the converted text to the end of the parent element's text nodes.   248    249         if len(self.text) > 1:   250             nodes = self.text[-2]   251             if "".join(self.text[-2]):   252                 parent = self.elements[-2]   253                 if parent == "body":   254                     nodes.append("\n\n")   255                 elif list_tags.has_key(parent):   256                     nodes.append("\n")   257                 elif list_tags.has_key(name) and parent == "li":   258                     nodes.append("\n")   259             nodes.append(text)   260    261         # Otherwise, emit the text.   262    263         else:   264             self.out.write(text)   265    266     def is_preformatted(self):   267         return reduce(operator.or_, self.states.values(), False)   268    269     # Whitespace normalisation.   270    271     def get_replacement(self, name):   272         if name in ("html", "body") or list_tags.has_key(name):   273             return ""   274         else:   275             return " "   276    277     def normalise(self, text, name):   278         return normalise_regexp.sub(self.get_replacement(name), text)   279    280 def parse(s, out):   281    282     "Parse the content in the string 's', writing a translation to 'out'."   283    284     # NOTE: CDATA sections appear to have erroneous endings.   285    286     s = u"""\   287 <?xml version="1.0"?>   288 <!DOCTYPE html    289      PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"   290      "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">   291 <html xmlns="http://www.w3.org/1999/xhtml">   292 <body>   293 %s   294 </body>   295 </html>""" % s.replace("]] >", "]]>")   296    297     f = StringIO(s.encode("utf-8"))   298     try:   299         parser = ConfluenceXMLParser(out)   300         parser.parse(f)   301     finally:   302         f.close()   303    304 if __name__ == "__main__":   305     s = sys.stdin.read()   306     out = codecs.getwriter("utf-8")(sys.stdout)   307     parse(s, out)   308    309 # vim: tabstop=4 expandtab shiftwidth=4