ConfluenceConverter (file xmlparser.py at e3262eb82f1d)

     1 #!/usr/bin/env python     2      3 """     4 Confluence Wiki XML/XHTML syntax parsing.     5      6 Copyright (C) 2012, 2013 Paul Boddie <paul@boddie.org.uk>     7      8 This software is free software; you can redistribute it and/or     9 modify it under the terms of the GNU General Public License as    10 published by the Free Software Foundation; either version 2 of    11 the License, or (at your option) any later version.    12     13 This software is distributed in the hope that it will be useful,    14 but WITHOUT ANY WARRANTY; without even the implied warranty of    15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the    16 GNU General Public License for more details.    17     18 You should have received a copy of the GNU General Public    19 License along with this library; see the file LICENCE.txt    20 If not, write to the Free Software Foundation, Inc.,    21 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA    22 """    23     24 try:    25     from cStringIO import StringIO    26 except ImportError:    27     from StringIO import StringIO    28     29 from MoinMoin import wikiutil    30 from common import *    31 from xmlread import Parser    32 import re    33 import sys    34 import operator    35 import htmlentitydefs    36 import codecs    37     38 # XML dialect syntax parsing.    39     40 tags = {    41     # XHTML tag               MoinMoin syntax    42     "strong"                : "'''%s'''",    43     "em"                    : "''%s''",    44     "u"                     : "__%s__",    45     "del"                   : "--(%s)--",    46     "sup"                   : "^%s^",    47     "sub"                   : ",,%s,,",    48     "code"                  : "`%s`",    49     "tbody"                 : "%s",    50     "tr"                    : "%s",    51     "th"                    : "'''%s'''",    52     "td"                    : "%s",    53     "blockquote"            : " %s",    54     "small"                 : "~-%s-~",    55     "big"                   : "~+%s+~",    56     "p"                     : "%s",    57     "ol"                    : "%s",    58     "ul"                    : "%s",    59     "ac:link"               : "[[%s%s|%s]]",    60     "ac:image"              : "{{%s%s|%s}}",    61     "a"                     : "[[%s|%s]]",    62     }    63     64 for tag, translation in blocktypes.items():    65     tags[tag] = translation    66     67 simple_tags = {    68     # XHTML tag               MoinMoin syntax    69     "br"                    : "<<BR>>",    70     }    71     72 list_tags = {    73     # XHTML list tag          MoinMoin list item syntax    74     "ol"                    : "1. %s",    75     "ul"                    : "* %s",    76     }    77     78 preformatted_tags = ["pre", "ac:plain-text-body"]    79 single_level_tags = ["strong", "em", "u", "del", "sup", "sub", "code"]    80 formatted_tags    = ["ac:rich-text-body", "table"]    81     82 indented_tags = ["li", "p"] + preformatted_tags + formatted_tags    83 block_tags = indented_tags + blocktypes.keys() + list_tags.keys()    84 span_override_tags = ["ac:link"]    85     86 link_target_tags = {    87     # Confluence element      Attributes providing the target    88     "ri:page"               : ("ri:space-key", "ri:content-title"),    89     "ri:attachment"         : ("ri:filename",),    90     "ri:user"               : ("ri:username",),    91     }    92     93 link_target_prefixes = {    94     # Attribute with details  Prefix ensuring correct relative link    95     "ri:space-key"          : "..",    96     "ri:content-title"      : "..",    97     }    98     99 link_label_attributes = "ri:content-title", "ac:link-body"   100    101 # NOTE: User links should support the intended user namespace prefix.   102    103 link_target_types = {   104     # Confluence element      MoinMoin link prefix   105     "ri:attachment"         : "attachment:",   106     "ri:user"               : "",   107     "ac:link-body"          : "#",   108     }   109    110 macro_rich_text_styles = {   111     # Confluence style        MoinMoin admonition style   112     "note"                  : "caution",   113     "warning"               : "warning",   114     "info"                  : "important",   115     "tip"                   : "tip",   116     }   117    118 normalise_regexp_str = r"\s+"   119 normalise_regexp = re.compile(normalise_regexp_str)   120    121 class ConfluenceXMLParser(Parser):   122    123     "Handle content from Confluence 4 page revisions."   124    125     def __init__(self, out):   126         Parser.__init__(self)   127         self.out = out   128    129         # Link target and label information.   130    131         self.target = None   132         self.target_type = None   133         self.label = None   134    135         # Macro information.   136    137         self.macro = None   138         self.macro_parameters = {}   139    140         # Indentation and element nesting states.   141    142         self.indent = 0   143         self.states = {}   144         self.max_level = self.level = 0   145    146         for name in preformatted_tags + single_level_tags:   147             self.states[name] = 0   148    149         # Table states.   150    151         self.table_rows = 0   152         self.table_columns = 0   153    154         # Block states.   155    156         self.have_block = False   157    158     # ContentHandler-related methods.   159    160     def startElement(self, name, attrs):   161    162         # Track indentation for lists.   163    164         if list_tags.has_key(name):   165             self.indent += 1   166    167         # Track element nesting.   168    169         elif self.states.has_key(name):   170             self.states[name] += 1   171    172         # Track cumulative element nesting in order to produce appropriate depth   173         # indicators in the formatted output.   174    175         if name in preformatted_tags or name in formatted_tags:   176             self.level += 1   177             self.max_level = max(self.level, self.max_level)   178    179         Parser.startElement(self, name, attrs)   180    181         # Remember macro information for use within the element.   182    183         if name == "ac:macro":   184             self.macro = self.attributes[-1].get("ac:name")   185    186     def endElement(self, name):   187         Parser.endElement(self, name)   188    189         if list_tags.has_key(name):   190             self.indent -= 1   191         elif self.states.has_key(name):   192             self.states[name] -= 1   193         if name in preformatted_tags or name in formatted_tags:   194             self.level -= 1   195             if not self.level:   196                 self.max_level = 0   197    198     def characters(self, content):   199         if not self.is_preformatted():   200             content = self.normalise(content, self.elements[-1])   201         Parser.characters(self, content)   202    203     def skippedEntity(self, name):   204         ch = htmlentitydefs.name2codepoint.get(name)   205         if ch:   206             self.text[-1].append(unichr(ch))   207    208     # Parser-related methods.   209    210     def handleElement(self, name):   211    212         """   213         Handle the completion of the element with the given 'name'. Any content   214         will either be recorded for later use (by an enclosing element, for   215         example) or emitted in some form.   216         """   217    218         text = u"".join(self.text[-1])   219    220         # Handle state.   221    222         if name == "table":   223             self.table_rows = 0   224         elif name == "tr":   225             self.table_columns = 0   226    227         # Find conversions.   228    229         conversion = None   230    231         # Handle list elements.   232    233         if name == "li" and len(self.elements) > 1:   234             list_tag = self.elements[-2]   235             conversion = list_tags.get(list_tag)   236    237         # Remember link target information.   238    239         elif link_target_tags.has_key(name):   240             target_details = []   241    242             # Get target details from the element's attributes.   243    244             for attrname in link_target_tags[name]:   245                 attrvalue = self.attributes[-1].get(attrname)   246                 if attrvalue:   247                     target_details.append(attrvalue)   248                     prefix = link_target_prefixes.get(attrname)   249                     if prefix:   250                         target_details.insert(0, prefix)   251                     if attrname in link_label_attributes and not self.label:   252                         self.label = attrvalue   253    254             # Make a link based on the details.   255    256             self.target = u"/".join(target_details)   257             self.target_type = name   258             text = ""   259    260         # For anchor links, just use the raw text and let Moin do the formatting.   261    262         elif name == "ac:link-body":   263             if not self.target_type:   264                 self.target_type = name   265             self.label = text.strip()   266             text = ""   267    268         # For conventional links, remember the href attribute as the target.   269    270         elif name == "a":   271             self.target = self.attributes[-1].get("href")   272             self.label = text.strip()   273             text = ""   274    275         # Discard macro state.   276    277         elif name == "ac:macro":   278             self.macro = None   279             self.macro_parameters = {}   280    281         # Remember macro information.   282    283         elif name in ("ac:parameter", "ac:default-parameter"):   284             self.macro_parameters[self.attributes[-1].get("ac:name")] = text   285             text = ""   286    287         # Handle single-level tags.   288    289         elif name in single_level_tags and self.states[name] > 1:   290             conversion = "%s"   291    292         # Handle preformatted sections.   293    294         elif name in preformatted_tags or name in formatted_tags:   295    296             # Nest the section appropriately.   297    298             level = 3 + self.max_level - self.level   299             opening = "{" * level   300             closing = "}" * level   301    302             # Macro name information is used to style rich text body regions.   303    304             if name != "table" and self.macro and macro_rich_text_styles.has_key(self.macro):   305                 details = macro_rich_text_styles[self.macro]   306                 title = self.macro_parameters.get("title")   307                 if title:   308                     details = "%s\n\n%s" % (details, title)   309    310                 conversion = "%s#!wiki %s\n\n%%s\n%s" % (opening, details, closing)   311    312             elif name == "table":   313                 conversion = "%s#!table\n%%s\n%s" % (opening, closing)   314    315             else:   316                 # Preformatted sections containing newlines must contain an initial   317                 # newline.   318    319                 if text.find("\n") != -1 and not text.startswith("\n"):   320                     opening += "\n"   321    322                 conversion = "%s%%s%s" % (opening, closing)   323    324         # Handle the common case and simpler special cases.   325    326         if not conversion:   327             conversion = tags.get(name)   328    329    330    331         # Attempt to convert the text.   332    333         # Links require target information.   334    335         if name in ("ac:link", "ac:image"):   336             prefix = link_target_types.get(self.target_type, "")   337             anchor = self.attributes[-1].get("ac:anchor")   338             text = conversion % (prefix, anchor or self.target, self.label or text.strip() or self.target)   339             self.target = self.target_type = self.label = None   340    341         elif name == "a":   342             text = conversion % (self.target, self.label or self.target)   343             self.target = self.target_type = self.label = None   344    345         # Handle the common case.   346    347         elif text and conversion:   348             text = conversion % text   349         elif simple_tags.has_key(name):   350             text = simple_tags[name]   351    352         # Postprocess table columns and rows.   353    354         if name in ("th", "td"):   355             if self.table_columns:   356                 text = "\n|| %s" % text   357             self.table_columns += 1   358         elif name == "tr":   359             if self.table_rows:   360                 text = "\n==\n%s" % text   361             self.table_rows += 1   362    363         # Normalise leading whitespace and indent the text if appropriate.   364    365         if name in indented_tags:   366             text = " " * self.indent + text.lstrip()   367    368         # Add the converted text to the end of the parent element's text nodes.   369    370         if len(self.text) > 1:   371             nodes = self.text[-2]   372             parent = self.elements[-2]   373    374             # Where preceding text exists, add any blank line separators.   375    376             if u"".join(nodes):   377    378                 # All top-level elements are separated with blank lines.   379    380                 if parent == "body":   381                     nodes.append("\n")   382    383                 # Block elements always cause a new line to be started.   384    385                 if name in block_tags or self.have_block and name not in span_override_tags:   386                     nodes.append("\n")   387    388                 self.have_block = False   389    390             # Lists inside lists require separation.   391    392             elif list_tags.has_key(name) and parent == "li":   393                 nodes.append("\n")   394    395             # Without preceding text, save any block node state for non-block   396             # elements so that newline separators can be added at another   397             # level.   398    399             elif name in block_tags and parent not in block_tags:   400                 self.have_block = True   401    402             elif name not in block_tags and self.have_block and name not in span_override_tags:   403                 self.have_block = True   404    405             else:   406                 self.have_block = False   407    408             nodes.append(text)   409    410         # Otherwise, emit the text (at the top level of the document).   411    412         else:   413             self.out.write(text)   414    415     def is_preformatted(self):   416         return reduce(operator.or_, [self.states[tag] for tag in preformatted_tags], False)   417    418     # Whitespace normalisation.   419    420     def get_replacement(self, name):   421         if name in ("html", "body", "table", "tbody", "tr") or list_tags.has_key(name):   422             return ""   423         else:   424             return " "   425    426     def normalise(self, text, name):   427         return normalise_regexp.sub(self.get_replacement(name), text)   428    429 def parse(s, out):   430    431     "Parse the content in the string 's', writing a translation to 'out'."   432    433     # NOTE: CDATA sections appear to have erroneous endings.   434    435     s = u"""\   436 <?xml version="1.0"?>   437 <!DOCTYPE html    438      PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"   439      "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">   440 <html xmlns="http://www.w3.org/1999/xhtml">   441 <body>   442 %s   443 </body>   444 </html>""" % s.replace("]] >", "]]>")   445    446     f = StringIO(s.encode("utf-8"))   447     try:   448         parser = ConfluenceXMLParser(out)   449         parser.parse(f)   450     finally:   451         f.close()   452    453 if __name__ == "__main__":   454     s = sys.stdin.read()   455     out = codecs.getwriter("utf-8")(sys.stdout)   456     parse(s, out)   457    458 # vim: tabstop=4 expandtab shiftwidth=4