ConfluenceConverter (file xmlparser.py at 06641676740f)

     1 #!/usr/bin/env python     2      3 """     4 Confluence Wiki XML/XHTML syntax parsing.     5      6 Copyright (C) 2012, 2013, 2015, 2017 Paul Boddie <paul@boddie.org.uk>     7      8 This software is free software; you can redistribute it and/or     9 modify it under the terms of the GNU General Public License as    10 published by the Free Software Foundation; either version 2 of    11 the License, or (at your option) any later version.    12     13 This software is distributed in the hope that it will be useful,    14 but WITHOUT ANY WARRANTY; without even the implied warranty of    15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the    16 GNU General Public License for more details.    17     18 You should have received a copy of the GNU General Public    19 License along with this library; see the file LICENCE.txt    20 If not, write to the Free Software Foundation, Inc.,    21 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA    22 """    23     24 try:    25     from cStringIO import StringIO    26 except ImportError:    27     from StringIO import StringIO    28     29 from MoinMoin import wikiutil    30 from common import *    31 from xmlread import Parser    32 import re    33 import sys    34 import operator    35 import htmlentitydefs    36 import codecs    37     38 # XML dialect syntax parsing.    39     40 tags = {    41     # XHTML tag               MoinMoin syntax    42     "strong"                : "'''%s'''",    43     "em"                    : "''%s''",    44     "u"                     : "__%s__",    45     "del"                   : "--(%s)--",    46     "sup"                   : "^%s^",    47     "sub"                   : ",,%s,,",    48     "code"                  : "`%s`",    49     "tbody"                 : "%s",    50     "tr"                    : "%s",    51     "th"                    : "'''%s'''",    52     "td"                    : "%s",    53     "blockquote"            : " %s",    54     "small"                 : "~-%s-~",    55     "big"                   : "~+%s+~",    56     "p"                     : "%s",    57     "ol"                    : "%s",    58     "ul"                    : "%s",    59     "ac:link"               : "[[%s%s%s|%s]]",    60     "ac:image"              : "{{%s%s%s|%s}}",    61     "a"                     : "[[%s|%s]]",    62     }    63     64 for tag, translation in blocktypes.items():    65     tags[tag] = translation    66     67 simple_tags = {    68     # XHTML tag               MoinMoin syntax    69     "br"                    : "<<BR>>",    70     }    71     72 simple_preformatted_tags = {    73     # XHTML tag               MoinMoin syntax    74     "br"                    : "\n",    75     }    76     77 list_tags = {    78     # XHTML list tag          MoinMoin list item syntax    79     "ol"                    : "1. %s",    80     "ul"                    : "* %s",    81     }    82     83 preformatted_tags = ["pre", "ac:plain-text-body"]    84 single_level_tags = ["strong", "em", "u", "del", "sup", "sub", "code"]    85 formatted_tags    = ["ac:rich-text-body", "table"]    86     87 indented_tags = ["li", "p"] + preformatted_tags + formatted_tags    88 block_tags = indented_tags + blocktypes.keys() + list_tags.keys()    89 span_override_tags = ["ac:link"]    90     91 link_target_tags = {    92     # Confluence element      Attributes providing the target    93     "ri:page"               : ("ri:space-key", "ri:content-title"),    94     "ri:attachment"         : ("ri:filename",),    95     "ri:user"               : ("ri:username",),    96     }    97     98 link_target_prefixes = {    99     # Attribute with details  Prefix ensuring correct relative link   100     "ri:space-key"          : "..",   101     "ri:content-title"      : "..",   102     }   103    104 link_label_attributes = "ri:content-title", "ac:link-body"   105    106 # NOTE: User links should support the intended user namespace prefix.   107    108 link_target_types = {   109     # Confluence element      MoinMoin link prefix   110     "ri:attachment"         : "attachment:",   111     "ri:user"               : "",   112     }   113    114 macro_rich_text_styles = {   115     # Confluence style        MoinMoin admonition style   116     "note"                  : "caution",   117     "warning"               : "warning",   118     "info"                  : "important",   119     "tip"                   : "tip",   120     "excerpt"               : "",   121     }   122    123 macroargs = {   124     # Confluence macro        Confluence and MoinMoin macro arguments   125     "color"                 : ("color", "col"),   126     }   127    128 macrotypes = {   129     # Confluence macro        MoinMoin syntax   130     "anchor"                : "<<Anchor(%(anchor)s)>>",   131     "color"                 : "<<Color2(%(content)s, %(args)s)>>",   132     "toc"                   : "<<TableOfContents>>",   133     }   134    135 normalise_regexp_str = r"\s+"   136 normalise_regexp = re.compile(normalise_regexp_str)   137    138 class ConfluenceXMLParser(Parser):   139    140     "Handle content from Confluence 4 page revisions."   141    142     def __init__(self, out, is_comment_page=False):   143         Parser.__init__(self)   144         self.out = out   145         self.is_comment_page = is_comment_page   146    147         # Link target and label information.   148    149         self.target = None   150         self.target_type = None   151         self.label = None   152    153         # Macro information.   154    155         self.macros = []   156         self.macro_parameters = []   157         self.held_anchors = []   158    159         # Indentation and element nesting states.   160    161         self.indents = [0]   162         self.states = {}   163         self.max_level = self.level = 0   164    165         for name in preformatted_tags + single_level_tags:   166             self.states[name] = 0   167    168         # Table states.   169    170         self.table_rows = 0   171         self.table_columns = 0   172    173         # Block states.   174    175         self.have_block = False   176    177     # ContentHandler-related methods.   178    179     def startElement(self, name, attrs):   180    181         # Track indentation for lists.   182    183         if list_tags.has_key(name):   184             self.indents.append(self.indents[-1] + 1)   185    186         # Track element nesting.   187    188         if self.states.has_key(name):   189             self.states[name] += 1   190    191         # Track cumulative element nesting in order to produce appropriate depth   192         # indicators in the formatted output.   193    194         if name in preformatted_tags or name in formatted_tags:   195             self.level += 1   196             self.max_level = max(self.level, self.max_level)   197    198             # Reset indentation within regions.   199    200             self.indents.append(0)   201    202         if name in headings:   203             self.held_anchors = []   204    205         Parser.startElement(self, name, attrs)   206    207         # Remember macro information for use within the element.   208    209         if name in ("ac:macro", "ac:structured-macro"):   210             self.macros.append(self.attributes[-1].get("ac:name"))   211             self.macro_parameters.append({})   212    213     def endElement(self, name):   214    215         # Reset the indent for any preformatted/formatted region so that it may   216         # itself be indented.   217    218         if name in preformatted_tags or name in formatted_tags:   219             self.indents.pop()   220    221         Parser.endElement(self, name)   222    223         if list_tags.has_key(name):   224             self.indents.pop()   225    226         if self.states.has_key(name):   227             self.states[name] -= 1   228    229         if name in preformatted_tags or name in formatted_tags:   230             self.level -= 1   231             if not self.level:   232                 self.max_level = 0   233    234         # Discard macro state.   235    236         if name in ("ac:macro", "ac:structured-macro"):   237             self.macros.pop()   238             self.macro_parameters.pop()   239    240     def characters(self, content):   241         if not self.is_preformatted():   242             content = self.normalise(content, self.elements[-1])   243         Parser.characters(self, content)   244    245     def skippedEntity(self, name):   246         ch = htmlentitydefs.name2codepoint.get(name)   247         if ch:   248             self.text[-1].append(unichr(ch))   249    250     # Parser-related methods.   251    252     def handleElement(self, name):   253    254         """   255         Handle the completion of the element with the given 'name'. Any content   256         will either be recorded for later use (by an enclosing element, for   257         example) or emitted in some form.   258         """   259    260         text = u"".join(self.text[-1])   261    262         # Handle state.   263    264         if name == "table":   265             self.table_rows = 0   266         elif name == "tr":   267             self.table_columns = 0   268    269         # Find conversions.   270    271         conversion = None   272    273         # Handle list elements.   274    275         if name == "li" and len(self.elements) > 1:   276             list_tag = self.elements[-2]   277             conversion = list_tags.get(list_tag)   278    279         # Remember link target information.   280    281         elif link_target_tags.has_key(name):   282             target_details = []   283    284             # Get target details from the element's attributes.   285    286             for attrname in link_target_tags[name]:   287                 attrvalue = self.attributes[-1].get(attrname)   288                 if attrvalue:   289    290                     # Obtain a link label.   291    292                     if attrname in link_label_attributes and not self.label:   293                         self.label = attrvalue   294    295                     # Validate any page title.   296    297                     if attrname == "ri:content-title":   298                         attrvalue = get_page_title(attrvalue)   299                     target_details.append(attrvalue)   300    301                     # Insert any prefix required for the link.   302    303                     prefix = link_target_prefixes.get(attrname)   304                     if prefix:   305                         target_details.insert(0, prefix)   306                         if self.is_comment_page:   307                             target_details.insert(0, prefix)   308    309             # Make a link based on the details.   310    311             self.target = u"/".join(target_details)   312             self.target_type = name   313             text = ""   314    315         # For anchor links, just use the raw text and let Moin do the formatting.   316         # Set an empty default target, overwriting it if enclosing elements   317         # specify target details.   318    319         elif name in ("ac:link-body", "ac:plain-text-link-body"):   320             self.target = self.target or ""   321             self.label = text.strip()   322             text = ""   323    324         # For conventional links, remember the href attribute as the target.   325    326         elif name == "a":   327             self.target = self.attributes[-1].get("href")   328             self.label = text.strip()   329             text = ""   330    331         # Remember macro information.   332    333         elif name == "ac:parameter":   334             self.macro_parameters[-1][self.attributes[-1].get("ac:name")] = text   335             text = ""   336    337         elif name == "ac:default-parameter":   338             self.macro_parameters[-1][self.attributes[-2].get("ac:name")] = text   339             text = ""   340    341         # Handle single-level tags.   342    343         elif name in single_level_tags and self.states[name] > 1:   344             conversion = "%s"   345    346         # Handle preformatted sections.   347    348         elif name in preformatted_tags or name in formatted_tags:   349    350             # Nest the section appropriately.   351    352             level = 3 + self.max_level - self.level   353             opening = "{" * level   354             closing = "}" * level   355    356             # Macro name information is used to style rich text body regions.   357    358             if name != "table" and self.macros and macro_rich_text_styles.has_key(self.macros[-1]):   359                 details = macro_rich_text_styles[self.macros[-1]]   360                 title = self.macro_parameters[-1].get("title")   361                 if title:   362                     details = "%s\n\n%s" % (details, title)   363    364                 conversion = "%s#!wiki %s\n\n%%s\n%s" % (opening, details, closing)   365    366             elif name == "table":   367                 conversion = "%s#!table\n%%s\n%s" % (opening, closing)   368    369             else:   370                 # Preformatted sections containing newlines must contain an initial   371                 # newline.   372    373                 if text.find("\n") != -1 and not text.startswith("\n"):   374                     opening += "\n"   375    376                 conversion = "%s%%s%s" % (opening, closing)   377    378         # Handle the common case and simpler special cases.   379    380         if not conversion:   381             conversion = tags.get(name)   382    383    384    385         # Attempt to convert the text.   386    387         # Links require target information.   388    389         if name in ("ac:link", "ac:image"):   390             prefix = link_target_types.get(self.target_type, "")   391             anchor = self.attributes[-1].get("ac:anchor") or ""   392             label = self.label or text.strip() or self.target   393             text = conversion % (prefix, self.target, anchor and ("#%s" % anchor) or "", label)   394             self.target = self.target_type = self.label = None   395    396         elif name == "a":   397             text = conversion % (self.target, self.label or self.target)   398             self.target = self.target_type = self.label = None   399    400         # Macros require various kinds of information.   401         # Some macros affect the formatting of their contents, whereas other   402         # simpler macros are handled here.   403    404         elif name in ("ac:macro", "ac:structured-macro"):   405             conversion = macrotypes.get(self.macros[-1])   406    407             # Produce the converted macro.   408    409             if conversion:   410                 parameters = {"content" : text}   411                 parameters.update(self.macro_parameters[-1])   412                 argnames = macroargs.get(self.macros[-1])   413                 if argnames:   414                     confargname, moinargname = argnames   415                     parameters["args"] = quote_macro_argument("%s=%s" % (moinargname, self.macro_parameters[-1][confargname]))   416    417                 # Obtain the Moin macro with parameters substituted.   418    419                 text = conversion % parameters   420                 if self.macros[-1] == "anchor" and self.forbids_macros():   421                     self.held_anchors.append(text)   422                     text = ""   423    424             # Warn about macros that are not converted.   425    426             elif not macro_rich_text_styles.has_key(self.macros[-1]):   427                 print >>sys.stderr, "No conversion possible for macro", self.macros[-1]   428                 print >>sys.stderr, "Macro has arguments", self.macro_parameters[-1]   429                 print >>sys.stderr   430    431         # Handle the common cases for parameterised and unparameterised   432         # substitutions.   433    434         elif text and conversion:   435             text = conversion % text   436         elif simple_tags.has_key(name) and not self.is_preformatted():   437             text = simple_tags[name]   438         elif simple_preformatted_tags.has_key(name) and self.is_preformatted():   439             text = simple_preformatted_tags[name]   440    441    442    443         # Postprocess table columns and rows.   444    445         if name in ("th", "td"):   446             if self.table_columns:   447                 text = "\n|| %s" % text   448             self.table_columns += 1   449         elif name == "tr":   450             if self.table_rows:   451                 text = "\n==\n%s" % text   452             self.table_rows += 1   453    454         # Postprocess held anchor tags in headings.   455    456         elif name in headings and self.held_anchors:   457             text = "%s\n%s" % ("".join(self.held_anchors), text)   458    459    460    461         # Normalise leading whitespace and indent the text if appropriate.   462    463         if name in indented_tags:   464             text = " " * self.indents[-1] + text.lstrip()   465    466         # Add the converted text to the end of the parent element's text nodes.   467    468         if len(self.text) > 1:   469             nodes = self.text[-2]   470             parent = self.elements[-2]   471    472             # Where preceding text exists, add any blank line separators.   473    474             if u"".join(nodes):   475    476                 # All top-level elements are separated with blank lines.   477    478                 if parent == "body":   479                     nodes.append("\n")   480    481                 # Block elements always cause a new line to be started.   482    483                 if name in block_tags or self.have_block and name not in span_override_tags:   484                     nodes.append("\n")   485    486                 self.have_block = False   487    488             # Lists inside lists require separation.   489    490             elif list_tags.has_key(name) and parent == "li":   491                 nodes.append("\n")   492    493             # Without preceding text, save any block node state for non-block   494             # elements so that newline separators can be added at another   495             # level.   496    497             elif name in block_tags and parent not in block_tags:   498                 self.have_block = True   499    500             elif name not in block_tags and self.have_block and name not in span_override_tags:   501                 self.have_block = True   502    503             else:   504                 self.have_block = False   505    506             nodes.append(text)   507    508         # Otherwise, emit the text (at the top level of the document).   509    510         else:   511             self.out.write(text)   512    513     def is_preformatted(self):   514         return reduce(operator.or_, [self.states[tag] for tag in preformatted_tags], False)   515    516     def forbids_macros(self):   517         return reduce(operator.or_, [(tag in headings or tag == "a") for tag in self.elements], False)   518    519     # Whitespace normalisation.   520    521     def get_replacement(self, name):   522         if name in ("html", "body", "table", "tbody", "tr") or list_tags.has_key(name):   523             return ""   524         else:   525             return " "   526    527     def normalise(self, text, name):   528         return normalise_regexp.sub(self.get_replacement(name), text)   529    530 def parse(s, out, is_comment_page=False):   531    532     "Parse the content in the string 's', writing a translation to 'out'."   533    534     # NOTE: CDATA sections appear to have erroneous endings.   535    536     s = u"""\   537 <?xml version="1.0"?>   538 <!DOCTYPE html    539      PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"   540      "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">   541 <html xmlns="http://www.w3.org/1999/xhtml">   542 <body>   543 %s   544 </body>   545 </html>""" % s.replace("]] >", "]]>")   546    547     f = StringIO(s.encode("utf-8"))   548     try:   549         parser = ConfluenceXMLParser(out, is_comment_page)   550         parser.parse(f)   551     finally:   552         f.close()   553    554 if __name__ == "__main__":   555     s = codecs.getreader("utf-8")(sys.stdin).read()   556     out = codecs.getwriter("utf-8")(sys.stdout)   557     parse(s, out)   558    559 # vim: tabstop=4 expandtab shiftwidth=4