ConfluenceConverter (file xmlparser.py at 11e412862d45)

     1 #!/usr/bin/env python     2      3 """     4 Confluence Wiki XML/XHTML syntax parsing.     5      6 Copyright (C) 2012, 2013, 2015 Paul Boddie <paul@boddie.org.uk>     7      8 This software is free software; you can redistribute it and/or     9 modify it under the terms of the GNU General Public License as    10 published by the Free Software Foundation; either version 2 of    11 the License, or (at your option) any later version.    12     13 This software is distributed in the hope that it will be useful,    14 but WITHOUT ANY WARRANTY; without even the implied warranty of    15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the    16 GNU General Public License for more details.    17     18 You should have received a copy of the GNU General Public    19 License along with this library; see the file LICENCE.txt    20 If not, write to the Free Software Foundation, Inc.,    21 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA    22 """    23     24 try:    25     from cStringIO import StringIO    26 except ImportError:    27     from StringIO import StringIO    28     29 from MoinMoin import wikiutil    30 from common import *    31 from xmlread import Parser    32 import re    33 import sys    34 import operator    35 import htmlentitydefs    36 import codecs    37     38 # XML dialect syntax parsing.    39     40 tags = {    41     # XHTML tag               MoinMoin syntax    42     "strong"                : "'''%s'''",    43     "em"                    : "''%s''",    44     "u"                     : "__%s__",    45     "del"                   : "--(%s)--",    46     "sup"                   : "^%s^",    47     "sub"                   : ",,%s,,",    48     "code"                  : "`%s`",    49     "tbody"                 : "%s",    50     "tr"                    : "%s",    51     "th"                    : "'''%s'''",    52     "td"                    : "%s",    53     "blockquote"            : " %s",    54     "small"                 : "~-%s-~",    55     "big"                   : "~+%s+~",    56     "p"                     : "%s",    57     "ol"                    : "%s",    58     "ul"                    : "%s",    59     "ac:link"               : "[[%s%s%s|%s]]",    60     "ac:image"              : "{{%s%s%s|%s}}",    61     "a"                     : "[[%s|%s]]",    62     }    63     64 for tag, translation in blocktypes.items():    65     tags[tag] = translation    66     67 simple_tags = {    68     # XHTML tag               MoinMoin syntax    69     "br"                    : "<<BR>>",    70     }    71     72 simple_preformatted_tags = {    73     # XHTML tag               MoinMoin syntax    74     "br"                    : "\n",    75     }    76     77 list_tags = {    78     # XHTML list tag          MoinMoin list item syntax    79     "ol"                    : "1. %s",    80     "ul"                    : "* %s",    81     }    82     83 preformatted_tags = ["pre", "ac:plain-text-body"]    84 single_level_tags = ["strong", "em", "u", "del", "sup", "sub", "code"]    85 formatted_tags    = ["ac:rich-text-body", "table"]    86     87 indented_tags = ["li", "p"] + preformatted_tags + formatted_tags    88 block_tags = indented_tags + blocktypes.keys() + list_tags.keys()    89 span_override_tags = ["ac:link"]    90     91 link_target_tags = {    92     # Confluence element      Attributes providing the target    93     "ri:page"               : ("ri:space-key", "ri:content-title"),    94     "ri:attachment"         : ("ri:filename",),    95     "ri:user"               : ("ri:username",),    96     }    97     98 link_target_prefixes = {    99     # Attribute with details  Prefix ensuring correct relative link   100     "ri:space-key"          : "..",   101     "ri:content-title"      : "..",   102     }   103    104 link_label_attributes = "ri:content-title", "ac:link-body"   105    106 # NOTE: User links should support the intended user namespace prefix.   107    108 link_target_types = {   109     # Confluence element      MoinMoin link prefix   110     "ri:attachment"         : "attachment:",   111     "ri:user"               : "",   112     }   113    114 macro_rich_text_styles = {   115     # Confluence style        MoinMoin admonition style   116     "note"                  : "caution",   117     "warning"               : "warning",   118     "info"                  : "important",   119     "tip"                   : "tip",   120     "excerpt"               : "",   121     }   122    123 macroargs = {   124     # Confluence macro        Confluence and MoinMoin macro arguments   125     "color"                 : ("color", "col"),   126     }   127    128 macrotypes = {   129     # Confluence macro        MoinMoin syntax   130     "anchor"                : "<<Anchor(%(anchor)s)>>",   131     "color"                 : "<<Color2(%(content)s, %(args)s)>>",   132     "toc"                   : "<<TableOfContents>>",   133     }   134    135 normalise_regexp_str = r"\s+"   136 normalise_regexp = re.compile(normalise_regexp_str)   137    138 class ConfluenceXMLParser(Parser):   139    140     "Handle content from Confluence 4 page revisions."   141    142     def __init__(self, out, is_comment_page=False):   143         Parser.__init__(self)   144         self.out = out   145         self.is_comment_page = is_comment_page   146    147         # Link target and label information.   148    149         self.target = None   150         self.target_type = None   151         self.label = None   152    153         # Macro information.   154    155         self.macros = []   156         self.macro_parameters = []   157         self.held_anchors = []   158    159         # Indentation and element nesting states.   160    161         self.indents = [0]   162         self.states = {}   163         self.max_level = self.level = 0   164    165         for name in preformatted_tags + single_level_tags:   166             self.states[name] = 0   167    168         # Table states.   169    170         self.table_rows = 0   171         self.table_columns = 0   172    173         # Block states.   174    175         self.have_block = False   176    177     # ContentHandler-related methods.   178    179     def startElement(self, name, attrs):   180    181         # Track indentation for lists.   182    183         if list_tags.has_key(name):   184             self.indents.append(self.indents[-1] + 1)   185    186         # Track element nesting.   187    188         if self.states.has_key(name):   189             self.states[name] += 1   190    191         # Track cumulative element nesting in order to produce appropriate depth   192         # indicators in the formatted output.   193    194         if name in preformatted_tags or name in formatted_tags:   195             self.level += 1   196             self.max_level = max(self.level, self.max_level)   197    198             # Reset indentation within regions.   199    200             self.indents.append(0)   201    202         if name in headings:   203             self.held_anchors = []   204    205         Parser.startElement(self, name, attrs)   206    207         # Remember macro information for use within the element.   208    209         if name == "ac:macro":   210             self.macros.append(self.attributes[-1].get("ac:name"))   211             self.macro_parameters.append({})   212    213     def endElement(self, name):   214    215         # Reset the indent for any preformatted/formatted region so that it may   216         # itself be indented.   217    218         if name in preformatted_tags or name in formatted_tags:   219             self.indents.pop()   220    221         Parser.endElement(self, name)   222    223         if list_tags.has_key(name):   224             self.indents.pop()   225    226         if self.states.has_key(name):   227             self.states[name] -= 1   228    229         if name in preformatted_tags or name in formatted_tags:   230             self.level -= 1   231             if not self.level:   232                 self.max_level = 0   233    234         # Discard macro state.   235    236         if name == "ac:macro":   237             self.macros.pop()   238             self.macro_parameters.pop()   239    240     def characters(self, content):   241         if not self.is_preformatted():   242             content = self.normalise(content, self.elements[-1])   243         Parser.characters(self, content)   244    245     def skippedEntity(self, name):   246         ch = htmlentitydefs.name2codepoint.get(name)   247         if ch:   248             self.text[-1].append(unichr(ch))   249    250     # Parser-related methods.   251    252     def handleElement(self, name):   253    254         """   255         Handle the completion of the element with the given 'name'. Any content   256         will either be recorded for later use (by an enclosing element, for   257         example) or emitted in some form.   258         """   259    260         text = u"".join(self.text[-1])   261    262         # Handle state.   263    264         if name == "table":   265             self.table_rows = 0   266         elif name == "tr":   267             self.table_columns = 0   268    269         # Find conversions.   270    271         conversion = None   272    273         # Handle list elements.   274    275         if name == "li" and len(self.elements) > 1:   276             list_tag = self.elements[-2]   277             conversion = list_tags.get(list_tag)   278    279         # Remember link target information.   280    281         elif link_target_tags.has_key(name):   282             target_details = []   283    284             # Get target details from the element's attributes.   285    286             for attrname in link_target_tags[name]:   287                 attrvalue = self.attributes[-1].get(attrname)   288                 if attrvalue:   289    290                     # Obtain a link label.   291    292                     if attrname in link_label_attributes and not self.label:   293                         self.label = attrvalue   294    295                     # Validate any page title.   296    297                     if attrname == "ri:content-title":   298                         attrvalue = get_page_title(attrvalue)   299                     target_details.append(attrvalue)   300    301                     # Insert any prefix required for the link.   302    303                     prefix = link_target_prefixes.get(attrname)   304                     if prefix:   305                         target_details.insert(0, prefix)   306                         if self.is_comment_page:   307                             target_details.insert(0, prefix)   308    309             # Make a link based on the details.   310    311             self.target = u"/".join(target_details)   312             self.target_type = name   313             text = ""   314    315         # For anchor links, just use the raw text and let Moin do the formatting.   316         # Set an empty default target, overwriting it if enclosing elements   317         # specify target details.   318    319         elif name in ("ac:link-body", "ac:plain-text-link-body"):   320             self.target = self.target or ""   321             self.label = text.strip()   322             text = ""   323    324         # For conventional links, remember the href attribute as the target.   325    326         elif name == "a":   327             self.target = self.attributes[-1].get("href")   328             self.label = text.strip()   329             text = ""   330    331         # Remember macro information.   332    333         elif name == "ac:parameter":   334             self.macro_parameters[-1][self.attributes[-1].get("ac:name")] = text   335             text = ""   336    337         elif name == "ac:default-parameter":   338             self.macro_parameters[-1][self.attributes[-2].get("ac:name")] = text   339             text = ""   340    341         # Handle single-level tags.   342    343         elif name in single_level_tags and self.states[name] > 1:   344             conversion = "%s"   345    346         # Handle preformatted sections.   347    348         elif name in preformatted_tags or name in formatted_tags:   349    350             # Nest the section appropriately.   351    352             level = 3 + self.max_level - self.level   353             opening = "{" * level   354             closing = "}" * level   355    356             # Macro name information is used to style rich text body regions.   357    358             if name != "table" and self.macros and macro_rich_text_styles.has_key(self.macros[-1]):   359                 details = macro_rich_text_styles[self.macros[-1]]   360                 title = self.macro_parameters[-1].get("title")   361                 if title:   362                     details = "%s\n\n%s" % (details, title)   363    364                 conversion = "%s#!wiki %s\n\n%%s\n%s" % (opening, details, closing)   365    366             elif name == "table":   367                 conversion = "%s#!table\n%%s\n%s" % (opening, closing)   368    369             else:   370                 # Preformatted sections containing newlines must contain an initial   371                 # newline.   372    373                 if text.find("\n") != -1 and not text.startswith("\n"):   374                     opening += "\n"   375    376                 conversion = "%s%%s%s" % (opening, closing)   377    378         # Handle the common case and simpler special cases.   379    380         if not conversion:   381             conversion = tags.get(name)   382    383    384    385         # Attempt to convert the text.   386    387         # Links require target information.   388    389         if name in ("ac:link", "ac:image"):   390             prefix = link_target_types.get(self.target_type, "")   391             anchor = self.attributes[-1].get("ac:anchor") or ""   392             label = self.label or text.strip() or self.target   393             text = conversion % (prefix, self.target, anchor and ("#%s" % anchor) or "", label)   394             self.target = self.target_type = self.label = None   395    396         elif name == "a":   397             text = conversion % (self.target, self.label or self.target)   398             self.target = self.target_type = self.label = None   399    400         # Macros require various kinds of information.   401         # Some macros affect the formatting of their contents, whereas other   402         # simpler macros are handled here.   403    404         elif name == "ac:macro":   405             conversion = macrotypes.get(self.macros[-1])   406             if conversion:   407                 parameters = {"content" : text}   408                 parameters.update(self.macro_parameters[-1])   409                 argnames = macroargs.get(self.macros[-1])   410                 if argnames:   411                     confargname, moinargname = argnames   412                     parameters["args"] = quote_macro_argument("%s=%s" % (moinargname, self.macro_parameters[-1][confargname]))   413                 text = conversion % parameters   414                 if self.macros[-1] == "anchor" and self.forbids_macros():   415                     self.held_anchors.append(text)   416                     text = ""   417    418         # Handle the common cases for parameterised and unparameterised   419         # substitutions.   420    421         elif text and conversion:   422             text = conversion % text   423         elif simple_tags.has_key(name) and not self.is_preformatted():   424             text = simple_tags[name]   425         elif simple_preformatted_tags.has_key(name) and self.is_preformatted():   426             text = simple_preformatted_tags[name]   427    428    429    430         # Postprocess table columns and rows.   431    432         if name in ("th", "td"):   433             if self.table_columns:   434                 text = "\n|| %s" % text   435             self.table_columns += 1   436         elif name == "tr":   437             if self.table_rows:   438                 text = "\n==\n%s" % text   439             self.table_rows += 1   440    441         # Postprocess held anchor tags in headings.   442    443         elif name in headings and self.held_anchors:   444             text = "%s\n%s" % ("".join(self.held_anchors), text)   445    446    447    448         # Normalise leading whitespace and indent the text if appropriate.   449    450         if name in indented_tags:   451             text = " " * self.indents[-1] + text.lstrip()   452    453         # Add the converted text to the end of the parent element's text nodes.   454    455         if len(self.text) > 1:   456             nodes = self.text[-2]   457             parent = self.elements[-2]   458    459             # Where preceding text exists, add any blank line separators.   460    461             if u"".join(nodes):   462    463                 # All top-level elements are separated with blank lines.   464    465                 if parent == "body":   466                     nodes.append("\n")   467    468                 # Block elements always cause a new line to be started.   469    470                 if name in block_tags or self.have_block and name not in span_override_tags:   471                     nodes.append("\n")   472    473                 self.have_block = False   474    475             # Lists inside lists require separation.   476    477             elif list_tags.has_key(name) and parent == "li":   478                 nodes.append("\n")   479    480             # Without preceding text, save any block node state for non-block   481             # elements so that newline separators can be added at another   482             # level.   483    484             elif name in block_tags and parent not in block_tags:   485                 self.have_block = True   486    487             elif name not in block_tags and self.have_block and name not in span_override_tags:   488                 self.have_block = True   489    490             else:   491                 self.have_block = False   492    493             nodes.append(text)   494    495         # Otherwise, emit the text (at the top level of the document).   496    497         else:   498             self.out.write(text)   499    500     def is_preformatted(self):   501         return reduce(operator.or_, [self.states[tag] for tag in preformatted_tags], False)   502    503     def forbids_macros(self):   504         return reduce(operator.or_, [(tag in headings or tag == "a") for tag in self.elements], False)   505    506     # Whitespace normalisation.   507    508     def get_replacement(self, name):   509         if name in ("html", "body", "table", "tbody", "tr") or list_tags.has_key(name):   510             return ""   511         else:   512             return " "   513    514     def normalise(self, text, name):   515         return normalise_regexp.sub(self.get_replacement(name), text)   516    517 def parse(s, out, is_comment_page=False):   518    519     "Parse the content in the string 's', writing a translation to 'out'."   520    521     # NOTE: CDATA sections appear to have erroneous endings.   522    523     s = u"""\   524 <?xml version="1.0"?>   525 <!DOCTYPE html    526      PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"   527      "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">   528 <html xmlns="http://www.w3.org/1999/xhtml">   529 <body>   530 %s   531 </body>   532 </html>""" % s.replace("]] >", "]]>")   533    534     f = StringIO(s.encode("utf-8"))   535     try:   536         parser = ConfluenceXMLParser(out, is_comment_page)   537         parser.parse(f)   538     finally:   539         f.close()   540    541 if __name__ == "__main__":   542     s = codecs.getreader("utf-8")(sys.stdin).read()   543     out = codecs.getwriter("utf-8")(sys.stdout)   544     parse(s, out)   545    546 # vim: tabstop=4 expandtab shiftwidth=4