ConfluenceConverter (file xmlparser.py at 9eb0dcdf488c)

     1 #!/usr/bin/env python     2      3 """     4 Confluence Wiki XML/XHTML syntax parsing.     5      6 Copyright (C) 2012, 2013 Paul Boddie <paul@boddie.org.uk>     7      8 This software is free software; you can redistribute it and/or     9 modify it under the terms of the GNU General Public License as    10 published by the Free Software Foundation; either version 2 of    11 the License, or (at your option) any later version.    12     13 This software is distributed in the hope that it will be useful,    14 but WITHOUT ANY WARRANTY; without even the implied warranty of    15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the    16 GNU General Public License for more details.    17     18 You should have received a copy of the GNU General Public    19 License along with this library; see the file LICENCE.txt    20 If not, write to the Free Software Foundation, Inc.,    21 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA    22 """    23     24 try:    25     from cStringIO import StringIO    26 except ImportError:    27     from StringIO import StringIO    28     29 from MoinMoin import wikiutil    30 from common import *    31 from xmlread import Parser    32 import re    33 import sys    34 import operator    35 import htmlentitydefs    36 import codecs    37     38 # XML dialect syntax parsing.    39     40 tags = {    41     # XHTML tag               MoinMoin syntax    42     "strong"                : "'''%s'''",    43     "em"                    : "''%s''",    44     "u"                     : "__%s__",    45     "del"                   : "--(%s)--",    46     "sup"                   : "^%s^",    47     "sub"                   : ",,%s,,",    48     "code"                  : "`%s`",    49     "tbody"                 : "%s",    50     "tr"                    : "%s",    51     "th"                    : "'''%s'''",    52     "td"                    : "%s",    53     "blockquote"            : " %s",    54     "small"                 : "~-%s-~",    55     "big"                   : "~+%s+~",    56     "p"                     : "%s",    57     "ol"                    : "%s",    58     "ul"                    : "%s",    59     "ac:link"               : "[[%s%s%s|%s]]",    60     "ac:image"              : "{{%s%s%s|%s}}",    61     "a"                     : "[[%s|%s]]",    62     }    63     64 for tag, translation in blocktypes.items():    65     tags[tag] = translation    66     67 simple_tags = {    68     # XHTML tag               MoinMoin syntax    69     "br"                    : "<<BR>>",    70     }    71     72 simple_preformatted_tags = {    73     # XHTML tag               MoinMoin syntax    74     "br"                    : "\n",    75     }    76     77 list_tags = {    78     # XHTML list tag          MoinMoin list item syntax    79     "ol"                    : "1. %s",    80     "ul"                    : "* %s",    81     }    82     83 preformatted_tags = ["pre", "ac:plain-text-body"]    84 single_level_tags = ["strong", "em", "u", "del", "sup", "sub", "code"]    85 formatted_tags    = ["ac:rich-text-body", "table"]    86     87 indented_tags = ["li", "p"] + preformatted_tags + formatted_tags    88 block_tags = indented_tags + blocktypes.keys() + list_tags.keys()    89 span_override_tags = ["ac:link"]    90     91 link_target_tags = {    92     # Confluence element      Attributes providing the target    93     "ri:page"               : ("ri:space-key", "ri:content-title"),    94     "ri:attachment"         : ("ri:filename",),    95     "ri:user"               : ("ri:username",),    96     }    97     98 link_target_prefixes = {    99     # Attribute with details  Prefix ensuring correct relative link   100     "ri:space-key"          : "..",   101     "ri:content-title"      : "..",   102     }   103    104 link_label_attributes = "ri:content-title", "ac:link-body"   105    106 # NOTE: User links should support the intended user namespace prefix.   107    108 link_target_types = {   109     # Confluence element      MoinMoin link prefix   110     "ri:attachment"         : "attachment:",   111     "ri:user"               : "",   112     }   113    114 macro_rich_text_styles = {   115     # Confluence style        MoinMoin admonition style   116     "note"                  : "caution",   117     "warning"               : "warning",   118     "info"                  : "important",   119     "tip"                   : "tip",   120     "excerpt"               : "",   121     }   122    123 macroargs = {   124     # Confluence macro        Confluence and MoinMoin macro arguments   125     "color"                 : ("color", "col"),   126     }   127    128 macrotypes = {   129     # Confluence macro        MoinMoin syntax   130     "anchor"                : "<<Anchor(%(anchor)s)>>",   131     "color"                 : "<<Color2(%(content)s, %(args)s)>>",   132     "toc"                   : "<<TableOfContents>>",   133     }   134    135 normalise_regexp_str = r"\s+"   136 normalise_regexp = re.compile(normalise_regexp_str)   137    138 class ConfluenceXMLParser(Parser):   139    140     "Handle content from Confluence 4 page revisions."   141    142     def __init__(self, out):   143         Parser.__init__(self)   144         self.out = out   145    146         # Link target and label information.   147    148         self.target = None   149         self.target_type = None   150         self.label = None   151    152         # Macro information.   153    154         self.macros = []   155         self.macro_parameters = []   156         self.held_anchors = []   157    158         # Indentation and element nesting states.   159    160         self.indents = [0]   161         self.states = {}   162         self.max_level = self.level = 0   163    164         for name in preformatted_tags + single_level_tags:   165             self.states[name] = 0   166    167         # Table states.   168    169         self.table_rows = 0   170         self.table_columns = 0   171    172         # Block states.   173    174         self.have_block = False   175    176     # ContentHandler-related methods.   177    178     def startElement(self, name, attrs):   179    180         # Track indentation for lists.   181    182         if list_tags.has_key(name):   183             self.indents.append(self.indents[-1] + 1)   184    185         # Track element nesting.   186    187         if self.states.has_key(name):   188             self.states[name] += 1   189    190         # Track cumulative element nesting in order to produce appropriate depth   191         # indicators in the formatted output.   192    193         if name in preformatted_tags or name in formatted_tags:   194             self.level += 1   195             self.max_level = max(self.level, self.max_level)   196    197             # Reset indentation within regions.   198    199             self.indents.append(0)   200    201         if name in headings:   202             self.held_anchors = []   203    204         Parser.startElement(self, name, attrs)   205    206         # Remember macro information for use within the element.   207    208         if name == "ac:macro":   209             self.macros.append(self.attributes[-1].get("ac:name"))   210             self.macro_parameters.append({})   211    212     def endElement(self, name):   213    214         # Reset the indent for any preformatted/formatted region so that it may   215         # itself be indented.   216    217         if name in preformatted_tags or name in formatted_tags:   218             self.indents.pop()   219    220         Parser.endElement(self, name)   221    222         if list_tags.has_key(name):   223             self.indents.pop()   224    225         if self.states.has_key(name):   226             self.states[name] -= 1   227    228         if name in preformatted_tags or name in formatted_tags:   229             self.level -= 1   230             if not self.level:   231                 self.max_level = 0   232    233         # Discard macro state.   234    235         if name == "ac:macro":   236             self.macros.pop()   237             self.macro_parameters.pop()   238    239     def characters(self, content):   240         if not self.is_preformatted():   241             content = self.normalise(content, self.elements[-1])   242         Parser.characters(self, content)   243    244     def skippedEntity(self, name):   245         ch = htmlentitydefs.name2codepoint.get(name)   246         if ch:   247             self.text[-1].append(unichr(ch))   248    249     # Parser-related methods.   250    251     def handleElement(self, name):   252    253         """   254         Handle the completion of the element with the given 'name'. Any content   255         will either be recorded for later use (by an enclosing element, for   256         example) or emitted in some form.   257         """   258    259         text = u"".join(self.text[-1])   260    261         # Handle state.   262    263         if name == "table":   264             self.table_rows = 0   265         elif name == "tr":   266             self.table_columns = 0   267    268         # Find conversions.   269    270         conversion = None   271    272         # Handle list elements.   273    274         if name == "li" and len(self.elements) > 1:   275             list_tag = self.elements[-2]   276             conversion = list_tags.get(list_tag)   277    278         # Remember link target information.   279    280         elif link_target_tags.has_key(name):   281             target_details = []   282    283             # Get target details from the element's attributes.   284    285             for attrname in link_target_tags[name]:   286                 attrvalue = self.attributes[-1].get(attrname)   287                 if attrvalue:   288    289                     # Obtain a link label.   290    291                     if attrname in link_label_attributes and not self.label:   292                         self.label = attrvalue   293    294                     # Validate any page title.   295    296                     if attrname == "ri:content-title":   297                         attrvalue = get_page_title(attrvalue)   298                     target_details.append(attrvalue)   299    300                     # Insert any prefix required for the link.   301    302                     prefix = link_target_prefixes.get(attrname)   303                     if prefix:   304                         target_details.insert(0, prefix)   305    306             # Make a link based on the details.   307    308             self.target = u"/".join(target_details)   309             self.target_type = name   310             text = ""   311    312         # For anchor links, just use the raw text and let Moin do the formatting.   313    314         elif name == "ac:link-body":   315             self.label = text.strip()   316             text = ""   317    318         # For conventional links, remember the href attribute as the target.   319    320         elif name == "a":   321             self.target = self.attributes[-1].get("href")   322             self.label = text.strip()   323             text = ""   324    325         # Remember macro information.   326    327         elif name == "ac:parameter":   328             self.macro_parameters[-1][self.attributes[-1].get("ac:name")] = text   329             text = ""   330    331         elif name == "ac:default-parameter":   332             self.macro_parameters[-1][self.attributes[-2].get("ac:name")] = text   333             text = ""   334    335         # Handle single-level tags.   336    337         elif name in single_level_tags and self.states[name] > 1:   338             conversion = "%s"   339    340         # Handle preformatted sections.   341    342         elif name in preformatted_tags or name in formatted_tags:   343    344             # Nest the section appropriately.   345    346             level = 3 + self.max_level - self.level   347             opening = "{" * level   348             closing = "}" * level   349    350             # Macro name information is used to style rich text body regions.   351    352             if name != "table" and self.macros and macro_rich_text_styles.has_key(self.macros[-1]):   353                 details = macro_rich_text_styles[self.macros[-1]]   354                 title = self.macro_parameters[-1].get("title")   355                 if title:   356                     details = "%s\n\n%s" % (details, title)   357    358                 conversion = "%s#!wiki %s\n\n%%s\n%s" % (opening, details, closing)   359    360             elif name == "table":   361                 conversion = "%s#!table\n%%s\n%s" % (opening, closing)   362    363             else:   364                 # Preformatted sections containing newlines must contain an initial   365                 # newline.   366    367                 if text.find("\n") != -1 and not text.startswith("\n"):   368                     opening += "\n"   369    370                 conversion = "%s%%s%s" % (opening, closing)   371    372         # Handle the common case and simpler special cases.   373    374         if not conversion:   375             conversion = tags.get(name)   376    377    378    379         # Attempt to convert the text.   380    381         # Links require target information.   382    383         if name in ("ac:link", "ac:image"):   384             prefix = link_target_types.get(self.target_type, "")   385             anchor = self.attributes[-1].get("ac:anchor") or ""   386             label = self.label or text.strip() or self.target   387             text = conversion % (prefix, self.target, anchor and ("#%s" % anchor) or "", label)   388             self.target = self.target_type = self.label = None   389    390         elif name == "a":   391             text = conversion % (self.target, self.label or self.target)   392             self.target = self.target_type = self.label = None   393    394         # Macros require various kinds of information.   395         # Some macros affect the formatting of their contents, whereas other   396         # simpler macros are handled here.   397    398         elif name == "ac:macro":   399             conversion = macrotypes.get(self.macros[-1])   400             if conversion:   401                 parameters = {"content" : text}   402                 parameters.update(self.macro_parameters[-1])   403                 argnames = macroargs.get(self.macros[-1])   404                 if argnames:   405                     confargname, moinargname = argnames   406                     parameters["args"] = quote_macro_argument("%s=%s" % (moinargname, self.macro_parameters[-1][confargname]))   407                 text = conversion % parameters   408                 if self.macros[-1] == "anchor" and self.forbids_macros():   409                     self.held_anchors.append(text)   410                     text = ""   411    412         # Handle the common cases for parameterised and unparameterised   413         # substitutions.   414    415         elif text and conversion:   416             text = conversion % text   417         elif simple_tags.has_key(name) and not self.is_preformatted():   418             text = simple_tags[name]   419         elif simple_preformatted_tags.has_key(name) and self.is_preformatted():   420             text = simple_preformatted_tags[name]   421    422    423    424         # Postprocess table columns and rows.   425    426         if name in ("th", "td"):   427             if self.table_columns:   428                 text = "\n|| %s" % text   429             self.table_columns += 1   430         elif name == "tr":   431             if self.table_rows:   432                 text = "\n==\n%s" % text   433             self.table_rows += 1   434    435         # Postprocess held anchor tags in headings.   436    437         elif name in headings and self.held_anchors:   438             text = "%s\n%s" % ("".join(self.held_anchors), text)   439    440    441    442         # Normalise leading whitespace and indent the text if appropriate.   443    444         if name in indented_tags:   445             text = " " * self.indents[-1] + text.lstrip()   446    447         # Add the converted text to the end of the parent element's text nodes.   448    449         if len(self.text) > 1:   450             nodes = self.text[-2]   451             parent = self.elements[-2]   452    453             # Where preceding text exists, add any blank line separators.   454    455             if u"".join(nodes):   456    457                 # All top-level elements are separated with blank lines.   458    459                 if parent == "body":   460                     nodes.append("\n")   461    462                 # Block elements always cause a new line to be started.   463    464                 if name in block_tags or self.have_block and name not in span_override_tags:   465                     nodes.append("\n")   466    467                 self.have_block = False   468    469             # Lists inside lists require separation.   470    471             elif list_tags.has_key(name) and parent == "li":   472                 nodes.append("\n")   473    474             # Without preceding text, save any block node state for non-block   475             # elements so that newline separators can be added at another   476             # level.   477    478             elif name in block_tags and parent not in block_tags:   479                 self.have_block = True   480    481             elif name not in block_tags and self.have_block and name not in span_override_tags:   482                 self.have_block = True   483    484             else:   485                 self.have_block = False   486    487             nodes.append(text)   488    489         # Otherwise, emit the text (at the top level of the document).   490    491         else:   492             self.out.write(text)   493    494     def is_preformatted(self):   495         return reduce(operator.or_, [self.states[tag] for tag in preformatted_tags], False)   496    497     def forbids_macros(self):   498         return reduce(operator.or_, [(tag in headings or tag == "a") for tag in self.elements], False)   499    500     # Whitespace normalisation.   501    502     def get_replacement(self, name):   503         if name in ("html", "body", "table", "tbody", "tr") or list_tags.has_key(name):   504             return ""   505         else:   506             return " "   507    508     def normalise(self, text, name):   509         return normalise_regexp.sub(self.get_replacement(name), text)   510    511 def parse(s, out):   512    513     "Parse the content in the string 's', writing a translation to 'out'."   514    515     # NOTE: CDATA sections appear to have erroneous endings.   516    517     s = u"""\   518 <?xml version="1.0"?>   519 <!DOCTYPE html    520      PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"   521      "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">   522 <html xmlns="http://www.w3.org/1999/xhtml">   523 <body>   524 %s   525 </body>   526 </html>""" % s.replace("]] >", "]]>")   527    528     f = StringIO(s.encode("utf-8"))   529     try:   530         parser = ConfluenceXMLParser(out)   531         parser.parse(f)   532     finally:   533         f.close()   534    535 if __name__ == "__main__":   536     s = codecs.getreader("utf-8")(sys.stdin).read()   537     out = codecs.getwriter("utf-8")(sys.stdout)   538     parse(s, out)   539    540 # vim: tabstop=4 expandtab shiftwidth=4