ConfluenceConverter (file xmlparser.py at bffc66ce38a1)

     1 #!/usr/bin/env python     2      3 """     4 Confluence Wiki XML/XHTML syntax parsing.     5      6 Copyright (C) 2012, 2013 Paul Boddie <paul@boddie.org.uk>     7      8 This software is free software; you can redistribute it and/or     9 modify it under the terms of the GNU General Public License as    10 published by the Free Software Foundation; either version 2 of    11 the License, or (at your option) any later version.    12     13 This software is distributed in the hope that it will be useful,    14 but WITHOUT ANY WARRANTY; without even the implied warranty of    15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the    16 GNU General Public License for more details.    17     18 You should have received a copy of the GNU General Public    19 License along with this library; see the file LICENCE.txt    20 If not, write to the Free Software Foundation, Inc.,    21 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA    22 """    23     24 try:    25     from cStringIO import StringIO    26 except ImportError:    27     from StringIO import StringIO    28     29 from MoinMoin import wikiutil    30 from common import *    31 from xmlread import Parser    32 import re    33 import sys    34 import operator    35 import htmlentitydefs    36 import codecs    37     38 # XML dialect syntax parsing.    39     40 tags = {    41     # XHTML tag               MoinMoin syntax    42     "strong"                : "'''%s'''",    43     "em"                    : "''%s''",    44     "u"                     : "__%s__",    45     "del"                   : "--(%s)--",    46     "sup"                   : "^%s^",    47     "sub"                   : ",,%s,,",    48     "code"                  : "`%s`",    49     "tbody"                 : "%s",    50     "tr"                    : "%s",    51     "th"                    : "'''%s'''",    52     "td"                    : "%s",    53     "blockquote"            : " %s",    54     "small"                 : "~-%s-~",    55     "big"                   : "~+%s+~",    56     "p"                     : "%s",    57     "ol"                    : "%s",    58     "ul"                    : "%s",    59     "ac:link"               : "[[%s%s%s|%s]]",    60     "ac:image"              : "{{%s%s%s|%s}}",    61     "a"                     : "[[%s|%s]]",    62     }    63     64 for tag, translation in blocktypes.items():    65     tags[tag] = translation    66     67 simple_tags = {    68     # XHTML tag               MoinMoin syntax    69     "br"                    : "<<BR>>",    70     }    71     72 simple_preformatted_tags = {    73     # XHTML tag               MoinMoin syntax    74     "br"                    : "\n",    75     }    76     77 list_tags = {    78     # XHTML list tag          MoinMoin list item syntax    79     "ol"                    : "1. %s",    80     "ul"                    : "* %s",    81     }    82     83 preformatted_tags = ["pre", "ac:plain-text-body"]    84 single_level_tags = ["strong", "em", "u", "del", "sup", "sub", "code"]    85 formatted_tags    = ["ac:rich-text-body", "table"]    86     87 indented_tags = ["li", "p"] + preformatted_tags + formatted_tags    88 block_tags = indented_tags + blocktypes.keys() + list_tags.keys()    89 span_override_tags = ["ac:link"]    90     91 link_target_tags = {    92     # Confluence element      Attributes providing the target    93     "ri:page"               : ("ri:space-key", "ri:content-title"),    94     "ri:attachment"         : ("ri:filename",),    95     "ri:user"               : ("ri:username",),    96     }    97     98 link_target_prefixes = {    99     # Attribute with details  Prefix ensuring correct relative link   100     "ri:space-key"          : "..",   101     "ri:content-title"      : "..",   102     }   103    104 link_label_attributes = "ri:content-title", "ac:link-body"   105    106 # NOTE: User links should support the intended user namespace prefix.   107    108 link_target_types = {   109     # Confluence element      MoinMoin link prefix   110     "ri:attachment"         : "attachment:",   111     "ri:user"               : "",   112     }   113    114 macro_rich_text_styles = {   115     # Confluence style        MoinMoin admonition style   116     "note"                  : "caution",   117     "warning"               : "warning",   118     "info"                  : "important",   119     "tip"                   : "tip",   120     "excerpt"               : "",   121     }   122    123 macroargs = {   124     # Confluence macro        Confluence and MoinMoin macro arguments   125     "color"                 : ("color", "col"),   126     }   127    128 macrotypes = {   129     # Confluence macro        MoinMoin syntax   130     "anchor"                : "<<Anchor(%(anchor)s)>>",   131     "color"                 : "<<Color2(%(content)s, %(args)s)>>",   132     }   133    134 normalise_regexp_str = r"\s+"   135 normalise_regexp = re.compile(normalise_regexp_str)   136    137 class ConfluenceXMLParser(Parser):   138    139     "Handle content from Confluence 4 page revisions."   140    141     def __init__(self, out):   142         Parser.__init__(self)   143         self.out = out   144    145         # Link target and label information.   146    147         self.target = None   148         self.target_type = None   149         self.label = None   150    151         # Macro information.   152    153         self.macro = None   154         self.macro_parameters = {}   155         self.held_anchors = []   156    157         # Indentation and element nesting states.   158    159         self.indents = [0]   160         self.states = {}   161         self.max_level = self.level = 0   162    163         for name in preformatted_tags + single_level_tags:   164             self.states[name] = 0   165    166         # Table states.   167    168         self.table_rows = 0   169         self.table_columns = 0   170    171         # Block states.   172    173         self.have_block = False   174    175     # ContentHandler-related methods.   176    177     def startElement(self, name, attrs):   178    179         # Track indentation for lists.   180    181         if list_tags.has_key(name):   182             self.indents.append(self.indents[-1] + 1)   183    184         # Track element nesting.   185    186         if self.states.has_key(name):   187             self.states[name] += 1   188    189         # Track cumulative element nesting in order to produce appropriate depth   190         # indicators in the formatted output.   191    192         if name in preformatted_tags or name in formatted_tags:   193             self.level += 1   194             self.max_level = max(self.level, self.max_level)   195    196             # Reset indentation within regions.   197    198             self.indents.append(0)   199    200         if name in headings:   201             self.held_anchors = []   202    203         Parser.startElement(self, name, attrs)   204    205         # Remember macro information for use within the element.   206    207         if name == "ac:macro":   208             self.macro = self.attributes[-1].get("ac:name")   209    210     def endElement(self, name):   211    212         # Reset the indent for any preformatted/formatted region so that it may   213         # itself be indented.   214    215         if name in preformatted_tags or name in formatted_tags:   216             self.indents.pop()   217    218         Parser.endElement(self, name)   219    220         if list_tags.has_key(name):   221             self.indents.pop()   222    223         if self.states.has_key(name):   224             self.states[name] -= 1   225    226         if name in preformatted_tags or name in formatted_tags:   227             self.level -= 1   228             if not self.level:   229                 self.max_level = 0   230    231         # Discard macro state.   232    233         if name == "ac:macro":   234             self.macro = None   235             self.macro_parameters = {}   236    237     def characters(self, content):   238         if not self.is_preformatted():   239             content = self.normalise(content, self.elements[-1])   240         Parser.characters(self, content)   241    242     def skippedEntity(self, name):   243         ch = htmlentitydefs.name2codepoint.get(name)   244         if ch:   245             self.text[-1].append(unichr(ch))   246    247     # Parser-related methods.   248    249     def handleElement(self, name):   250    251         """   252         Handle the completion of the element with the given 'name'. Any content   253         will either be recorded for later use (by an enclosing element, for   254         example) or emitted in some form.   255         """   256    257         text = u"".join(self.text[-1])   258    259         # Handle state.   260    261         if name == "table":   262             self.table_rows = 0   263         elif name == "tr":   264             self.table_columns = 0   265    266         # Find conversions.   267    268         conversion = None   269    270         # Handle list elements.   271    272         if name == "li" and len(self.elements) > 1:   273             list_tag = self.elements[-2]   274             conversion = list_tags.get(list_tag)   275    276         # Remember link target information.   277    278         elif link_target_tags.has_key(name):   279             target_details = []   280    281             # Get target details from the element's attributes.   282    283             for attrname in link_target_tags[name]:   284                 attrvalue = self.attributes[-1].get(attrname)   285                 if attrvalue:   286    287                     # Obtain a link label.   288    289                     if attrname in link_label_attributes and not self.label:   290                         self.label = attrvalue   291    292                     # Validate any page title.   293    294                     if attrname == "ri:content-title":   295                         attrvalue = get_page_title(attrvalue)   296                     target_details.append(attrvalue)   297    298                     # Insert any prefix required for the link.   299    300                     prefix = link_target_prefixes.get(attrname)   301                     if prefix:   302                         target_details.insert(0, prefix)   303    304             # Make a link based on the details.   305    306             self.target = u"/".join(target_details)   307             self.target_type = name   308             text = ""   309    310         # For anchor links, just use the raw text and let Moin do the formatting.   311    312         elif name == "ac:link-body":   313             self.label = text.strip()   314             text = ""   315    316         # For conventional links, remember the href attribute as the target.   317    318         elif name == "a":   319             self.target = self.attributes[-1].get("href")   320             self.label = text.strip()   321             text = ""   322    323         # Remember macro information.   324    325         elif name == "ac:parameter":   326             self.macro_parameters[self.attributes[-1].get("ac:name")] = text   327             text = ""   328    329         elif name == "ac:default-parameter":   330             self.macro_parameters[self.attributes[-2].get("ac:name")] = text   331             text = ""   332    333         # Handle single-level tags.   334    335         elif name in single_level_tags and self.states[name] > 1:   336             conversion = "%s"   337    338         # Handle preformatted sections.   339    340         elif name in preformatted_tags or name in formatted_tags:   341    342             # Nest the section appropriately.   343    344             level = 3 + self.max_level - self.level   345             opening = "{" * level   346             closing = "}" * level   347    348             # Macro name information is used to style rich text body regions.   349    350             if name != "table" and self.macro and macro_rich_text_styles.has_key(self.macro):   351                 details = macro_rich_text_styles[self.macro]   352                 title = self.macro_parameters.get("title")   353                 if title:   354                     details = "%s\n\n%s" % (details, title)   355    356                 conversion = "%s#!wiki %s\n\n%%s\n%s" % (opening, details, closing)   357    358             elif name == "table":   359                 conversion = "%s#!table\n%%s\n%s" % (opening, closing)   360    361             else:   362                 # Preformatted sections containing newlines must contain an initial   363                 # newline.   364    365                 if text.find("\n") != -1 and not text.startswith("\n"):   366                     opening += "\n"   367    368                 conversion = "%s%%s%s" % (opening, closing)   369    370         # Handle the common case and simpler special cases.   371    372         if not conversion:   373             conversion = tags.get(name)   374    375    376    377         # Attempt to convert the text.   378    379         # Links require target information.   380    381         if name in ("ac:link", "ac:image"):   382             prefix = link_target_types.get(self.target_type, "")   383             anchor = self.attributes[-1].get("ac:anchor") or ""   384             label = self.label or text.strip() or self.target   385             text = conversion % (prefix, self.target, anchor and ("#%s" % anchor) or "", label)   386             self.target = self.target_type = self.label = None   387    388         elif name == "a":   389             text = conversion % (self.target, self.label or self.target)   390             self.target = self.target_type = self.label = None   391    392         # Macros require various kinds of information.   393         # Some macros affect the formatting of their contents, whereas other   394         # simpler macros are handled here.   395    396         elif name == "ac:macro":   397             conversion = macrotypes.get(self.macro)   398             if conversion:   399                 parameters = {"content" : text}   400                 parameters.update(self.macro_parameters)   401                 argnames = macroargs.get(self.macro)   402                 if argnames:   403                     confargname, moinargname = argnames   404                     parameters["args"] = quote_macro_argument("%s=%s" % (moinargname, self.macro_parameters[confargname]))   405                 text = conversion % parameters   406                 if self.macro == "anchor" and self.forbids_macros():   407                     self.held_anchors.append(text)   408                     text = ""   409    410         # Handle the common cases for parameterised and unparameterised   411         # substitutions.   412    413         elif text and conversion:   414             text = conversion % text   415         elif simple_tags.has_key(name) and not self.is_preformatted():   416             text = simple_tags[name]   417         elif simple_preformatted_tags.has_key(name) and self.is_preformatted():   418             text = simple_preformatted_tags[name]   419    420    421    422         # Postprocess table columns and rows.   423    424         if name in ("th", "td"):   425             if self.table_columns:   426                 text = "\n|| %s" % text   427             self.table_columns += 1   428         elif name == "tr":   429             if self.table_rows:   430                 text = "\n==\n%s" % text   431             self.table_rows += 1   432    433         # Postprocess held anchor tags in headings.   434    435         elif name in headings and self.held_anchors:   436             text = "%s\n%s" % ("".join(self.held_anchors), text)   437    438    439    440         # Normalise leading whitespace and indent the text if appropriate.   441    442         if name in indented_tags:   443             text = " " * self.indents[-1] + text.lstrip()   444    445         # Add the converted text to the end of the parent element's text nodes.   446    447         if len(self.text) > 1:   448             nodes = self.text[-2]   449             parent = self.elements[-2]   450    451             # Where preceding text exists, add any blank line separators.   452    453             if u"".join(nodes):   454    455                 # All top-level elements are separated with blank lines.   456    457                 if parent == "body":   458                     nodes.append("\n")   459    460                 # Block elements always cause a new line to be started.   461    462                 if name in block_tags or self.have_block and name not in span_override_tags:   463                     nodes.append("\n")   464    465                 self.have_block = False   466    467             # Lists inside lists require separation.   468    469             elif list_tags.has_key(name) and parent == "li":   470                 nodes.append("\n")   471    472             # Without preceding text, save any block node state for non-block   473             # elements so that newline separators can be added at another   474             # level.   475    476             elif name in block_tags and parent not in block_tags:   477                 self.have_block = True   478    479             elif name not in block_tags and self.have_block and name not in span_override_tags:   480                 self.have_block = True   481    482             else:   483                 self.have_block = False   484    485             nodes.append(text)   486    487         # Otherwise, emit the text (at the top level of the document).   488    489         else:   490             self.out.write(text)   491    492     def is_preformatted(self):   493         return reduce(operator.or_, [self.states[tag] for tag in preformatted_tags], False)   494    495     def forbids_macros(self):   496         return reduce(operator.or_, [(tag in headings or tag == "a") for tag in self.elements], False)   497    498     # Whitespace normalisation.   499    500     def get_replacement(self, name):   501         if name in ("html", "body", "table", "tbody", "tr") or list_tags.has_key(name):   502             return ""   503         else:   504             return " "   505    506     def normalise(self, text, name):   507         return normalise_regexp.sub(self.get_replacement(name), text)   508    509 def parse(s, out):   510    511     "Parse the content in the string 's', writing a translation to 'out'."   512    513     # NOTE: CDATA sections appear to have erroneous endings.   514    515     s = u"""\   516 <?xml version="1.0"?>   517 <!DOCTYPE html    518      PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"   519      "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">   520 <html xmlns="http://www.w3.org/1999/xhtml">   521 <body>   522 %s   523 </body>   524 </html>""" % s.replace("]] >", "]]>")   525    526     f = StringIO(s.encode("utf-8"))   527     try:   528         parser = ConfluenceXMLParser(out)   529         parser.parse(f)   530     finally:   531         f.close()   532    533 if __name__ == "__main__":   534     s = codecs.getreader("utf-8")(sys.stdin).read()   535     out = codecs.getwriter("utf-8")(sys.stdout)   536     parse(s, out)   537    538 # vim: tabstop=4 expandtab shiftwidth=4