# HG changeset patch # User Paul Boddie # Date 1355868379 -3600 # Node ID 547de21df3d4c2a106c46982e43dc65d0b08af25 # Parent e0920cd5997032df3e9f24ce48cb2310f7d95ffe Attempted to normalise whitespace and to ensure that the different block elements retain their separation and relationships in the generated markup. Added support for adminitions through the recognition of certain Confluence macro invocations. Added support for "skipped entities" such as — and – found in the Confluence XHTML markup. diff -r e0920cd59970 -r 547de21df3d4 parser.py --- a/parser.py Mon Dec 17 02:06:24 2012 +0100 +++ b/parser.py Tue Dec 18 23:06:19 2012 +0100 @@ -39,6 +39,7 @@ from xmlread import Parser import re import sys +import operator URL_SCHEMES = ("http", "https", "ftp", "mailto") @@ -383,6 +384,7 @@ # XML dialect syntax parsing. tags = { + # XHTML tag MoinMoin syntax "strong" : "'''%s'''", "em" : "''%s''", "u" : "__%s__", @@ -394,25 +396,48 @@ "blockquote" : " %s", "small" : "~-%s-~", "big" : "~+%s+~", - "p" : "%s\n\n", + "p" : "\n%s\n", + "ol" : "\n%s", + "ul" : "\n%s", "ac:plain-text-body" : "{{{%s}}}", "ac:link" : "[[%s%s|%s]]", } - -tags.update(blocktypes) + +for tag, translation in blocktypes.items(): + tags[tag] = "\n%s\n" % translation + +simple_tags = { + # XHTML tag MoinMoin syntax + "br" : "<
>", + } list_tags = { - "ol" : " 1. %s\n", - "ul" : " * %s\n", + # XHTML list tag MoinMoin list item syntax + "ol" : "1. %s\n", + "ul" : "* %s\n", } +indented_tags = ["li", "p"] + link_target_tags = { + # Confluence element Attribute providing the target "ri:page" : "ri:content-title", "ri:attachment" : "ri:filename", } -normalise_regexp_str = r"\n\n+" -normalise_regexp = re.compile(normalise_regexp_str, re.DOTALL) +macro_rich_text_styles = { + # Confluence style MoinMoin admonition style + "note" : "caution", + "warning" : "warning", + "info" : "important", + "tip" : "tip", + } + +normalise_regexp_str = r"\s+" +normalise_regexp = re.compile(normalise_regexp_str) + +normalise_end_regexp_str = r"\s\s+$" +normalise_end_regexp = re.compile(normalise_end_regexp_str) class ConfluenceXMLParser(Parser): @@ -427,8 +452,50 @@ self.target = None self.target_type = None + # Macro information. + + self.macro = None + self.macro_parameters = {} + + # Indentation and preformatted states. + + self.indent = 0 + self.states = {} + for name in ("pre", "ac:plain-text-body"): + self.states[name] = 0 + + # ContentHandler-related methods. + + def startElement(self, name, attrs): + if list_tags.has_key(name): + self.indent += 1 + elif self.states.has_key(name): + self.states[name] += 1 + Parser.startElement(self, name, attrs) + + def endElement(self, name): + Parser.endElement(self, name) + if list_tags.has_key(name): + self.indent -= 1 + elif self.states.has_key(name): + self.states[name] -= 1 + + def characters(self, content): + if not self.is_preformatted(): + content = self.normalise(content, self.elements[-1]) + Parser.characters(self, content) + + def skippedEntity(self, name): + if name == "mdash": + self.text[-1].append(u"\u2014") + elif name == "ndash": + self.text[-1].append(u"\u2013") + + # Parser-related methods. + def handleElement(self, name): text = "".join(self.text[-1]) + conversion = None # Handle list elements. @@ -443,6 +510,15 @@ self.target_type = name text = "" + # Remember macro information. + + elif name == "ac:parameter": + self.macro_parameters[self.attributes[-1].get("ac:name")] = text + text = "" + + elif name == "ac:macro": + self.macro = self.attributes[-1].get("ac:name") + # Handle the common case. else: @@ -450,6 +526,8 @@ # Attempt to convert the text. + # Links require target information. + if name == "ac:link": if self.target_type == "ri:attachment": prefix = "attachment:" @@ -457,21 +535,66 @@ prefix = "../" text = conversion % (prefix, self.target, text or self.target) + self.target = self.target_type = None + + # Macro name information is used to style rich text body regions. + + elif name == "ac:macro" and macro_rich_text_styles.has_key(self.macro): + details = macro_rich_text_styles[self.macro] + title = self.macro_parameters.get("title") + if title: + details = "%s\n\n%s" % (details, title) + text = "{{{#!wiki %s\n\n%s}}}" % (details, text) + self.macro = None + self.macro_parameters = {} # Handle the common case. elif text and conversion: text = conversion % text + elif simple_tags.has_key(name): + text = simple_tags[name] + + # Normalise leading whitespace and indent the text if appropriate. + + if name in indented_tags: + text = " " * self.indent + text.lstrip() # Add the converted text to the end of the parent element's text nodes. if len(self.text) > 1: + preceding = "".join(self.text[-2]) + + if not self.is_preformatted(): + preceding = self.normalise_end(preceding, self.elements[-2]) + + self.text[-2] = [preceding] self.text[-2].append(text) - # Otherwise, emit the text with normalised newlines. + # Otherwise, emit the text. else: - self.out.write(normalise_regexp.sub("\n\n", text)) + self.out.write(text) + + def is_preformatted(self): + return reduce(operator.or_, self.states.values(), False) + + def get_replacement(self, name, end=False): + if list_tags.has_key(name): + if end: + return "\n" + else: + return "" + elif name == "body": + return "\n\n" + else: + return " " + + def normalise(self, text, name): + return normalise_regexp.sub(self.get_replacement(name), text) + + def normalise_end(self, text, name): + return normalise_end_regexp.sub(self.get_replacement(name, True), text) def xmlparse(s, out):