# HG changeset patch
# User Paul Boddie <paul@boddie.org.uk>
# Date 1355868379 -3600
# Node ID 547de21df3d4c2a106c46982e43dc65d0b08af25
# Parent  e0920cd5997032df3e9f24ce48cb2310f7d95ffe
Attempted to normalise whitespace and to ensure that the different block
elements retain their separation and relationships in the generated markup.
Added support for adminitions through the recognition of certain Confluence
macro invocations.
Added support for "skipped entities" such as &mdash; and &ndash; found in the
Confluence XHTML markup.

diff -r e0920cd59970 -r 547de21df3d4 parser.py
--- a/parser.py	Mon Dec 17 02:06:24 2012 +0100
+++ b/parser.py	Tue Dec 18 23:06:19 2012 +0100
@@ -39,6 +39,7 @@
 from xmlread import Parser
 import re
 import sys
+import operator
 
 URL_SCHEMES = ("http", "https", "ftp", "mailto")
 
@@ -383,6 +384,7 @@
 # XML dialect syntax parsing.
 
 tags = {
+    # XHTML tag               MoinMoin syntax
     "strong"                : "'''%s'''",
     "em"                    : "''%s''",
     "u"                     : "__%s__",
@@ -394,25 +396,48 @@
     "blockquote"            : " %s",
     "small"                 : "~-%s-~",
     "big"                   : "~+%s+~",
-    "p"                     : "%s\n\n",
+    "p"                     : "\n%s\n",
+    "ol"                    : "\n%s",
+    "ul"                    : "\n%s",
     "ac:plain-text-body"    : "{{{%s}}}",
     "ac:link"               : "[[%s%s|%s]]",
     }
-    
-tags.update(blocktypes)
+
+for tag, translation in blocktypes.items():
+    tags[tag] = "\n%s\n" % translation
+
+simple_tags = {
+    # XHTML tag               MoinMoin syntax
+    "br"                    : "<<BR>>",
+    }
 
 list_tags = {
-    "ol"                    : " 1. %s\n",
-    "ul"                    : " * %s\n",
+    # XHTML list tag          MoinMoin list item syntax
+    "ol"                    : "1. %s\n",
+    "ul"                    : "* %s\n",
     }
 
+indented_tags = ["li", "p"]
+
 link_target_tags = {
+    # Confluence element      Attribute providing the target
     "ri:page"               : "ri:content-title",
     "ri:attachment"         : "ri:filename",
     }
 
-normalise_regexp_str = r"\n\n+"
-normalise_regexp = re.compile(normalise_regexp_str, re.DOTALL)
+macro_rich_text_styles = {
+    # Confluence style        MoinMoin admonition style
+    "note"                  : "caution",
+    "warning"               : "warning",
+    "info"                  : "important",
+    "tip"                   : "tip",
+    }
+
+normalise_regexp_str = r"\s+"
+normalise_regexp = re.compile(normalise_regexp_str)
+
+normalise_end_regexp_str = r"\s\s+$"
+normalise_end_regexp = re.compile(normalise_end_regexp_str)
 
 class ConfluenceXMLParser(Parser):
 
@@ -427,8 +452,50 @@
         self.target = None
         self.target_type = None
 
+        # Macro information.
+
+        self.macro = None
+        self.macro_parameters = {}
+
+        # Indentation and preformatted states.
+
+        self.indent = 0
+        self.states = {}
+        for name in ("pre", "ac:plain-text-body"):
+            self.states[name] = 0
+
+    # ContentHandler-related methods.
+
+    def startElement(self, name, attrs):
+        if list_tags.has_key(name):
+            self.indent += 1
+        elif self.states.has_key(name):
+            self.states[name] += 1
+        Parser.startElement(self, name, attrs)
+
+    def endElement(self, name):
+        Parser.endElement(self, name)
+        if list_tags.has_key(name):
+            self.indent -= 1
+        elif self.states.has_key(name):
+            self.states[name] -= 1
+
+    def characters(self, content):
+        if not self.is_preformatted():
+            content = self.normalise(content, self.elements[-1])
+        Parser.characters(self, content)
+
+    def skippedEntity(self, name):
+        if name == "mdash":
+            self.text[-1].append(u"\u2014")
+        elif name == "ndash":
+            self.text[-1].append(u"\u2013")
+
+    # Parser-related methods.
+
     def handleElement(self, name):
         text = "".join(self.text[-1])
+        conversion = None
 
         # Handle list elements.
 
@@ -443,6 +510,15 @@
             self.target_type = name
             text = ""
 
+        # Remember macro information.
+
+        elif name == "ac:parameter":
+            self.macro_parameters[self.attributes[-1].get("ac:name")] = text
+            text = ""
+
+        elif name == "ac:macro":
+            self.macro = self.attributes[-1].get("ac:name")
+
         # Handle the common case.
 
         else:
@@ -450,6 +526,8 @@
 
         # Attempt to convert the text.
 
+        # Links require target information.
+
         if name == "ac:link":
             if self.target_type == "ri:attachment":
                 prefix = "attachment:"
@@ -457,21 +535,66 @@
                 prefix = "../"
 
             text = conversion % (prefix, self.target, text or self.target)
+            self.target = self.target_type = None
+
+        # Macro name information is used to style rich text body regions.
+
+        elif name == "ac:macro" and macro_rich_text_styles.has_key(self.macro):
+            details = macro_rich_text_styles[self.macro]
+            title = self.macro_parameters.get("title")
+            if title:
+                details = "%s\n\n%s" % (details, title)
+            text = "{{{#!wiki %s\n\n%s}}}" % (details, text)
+            self.macro = None
+            self.macro_parameters = {}
 
         # Handle the common case.
 
         elif text and conversion:
             text = conversion % text
+        elif simple_tags.has_key(name):
+            text = simple_tags[name]
+
+        # Normalise leading whitespace and indent the text if appropriate.
+
+        if name in indented_tags:
+            text = " " * self.indent + text.lstrip()
 
         # Add the converted text to the end of the parent element's text nodes.
 
         if len(self.text) > 1:
+            preceding = "".join(self.text[-2])
+
+            if not self.is_preformatted():
+                preceding = self.normalise_end(preceding, self.elements[-2])
+
+            self.text[-2] = [preceding]
             self.text[-2].append(text)
 
-        # Otherwise, emit the text with normalised newlines.
+        # Otherwise, emit the text.
 
         else:
-            self.out.write(normalise_regexp.sub("\n\n", text))
+            self.out.write(text)
+
+    def is_preformatted(self):
+        return reduce(operator.or_, self.states.values(), False)
+
+    def get_replacement(self, name, end=False):
+        if list_tags.has_key(name):
+            if end:
+                return "\n"
+            else:
+                return ""
+        elif name == "body":
+            return "\n\n"
+        else:
+            return " "
+
+    def normalise(self, text, name):
+        return normalise_regexp.sub(self.get_replacement(name), text)
+
+    def normalise_end(self, text, name):
+        return normalise_end_regexp.sub(self.get_replacement(name, True), text)
 
 def xmlparse(s, out):