# HG changeset patch
# User Paul Boddie <paul@boddie.org.uk>
# Date 1355706384 -3600
# Node ID e0920cd5997032df3e9f24ce48cb2310f7d95ffe
# Parent  4df6e1afb172cbe6205e50b95329b2e8c0f5bf19
Added initial support for parsing and converting Confluence 4 XHTML content.

diff -r 4df6e1afb172 -r e0920cd59970 convert.py
--- a/convert.py	Fri Dec 14 01:51:01 2012 +0100
+++ b/convert.py	Mon Dec 17 02:06:24 2012 +0100
@@ -28,6 +28,7 @@
 import codecs
 import xmlread
 import parser
+import sys
 
 MAX_TITLE_LENGTH = 120
 
@@ -46,7 +47,14 @@
         "Handle objects according to type."
 
         objecttype = attributes[-1]["class"]
+
+        # Any identifier is stored as the object's textual content.
+
         identifier = text.strip()
+
+        # The content is a dictionary mapping names to properties and
+        # collections.
+
         content = self.content
 
         pages_dir = join(self.space, "pages")
@@ -126,12 +134,21 @@
             if not body:
                 body = "## Empty page."
 
+            # NOTE: Very simple technique employed for guessing the format.
+
             if no_translate:
                 fn = write
+            elif body.startswith("<"):
+                fn = xmltranslate
             else:
                 fn = translate
 
-            fn(join(versions_dir, content["content"]), body)
+            try:
+                fn(join(versions_dir, content["content"]), body)
+            except:
+                print >>sys.stderr, "Error parsing..."
+                print >>sys.stderr, body
+                raise
 
         self.content = {}
 
@@ -201,19 +218,24 @@
     finally:
         f.close()
 
-def translate(filename, body):
+def translate(filename, body, fn=None):
 
     """
     Write to the file with the given 'filename' a translation of the given
     'body'.
     """
 
+    fn = fn or parser.parse
+
     out = codecs.open(filename, "w", encoding="utf-8")
     try:
-        parser.parse(body, out)
+        fn(body, out)
     finally:
         out.close()
 
+def xmltranslate(filename, body):
+    translate(filename, body, parser.xmlparse)
+
 def sort_manifest(filename, pagetitle, output=None):
 
     """
@@ -256,8 +278,6 @@
         append(output, s)
 
 if __name__ == "__main__":
-    import sys
-
     try:
         filename = sys.argv[1]
         is_zipfile = splitext(filename)[-1] == extsep + "zip"
diff -r 4df6e1afb172 -r e0920cd59970 parser.py
--- a/parser.py	Fri Dec 14 01:51:01 2012 +0100
+++ b/parser.py	Mon Dec 17 02:06:24 2012 +0100
@@ -31,7 +31,14 @@
  3. Each block is then parsed.
 """
 
+try:
+    from cStringIO import StringIO
+except ImportError:
+    from StringIO import StringIO
+
+from xmlread import Parser
 import re
+import sys
 
 URL_SCHEMES = ("http", "https", "ftp", "mailto")
 
@@ -373,6 +380,123 @@
     "warning" : "wiki warning",
     }
 
+# XML dialect syntax parsing.
+
+tags = {
+    "strong"                : "'''%s'''",
+    "em"                    : "''%s''",
+    "u"                     : "__%s__",
+    "del"                   : "--(%s)--",
+    "sup"                   : "^%s^",
+    "sub"                   : ",,%s,,",
+    "code"                  : "`%s`",
+    "pre"                   : "{{{%s}}}",
+    "blockquote"            : " %s",
+    "small"                 : "~-%s-~",
+    "big"                   : "~+%s+~",
+    "p"                     : "%s\n\n",
+    "ac:plain-text-body"    : "{{{%s}}}",
+    "ac:link"               : "[[%s%s|%s]]",
+    }
+    
+tags.update(blocktypes)
+
+list_tags = {
+    "ol"                    : " 1. %s\n",
+    "ul"                    : " * %s\n",
+    }
+
+link_target_tags = {
+    "ri:page"               : "ri:content-title",
+    "ri:attachment"         : "ri:filename",
+    }
+
+normalise_regexp_str = r"\n\n+"
+normalise_regexp = re.compile(normalise_regexp_str, re.DOTALL)
+
+class ConfluenceXMLParser(Parser):
+
+    "Handle content from Confluence 4 page revisions."
+
+    def __init__(self, out):
+        Parser.__init__(self)
+        self.out = out
+
+        # Link target information.
+
+        self.target = None
+        self.target_type = None
+
+    def handleElement(self, name):
+        text = "".join(self.text[-1])
+
+        # Handle list elements.
+
+        if name == "li" and len(self.elements) > 1:
+            list_tag = self.elements[-2]
+            conversion = list_tags.get(list_tag)
+
+        # Remember link target information.
+
+        elif link_target_tags.has_key(name):
+            self.target = self.attributes[-1].get(link_target_tags[name])
+            self.target_type = name
+            text = ""
+
+        # Handle the common case.
+
+        else:
+            conversion = tags.get(name)
+
+        # Attempt to convert the text.
+
+        if name == "ac:link":
+            if self.target_type == "ri:attachment":
+                prefix = "attachment:"
+            else:
+                prefix = "../"
+
+            text = conversion % (prefix, self.target, text or self.target)
+
+        # Handle the common case.
+
+        elif text and conversion:
+            text = conversion % text
+
+        # Add the converted text to the end of the parent element's text nodes.
+
+        if len(self.text) > 1:
+            self.text[-2].append(text)
+
+        # Otherwise, emit the text with normalised newlines.
+
+        else:
+            self.out.write(normalise_regexp.sub("\n\n", text))
+
+def xmlparse(s, out):
+
+    "Parse the content in the string 's', writing a translation to 'out'."
+
+    # NOTE: CDATA sections appear to have erroneous endings.
+
+    s = u"""\
+<?xml version="1.0"?>
+<!DOCTYPE html 
+     PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
+     "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml">
+<body>
+%s
+</body>
+</html>""" % s.replace("]] >", "]]>")
+
+    f = StringIO(s.encode("utf-8"))
+    try:
+        parser = ConfluenceXMLParser(out)
+        parser.parse(f)
+    finally:
+        f.close()
+
 # General parsing.
 
 def parse(s, out):
@@ -430,8 +554,6 @@
             print >>out
 
 if __name__ == "__main__":
-    import sys
-
     s = sys.stdin.read()
     parse(s, sys.stdout)