# HG changeset patch # User Paul Boddie # Date 1355706384 -3600 # Node ID e0920cd5997032df3e9f24ce48cb2310f7d95ffe # Parent 4df6e1afb172cbe6205e50b95329b2e8c0f5bf19 Added initial support for parsing and converting Confluence 4 XHTML content. diff -r 4df6e1afb172 -r e0920cd59970 convert.py --- a/convert.py Fri Dec 14 01:51:01 2012 +0100 +++ b/convert.py Mon Dec 17 02:06:24 2012 +0100 @@ -28,6 +28,7 @@ import codecs import xmlread import parser +import sys MAX_TITLE_LENGTH = 120 @@ -46,7 +47,14 @@ "Handle objects according to type." objecttype = attributes[-1]["class"] + + # Any identifier is stored as the object's textual content. + identifier = text.strip() + + # The content is a dictionary mapping names to properties and + # collections. + content = self.content pages_dir = join(self.space, "pages") @@ -126,12 +134,21 @@ if not body: body = "## Empty page." + # NOTE: Very simple technique employed for guessing the format. + if no_translate: fn = write + elif body.startswith("<"): + fn = xmltranslate else: fn = translate - fn(join(versions_dir, content["content"]), body) + try: + fn(join(versions_dir, content["content"]), body) + except: + print >>sys.stderr, "Error parsing..." + print >>sys.stderr, body + raise self.content = {} @@ -201,19 +218,24 @@ finally: f.close() -def translate(filename, body): +def translate(filename, body, fn=None): """ Write to the file with the given 'filename' a translation of the given 'body'. """ + fn = fn or parser.parse + out = codecs.open(filename, "w", encoding="utf-8") try: - parser.parse(body, out) + fn(body, out) finally: out.close() +def xmltranslate(filename, body): + translate(filename, body, parser.xmlparse) + def sort_manifest(filename, pagetitle, output=None): """ @@ -256,8 +278,6 @@ append(output, s) if __name__ == "__main__": - import sys - try: filename = sys.argv[1] is_zipfile = splitext(filename)[-1] == extsep + "zip" diff -r 4df6e1afb172 -r e0920cd59970 parser.py --- a/parser.py Fri Dec 14 01:51:01 2012 +0100 +++ b/parser.py Mon Dec 17 02:06:24 2012 +0100 @@ -31,7 +31,14 @@ 3. Each block is then parsed. """ +try: + from cStringIO import StringIO +except ImportError: + from StringIO import StringIO + +from xmlread import Parser import re +import sys URL_SCHEMES = ("http", "https", "ftp", "mailto") @@ -373,6 +380,123 @@ "warning" : "wiki warning", } +# XML dialect syntax parsing. + +tags = { + "strong" : "'''%s'''", + "em" : "''%s''", + "u" : "__%s__", + "del" : "--(%s)--", + "sup" : "^%s^", + "sub" : ",,%s,,", + "code" : "`%s`", + "pre" : "{{{%s}}}", + "blockquote" : " %s", + "small" : "~-%s-~", + "big" : "~+%s+~", + "p" : "%s\n\n", + "ac:plain-text-body" : "{{{%s}}}", + "ac:link" : "[[%s%s|%s]]", + } + +tags.update(blocktypes) + +list_tags = { + "ol" : " 1. %s\n", + "ul" : " * %s\n", + } + +link_target_tags = { + "ri:page" : "ri:content-title", + "ri:attachment" : "ri:filename", + } + +normalise_regexp_str = r"\n\n+" +normalise_regexp = re.compile(normalise_regexp_str, re.DOTALL) + +class ConfluenceXMLParser(Parser): + + "Handle content from Confluence 4 page revisions." + + def __init__(self, out): + Parser.__init__(self) + self.out = out + + # Link target information. + + self.target = None + self.target_type = None + + def handleElement(self, name): + text = "".join(self.text[-1]) + + # Handle list elements. + + if name == "li" and len(self.elements) > 1: + list_tag = self.elements[-2] + conversion = list_tags.get(list_tag) + + # Remember link target information. + + elif link_target_tags.has_key(name): + self.target = self.attributes[-1].get(link_target_tags[name]) + self.target_type = name + text = "" + + # Handle the common case. + + else: + conversion = tags.get(name) + + # Attempt to convert the text. + + if name == "ac:link": + if self.target_type == "ri:attachment": + prefix = "attachment:" + else: + prefix = "../" + + text = conversion % (prefix, self.target, text or self.target) + + # Handle the common case. + + elif text and conversion: + text = conversion % text + + # Add the converted text to the end of the parent element's text nodes. + + if len(self.text) > 1: + self.text[-2].append(text) + + # Otherwise, emit the text with normalised newlines. + + else: + self.out.write(normalise_regexp.sub("\n\n", text)) + +def xmlparse(s, out): + + "Parse the content in the string 's', writing a translation to 'out'." + + # NOTE: CDATA sections appear to have erroneous endings. + + s = u"""\ + + + + +%s + +""" % s.replace("]] >", "]]>") + + f = StringIO(s.encode("utf-8")) + try: + parser = ConfluenceXMLParser(out) + parser.parse(f) + finally: + f.close() + # General parsing. def parse(s, out): @@ -430,8 +554,6 @@ print >>out if __name__ == "__main__": - import sys - s = sys.stdin.read() parse(s, sys.stdout)