1.1 --- a/convert.py Fri Dec 14 01:51:01 2012 +0100
1.2 +++ b/convert.py Mon Dec 17 02:06:24 2012 +0100
1.3 @@ -28,6 +28,7 @@
1.4 import codecs
1.5 import xmlread
1.6 import parser
1.7 +import sys
1.8
1.9 MAX_TITLE_LENGTH = 120
1.10
1.11 @@ -46,7 +47,14 @@
1.12 "Handle objects according to type."
1.13
1.14 objecttype = attributes[-1]["class"]
1.15 +
1.16 + # Any identifier is stored as the object's textual content.
1.17 +
1.18 identifier = text.strip()
1.19 +
1.20 + # The content is a dictionary mapping names to properties and
1.21 + # collections.
1.22 +
1.23 content = self.content
1.24
1.25 pages_dir = join(self.space, "pages")
1.26 @@ -126,12 +134,21 @@
1.27 if not body:
1.28 body = "## Empty page."
1.29
1.30 + # NOTE: Very simple technique employed for guessing the format.
1.31 +
1.32 if no_translate:
1.33 fn = write
1.34 + elif body.startswith("<"):
1.35 + fn = xmltranslate
1.36 else:
1.37 fn = translate
1.38
1.39 - fn(join(versions_dir, content["content"]), body)
1.40 + try:
1.41 + fn(join(versions_dir, content["content"]), body)
1.42 + except:
1.43 + print >>sys.stderr, "Error parsing..."
1.44 + print >>sys.stderr, body
1.45 + raise
1.46
1.47 self.content = {}
1.48
1.49 @@ -201,19 +218,24 @@
1.50 finally:
1.51 f.close()
1.52
1.53 -def translate(filename, body):
1.54 +def translate(filename, body, fn=None):
1.55
1.56 """
1.57 Write to the file with the given 'filename' a translation of the given
1.58 'body'.
1.59 """
1.60
1.61 + fn = fn or parser.parse
1.62 +
1.63 out = codecs.open(filename, "w", encoding="utf-8")
1.64 try:
1.65 - parser.parse(body, out)
1.66 + fn(body, out)
1.67 finally:
1.68 out.close()
1.69
1.70 +def xmltranslate(filename, body):
1.71 + translate(filename, body, parser.xmlparse)
1.72 +
1.73 def sort_manifest(filename, pagetitle, output=None):
1.74
1.75 """
1.76 @@ -256,8 +278,6 @@
1.77 append(output, s)
1.78
1.79 if __name__ == "__main__":
1.80 - import sys
1.81 -
1.82 try:
1.83 filename = sys.argv[1]
1.84 is_zipfile = splitext(filename)[-1] == extsep + "zip"
2.1 --- a/parser.py Fri Dec 14 01:51:01 2012 +0100
2.2 +++ b/parser.py Mon Dec 17 02:06:24 2012 +0100
2.3 @@ -31,7 +31,14 @@
2.4 3. Each block is then parsed.
2.5 """
2.6
2.7 +try:
2.8 + from cStringIO import StringIO
2.9 +except ImportError:
2.10 + from StringIO import StringIO
2.11 +
2.12 +from xmlread import Parser
2.13 import re
2.14 +import sys
2.15
2.16 URL_SCHEMES = ("http", "https", "ftp", "mailto")
2.17
2.18 @@ -373,6 +380,123 @@
2.19 "warning" : "wiki warning",
2.20 }
2.21
2.22 +# XML dialect syntax parsing.
2.23 +
2.24 +tags = {
2.25 + "strong" : "'''%s'''",
2.26 + "em" : "''%s''",
2.27 + "u" : "__%s__",
2.28 + "del" : "--(%s)--",
2.29 + "sup" : "^%s^",
2.30 + "sub" : ",,%s,,",
2.31 + "code" : "`%s`",
2.32 + "pre" : "{{{%s}}}",
2.33 + "blockquote" : " %s",
2.34 + "small" : "~-%s-~",
2.35 + "big" : "~+%s+~",
2.36 + "p" : "%s\n\n",
2.37 + "ac:plain-text-body" : "{{{%s}}}",
2.38 + "ac:link" : "[[%s%s|%s]]",
2.39 + }
2.40 +
2.41 +tags.update(blocktypes)
2.42 +
2.43 +list_tags = {
2.44 + "ol" : " 1. %s\n",
2.45 + "ul" : " * %s\n",
2.46 + }
2.47 +
2.48 +link_target_tags = {
2.49 + "ri:page" : "ri:content-title",
2.50 + "ri:attachment" : "ri:filename",
2.51 + }
2.52 +
2.53 +normalise_regexp_str = r"\n\n+"
2.54 +normalise_regexp = re.compile(normalise_regexp_str, re.DOTALL)
2.55 +
2.56 +class ConfluenceXMLParser(Parser):
2.57 +
2.58 + "Handle content from Confluence 4 page revisions."
2.59 +
2.60 + def __init__(self, out):
2.61 + Parser.__init__(self)
2.62 + self.out = out
2.63 +
2.64 + # Link target information.
2.65 +
2.66 + self.target = None
2.67 + self.target_type = None
2.68 +
2.69 + def handleElement(self, name):
2.70 + text = "".join(self.text[-1])
2.71 +
2.72 + # Handle list elements.
2.73 +
2.74 + if name == "li" and len(self.elements) > 1:
2.75 + list_tag = self.elements[-2]
2.76 + conversion = list_tags.get(list_tag)
2.77 +
2.78 + # Remember link target information.
2.79 +
2.80 + elif link_target_tags.has_key(name):
2.81 + self.target = self.attributes[-1].get(link_target_tags[name])
2.82 + self.target_type = name
2.83 + text = ""
2.84 +
2.85 + # Handle the common case.
2.86 +
2.87 + else:
2.88 + conversion = tags.get(name)
2.89 +
2.90 + # Attempt to convert the text.
2.91 +
2.92 + if name == "ac:link":
2.93 + if self.target_type == "ri:attachment":
2.94 + prefix = "attachment:"
2.95 + else:
2.96 + prefix = "../"
2.97 +
2.98 + text = conversion % (prefix, self.target, text or self.target)
2.99 +
2.100 + # Handle the common case.
2.101 +
2.102 + elif text and conversion:
2.103 + text = conversion % text
2.104 +
2.105 + # Add the converted text to the end of the parent element's text nodes.
2.106 +
2.107 + if len(self.text) > 1:
2.108 + self.text[-2].append(text)
2.109 +
2.110 + # Otherwise, emit the text with normalised newlines.
2.111 +
2.112 + else:
2.113 + self.out.write(normalise_regexp.sub("\n\n", text))
2.114 +
2.115 +def xmlparse(s, out):
2.116 +
2.117 + "Parse the content in the string 's', writing a translation to 'out'."
2.118 +
2.119 + # NOTE: CDATA sections appear to have erroneous endings.
2.120 +
2.121 + s = u"""\
2.122 +<?xml version="1.0"?>
2.123 +<!DOCTYPE html
2.124 + PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
2.125 + "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
2.126 +<html xmlns="http://www.w3.org/1999/xhtml">
2.127 +<body>
2.128 +%s
2.129 +</body>
2.130 +</html>""" % s.replace("]] >", "]]>")
2.131 +
2.132 + f = StringIO(s.encode("utf-8"))
2.133 + try:
2.134 + parser = ConfluenceXMLParser(out)
2.135 + parser.parse(f)
2.136 + finally:
2.137 + f.close()
2.138 +
2.139 # General parsing.
2.140
2.141 def parse(s, out):
2.142 @@ -430,8 +554,6 @@
2.143 print >>out
2.144
2.145 if __name__ == "__main__":
2.146 - import sys
2.147 -
2.148 s = sys.stdin.read()
2.149 parse(s, sys.stdout)
2.150