1.1 --- a/parser.py Fri Dec 14 01:51:01 2012 +0100
1.2 +++ b/parser.py Mon Dec 17 02:06:24 2012 +0100
1.3 @@ -31,7 +31,14 @@
1.4 3. Each block is then parsed.
1.5 """
1.6
1.7 +try:
1.8 + from cStringIO import StringIO
1.9 +except ImportError:
1.10 + from StringIO import StringIO
1.11 +
1.12 +from xmlread import Parser
1.13 import re
1.14 +import sys
1.15
1.16 URL_SCHEMES = ("http", "https", "ftp", "mailto")
1.17
1.18 @@ -373,6 +380,123 @@
1.19 "warning" : "wiki warning",
1.20 }
1.21
1.22 +# XML dialect syntax parsing.
1.23 +
1.24 +tags = {
1.25 + "strong" : "'''%s'''",
1.26 + "em" : "''%s''",
1.27 + "u" : "__%s__",
1.28 + "del" : "--(%s)--",
1.29 + "sup" : "^%s^",
1.30 + "sub" : ",,%s,,",
1.31 + "code" : "`%s`",
1.32 + "pre" : "{{{%s}}}",
1.33 + "blockquote" : " %s",
1.34 + "small" : "~-%s-~",
1.35 + "big" : "~+%s+~",
1.36 + "p" : "%s\n\n",
1.37 + "ac:plain-text-body" : "{{{%s}}}",
1.38 + "ac:link" : "[[%s%s|%s]]",
1.39 + }
1.40 +
1.41 +tags.update(blocktypes)
1.42 +
1.43 +list_tags = {
1.44 + "ol" : " 1. %s\n",
1.45 + "ul" : " * %s\n",
1.46 + }
1.47 +
1.48 +link_target_tags = {
1.49 + "ri:page" : "ri:content-title",
1.50 + "ri:attachment" : "ri:filename",
1.51 + }
1.52 +
1.53 +normalise_regexp_str = r"\n\n+"
1.54 +normalise_regexp = re.compile(normalise_regexp_str, re.DOTALL)
1.55 +
1.56 +class ConfluenceXMLParser(Parser):
1.57 +
1.58 + "Handle content from Confluence 4 page revisions."
1.59 +
1.60 + def __init__(self, out):
1.61 + Parser.__init__(self)
1.62 + self.out = out
1.63 +
1.64 + # Link target information.
1.65 +
1.66 + self.target = None
1.67 + self.target_type = None
1.68 +
1.69 + def handleElement(self, name):
1.70 + text = "".join(self.text[-1])
1.71 +
1.72 + # Handle list elements.
1.73 +
1.74 + if name == "li" and len(self.elements) > 1:
1.75 + list_tag = self.elements[-2]
1.76 + conversion = list_tags.get(list_tag)
1.77 +
1.78 + # Remember link target information.
1.79 +
1.80 + elif link_target_tags.has_key(name):
1.81 + self.target = self.attributes[-1].get(link_target_tags[name])
1.82 + self.target_type = name
1.83 + text = ""
1.84 +
1.85 + # Handle the common case.
1.86 +
1.87 + else:
1.88 + conversion = tags.get(name)
1.89 +
1.90 + # Attempt to convert the text.
1.91 +
1.92 + if name == "ac:link":
1.93 + if self.target_type == "ri:attachment":
1.94 + prefix = "attachment:"
1.95 + else:
1.96 + prefix = "../"
1.97 +
1.98 + text = conversion % (prefix, self.target, text or self.target)
1.99 +
1.100 + # Handle the common case.
1.101 +
1.102 + elif text and conversion:
1.103 + text = conversion % text
1.104 +
1.105 + # Add the converted text to the end of the parent element's text nodes.
1.106 +
1.107 + if len(self.text) > 1:
1.108 + self.text[-2].append(text)
1.109 +
1.110 + # Otherwise, emit the text with normalised newlines.
1.111 +
1.112 + else:
1.113 + self.out.write(normalise_regexp.sub("\n\n", text))
1.114 +
1.115 +def xmlparse(s, out):
1.116 +
1.117 + "Parse the content in the string 's', writing a translation to 'out'."
1.118 +
1.119 + # NOTE: CDATA sections appear to have erroneous endings.
1.120 +
1.121 + s = u"""\
1.122 +<?xml version="1.0"?>
1.123 +<!DOCTYPE html
1.124 + PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
1.125 + "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
1.126 +<html xmlns="http://www.w3.org/1999/xhtml">
1.127 +<body>
1.128 +%s
1.129 +</body>
1.130 +</html>""" % s.replace("]] >", "]]>")
1.131 +
1.132 + f = StringIO(s.encode("utf-8"))
1.133 + try:
1.134 + parser = ConfluenceXMLParser(out)
1.135 + parser.parse(f)
1.136 + finally:
1.137 + f.close()
1.138 +
1.139 # General parsing.
1.140
1.141 def parse(s, out):
1.142 @@ -430,8 +554,6 @@
1.143 print >>out
1.144
1.145 if __name__ == "__main__":
1.146 - import sys
1.147 -
1.148 s = sys.stdin.read()
1.149 parse(s, sys.stdout)
1.150