1.1 --- a/parser.py Mon Dec 17 02:06:24 2012 +0100
1.2 +++ b/parser.py Tue Dec 18 23:06:19 2012 +0100
1.3 @@ -39,6 +39,7 @@
1.4 from xmlread import Parser
1.5 import re
1.6 import sys
1.7 +import operator
1.8
1.9 URL_SCHEMES = ("http", "https", "ftp", "mailto")
1.10
1.11 @@ -383,6 +384,7 @@
1.12 # XML dialect syntax parsing.
1.13
1.14 tags = {
1.15 + # XHTML tag MoinMoin syntax
1.16 "strong" : "'''%s'''",
1.17 "em" : "''%s''",
1.18 "u" : "__%s__",
1.19 @@ -394,25 +396,48 @@
1.20 "blockquote" : " %s",
1.21 "small" : "~-%s-~",
1.22 "big" : "~+%s+~",
1.23 - "p" : "%s\n\n",
1.24 + "p" : "\n%s\n",
1.25 + "ol" : "\n%s",
1.26 + "ul" : "\n%s",
1.27 "ac:plain-text-body" : "{{{%s}}}",
1.28 "ac:link" : "[[%s%s|%s]]",
1.29 }
1.30 -
1.31 -tags.update(blocktypes)
1.32 +
1.33 +for tag, translation in blocktypes.items():
1.34 + tags[tag] = "\n%s\n" % translation
1.35 +
1.36 +simple_tags = {
1.37 + # XHTML tag MoinMoin syntax
1.38 + "br" : "<<BR>>",
1.39 + }
1.40
1.41 list_tags = {
1.42 - "ol" : " 1. %s\n",
1.43 - "ul" : " * %s\n",
1.44 + # XHTML list tag MoinMoin list item syntax
1.45 + "ol" : "1. %s\n",
1.46 + "ul" : "* %s\n",
1.47 }
1.48
1.49 +indented_tags = ["li", "p"]
1.50 +
1.51 link_target_tags = {
1.52 + # Confluence element Attribute providing the target
1.53 "ri:page" : "ri:content-title",
1.54 "ri:attachment" : "ri:filename",
1.55 }
1.56
1.57 -normalise_regexp_str = r"\n\n+"
1.58 -normalise_regexp = re.compile(normalise_regexp_str, re.DOTALL)
1.59 +macro_rich_text_styles = {
1.60 + # Confluence style MoinMoin admonition style
1.61 + "note" : "caution",
1.62 + "warning" : "warning",
1.63 + "info" : "important",
1.64 + "tip" : "tip",
1.65 + }
1.66 +
1.67 +normalise_regexp_str = r"\s+"
1.68 +normalise_regexp = re.compile(normalise_regexp_str)
1.69 +
1.70 +normalise_end_regexp_str = r"\s\s+$"
1.71 +normalise_end_regexp = re.compile(normalise_end_regexp_str)
1.72
1.73 class ConfluenceXMLParser(Parser):
1.74
1.75 @@ -427,8 +452,50 @@
1.76 self.target = None
1.77 self.target_type = None
1.78
1.79 + # Macro information.
1.80 +
1.81 + self.macro = None
1.82 + self.macro_parameters = {}
1.83 +
1.84 + # Indentation and preformatted states.
1.85 +
1.86 + self.indent = 0
1.87 + self.states = {}
1.88 + for name in ("pre", "ac:plain-text-body"):
1.89 + self.states[name] = 0
1.90 +
1.91 + # ContentHandler-related methods.
1.92 +
1.93 + def startElement(self, name, attrs):
1.94 + if list_tags.has_key(name):
1.95 + self.indent += 1
1.96 + elif self.states.has_key(name):
1.97 + self.states[name] += 1
1.98 + Parser.startElement(self, name, attrs)
1.99 +
1.100 + def endElement(self, name):
1.101 + Parser.endElement(self, name)
1.102 + if list_tags.has_key(name):
1.103 + self.indent -= 1
1.104 + elif self.states.has_key(name):
1.105 + self.states[name] -= 1
1.106 +
1.107 + def characters(self, content):
1.108 + if not self.is_preformatted():
1.109 + content = self.normalise(content, self.elements[-1])
1.110 + Parser.characters(self, content)
1.111 +
1.112 + def skippedEntity(self, name):
1.113 + if name == "mdash":
1.114 + self.text[-1].append(u"\u2014")
1.115 + elif name == "ndash":
1.116 + self.text[-1].append(u"\u2013")
1.117 +
1.118 + # Parser-related methods.
1.119 +
1.120 def handleElement(self, name):
1.121 text = "".join(self.text[-1])
1.122 + conversion = None
1.123
1.124 # Handle list elements.
1.125
1.126 @@ -443,6 +510,15 @@
1.127 self.target_type = name
1.128 text = ""
1.129
1.130 + # Remember macro information.
1.131 +
1.132 + elif name == "ac:parameter":
1.133 + self.macro_parameters[self.attributes[-1].get("ac:name")] = text
1.134 + text = ""
1.135 +
1.136 + elif name == "ac:macro":
1.137 + self.macro = self.attributes[-1].get("ac:name")
1.138 +
1.139 # Handle the common case.
1.140
1.141 else:
1.142 @@ -450,6 +526,8 @@
1.143
1.144 # Attempt to convert the text.
1.145
1.146 + # Links require target information.
1.147 +
1.148 if name == "ac:link":
1.149 if self.target_type == "ri:attachment":
1.150 prefix = "attachment:"
1.151 @@ -457,21 +535,66 @@
1.152 prefix = "../"
1.153
1.154 text = conversion % (prefix, self.target, text or self.target)
1.155 + self.target = self.target_type = None
1.156 +
1.157 + # Macro name information is used to style rich text body regions.
1.158 +
1.159 + elif name == "ac:macro" and macro_rich_text_styles.has_key(self.macro):
1.160 + details = macro_rich_text_styles[self.macro]
1.161 + title = self.macro_parameters.get("title")
1.162 + if title:
1.163 + details = "%s\n\n%s" % (details, title)
1.164 + text = "{{{#!wiki %s\n\n%s}}}" % (details, text)
1.165 + self.macro = None
1.166 + self.macro_parameters = {}
1.167
1.168 # Handle the common case.
1.169
1.170 elif text and conversion:
1.171 text = conversion % text
1.172 + elif simple_tags.has_key(name):
1.173 + text = simple_tags[name]
1.174 +
1.175 + # Normalise leading whitespace and indent the text if appropriate.
1.176 +
1.177 + if name in indented_tags:
1.178 + text = " " * self.indent + text.lstrip()
1.179
1.180 # Add the converted text to the end of the parent element's text nodes.
1.181
1.182 if len(self.text) > 1:
1.183 + preceding = "".join(self.text[-2])
1.184 +
1.185 + if not self.is_preformatted():
1.186 + preceding = self.normalise_end(preceding, self.elements[-2])
1.187 +
1.188 + self.text[-2] = [preceding]
1.189 self.text[-2].append(text)
1.190
1.191 - # Otherwise, emit the text with normalised newlines.
1.192 + # Otherwise, emit the text.
1.193
1.194 else:
1.195 - self.out.write(normalise_regexp.sub("\n\n", text))
1.196 + self.out.write(text)
1.197 +
1.198 + def is_preformatted(self):
1.199 + return reduce(operator.or_, self.states.values(), False)
1.200 +
1.201 + def get_replacement(self, name, end=False):
1.202 + if list_tags.has_key(name):
1.203 + if end:
1.204 + return "\n"
1.205 + else:
1.206 + return ""
1.207 + elif name == "body":
1.208 + return "\n\n"
1.209 + else:
1.210 + return " "
1.211 +
1.212 + def normalise(self, text, name):
1.213 + return normalise_regexp.sub(self.get_replacement(name), text)
1.214 +
1.215 + def normalise_end(self, text, name):
1.216 + return normalise_end_regexp.sub(self.get_replacement(name, True), text)
1.217
1.218 def xmlparse(s, out):
1.219