1 #!/usr/bin/env python 2 3 """ 4 Confluence Wiki XML/XHTML syntax parsing. 5 6 Copyright (C) 2012, 2013 Paul Boddie <paul@boddie.org.uk> 7 8 This software is free software; you can redistribute it and/or 9 modify it under the terms of the GNU General Public License as 10 published by the Free Software Foundation; either version 2 of 11 the License, or (at your option) any later version. 12 13 This software is distributed in the hope that it will be useful, 14 but WITHOUT ANY WARRANTY; without even the implied warranty of 15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 GNU General Public License for more details. 17 18 You should have received a copy of the GNU General Public 19 License along with this library; see the file LICENCE.txt 20 If not, write to the Free Software Foundation, Inc., 21 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA 22 """ 23 24 try: 25 from cStringIO import StringIO 26 except ImportError: 27 from StringIO import StringIO 28 29 from common import * 30 from xmlread import Parser 31 import re 32 import sys 33 import operator 34 import htmlentitydefs 35 import codecs 36 37 # XML dialect syntax parsing. 38 39 tags = { 40 # XHTML tag MoinMoin syntax 41 "strong" : "'''%s'''", 42 "em" : "''%s''", 43 "u" : "__%s__", 44 "del" : "--(%s)--", 45 "sup" : "^%s^", 46 "sub" : ",,%s,,", 47 "code" : "`%s`", 48 "pre" : "{{{%s}}}", 49 "table" : "{{{#!table\n%s\n}}}", 50 "tbody" : "%s", 51 "tr" : "%s", 52 "th" : "'''%s'''", 53 "td" : "%s", 54 "blockquote" : " %s", 55 "small" : "~-%s-~", 56 "big" : "~+%s+~", 57 "p" : "%s", 58 "ol" : "%s", 59 "ul" : "%s", 60 "ac:plain-text-body" : "{{{%s}}}", 61 "ac:link" : "[[%s%s|%s]]", 62 } 63 64 for tag, translation in blocktypes.items(): 65 tags[tag] = translation 66 67 simple_tags = { 68 # XHTML tag MoinMoin syntax 69 "br" : "<<BR>>", 70 } 71 72 list_tags = { 73 # XHTML list tag MoinMoin list item syntax 74 "ol" : "1. %s", 75 "ul" : "* %s", 76 } 77 78 indented_tags = ["li", "p"] 79 80 link_target_tags = { 81 # Confluence element Attribute providing the target 82 "ri:page" : "ri:content-title", 83 "ri:attachment" : "ri:filename", 84 "ri:user" : "ri:username", 85 } 86 87 macro_rich_text_styles = { 88 # Confluence style MoinMoin admonition style 89 "note" : "caution", 90 "warning" : "warning", 91 "info" : "important", 92 "tip" : "tip", 93 } 94 95 normalise_regexp_str = r"\s+" 96 normalise_regexp = re.compile(normalise_regexp_str) 97 98 class ConfluenceXMLParser(Parser): 99 100 "Handle content from Confluence 4 page revisions." 101 102 def __init__(self, out): 103 Parser.__init__(self) 104 self.out = out 105 106 # Link target information. 107 108 self.target = None 109 self.target_type = None 110 111 # Macro information. 112 113 self.macro = None 114 self.macro_parameters = {} 115 116 # Indentation and preformatted states. 117 118 self.indent = 0 119 self.states = {} 120 for name in ("pre", "ac:plain-text-body"): 121 self.states[name] = 0 122 123 # Table states. 124 125 self.table_rows = 0 126 self.table_columns = 0 127 128 # ContentHandler-related methods. 129 130 def startElement(self, name, attrs): 131 if list_tags.has_key(name): 132 self.indent += 1 133 elif self.states.has_key(name): 134 self.states[name] += 1 135 Parser.startElement(self, name, attrs) 136 137 def endElement(self, name): 138 Parser.endElement(self, name) 139 if list_tags.has_key(name): 140 self.indent -= 1 141 elif self.states.has_key(name): 142 self.states[name] -= 1 143 144 def characters(self, content): 145 if not self.is_preformatted(): 146 content = self.normalise(content, self.elements[-1]) 147 Parser.characters(self, content) 148 149 def skippedEntity(self, name): 150 ch = htmlentitydefs.name2codepoint.get(name) 151 if ch: 152 self.text[-1].append(unichr(ch)) 153 154 # Parser-related methods. 155 156 def handleElement(self, name): 157 text = "".join(self.text[-1]).strip() 158 159 # Handle state. 160 161 if name == "table": 162 self.table_rows = 0 163 elif name == "tr": 164 self.table_columns = 0 165 166 # Find conversions. 167 168 conversion = None 169 170 # Handle list elements. 171 172 if name == "li" and len(self.elements) > 1: 173 list_tag = self.elements[-2] 174 conversion = list_tags.get(list_tag) 175 176 # Remember link target information. 177 178 elif link_target_tags.has_key(name): 179 self.target = self.attributes[-1].get(link_target_tags[name]) 180 self.target_type = name 181 text = "" 182 183 # Remember macro information. 184 185 elif name == "ac:parameter": 186 self.macro_parameters[self.attributes[-1].get("ac:name")] = text 187 text = "" 188 189 elif name == "ac:macro": 190 self.macro = self.attributes[-1].get("ac:name") 191 192 # Handle the common case. 193 194 else: 195 conversion = tags.get(name) 196 197 # Attempt to convert the text. 198 199 # Links require target information. 200 # NOTE: User links should support the intended user namespace prefix. 201 202 if name == "ac:link": 203 if self.target_type == "ri:attachment": 204 prefix = "attachment:" 205 elif self.target_type == "ri:user": 206 prefix = "" 207 else: 208 prefix = "../" 209 210 text = conversion % (prefix, self.target, text or self.target) 211 self.target = self.target_type = None 212 213 # Macro name information is used to style rich text body regions. 214 215 elif name == "ac:macro" and macro_rich_text_styles.has_key(self.macro): 216 details = macro_rich_text_styles[self.macro] 217 title = self.macro_parameters.get("title") 218 if title: 219 details = "%s\n\n%s" % (details, title) 220 text = "{{{#!wiki %s\n\n%s}}}" % (details, text) 221 self.macro = None 222 self.macro_parameters = {} 223 224 # Handle the common case. 225 226 elif text and conversion: 227 text = conversion % text 228 elif simple_tags.has_key(name): 229 text = simple_tags[name] 230 231 # Postprocess table columns and rows. 232 233 if name in ("th", "td"): 234 if self.table_columns: 235 text = "\n|| %s" % text 236 self.table_columns += 1 237 elif name == "tr": 238 if self.table_rows: 239 text = "\n==\n%s" % text 240 self.table_rows += 1 241 242 # Normalise leading whitespace and indent the text if appropriate. 243 244 if name in indented_tags: 245 text = " " * self.indent + text.lstrip() 246 247 # Add the converted text to the end of the parent element's text nodes. 248 249 if len(self.text) > 1: 250 nodes = self.text[-2] 251 if "".join(self.text[-2]): 252 parent = self.elements[-2] 253 if parent == "body": 254 nodes.append("\n\n") 255 elif list_tags.has_key(parent): 256 nodes.append("\n") 257 elif list_tags.has_key(name) and parent == "li": 258 nodes.append("\n") 259 nodes.append(text) 260 261 # Otherwise, emit the text. 262 263 else: 264 self.out.write(text) 265 266 def is_preformatted(self): 267 return reduce(operator.or_, self.states.values(), False) 268 269 # Whitespace normalisation. 270 271 def get_replacement(self, name): 272 if name in ("html", "body") or list_tags.has_key(name): 273 return "" 274 else: 275 return " " 276 277 def normalise(self, text, name): 278 return normalise_regexp.sub(self.get_replacement(name), text) 279 280 def parse(s, out): 281 282 "Parse the content in the string 's', writing a translation to 'out'." 283 284 # NOTE: CDATA sections appear to have erroneous endings. 285 286 s = u"""\ 287 <?xml version="1.0"?> 288 <!DOCTYPE html 289 PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" 290 "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> 291 <html xmlns="http://www.w3.org/1999/xhtml"> 292 <body> 293 %s 294 </body> 295 </html>""" % s.replace("]] >", "]]>") 296 297 f = StringIO(s.encode("utf-8")) 298 try: 299 parser = ConfluenceXMLParser(out) 300 parser.parse(f) 301 finally: 302 f.close() 303 304 if __name__ == "__main__": 305 s = sys.stdin.read() 306 out = codecs.getwriter("utf-8")(sys.stdout) 307 parse(s, out) 308 309 # vim: tabstop=4 expandtab shiftwidth=4