1 #!/usr/bin/env python 2 3 """ 4 Confluence Wiki XML/XHTML syntax parsing. 5 6 Copyright (C) 2012, 2013 Paul Boddie <paul@boddie.org.uk> 7 8 This software is free software; you can redistribute it and/or 9 modify it under the terms of the GNU General Public License as 10 published by the Free Software Foundation; either version 2 of 11 the License, or (at your option) any later version. 12 13 This software is distributed in the hope that it will be useful, 14 but WITHOUT ANY WARRANTY; without even the implied warranty of 15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 GNU General Public License for more details. 17 18 You should have received a copy of the GNU General Public 19 License along with this library; see the file LICENCE.txt 20 If not, write to the Free Software Foundation, Inc., 21 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA 22 """ 23 24 try: 25 from cStringIO import StringIO 26 except ImportError: 27 from StringIO import StringIO 28 29 from MoinMoin import wikiutil 30 from common import * 31 from xmlread import Parser 32 import re 33 import sys 34 import operator 35 import htmlentitydefs 36 import codecs 37 38 # XML dialect syntax parsing. 39 40 tags = { 41 # XHTML tag MoinMoin syntax 42 "strong" : "'''%s'''", 43 "em" : "''%s''", 44 "u" : "__%s__", 45 "del" : "--(%s)--", 46 "sup" : "^%s^", 47 "sub" : ",,%s,,", 48 "code" : "`%s`", 49 "tbody" : "%s", 50 "tr" : "%s", 51 "th" : "'''%s'''", 52 "td" : "%s", 53 "blockquote" : " %s", 54 "small" : "~-%s-~", 55 "big" : "~+%s+~", 56 "p" : "%s", 57 "ol" : "%s", 58 "ul" : "%s", 59 "ac:link" : "[[%s%s|%s]]", 60 "ac:image" : "{{%s%s|%s}}", 61 } 62 63 for tag, translation in blocktypes.items(): 64 tags[tag] = translation 65 66 simple_tags = { 67 # XHTML tag MoinMoin syntax 68 "br" : "<<BR>>", 69 } 70 71 list_tags = { 72 # XHTML list tag MoinMoin list item syntax 73 "ol" : "1. %s", 74 "ul" : "* %s", 75 } 76 77 indented_tags = ["li", "p"] 78 79 preformatted_tags = ["pre", "ac:plain-text-body"] 80 single_level_tags = ["strong", "em", "u", "del", "sup", "sub", "code"] 81 formatted_tags = ["ac:rich-text-body", "table"] 82 83 link_target_tags = { 84 # Confluence element Attributes providing the target 85 "ri:page" : ("ri:space-key", "ri:content-title"), 86 "ri:attachment" : ("ri:filename",), 87 "ri:user" : ("ri:username",), 88 } 89 90 link_target_prefixes = { 91 # Attribute with details Prefix ensuring correct relative link 92 "ri:space-key" : "..", 93 "ri:content-title" : "..", 94 } 95 96 link_label_attributes = "ri:content-title", "ac:link-body" 97 98 # NOTE: User links should support the intended user namespace prefix. 99 100 link_target_types = { 101 # Confluence element MoinMoin link prefix 102 "ri:attachment" : "attachment:", 103 "ri:user" : "", 104 "ac:link-body" : "#", 105 } 106 107 macro_rich_text_styles = { 108 # Confluence style MoinMoin admonition style 109 "note" : "caution", 110 "warning" : "warning", 111 "info" : "important", 112 "tip" : "tip", 113 } 114 115 normalise_regexp_str = r"\s+" 116 normalise_regexp = re.compile(normalise_regexp_str) 117 118 class ConfluenceXMLParser(Parser): 119 120 "Handle content from Confluence 4 page revisions." 121 122 def __init__(self, out): 123 Parser.__init__(self) 124 self.out = out 125 126 # Link target and label information. 127 128 self.target = None 129 self.target_type = None 130 self.label = None 131 132 # Macro information. 133 134 self.macro = None 135 self.macro_parameters = {} 136 137 # Indentation and element nesting states. 138 139 self.indent = 0 140 self.states = {} 141 self.max_level = self.level = 0 142 143 for name in preformatted_tags + single_level_tags: 144 self.states[name] = 0 145 146 # Table states. 147 148 self.table_rows = 0 149 self.table_columns = 0 150 151 # ContentHandler-related methods. 152 153 def startElement(self, name, attrs): 154 155 # Track indentation for lists. 156 157 if list_tags.has_key(name): 158 self.indent += 1 159 160 # Track element nesting. 161 162 elif self.states.has_key(name): 163 self.states[name] += 1 164 165 # Track cumulative element nesting in order to produce appropriate depth 166 # indicators in the formatted output. 167 168 if name in preformatted_tags or name in formatted_tags: 169 self.level += 1 170 self.max_level = max(self.level, self.max_level) 171 172 Parser.startElement(self, name, attrs) 173 174 # Remember macro information for use within the element. 175 176 if name == "ac:macro": 177 self.macro = self.attributes[-1].get("ac:name") 178 179 def endElement(self, name): 180 Parser.endElement(self, name) 181 182 if list_tags.has_key(name): 183 self.indent -= 1 184 elif self.states.has_key(name): 185 self.states[name] -= 1 186 if name in preformatted_tags or name in formatted_tags: 187 self.level -= 1 188 if not self.level: 189 self.max_level = 0 190 191 def characters(self, content): 192 if not self.is_preformatted(): 193 content = self.normalise(content, self.elements[-1]) 194 Parser.characters(self, content) 195 196 def skippedEntity(self, name): 197 ch = htmlentitydefs.name2codepoint.get(name) 198 if ch: 199 self.text[-1].append(unichr(ch)) 200 201 # Parser-related methods. 202 203 def handleElement(self, name): 204 205 """ 206 Handle the completion of the element with the given 'name'. Any content 207 will either be recorded for later use (by an enclosing element, for 208 example) or emitted in some form. 209 """ 210 211 text = "".join(self.text[-1]) 212 213 # Handle state. 214 215 if name == "table": 216 self.table_rows = 0 217 elif name == "tr": 218 self.table_columns = 0 219 220 # Find conversions. 221 222 conversion = None 223 224 # Handle list elements. 225 226 if name == "li" and len(self.elements) > 1: 227 list_tag = self.elements[-2] 228 conversion = list_tags.get(list_tag) 229 230 # Remember link target information. 231 232 elif link_target_tags.has_key(name): 233 target_details = [] 234 235 # Get target details from the element's attributes. 236 237 for attrname in link_target_tags[name]: 238 attrvalue = self.attributes[-1].get(attrname) 239 if attrvalue: 240 target_details.append(attrvalue) 241 prefix = link_target_prefixes.get(attrname) 242 if prefix: 243 target_details.insert(0, prefix) 244 if attrname in link_label_attributes and not self.label: 245 self.label = attrvalue 246 247 # Make a link based on the details. 248 249 self.target = "/".join(target_details) 250 self.target_type = name 251 text = "" 252 253 # For anchor links, just use the raw text and let Moin do the formatting. 254 255 elif name == "ac:link-body": 256 if not self.target_type: 257 self.target_type = name 258 self.label = text 259 text = "" 260 261 # Discard macro state. 262 263 elif name == "ac:macro": 264 self.macro = None 265 self.macro_parameters = {} 266 267 # Remember macro information. 268 269 elif name in ("ac:parameter", "ac:default-parameter"): 270 self.macro_parameters[self.attributes[-1].get("ac:name")] = text 271 text = "" 272 273 # Handle single-level tags. 274 275 elif name in single_level_tags and self.states[name] > 1: 276 conversion = "%s" 277 278 # Handle preformatted sections. 279 280 elif name in preformatted_tags or name in formatted_tags: 281 282 # Nest the section appropriately. 283 284 level = 3 + self.max_level - self.level 285 opening = "{" * level 286 closing = "}" * level 287 288 # Macro name information is used to style rich text body regions. 289 290 if name != "table" and self.macro and macro_rich_text_styles.has_key(self.macro): 291 details = macro_rich_text_styles[self.macro] 292 title = self.macro_parameters.get("title") 293 if title: 294 details = "%s\n\n%s" % (details, title) 295 296 conversion = "%s#!wiki %s\n\n%%s\n%s" % (opening, details, closing) 297 298 elif name == "table": 299 conversion = "%s#!table\n%%s\n%s" % (opening, closing) 300 301 else: 302 conversion = "%s%%s%s" % (opening, closing) 303 304 # Handle the common case. 305 306 else: 307 conversion = tags.get(name) 308 309 # Attempt to convert the text. 310 311 # Links require target information. 312 313 if name in ("ac:link", "ac:image"): 314 prefix = link_target_types.get(self.target_type, "") 315 anchor = self.attributes[-1].get("ac:anchor") 316 text = conversion % (prefix, anchor or self.target, self.label or text or self.target) 317 self.target = self.target_type = self.label = None 318 319 # Handle the common case. 320 321 elif text and conversion: 322 text = conversion % text 323 elif simple_tags.has_key(name): 324 text = simple_tags[name] 325 326 # Postprocess table columns and rows. 327 328 if name in ("th", "td"): 329 if self.table_columns: 330 text = "\n|| %s" % text 331 self.table_columns += 1 332 elif name == "tr": 333 if self.table_rows: 334 text = "\n==\n%s" % text 335 self.table_rows += 1 336 337 # Normalise leading whitespace and indent the text if appropriate. 338 339 if name in indented_tags: 340 text = " " * self.indent + text.lstrip() 341 342 # Add the converted text to the end of the parent element's text nodes. 343 344 if len(self.text) > 1: 345 nodes = self.text[-2] 346 if "".join(self.text[-2]): 347 parent = self.elements[-2] 348 if parent == "body": 349 nodes.append("\n\n") 350 elif list_tags.has_key(parent): 351 nodes.append("\n") 352 elif list_tags.has_key(name): 353 nodes.append("\n") 354 nodes.append(text) 355 356 # Otherwise, emit the text. 357 358 else: 359 self.out.write(text) 360 361 def is_preformatted(self): 362 return reduce(operator.or_, [self.states[tag] for tag in preformatted_tags], False) 363 364 # Whitespace normalisation. 365 366 def get_replacement(self, name): 367 if name in ("html", "body", "table", "tbody", "tr") or list_tags.has_key(name): 368 return "" 369 else: 370 return " " 371 372 def normalise(self, text, name): 373 return normalise_regexp.sub(self.get_replacement(name), text) 374 375 def parse(s, out): 376 377 "Parse the content in the string 's', writing a translation to 'out'." 378 379 # NOTE: CDATA sections appear to have erroneous endings. 380 381 s = u"""\ 382 <?xml version="1.0"?> 383 <!DOCTYPE html 384 PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" 385 "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> 386 <html xmlns="http://www.w3.org/1999/xhtml"> 387 <body> 388 %s 389 </body> 390 </html>""" % s.replace("]] >", "]]>") 391 392 f = StringIO(s.encode("utf-8")) 393 try: 394 parser = ConfluenceXMLParser(out) 395 parser.parse(f) 396 finally: 397 f.close() 398 399 if __name__ == "__main__": 400 s = sys.stdin.read() 401 out = codecs.getwriter("utf-8")(sys.stdout) 402 parse(s, out) 403 404 # vim: tabstop=4 expandtab shiftwidth=4