1 #!/usr/bin/env python 2 3 """ 4 Confluence Wiki XML/XHTML syntax parsing. 5 6 Copyright (C) 2012, 2013 Paul Boddie <paul@boddie.org.uk> 7 8 This software is free software; you can redistribute it and/or 9 modify it under the terms of the GNU General Public License as 10 published by the Free Software Foundation; either version 2 of 11 the License, or (at your option) any later version. 12 13 This software is distributed in the hope that it will be useful, 14 but WITHOUT ANY WARRANTY; without even the implied warranty of 15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 GNU General Public License for more details. 17 18 You should have received a copy of the GNU General Public 19 License along with this library; see the file LICENCE.txt 20 If not, write to the Free Software Foundation, Inc., 21 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA 22 """ 23 24 try: 25 from cStringIO import StringIO 26 except ImportError: 27 from StringIO import StringIO 28 29 from MoinMoin import wikiutil 30 from common import * 31 from xmlread import Parser 32 import re 33 import sys 34 import operator 35 import htmlentitydefs 36 import codecs 37 38 # XML dialect syntax parsing. 39 40 tags = { 41 # XHTML tag MoinMoin syntax 42 "strong" : "'''%s'''", 43 "em" : "''%s''", 44 "u" : "__%s__", 45 "del" : "--(%s)--", 46 "sup" : "^%s^", 47 "sub" : ",,%s,,", 48 "code" : "`%s`", 49 "tbody" : "%s", 50 "tr" : "%s", 51 "th" : "'''%s'''", 52 "td" : "%s", 53 "blockquote" : " %s", 54 "small" : "~-%s-~", 55 "big" : "~+%s+~", 56 "p" : "%s", 57 "ol" : "%s", 58 "ul" : "%s", 59 "ac:link" : "[[%s%s|%s]]", 60 "ac:image" : "{{%s%s|%s}}", 61 "a" : "[[%s|%s]]", 62 } 63 64 for tag, translation in blocktypes.items(): 65 tags[tag] = translation 66 67 simple_tags = { 68 # XHTML tag MoinMoin syntax 69 "br" : "<<BR>>", 70 } 71 72 simple_preformatted_tags = { 73 # XHTML tag MoinMoin syntax 74 "br" : "\n", 75 } 76 77 list_tags = { 78 # XHTML list tag MoinMoin list item syntax 79 "ol" : "1. %s", 80 "ul" : "* %s", 81 } 82 83 preformatted_tags = ["pre", "ac:plain-text-body"] 84 single_level_tags = ["strong", "em", "u", "del", "sup", "sub", "code"] 85 formatted_tags = ["ac:rich-text-body", "table"] 86 87 indented_tags = ["li", "p"] + preformatted_tags + formatted_tags 88 block_tags = indented_tags + blocktypes.keys() + list_tags.keys() 89 span_override_tags = ["ac:link"] 90 91 link_target_tags = { 92 # Confluence element Attributes providing the target 93 "ri:page" : ("ri:space-key", "ri:content-title"), 94 "ri:attachment" : ("ri:filename",), 95 "ri:user" : ("ri:username",), 96 } 97 98 link_target_prefixes = { 99 # Attribute with details Prefix ensuring correct relative link 100 "ri:space-key" : "..", 101 "ri:content-title" : "..", 102 } 103 104 link_label_attributes = "ri:content-title", "ac:link-body" 105 106 # NOTE: User links should support the intended user namespace prefix. 107 108 link_target_types = { 109 # Confluence element MoinMoin link prefix 110 "ri:attachment" : "attachment:", 111 "ri:user" : "", 112 "ac:link-body" : "#", 113 } 114 115 macro_rich_text_styles = { 116 # Confluence style MoinMoin admonition style 117 "note" : "caution", 118 "warning" : "warning", 119 "info" : "important", 120 "tip" : "tip", 121 } 122 123 normalise_regexp_str = r"\s+" 124 normalise_regexp = re.compile(normalise_regexp_str) 125 126 class ConfluenceXMLParser(Parser): 127 128 "Handle content from Confluence 4 page revisions." 129 130 def __init__(self, out): 131 Parser.__init__(self) 132 self.out = out 133 134 # Link target and label information. 135 136 self.target = None 137 self.target_type = None 138 self.label = None 139 140 # Macro information. 141 142 self.macro = None 143 self.macro_parameters = {} 144 145 # Indentation and element nesting states. 146 147 self.indents = [0] 148 self.states = {} 149 self.max_level = self.level = 0 150 151 for name in preformatted_tags + single_level_tags: 152 self.states[name] = 0 153 154 # Table states. 155 156 self.table_rows = 0 157 self.table_columns = 0 158 159 # Block states. 160 161 self.have_block = False 162 163 # ContentHandler-related methods. 164 165 def startElement(self, name, attrs): 166 167 # Track indentation for lists. 168 169 if list_tags.has_key(name): 170 self.indents.append(self.indents[-1] + 1) 171 172 # Track element nesting. 173 174 if self.states.has_key(name): 175 self.states[name] += 1 176 177 # Track cumulative element nesting in order to produce appropriate depth 178 # indicators in the formatted output. 179 180 if name in preformatted_tags or name in formatted_tags: 181 self.level += 1 182 self.max_level = max(self.level, self.max_level) 183 184 # Reset indentation within regions. 185 186 self.indents.append(0) 187 188 Parser.startElement(self, name, attrs) 189 190 # Remember macro information for use within the element. 191 192 if name == "ac:macro": 193 self.macro = self.attributes[-1].get("ac:name") 194 195 def endElement(self, name): 196 197 # Reset the indent for any preformatted/formatted region so that it may 198 # itself be indented. 199 200 if name in preformatted_tags or name in formatted_tags: 201 self.indents.pop() 202 203 Parser.endElement(self, name) 204 205 if list_tags.has_key(name): 206 self.indents.pop() 207 208 if self.states.has_key(name): 209 self.states[name] -= 1 210 211 if name in preformatted_tags or name in formatted_tags: 212 self.level -= 1 213 if not self.level: 214 self.max_level = 0 215 216 # Discard macro state. 217 218 if name == "ac:macro": 219 self.macro = None 220 self.macro_parameters = {} 221 222 def characters(self, content): 223 if not self.is_preformatted(): 224 content = self.normalise(content, self.elements[-1]) 225 Parser.characters(self, content) 226 227 def skippedEntity(self, name): 228 ch = htmlentitydefs.name2codepoint.get(name) 229 if ch: 230 self.text[-1].append(unichr(ch)) 231 232 # Parser-related methods. 233 234 def handleElement(self, name): 235 236 """ 237 Handle the completion of the element with the given 'name'. Any content 238 will either be recorded for later use (by an enclosing element, for 239 example) or emitted in some form. 240 """ 241 242 text = u"".join(self.text[-1]) 243 244 # Handle state. 245 246 if name == "table": 247 self.table_rows = 0 248 elif name == "tr": 249 self.table_columns = 0 250 251 # Find conversions. 252 253 conversion = None 254 255 # Handle list elements. 256 257 if name == "li" and len(self.elements) > 1: 258 list_tag = self.elements[-2] 259 conversion = list_tags.get(list_tag) 260 261 # Remember link target information. 262 263 elif link_target_tags.has_key(name): 264 target_details = [] 265 266 # Get target details from the element's attributes. 267 268 for attrname in link_target_tags[name]: 269 attrvalue = self.attributes[-1].get(attrname) 270 if attrvalue: 271 target_details.append(attrvalue) 272 prefix = link_target_prefixes.get(attrname) 273 if prefix: 274 target_details.insert(0, prefix) 275 if attrname in link_label_attributes and not self.label: 276 self.label = attrvalue 277 278 # Make a link based on the details. 279 280 self.target = u"/".join(target_details) 281 self.target_type = name 282 text = "" 283 284 # For anchor links, just use the raw text and let Moin do the formatting. 285 286 elif name == "ac:link-body": 287 if not self.target_type: 288 self.target_type = name 289 self.label = text.strip() 290 text = "" 291 292 # For conventional links, remember the href attribute as the target. 293 294 elif name == "a": 295 self.target = self.attributes[-1].get("href") 296 self.label = text.strip() 297 text = "" 298 299 # Remember macro information. 300 301 elif name == "ac:parameter": 302 self.macro_parameters[self.attributes[-1].get("ac:name")] = text 303 text = "" 304 305 elif name == "ac:default-parameter": 306 self.macro_parameters[self.attributes[-2].get("ac:name")] = text 307 text = "" 308 309 # Handle single-level tags. 310 311 elif name in single_level_tags and self.states[name] > 1: 312 conversion = "%s" 313 314 # Handle preformatted sections. 315 316 elif name in preformatted_tags or name in formatted_tags: 317 318 # Nest the section appropriately. 319 320 level = 3 + self.max_level - self.level 321 opening = "{" * level 322 closing = "}" * level 323 324 # Macro name information is used to style rich text body regions. 325 326 if name != "table" and self.macro and macro_rich_text_styles.has_key(self.macro): 327 details = macro_rich_text_styles[self.macro] 328 title = self.macro_parameters.get("title") 329 if title: 330 details = "%s\n\n%s" % (details, title) 331 332 conversion = "%s#!wiki %s\n\n%%s\n%s" % (opening, details, closing) 333 334 elif name == "table": 335 conversion = "%s#!table\n%%s\n%s" % (opening, closing) 336 337 else: 338 # Preformatted sections containing newlines must contain an initial 339 # newline. 340 341 if text.find("\n") != -1 and not text.startswith("\n"): 342 opening += "\n" 343 344 conversion = "%s%%s%s" % (opening, closing) 345 346 # Handle the common case and simpler special cases. 347 348 if not conversion: 349 conversion = tags.get(name) 350 351 352 353 # Attempt to convert the text. 354 355 # Links require target information. 356 357 if name in ("ac:link", "ac:image"): 358 prefix = link_target_types.get(self.target_type, "") 359 anchor = self.attributes[-1].get("ac:anchor") 360 text = conversion % (prefix, anchor or self.target, self.label or text.strip() or self.target) 361 self.target = self.target_type = self.label = None 362 363 elif name == "a": 364 text = conversion % (self.target, self.label or self.target) 365 self.target = self.target_type = self.label = None 366 367 # Macros require various kinds of information. 368 369 elif name == "ac:macro": 370 macro_name = self.attributes[-1]["ac:name"] 371 372 # Handle the common cases for parameterised and unparameterised 373 # substitutions. 374 375 elif text and conversion: 376 text = conversion % text 377 elif simple_tags.has_key(name) and not self.is_preformatted(): 378 text = simple_tags[name] 379 elif simple_preformatted_tags.has_key(name) and self.is_preformatted(): 380 text = simple_preformatted_tags[name] 381 382 383 384 # Postprocess table columns and rows. 385 386 if name in ("th", "td"): 387 if self.table_columns: 388 text = "\n|| %s" % text 389 self.table_columns += 1 390 elif name == "tr": 391 if self.table_rows: 392 text = "\n==\n%s" % text 393 self.table_rows += 1 394 395 396 397 # Normalise leading whitespace and indent the text if appropriate. 398 399 if name in indented_tags: 400 text = " " * self.indents[-1] + text.lstrip() 401 402 # Add the converted text to the end of the parent element's text nodes. 403 404 if len(self.text) > 1: 405 nodes = self.text[-2] 406 parent = self.elements[-2] 407 408 # Where preceding text exists, add any blank line separators. 409 410 if u"".join(nodes): 411 412 # All top-level elements are separated with blank lines. 413 414 if parent == "body": 415 nodes.append("\n") 416 417 # Block elements always cause a new line to be started. 418 419 if name in block_tags or self.have_block and name not in span_override_tags: 420 nodes.append("\n") 421 422 self.have_block = False 423 424 # Lists inside lists require separation. 425 426 elif list_tags.has_key(name) and parent == "li": 427 nodes.append("\n") 428 429 # Without preceding text, save any block node state for non-block 430 # elements so that newline separators can be added at another 431 # level. 432 433 elif name in block_tags and parent not in block_tags: 434 self.have_block = True 435 436 elif name not in block_tags and self.have_block and name not in span_override_tags: 437 self.have_block = True 438 439 else: 440 self.have_block = False 441 442 nodes.append(text) 443 444 # Otherwise, emit the text (at the top level of the document). 445 446 else: 447 self.out.write(text) 448 449 def is_preformatted(self): 450 return reduce(operator.or_, [self.states[tag] for tag in preformatted_tags], False) 451 452 # Whitespace normalisation. 453 454 def get_replacement(self, name): 455 if name in ("html", "body", "table", "tbody", "tr") or list_tags.has_key(name): 456 return "" 457 else: 458 return " " 459 460 def normalise(self, text, name): 461 return normalise_regexp.sub(self.get_replacement(name), text) 462 463 def parse(s, out): 464 465 "Parse the content in the string 's', writing a translation to 'out'." 466 467 # NOTE: CDATA sections appear to have erroneous endings. 468 469 s = u"""\ 470 <?xml version="1.0"?> 471 <!DOCTYPE html 472 PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" 473 "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> 474 <html xmlns="http://www.w3.org/1999/xhtml"> 475 <body> 476 %s 477 </body> 478 </html>""" % s.replace("]] >", "]]>") 479 480 f = StringIO(s.encode("utf-8")) 481 try: 482 parser = ConfluenceXMLParser(out) 483 parser.parse(f) 484 finally: 485 f.close() 486 487 if __name__ == "__main__": 488 s = codecs.getreader("utf-8")(sys.stdin).read() 489 out = codecs.getwriter("utf-8")(sys.stdout) 490 parse(s, out) 491 492 # vim: tabstop=4 expandtab shiftwidth=4