1 #!/usr/bin/env python 2 3 """ 4 Confluence Wiki XML/XHTML syntax parsing. 5 6 Copyright (C) 2012, 2013 Paul Boddie <paul@boddie.org.uk> 7 8 This software is free software; you can redistribute it and/or 9 modify it under the terms of the GNU General Public License as 10 published by the Free Software Foundation; either version 2 of 11 the License, or (at your option) any later version. 12 13 This software is distributed in the hope that it will be useful, 14 but WITHOUT ANY WARRANTY; without even the implied warranty of 15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 GNU General Public License for more details. 17 18 You should have received a copy of the GNU General Public 19 License along with this library; see the file LICENCE.txt 20 If not, write to the Free Software Foundation, Inc., 21 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA 22 """ 23 24 try: 25 from cStringIO import StringIO 26 except ImportError: 27 from StringIO import StringIO 28 29 from MoinMoin import wikiutil 30 from common import * 31 from xmlread import Parser 32 import re 33 import sys 34 import operator 35 import htmlentitydefs 36 import codecs 37 38 # XML dialect syntax parsing. 39 40 tags = { 41 # XHTML tag MoinMoin syntax 42 "strong" : "'''%s'''", 43 "em" : "''%s''", 44 "u" : "__%s__", 45 "del" : "--(%s)--", 46 "sup" : "^%s^", 47 "sub" : ",,%s,,", 48 "code" : "`%s`", 49 "tbody" : "%s", 50 "tr" : "%s", 51 "th" : "'''%s'''", 52 "td" : "%s", 53 "blockquote" : " %s", 54 "small" : "~-%s-~", 55 "big" : "~+%s+~", 56 "p" : "%s", 57 "ol" : "%s", 58 "ul" : "%s", 59 "ac:link" : "[[%s%s%s|%s]]", 60 "ac:image" : "{{%s%s%s|%s}}", 61 "a" : "[[%s|%s]]", 62 } 63 64 for tag, translation in blocktypes.items(): 65 tags[tag] = translation 66 67 simple_tags = { 68 # XHTML tag MoinMoin syntax 69 "br" : "<<BR>>", 70 } 71 72 simple_preformatted_tags = { 73 # XHTML tag MoinMoin syntax 74 "br" : "\n", 75 } 76 77 list_tags = { 78 # XHTML list tag MoinMoin list item syntax 79 "ol" : "1. %s", 80 "ul" : "* %s", 81 } 82 83 preformatted_tags = ["pre", "ac:plain-text-body"] 84 single_level_tags = ["strong", "em", "u", "del", "sup", "sub", "code"] 85 formatted_tags = ["ac:rich-text-body", "table"] 86 87 indented_tags = ["li", "p"] + preformatted_tags + formatted_tags 88 block_tags = indented_tags + blocktypes.keys() + list_tags.keys() 89 span_override_tags = ["ac:link"] 90 91 link_target_tags = { 92 # Confluence element Attributes providing the target 93 "ri:page" : ("ri:space-key", "ri:content-title"), 94 "ri:attachment" : ("ri:filename",), 95 "ri:user" : ("ri:username",), 96 } 97 98 link_target_prefixes = { 99 # Attribute with details Prefix ensuring correct relative link 100 "ri:space-key" : "..", 101 "ri:content-title" : "..", 102 } 103 104 link_label_attributes = "ri:content-title", "ac:link-body" 105 106 # NOTE: User links should support the intended user namespace prefix. 107 108 link_target_types = { 109 # Confluence element MoinMoin link prefix 110 "ri:attachment" : "attachment:", 111 "ri:user" : "", 112 } 113 114 macro_rich_text_styles = { 115 # Confluence style MoinMoin admonition style 116 "note" : "caution", 117 "warning" : "warning", 118 "info" : "important", 119 "tip" : "tip", 120 "excerpt" : "", 121 } 122 123 macroargs = { 124 # Confluence macro Confluence and MoinMoin macro arguments 125 "color" : ("color", "col"), 126 } 127 128 macrotypes = { 129 # Confluence macro MoinMoin syntax 130 "anchor" : "<<Anchor(%(anchor)s)>>", 131 "color" : "<<Color2(%(content)s, %(args)s)>>", 132 "toc" : "<<TableOfContents>>", 133 } 134 135 normalise_regexp_str = r"\s+" 136 normalise_regexp = re.compile(normalise_regexp_str) 137 138 class ConfluenceXMLParser(Parser): 139 140 "Handle content from Confluence 4 page revisions." 141 142 def __init__(self, out): 143 Parser.__init__(self) 144 self.out = out 145 146 # Link target and label information. 147 148 self.target = None 149 self.target_type = None 150 self.label = None 151 152 # Macro information. 153 154 self.macros = [] 155 self.macro_parameters = [] 156 self.held_anchors = [] 157 158 # Indentation and element nesting states. 159 160 self.indents = [0] 161 self.states = {} 162 self.max_level = self.level = 0 163 164 for name in preformatted_tags + single_level_tags: 165 self.states[name] = 0 166 167 # Table states. 168 169 self.table_rows = 0 170 self.table_columns = 0 171 172 # Block states. 173 174 self.have_block = False 175 176 # ContentHandler-related methods. 177 178 def startElement(self, name, attrs): 179 180 # Track indentation for lists. 181 182 if list_tags.has_key(name): 183 self.indents.append(self.indents[-1] + 1) 184 185 # Track element nesting. 186 187 if self.states.has_key(name): 188 self.states[name] += 1 189 190 # Track cumulative element nesting in order to produce appropriate depth 191 # indicators in the formatted output. 192 193 if name in preformatted_tags or name in formatted_tags: 194 self.level += 1 195 self.max_level = max(self.level, self.max_level) 196 197 # Reset indentation within regions. 198 199 self.indents.append(0) 200 201 if name in headings: 202 self.held_anchors = [] 203 204 Parser.startElement(self, name, attrs) 205 206 # Remember macro information for use within the element. 207 208 if name == "ac:macro": 209 self.macros.append(self.attributes[-1].get("ac:name")) 210 self.macro_parameters.append({}) 211 212 def endElement(self, name): 213 214 # Reset the indent for any preformatted/formatted region so that it may 215 # itself be indented. 216 217 if name in preformatted_tags or name in formatted_tags: 218 self.indents.pop() 219 220 Parser.endElement(self, name) 221 222 if list_tags.has_key(name): 223 self.indents.pop() 224 225 if self.states.has_key(name): 226 self.states[name] -= 1 227 228 if name in preformatted_tags or name in formatted_tags: 229 self.level -= 1 230 if not self.level: 231 self.max_level = 0 232 233 # Discard macro state. 234 235 if name == "ac:macro": 236 self.macros.pop() 237 self.macro_parameters.pop() 238 239 def characters(self, content): 240 if not self.is_preformatted(): 241 content = self.normalise(content, self.elements[-1]) 242 Parser.characters(self, content) 243 244 def skippedEntity(self, name): 245 ch = htmlentitydefs.name2codepoint.get(name) 246 if ch: 247 self.text[-1].append(unichr(ch)) 248 249 # Parser-related methods. 250 251 def handleElement(self, name): 252 253 """ 254 Handle the completion of the element with the given 'name'. Any content 255 will either be recorded for later use (by an enclosing element, for 256 example) or emitted in some form. 257 """ 258 259 text = u"".join(self.text[-1]) 260 261 # Handle state. 262 263 if name == "table": 264 self.table_rows = 0 265 elif name == "tr": 266 self.table_columns = 0 267 268 # Find conversions. 269 270 conversion = None 271 272 # Handle list elements. 273 274 if name == "li" and len(self.elements) > 1: 275 list_tag = self.elements[-2] 276 conversion = list_tags.get(list_tag) 277 278 # Remember link target information. 279 280 elif link_target_tags.has_key(name): 281 target_details = [] 282 283 # Get target details from the element's attributes. 284 285 for attrname in link_target_tags[name]: 286 attrvalue = self.attributes[-1].get(attrname) 287 if attrvalue: 288 289 # Obtain a link label. 290 291 if attrname in link_label_attributes and not self.label: 292 self.label = attrvalue 293 294 # Validate any page title. 295 296 if attrname == "ri:content-title": 297 attrvalue = get_page_title(attrvalue) 298 target_details.append(attrvalue) 299 300 # Insert any prefix required for the link. 301 302 prefix = link_target_prefixes.get(attrname) 303 if prefix: 304 target_details.insert(0, prefix) 305 306 # Make a link based on the details. 307 308 self.target = u"/".join(target_details) 309 self.target_type = name 310 text = "" 311 312 # For anchor links, just use the raw text and let Moin do the formatting. 313 # Set an empty default target, overwriting it if enclosing elements 314 # specify target details. 315 316 elif name == "ac:link-body": 317 self.target = self.target or "" 318 self.label = text.strip() 319 text = "" 320 321 # For conventional links, remember the href attribute as the target. 322 323 elif name == "a": 324 self.target = self.attributes[-1].get("href") 325 self.label = text.strip() 326 text = "" 327 328 # Remember macro information. 329 330 elif name == "ac:parameter": 331 self.macro_parameters[-1][self.attributes[-1].get("ac:name")] = text 332 text = "" 333 334 elif name == "ac:default-parameter": 335 self.macro_parameters[-1][self.attributes[-2].get("ac:name")] = text 336 text = "" 337 338 # Handle single-level tags. 339 340 elif name in single_level_tags and self.states[name] > 1: 341 conversion = "%s" 342 343 # Handle preformatted sections. 344 345 elif name in preformatted_tags or name in formatted_tags: 346 347 # Nest the section appropriately. 348 349 level = 3 + self.max_level - self.level 350 opening = "{" * level 351 closing = "}" * level 352 353 # Macro name information is used to style rich text body regions. 354 355 if name != "table" and self.macros and macro_rich_text_styles.has_key(self.macros[-1]): 356 details = macro_rich_text_styles[self.macros[-1]] 357 title = self.macro_parameters[-1].get("title") 358 if title: 359 details = "%s\n\n%s" % (details, title) 360 361 conversion = "%s#!wiki %s\n\n%%s\n%s" % (opening, details, closing) 362 363 elif name == "table": 364 conversion = "%s#!table\n%%s\n%s" % (opening, closing) 365 366 else: 367 # Preformatted sections containing newlines must contain an initial 368 # newline. 369 370 if text.find("\n") != -1 and not text.startswith("\n"): 371 opening += "\n" 372 373 conversion = "%s%%s%s" % (opening, closing) 374 375 # Handle the common case and simpler special cases. 376 377 if not conversion: 378 conversion = tags.get(name) 379 380 381 382 # Attempt to convert the text. 383 384 # Links require target information. 385 386 if name in ("ac:link", "ac:image"): 387 prefix = link_target_types.get(self.target_type, "") 388 anchor = self.attributes[-1].get("ac:anchor") or "" 389 label = self.label or text.strip() or self.target 390 text = conversion % (prefix, self.target, anchor and ("#%s" % anchor) or "", label) 391 self.target = self.target_type = self.label = None 392 393 elif name == "a": 394 text = conversion % (self.target, self.label or self.target) 395 self.target = self.target_type = self.label = None 396 397 # Macros require various kinds of information. 398 # Some macros affect the formatting of their contents, whereas other 399 # simpler macros are handled here. 400 401 elif name == "ac:macro": 402 conversion = macrotypes.get(self.macros[-1]) 403 if conversion: 404 parameters = {"content" : text} 405 parameters.update(self.macro_parameters[-1]) 406 argnames = macroargs.get(self.macros[-1]) 407 if argnames: 408 confargname, moinargname = argnames 409 parameters["args"] = quote_macro_argument("%s=%s" % (moinargname, self.macro_parameters[-1][confargname])) 410 text = conversion % parameters 411 if self.macros[-1] == "anchor" and self.forbids_macros(): 412 self.held_anchors.append(text) 413 text = "" 414 415 # Handle the common cases for parameterised and unparameterised 416 # substitutions. 417 418 elif text and conversion: 419 text = conversion % text 420 elif simple_tags.has_key(name) and not self.is_preformatted(): 421 text = simple_tags[name] 422 elif simple_preformatted_tags.has_key(name) and self.is_preformatted(): 423 text = simple_preformatted_tags[name] 424 425 426 427 # Postprocess table columns and rows. 428 429 if name in ("th", "td"): 430 if self.table_columns: 431 text = "\n|| %s" % text 432 self.table_columns += 1 433 elif name == "tr": 434 if self.table_rows: 435 text = "\n==\n%s" % text 436 self.table_rows += 1 437 438 # Postprocess held anchor tags in headings. 439 440 elif name in headings and self.held_anchors: 441 text = "%s\n%s" % ("".join(self.held_anchors), text) 442 443 444 445 # Normalise leading whitespace and indent the text if appropriate. 446 447 if name in indented_tags: 448 text = " " * self.indents[-1] + text.lstrip() 449 450 # Add the converted text to the end of the parent element's text nodes. 451 452 if len(self.text) > 1: 453 nodes = self.text[-2] 454 parent = self.elements[-2] 455 456 # Where preceding text exists, add any blank line separators. 457 458 if u"".join(nodes): 459 460 # All top-level elements are separated with blank lines. 461 462 if parent == "body": 463 nodes.append("\n") 464 465 # Block elements always cause a new line to be started. 466 467 if name in block_tags or self.have_block and name not in span_override_tags: 468 nodes.append("\n") 469 470 self.have_block = False 471 472 # Lists inside lists require separation. 473 474 elif list_tags.has_key(name) and parent == "li": 475 nodes.append("\n") 476 477 # Without preceding text, save any block node state for non-block 478 # elements so that newline separators can be added at another 479 # level. 480 481 elif name in block_tags and parent not in block_tags: 482 self.have_block = True 483 484 elif name not in block_tags and self.have_block and name not in span_override_tags: 485 self.have_block = True 486 487 else: 488 self.have_block = False 489 490 nodes.append(text) 491 492 # Otherwise, emit the text (at the top level of the document). 493 494 else: 495 self.out.write(text) 496 497 def is_preformatted(self): 498 return reduce(operator.or_, [self.states[tag] for tag in preformatted_tags], False) 499 500 def forbids_macros(self): 501 return reduce(operator.or_, [(tag in headings or tag == "a") for tag in self.elements], False) 502 503 # Whitespace normalisation. 504 505 def get_replacement(self, name): 506 if name in ("html", "body", "table", "tbody", "tr") or list_tags.has_key(name): 507 return "" 508 else: 509 return " " 510 511 def normalise(self, text, name): 512 return normalise_regexp.sub(self.get_replacement(name), text) 513 514 def parse(s, out): 515 516 "Parse the content in the string 's', writing a translation to 'out'." 517 518 # NOTE: CDATA sections appear to have erroneous endings. 519 520 s = u"""\ 521 <?xml version="1.0"?> 522 <!DOCTYPE html 523 PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" 524 "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> 525 <html xmlns="http://www.w3.org/1999/xhtml"> 526 <body> 527 %s 528 </body> 529 </html>""" % s.replace("]] >", "]]>") 530 531 f = StringIO(s.encode("utf-8")) 532 try: 533 parser = ConfluenceXMLParser(out) 534 parser.parse(f) 535 finally: 536 f.close() 537 538 if __name__ == "__main__": 539 s = codecs.getreader("utf-8")(sys.stdin).read() 540 out = codecs.getwriter("utf-8")(sys.stdout) 541 parse(s, out) 542 543 # vim: tabstop=4 expandtab shiftwidth=4