1 #!/usr/bin/env python 2 3 """ 4 Confluence Wiki XML/XHTML syntax parsing. 5 6 Copyright (C) 2012, 2013 Paul Boddie <paul@boddie.org.uk> 7 8 This software is free software; you can redistribute it and/or 9 modify it under the terms of the GNU General Public License as 10 published by the Free Software Foundation; either version 2 of 11 the License, or (at your option) any later version. 12 13 This software is distributed in the hope that it will be useful, 14 but WITHOUT ANY WARRANTY; without even the implied warranty of 15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 GNU General Public License for more details. 17 18 You should have received a copy of the GNU General Public 19 License along with this library; see the file LICENCE.txt 20 If not, write to the Free Software Foundation, Inc., 21 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA 22 """ 23 24 try: 25 from cStringIO import StringIO 26 except ImportError: 27 from StringIO import StringIO 28 29 from MoinMoin import wikiutil 30 from common import * 31 from xmlread import Parser 32 import re 33 import sys 34 import operator 35 import htmlentitydefs 36 import codecs 37 38 # XML dialect syntax parsing. 39 40 tags = { 41 # XHTML tag MoinMoin syntax 42 "strong" : "'''%s'''", 43 "em" : "''%s''", 44 "u" : "__%s__", 45 "del" : "--(%s)--", 46 "sup" : "^%s^", 47 "sub" : ",,%s,,", 48 "code" : "`%s`", 49 "tbody" : "%s", 50 "tr" : "%s", 51 "th" : "'''%s'''", 52 "td" : "%s", 53 "blockquote" : " %s", 54 "small" : "~-%s-~", 55 "big" : "~+%s+~", 56 "p" : "%s", 57 "ol" : "%s", 58 "ul" : "%s", 59 "ac:link" : "[[%s%s%s|%s]]", 60 "ac:image" : "{{%s%s%s|%s}}", 61 "a" : "[[%s|%s]]", 62 } 63 64 for tag, translation in blocktypes.items(): 65 tags[tag] = translation 66 67 simple_tags = { 68 # XHTML tag MoinMoin syntax 69 "br" : "<<BR>>", 70 } 71 72 simple_preformatted_tags = { 73 # XHTML tag MoinMoin syntax 74 "br" : "\n", 75 } 76 77 list_tags = { 78 # XHTML list tag MoinMoin list item syntax 79 "ol" : "1. %s", 80 "ul" : "* %s", 81 } 82 83 preformatted_tags = ["pre", "ac:plain-text-body"] 84 single_level_tags = ["strong", "em", "u", "del", "sup", "sub", "code"] 85 formatted_tags = ["ac:rich-text-body", "table"] 86 87 indented_tags = ["li", "p"] + preformatted_tags + formatted_tags 88 block_tags = indented_tags + blocktypes.keys() + list_tags.keys() 89 span_override_tags = ["ac:link"] 90 91 link_target_tags = { 92 # Confluence element Attributes providing the target 93 "ri:page" : ("ri:space-key", "ri:content-title"), 94 "ri:attachment" : ("ri:filename",), 95 "ri:user" : ("ri:username",), 96 } 97 98 link_target_prefixes = { 99 # Attribute with details Prefix ensuring correct relative link 100 "ri:space-key" : "..", 101 "ri:content-title" : "..", 102 } 103 104 link_label_attributes = "ri:content-title", "ac:link-body" 105 106 # NOTE: User links should support the intended user namespace prefix. 107 108 link_target_types = { 109 # Confluence element MoinMoin link prefix 110 "ri:attachment" : "attachment:", 111 "ri:user" : "", 112 } 113 114 macro_rich_text_styles = { 115 # Confluence style MoinMoin admonition style 116 "note" : "caution", 117 "warning" : "warning", 118 "info" : "important", 119 "tip" : "tip", 120 "excerpt" : "", 121 } 122 123 macroargs = { 124 # Confluence macro Confluence and MoinMoin macro arguments 125 "color" : ("color", "col"), 126 } 127 128 macrotypes = { 129 # Confluence macro MoinMoin syntax 130 "anchor" : "<<Anchor(%(anchor)s)>>", 131 "color" : "<<Color2(%(content)s, %(args)s)>>", 132 "toc" : "<<TableOfContents>>", 133 } 134 135 normalise_regexp_str = r"\s+" 136 normalise_regexp = re.compile(normalise_regexp_str) 137 138 class ConfluenceXMLParser(Parser): 139 140 "Handle content from Confluence 4 page revisions." 141 142 def __init__(self, out): 143 Parser.__init__(self) 144 self.out = out 145 146 # Link target and label information. 147 148 self.target = None 149 self.target_type = None 150 self.label = None 151 152 # Macro information. 153 154 self.macros = [] 155 self.macro_parameters = [] 156 self.held_anchors = [] 157 158 # Indentation and element nesting states. 159 160 self.indents = [0] 161 self.states = {} 162 self.max_level = self.level = 0 163 164 for name in preformatted_tags + single_level_tags: 165 self.states[name] = 0 166 167 # Table states. 168 169 self.table_rows = 0 170 self.table_columns = 0 171 172 # Block states. 173 174 self.have_block = False 175 176 # ContentHandler-related methods. 177 178 def startElement(self, name, attrs): 179 180 # Track indentation for lists. 181 182 if list_tags.has_key(name): 183 self.indents.append(self.indents[-1] + 1) 184 185 # Track element nesting. 186 187 if self.states.has_key(name): 188 self.states[name] += 1 189 190 # Track cumulative element nesting in order to produce appropriate depth 191 # indicators in the formatted output. 192 193 if name in preformatted_tags or name in formatted_tags: 194 self.level += 1 195 self.max_level = max(self.level, self.max_level) 196 197 # Reset indentation within regions. 198 199 self.indents.append(0) 200 201 if name in headings: 202 self.held_anchors = [] 203 204 Parser.startElement(self, name, attrs) 205 206 # Remember macro information for use within the element. 207 208 if name == "ac:macro": 209 self.macros.append(self.attributes[-1].get("ac:name")) 210 self.macro_parameters.append({}) 211 212 def endElement(self, name): 213 214 # Reset the indent for any preformatted/formatted region so that it may 215 # itself be indented. 216 217 if name in preformatted_tags or name in formatted_tags: 218 self.indents.pop() 219 220 Parser.endElement(self, name) 221 222 if list_tags.has_key(name): 223 self.indents.pop() 224 225 if self.states.has_key(name): 226 self.states[name] -= 1 227 228 if name in preformatted_tags or name in formatted_tags: 229 self.level -= 1 230 if not self.level: 231 self.max_level = 0 232 233 # Discard macro state. 234 235 if name == "ac:macro": 236 self.macros.pop() 237 self.macro_parameters.pop() 238 239 def characters(self, content): 240 if not self.is_preformatted(): 241 content = self.normalise(content, self.elements[-1]) 242 Parser.characters(self, content) 243 244 def skippedEntity(self, name): 245 ch = htmlentitydefs.name2codepoint.get(name) 246 if ch: 247 self.text[-1].append(unichr(ch)) 248 249 # Parser-related methods. 250 251 def handleElement(self, name): 252 253 """ 254 Handle the completion of the element with the given 'name'. Any content 255 will either be recorded for later use (by an enclosing element, for 256 example) or emitted in some form. 257 """ 258 259 text = u"".join(self.text[-1]) 260 261 # Handle state. 262 263 if name == "table": 264 self.table_rows = 0 265 elif name == "tr": 266 self.table_columns = 0 267 268 # Find conversions. 269 270 conversion = None 271 272 # Handle list elements. 273 274 if name == "li" and len(self.elements) > 1: 275 list_tag = self.elements[-2] 276 conversion = list_tags.get(list_tag) 277 278 # Remember link target information. 279 280 elif link_target_tags.has_key(name): 281 target_details = [] 282 283 # Get target details from the element's attributes. 284 285 for attrname in link_target_tags[name]: 286 attrvalue = self.attributes[-1].get(attrname) 287 if attrvalue: 288 289 # Obtain a link label. 290 291 if attrname in link_label_attributes and not self.label: 292 self.label = attrvalue 293 294 # Validate any page title. 295 296 if attrname == "ri:content-title": 297 attrvalue = get_page_title(attrvalue) 298 target_details.append(attrvalue) 299 300 # Insert any prefix required for the link. 301 302 prefix = link_target_prefixes.get(attrname) 303 if prefix: 304 target_details.insert(0, prefix) 305 306 # Make a link based on the details. 307 308 self.target = u"/".join(target_details) 309 self.target_type = name 310 text = "" 311 312 # For anchor links, just use the raw text and let Moin do the formatting. 313 314 elif name == "ac:link-body": 315 self.label = text.strip() 316 text = "" 317 318 # For conventional links, remember the href attribute as the target. 319 320 elif name == "a": 321 self.target = self.attributes[-1].get("href") 322 self.label = text.strip() 323 text = "" 324 325 # Remember macro information. 326 327 elif name == "ac:parameter": 328 self.macro_parameters[-1][self.attributes[-1].get("ac:name")] = text 329 text = "" 330 331 elif name == "ac:default-parameter": 332 self.macro_parameters[-1][self.attributes[-2].get("ac:name")] = text 333 text = "" 334 335 # Handle single-level tags. 336 337 elif name in single_level_tags and self.states[name] > 1: 338 conversion = "%s" 339 340 # Handle preformatted sections. 341 342 elif name in preformatted_tags or name in formatted_tags: 343 344 # Nest the section appropriately. 345 346 level = 3 + self.max_level - self.level 347 opening = "{" * level 348 closing = "}" * level 349 350 # Macro name information is used to style rich text body regions. 351 352 if name != "table" and self.macros and macro_rich_text_styles.has_key(self.macros[-1]): 353 details = macro_rich_text_styles[self.macros[-1]] 354 title = self.macro_parameters[-1].get("title") 355 if title: 356 details = "%s\n\n%s" % (details, title) 357 358 conversion = "%s#!wiki %s\n\n%%s\n%s" % (opening, details, closing) 359 360 elif name == "table": 361 conversion = "%s#!table\n%%s\n%s" % (opening, closing) 362 363 else: 364 # Preformatted sections containing newlines must contain an initial 365 # newline. 366 367 if text.find("\n") != -1 and not text.startswith("\n"): 368 opening += "\n" 369 370 conversion = "%s%%s%s" % (opening, closing) 371 372 # Handle the common case and simpler special cases. 373 374 if not conversion: 375 conversion = tags.get(name) 376 377 378 379 # Attempt to convert the text. 380 381 # Links require target information. 382 383 if name in ("ac:link", "ac:image"): 384 prefix = link_target_types.get(self.target_type, "") 385 anchor = self.attributes[-1].get("ac:anchor") or "" 386 label = self.label or text.strip() or self.target 387 text = conversion % (prefix, self.target, anchor and ("#%s" % anchor) or "", label) 388 self.target = self.target_type = self.label = None 389 390 elif name == "a": 391 text = conversion % (self.target, self.label or self.target) 392 self.target = self.target_type = self.label = None 393 394 # Macros require various kinds of information. 395 # Some macros affect the formatting of their contents, whereas other 396 # simpler macros are handled here. 397 398 elif name == "ac:macro": 399 conversion = macrotypes.get(self.macros[-1]) 400 if conversion: 401 parameters = {"content" : text} 402 parameters.update(self.macro_parameters[-1]) 403 argnames = macroargs.get(self.macros[-1]) 404 if argnames: 405 confargname, moinargname = argnames 406 parameters["args"] = quote_macro_argument("%s=%s" % (moinargname, self.macro_parameters[-1][confargname])) 407 text = conversion % parameters 408 if self.macros[-1] == "anchor" and self.forbids_macros(): 409 self.held_anchors.append(text) 410 text = "" 411 412 # Handle the common cases for parameterised and unparameterised 413 # substitutions. 414 415 elif text and conversion: 416 text = conversion % text 417 elif simple_tags.has_key(name) and not self.is_preformatted(): 418 text = simple_tags[name] 419 elif simple_preformatted_tags.has_key(name) and self.is_preformatted(): 420 text = simple_preformatted_tags[name] 421 422 423 424 # Postprocess table columns and rows. 425 426 if name in ("th", "td"): 427 if self.table_columns: 428 text = "\n|| %s" % text 429 self.table_columns += 1 430 elif name == "tr": 431 if self.table_rows: 432 text = "\n==\n%s" % text 433 self.table_rows += 1 434 435 # Postprocess held anchor tags in headings. 436 437 elif name in headings and self.held_anchors: 438 text = "%s\n%s" % ("".join(self.held_anchors), text) 439 440 441 442 # Normalise leading whitespace and indent the text if appropriate. 443 444 if name in indented_tags: 445 text = " " * self.indents[-1] + text.lstrip() 446 447 # Add the converted text to the end of the parent element's text nodes. 448 449 if len(self.text) > 1: 450 nodes = self.text[-2] 451 parent = self.elements[-2] 452 453 # Where preceding text exists, add any blank line separators. 454 455 if u"".join(nodes): 456 457 # All top-level elements are separated with blank lines. 458 459 if parent == "body": 460 nodes.append("\n") 461 462 # Block elements always cause a new line to be started. 463 464 if name in block_tags or self.have_block and name not in span_override_tags: 465 nodes.append("\n") 466 467 self.have_block = False 468 469 # Lists inside lists require separation. 470 471 elif list_tags.has_key(name) and parent == "li": 472 nodes.append("\n") 473 474 # Without preceding text, save any block node state for non-block 475 # elements so that newline separators can be added at another 476 # level. 477 478 elif name in block_tags and parent not in block_tags: 479 self.have_block = True 480 481 elif name not in block_tags and self.have_block and name not in span_override_tags: 482 self.have_block = True 483 484 else: 485 self.have_block = False 486 487 nodes.append(text) 488 489 # Otherwise, emit the text (at the top level of the document). 490 491 else: 492 self.out.write(text) 493 494 def is_preformatted(self): 495 return reduce(operator.or_, [self.states[tag] for tag in preformatted_tags], False) 496 497 def forbids_macros(self): 498 return reduce(operator.or_, [(tag in headings or tag == "a") for tag in self.elements], False) 499 500 # Whitespace normalisation. 501 502 def get_replacement(self, name): 503 if name in ("html", "body", "table", "tbody", "tr") or list_tags.has_key(name): 504 return "" 505 else: 506 return " " 507 508 def normalise(self, text, name): 509 return normalise_regexp.sub(self.get_replacement(name), text) 510 511 def parse(s, out): 512 513 "Parse the content in the string 's', writing a translation to 'out'." 514 515 # NOTE: CDATA sections appear to have erroneous endings. 516 517 s = u"""\ 518 <?xml version="1.0"?> 519 <!DOCTYPE html 520 PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" 521 "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> 522 <html xmlns="http://www.w3.org/1999/xhtml"> 523 <body> 524 %s 525 </body> 526 </html>""" % s.replace("]] >", "]]>") 527 528 f = StringIO(s.encode("utf-8")) 529 try: 530 parser = ConfluenceXMLParser(out) 531 parser.parse(f) 532 finally: 533 f.close() 534 535 if __name__ == "__main__": 536 s = codecs.getreader("utf-8")(sys.stdin).read() 537 out = codecs.getwriter("utf-8")(sys.stdout) 538 parse(s, out) 539 540 # vim: tabstop=4 expandtab shiftwidth=4