1 #!/usr/bin/env python 2 3 """ 4 Confluence Wiki XML/XHTML syntax parsing. 5 6 Copyright (C) 2012, 2013, 2015, 2017 Paul Boddie <paul@boddie.org.uk> 7 8 This software is free software; you can redistribute it and/or 9 modify it under the terms of the GNU General Public License as 10 published by the Free Software Foundation; either version 2 of 11 the License, or (at your option) any later version. 12 13 This software is distributed in the hope that it will be useful, 14 but WITHOUT ANY WARRANTY; without even the implied warranty of 15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 GNU General Public License for more details. 17 18 You should have received a copy of the GNU General Public 19 License along with this library; see the file LICENCE.txt 20 If not, write to the Free Software Foundation, Inc., 21 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA 22 """ 23 24 try: 25 from cStringIO import StringIO 26 except ImportError: 27 from StringIO import StringIO 28 29 from MoinMoin import wikiutil 30 from common import * 31 from xmlread import Parser 32 import re 33 import sys 34 import operator 35 import htmlentitydefs 36 import codecs 37 38 # XML dialect syntax parsing. 39 40 tags = { 41 # XHTML tag MoinMoin syntax 42 "strong" : "'''%s'''", 43 "em" : "''%s''", 44 "u" : "__%s__", 45 "del" : "--(%s)--", 46 "sup" : "^%s^", 47 "sub" : ",,%s,,", 48 "code" : "`%s`", 49 "tbody" : "%s", 50 "tr" : "%s", 51 "th" : "'''%s'''", 52 "td" : "%s", 53 "blockquote" : " %s", 54 "small" : "~-%s-~", 55 "big" : "~+%s+~", 56 "p" : "%s", 57 "ol" : "%s", 58 "ul" : "%s", 59 "ac:link" : "[[%s%s%s|%s]]", 60 "ac:image" : "{{%s%s%s|%s}}", 61 "a" : "[[%s|%s]]", 62 } 63 64 for tag, translation in blocktypes.items(): 65 tags[tag] = translation 66 67 simple_tags = { 68 # XHTML tag MoinMoin syntax 69 "br" : "<<BR>>", 70 } 71 72 simple_preformatted_tags = { 73 # XHTML tag MoinMoin syntax 74 "br" : "\n", 75 } 76 77 list_tags = { 78 # XHTML list tag MoinMoin list item syntax 79 "ol" : "1. %s", 80 "ul" : "* %s", 81 } 82 83 preformatted_tags = ["pre", "ac:plain-text-body"] 84 single_level_tags = ["strong", "em", "u", "del", "sup", "sub", "code"] 85 formatted_tags = ["ac:rich-text-body", "table"] 86 87 indented_tags = ["li", "p"] + preformatted_tags + formatted_tags 88 block_tags = indented_tags + blocktypes.keys() + list_tags.keys() 89 span_override_tags = ["ac:link"] 90 91 link_target_tags = { 92 # Confluence element Attributes providing the target 93 "ri:page" : ("ri:space-key", "ri:content-title"), 94 "ri:attachment" : ("ri:filename",), 95 "ri:user" : ("ri:username",), 96 } 97 98 link_target_prefixes = { 99 # Attribute with details Prefix ensuring correct relative link 100 "ri:space-key" : "..", 101 "ri:content-title" : "..", 102 } 103 104 link_label_attributes = "ri:content-title", "ac:link-body" 105 106 # NOTE: User links should support the intended user namespace prefix. 107 108 link_target_types = { 109 # Confluence element MoinMoin link prefix 110 "ri:attachment" : "attachment:", 111 "ri:user" : "", 112 } 113 114 macro_rich_text_styles = { 115 # Confluence style MoinMoin admonition style 116 "note" : "caution", 117 "warning" : "warning", 118 "info" : "important", 119 "tip" : "tip", 120 "excerpt" : "", 121 } 122 123 macroargs = { 124 # Confluence macro Confluence and MoinMoin macro arguments 125 "color" : ("color", "col"), 126 } 127 128 macrotypes = { 129 # Confluence macro MoinMoin syntax 130 "anchor" : "<<Anchor(%(anchor)s)>>", 131 "color" : "<<Color2(%(content)s, %(args)s)>>", 132 "toc" : "<<TableOfContents>>", 133 } 134 135 normalise_regexp_str = r"\s+" 136 normalise_regexp = re.compile(normalise_regexp_str) 137 138 class ConfluenceXMLParser(Parser): 139 140 "Handle content from Confluence 4 page revisions." 141 142 def __init__(self, out, is_comment_page=False): 143 Parser.__init__(self) 144 self.out = out 145 self.is_comment_page = is_comment_page 146 147 # Link target and label information. 148 149 self.target = None 150 self.target_type = None 151 self.label = None 152 153 # Macro information. 154 155 self.macros = [] 156 self.macro_parameters = [] 157 self.held_anchors = [] 158 159 # Indentation and element nesting states. 160 161 self.indents = [0] 162 self.states = {} 163 self.max_level = self.level = 0 164 165 for name in preformatted_tags + single_level_tags: 166 self.states[name] = 0 167 168 # Table states. 169 170 self.table_rows = 0 171 self.table_columns = 0 172 173 # Block states. 174 175 self.have_block = False 176 177 # ContentHandler-related methods. 178 179 def startElement(self, name, attrs): 180 181 # Track indentation for lists. 182 183 if list_tags.has_key(name): 184 self.indents.append(self.indents[-1] + 1) 185 186 # Track element nesting. 187 188 if self.states.has_key(name): 189 self.states[name] += 1 190 191 # Track cumulative element nesting in order to produce appropriate depth 192 # indicators in the formatted output. 193 194 if name in preformatted_tags or name in formatted_tags: 195 self.level += 1 196 self.max_level = max(self.level, self.max_level) 197 198 # Reset indentation within regions. 199 200 self.indents.append(0) 201 202 if name in headings: 203 self.held_anchors = [] 204 205 Parser.startElement(self, name, attrs) 206 207 # Remember macro information for use within the element. 208 209 if name in ("ac:macro", "ac:structured-macro"): 210 self.macros.append(self.attributes[-1].get("ac:name")) 211 self.macro_parameters.append({}) 212 213 def endElement(self, name): 214 215 # Reset the indent for any preformatted/formatted region so that it may 216 # itself be indented. 217 218 if name in preformatted_tags or name in formatted_tags: 219 self.indents.pop() 220 221 Parser.endElement(self, name) 222 223 if list_tags.has_key(name): 224 self.indents.pop() 225 226 if self.states.has_key(name): 227 self.states[name] -= 1 228 229 if name in preformatted_tags or name in formatted_tags: 230 self.level -= 1 231 if not self.level: 232 self.max_level = 0 233 234 # Discard macro state. 235 236 if name in ("ac:macro", "ac:structured-macro"): 237 self.macros.pop() 238 self.macro_parameters.pop() 239 240 def characters(self, content): 241 if not self.is_preformatted(): 242 content = self.normalise(content, self.elements[-1]) 243 Parser.characters(self, content) 244 245 def skippedEntity(self, name): 246 ch = htmlentitydefs.name2codepoint.get(name) 247 if ch: 248 self.text[-1].append(unichr(ch)) 249 250 # Parser-related methods. 251 252 def handleElement(self, name): 253 254 """ 255 Handle the completion of the element with the given 'name'. Any content 256 will either be recorded for later use (by an enclosing element, for 257 example) or emitted in some form. 258 """ 259 260 text = u"".join(self.text[-1]) 261 262 # Handle state. 263 264 if name == "table": 265 self.table_rows = 0 266 elif name == "tr": 267 self.table_columns = 0 268 269 # Find conversions. 270 271 conversion = None 272 273 # Handle list elements. 274 275 if name == "li" and len(self.elements) > 1: 276 list_tag = self.elements[-2] 277 conversion = list_tags.get(list_tag) 278 279 # Remember link target information. 280 281 elif link_target_tags.has_key(name): 282 target_details = [] 283 284 # Get target details from the element's attributes. 285 286 for attrname in link_target_tags[name]: 287 attrvalue = self.attributes[-1].get(attrname) 288 if attrvalue: 289 290 # Obtain a link label. 291 292 if attrname in link_label_attributes and not self.label: 293 self.label = attrvalue 294 295 # Validate any page title. 296 297 if attrname == "ri:content-title": 298 attrvalue = get_page_title(attrvalue) 299 target_details.append(attrvalue) 300 301 # Insert any prefix required for the link. 302 303 prefix = link_target_prefixes.get(attrname) 304 if prefix: 305 target_details.insert(0, prefix) 306 if self.is_comment_page: 307 target_details.insert(0, prefix) 308 309 # Make a link based on the details. 310 311 self.target = u"/".join(target_details) 312 self.target_type = name 313 text = "" 314 315 # For anchor links, just use the raw text and let Moin do the formatting. 316 # Set an empty default target, overwriting it if enclosing elements 317 # specify target details. 318 319 elif name in ("ac:link-body", "ac:plain-text-link-body"): 320 self.target = self.target or "" 321 self.label = text.strip() 322 text = "" 323 324 # For conventional links, remember the href attribute as the target. 325 326 elif name == "a": 327 self.target = self.attributes[-1].get("href") 328 self.label = text.strip() 329 text = "" 330 331 # Remember macro information. 332 333 elif name == "ac:parameter": 334 self.macro_parameters[-1][self.attributes[-1].get("ac:name")] = text 335 text = "" 336 337 elif name == "ac:default-parameter": 338 self.macro_parameters[-1][self.attributes[-2].get("ac:name")] = text 339 text = "" 340 341 # Handle single-level tags. 342 343 elif name in single_level_tags and self.states[name] > 1: 344 conversion = "%s" 345 346 # Handle preformatted sections. 347 348 elif name in preformatted_tags or name in formatted_tags: 349 350 # Nest the section appropriately. 351 352 level = 3 + self.max_level - self.level 353 opening = "{" * level 354 closing = "}" * level 355 356 # Macro name information is used to style rich text body regions. 357 358 if name != "table" and self.macros and macro_rich_text_styles.has_key(self.macros[-1]): 359 details = macro_rich_text_styles[self.macros[-1]] 360 title = self.macro_parameters[-1].get("title") 361 if title: 362 details = "%s\n\n%s" % (details, title) 363 364 conversion = "%s#!wiki %s\n\n%%s\n%s" % (opening, details, closing) 365 366 elif name == "table": 367 conversion = "%s#!table\n%%s\n%s" % (opening, closing) 368 369 else: 370 # Preformatted sections containing newlines must contain an initial 371 # newline. 372 373 if text.find("\n") != -1 and not text.startswith("\n"): 374 opening += "\n" 375 376 conversion = "%s%%s%s" % (opening, closing) 377 378 # Handle the common case and simpler special cases. 379 380 if not conversion: 381 conversion = tags.get(name) 382 383 384 385 # Attempt to convert the text. 386 387 # Links require target information. 388 389 if name in ("ac:link", "ac:image"): 390 prefix = link_target_types.get(self.target_type, "") 391 anchor = self.attributes[-1].get("ac:anchor") or "" 392 label = self.label or text.strip() or self.target 393 text = conversion % (prefix, self.target, anchor and ("#%s" % anchor) or "", label) 394 self.target = self.target_type = self.label = None 395 396 elif name == "a": 397 text = conversion % (self.target, self.label or self.target) 398 self.target = self.target_type = self.label = None 399 400 # Macros require various kinds of information. 401 # Some macros affect the formatting of their contents, whereas other 402 # simpler macros are handled here. 403 404 elif name in ("ac:macro", "ac:structured-macro"): 405 conversion = macrotypes.get(self.macros[-1]) 406 407 # Produce the converted macro. 408 409 if conversion: 410 parameters = {"content" : text} 411 parameters.update(self.macro_parameters[-1]) 412 argnames = macroargs.get(self.macros[-1]) 413 if argnames: 414 confargname, moinargname = argnames 415 parameters["args"] = quote_macro_argument("%s=%s" % (moinargname, self.macro_parameters[-1][confargname])) 416 417 # Obtain the Moin macro with parameters substituted. 418 419 text = conversion % parameters 420 if self.macros[-1] == "anchor" and self.forbids_macros(): 421 self.held_anchors.append(text) 422 text = "" 423 424 # Warn about macros that are not converted. 425 426 elif not macro_rich_text_styles.has_key(self.macros[-1]): 427 print >>sys.stderr, "No conversion possible for macro", self.macros[-1] 428 print >>sys.stderr, "Macro has arguments", self.macro_parameters[-1] 429 print >>sys.stderr 430 431 # Handle the common cases for parameterised and unparameterised 432 # substitutions. 433 434 elif text and conversion: 435 text = conversion % text 436 elif simple_tags.has_key(name) and not self.is_preformatted(): 437 text = simple_tags[name] 438 elif simple_preformatted_tags.has_key(name) and self.is_preformatted(): 439 text = simple_preformatted_tags[name] 440 441 442 443 # Postprocess table columns and rows. 444 445 if name in ("th", "td"): 446 if self.table_columns: 447 text = "\n|| %s" % text 448 self.table_columns += 1 449 elif name == "tr": 450 if self.table_rows: 451 text = "\n==\n%s" % text 452 self.table_rows += 1 453 454 # Postprocess held anchor tags in headings. 455 456 elif name in headings and self.held_anchors: 457 text = "%s\n%s" % ("".join(self.held_anchors), text) 458 459 460 461 # Normalise leading whitespace and indent the text if appropriate. 462 463 if name in indented_tags: 464 text = " " * self.indents[-1] + text.lstrip() 465 466 # Add the converted text to the end of the parent element's text nodes. 467 468 if len(self.text) > 1: 469 nodes = self.text[-2] 470 parent = self.elements[-2] 471 472 # Where preceding text exists, add any blank line separators. 473 474 if u"".join(nodes): 475 476 # All top-level elements are separated with blank lines. 477 478 if parent == "body": 479 nodes.append("\n") 480 481 # Block elements always cause a new line to be started. 482 483 if name in block_tags or self.have_block and name not in span_override_tags: 484 nodes.append("\n") 485 486 self.have_block = False 487 488 # Lists inside lists require separation. 489 490 elif list_tags.has_key(name) and parent == "li": 491 nodes.append("\n") 492 493 # Without preceding text, save any block node state for non-block 494 # elements so that newline separators can be added at another 495 # level. 496 497 elif name in block_tags and parent not in block_tags: 498 self.have_block = True 499 500 elif name not in block_tags and self.have_block and name not in span_override_tags: 501 self.have_block = True 502 503 else: 504 self.have_block = False 505 506 nodes.append(text) 507 508 # Otherwise, emit the text (at the top level of the document). 509 510 else: 511 self.out.write(text) 512 513 def is_preformatted(self): 514 return reduce(operator.or_, [self.states[tag] for tag in preformatted_tags], False) 515 516 def forbids_macros(self): 517 return reduce(operator.or_, [(tag in headings or tag == "a") for tag in self.elements], False) 518 519 # Whitespace normalisation. 520 521 def get_replacement(self, name): 522 if name in ("html", "body", "table", "tbody", "tr") or list_tags.has_key(name): 523 return "" 524 else: 525 return " " 526 527 def normalise(self, text, name): 528 return normalise_regexp.sub(self.get_replacement(name), text) 529 530 def parse(s, out, is_comment_page=False): 531 532 "Parse the content in the string 's', writing a translation to 'out'." 533 534 # NOTE: CDATA sections appear to have erroneous endings. 535 536 s = u"""\ 537 <?xml version="1.0"?> 538 <!DOCTYPE html 539 PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" 540 "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> 541 <html xmlns="http://www.w3.org/1999/xhtml"> 542 <body> 543 %s 544 </body> 545 </html>""" % s.replace("]] >", "]]>") 546 547 f = StringIO(s.encode("utf-8")) 548 try: 549 parser = ConfluenceXMLParser(out, is_comment_page) 550 parser.parse(f) 551 finally: 552 f.close() 553 554 if __name__ == "__main__": 555 s = codecs.getreader("utf-8")(sys.stdin).read() 556 out = codecs.getwriter("utf-8")(sys.stdout) 557 parse(s, out) 558 559 # vim: tabstop=4 expandtab shiftwidth=4