1 #!/usr/bin/env python 2 3 """ 4 Confluence Wiki XML/XHTML syntax parsing. 5 6 Copyright (C) 2012, 2013, 2015 Paul Boddie <paul@boddie.org.uk> 7 8 This software is free software; you can redistribute it and/or 9 modify it under the terms of the GNU General Public License as 10 published by the Free Software Foundation; either version 2 of 11 the License, or (at your option) any later version. 12 13 This software is distributed in the hope that it will be useful, 14 but WITHOUT ANY WARRANTY; without even the implied warranty of 15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 GNU General Public License for more details. 17 18 You should have received a copy of the GNU General Public 19 License along with this library; see the file LICENCE.txt 20 If not, write to the Free Software Foundation, Inc., 21 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA 22 """ 23 24 try: 25 from cStringIO import StringIO 26 except ImportError: 27 from StringIO import StringIO 28 29 from MoinMoin import wikiutil 30 from common import * 31 from xmlread import Parser 32 import re 33 import sys 34 import operator 35 import htmlentitydefs 36 import codecs 37 38 # XML dialect syntax parsing. 39 40 tags = { 41 # XHTML tag MoinMoin syntax 42 "strong" : "'''%s'''", 43 "em" : "''%s''", 44 "u" : "__%s__", 45 "del" : "--(%s)--", 46 "sup" : "^%s^", 47 "sub" : ",,%s,,", 48 "code" : "`%s`", 49 "tbody" : "%s", 50 "tr" : "%s", 51 "th" : "'''%s'''", 52 "td" : "%s", 53 "blockquote" : " %s", 54 "small" : "~-%s-~", 55 "big" : "~+%s+~", 56 "p" : "%s", 57 "ol" : "%s", 58 "ul" : "%s", 59 "ac:link" : "[[%s%s%s|%s]]", 60 "ac:image" : "{{%s%s%s|%s}}", 61 "a" : "[[%s|%s]]", 62 } 63 64 for tag, translation in blocktypes.items(): 65 tags[tag] = translation 66 67 simple_tags = { 68 # XHTML tag MoinMoin syntax 69 "br" : "<<BR>>", 70 } 71 72 simple_preformatted_tags = { 73 # XHTML tag MoinMoin syntax 74 "br" : "\n", 75 } 76 77 list_tags = { 78 # XHTML list tag MoinMoin list item syntax 79 "ol" : "1. %s", 80 "ul" : "* %s", 81 } 82 83 preformatted_tags = ["pre", "ac:plain-text-body"] 84 single_level_tags = ["strong", "em", "u", "del", "sup", "sub", "code"] 85 formatted_tags = ["ac:rich-text-body", "table"] 86 87 indented_tags = ["li", "p"] + preformatted_tags + formatted_tags 88 block_tags = indented_tags + blocktypes.keys() + list_tags.keys() 89 span_override_tags = ["ac:link"] 90 91 link_target_tags = { 92 # Confluence element Attributes providing the target 93 "ri:page" : ("ri:space-key", "ri:content-title"), 94 "ri:attachment" : ("ri:filename",), 95 "ri:user" : ("ri:username",), 96 } 97 98 link_target_prefixes = { 99 # Attribute with details Prefix ensuring correct relative link 100 "ri:space-key" : "..", 101 "ri:content-title" : "..", 102 } 103 104 link_label_attributes = "ri:content-title", "ac:link-body" 105 106 # NOTE: User links should support the intended user namespace prefix. 107 108 link_target_types = { 109 # Confluence element MoinMoin link prefix 110 "ri:attachment" : "attachment:", 111 "ri:user" : "", 112 } 113 114 macro_rich_text_styles = { 115 # Confluence style MoinMoin admonition style 116 "note" : "caution", 117 "warning" : "warning", 118 "info" : "important", 119 "tip" : "tip", 120 "excerpt" : "", 121 } 122 123 macroargs = { 124 # Confluence macro Confluence and MoinMoin macro arguments 125 "color" : ("color", "col"), 126 } 127 128 macrotypes = { 129 # Confluence macro MoinMoin syntax 130 "anchor" : "<<Anchor(%(anchor)s)>>", 131 "color" : "<<Color2(%(content)s, %(args)s)>>", 132 "toc" : "<<TableOfContents>>", 133 } 134 135 normalise_regexp_str = r"\s+" 136 normalise_regexp = re.compile(normalise_regexp_str) 137 138 class ConfluenceXMLParser(Parser): 139 140 "Handle content from Confluence 4 page revisions." 141 142 def __init__(self, out, is_comment_page=False): 143 Parser.__init__(self) 144 self.out = out 145 self.is_comment_page = is_comment_page 146 147 # Link target and label information. 148 149 self.target = None 150 self.target_type = None 151 self.label = None 152 153 # Macro information. 154 155 self.macros = [] 156 self.macro_parameters = [] 157 self.held_anchors = [] 158 159 # Indentation and element nesting states. 160 161 self.indents = [0] 162 self.states = {} 163 self.max_level = self.level = 0 164 165 for name in preformatted_tags + single_level_tags: 166 self.states[name] = 0 167 168 # Table states. 169 170 self.table_rows = 0 171 self.table_columns = 0 172 173 # Block states. 174 175 self.have_block = False 176 177 # ContentHandler-related methods. 178 179 def startElement(self, name, attrs): 180 181 # Track indentation for lists. 182 183 if list_tags.has_key(name): 184 self.indents.append(self.indents[-1] + 1) 185 186 # Track element nesting. 187 188 if self.states.has_key(name): 189 self.states[name] += 1 190 191 # Track cumulative element nesting in order to produce appropriate depth 192 # indicators in the formatted output. 193 194 if name in preformatted_tags or name in formatted_tags: 195 self.level += 1 196 self.max_level = max(self.level, self.max_level) 197 198 # Reset indentation within regions. 199 200 self.indents.append(0) 201 202 if name in headings: 203 self.held_anchors = [] 204 205 Parser.startElement(self, name, attrs) 206 207 # Remember macro information for use within the element. 208 209 if name == "ac:macro": 210 self.macros.append(self.attributes[-1].get("ac:name")) 211 self.macro_parameters.append({}) 212 213 def endElement(self, name): 214 215 # Reset the indent for any preformatted/formatted region so that it may 216 # itself be indented. 217 218 if name in preformatted_tags or name in formatted_tags: 219 self.indents.pop() 220 221 Parser.endElement(self, name) 222 223 if list_tags.has_key(name): 224 self.indents.pop() 225 226 if self.states.has_key(name): 227 self.states[name] -= 1 228 229 if name in preformatted_tags or name in formatted_tags: 230 self.level -= 1 231 if not self.level: 232 self.max_level = 0 233 234 # Discard macro state. 235 236 if name == "ac:macro": 237 self.macros.pop() 238 self.macro_parameters.pop() 239 240 def characters(self, content): 241 if not self.is_preformatted(): 242 content = self.normalise(content, self.elements[-1]) 243 Parser.characters(self, content) 244 245 def skippedEntity(self, name): 246 ch = htmlentitydefs.name2codepoint.get(name) 247 if ch: 248 self.text[-1].append(unichr(ch)) 249 250 # Parser-related methods. 251 252 def handleElement(self, name): 253 254 """ 255 Handle the completion of the element with the given 'name'. Any content 256 will either be recorded for later use (by an enclosing element, for 257 example) or emitted in some form. 258 """ 259 260 text = u"".join(self.text[-1]) 261 262 # Handle state. 263 264 if name == "table": 265 self.table_rows = 0 266 elif name == "tr": 267 self.table_columns = 0 268 269 # Find conversions. 270 271 conversion = None 272 273 # Handle list elements. 274 275 if name == "li" and len(self.elements) > 1: 276 list_tag = self.elements[-2] 277 conversion = list_tags.get(list_tag) 278 279 # Remember link target information. 280 281 elif link_target_tags.has_key(name): 282 target_details = [] 283 284 # Get target details from the element's attributes. 285 286 for attrname in link_target_tags[name]: 287 attrvalue = self.attributes[-1].get(attrname) 288 if attrvalue: 289 290 # Obtain a link label. 291 292 if attrname in link_label_attributes and not self.label: 293 self.label = attrvalue 294 295 # Validate any page title. 296 297 if attrname == "ri:content-title": 298 attrvalue = get_page_title(attrvalue) 299 target_details.append(attrvalue) 300 301 # Insert any prefix required for the link. 302 303 prefix = link_target_prefixes.get(attrname) 304 if prefix: 305 target_details.insert(0, prefix) 306 if self.is_comment_page: 307 target_details.insert(0, prefix) 308 309 # Make a link based on the details. 310 311 self.target = u"/".join(target_details) 312 self.target_type = name 313 text = "" 314 315 # For anchor links, just use the raw text and let Moin do the formatting. 316 # Set an empty default target, overwriting it if enclosing elements 317 # specify target details. 318 319 elif name in ("ac:link-body", "ac:plain-text-link-body"): 320 self.target = self.target or "" 321 self.label = text.strip() 322 text = "" 323 324 # For conventional links, remember the href attribute as the target. 325 326 elif name == "a": 327 self.target = self.attributes[-1].get("href") 328 self.label = text.strip() 329 text = "" 330 331 # Remember macro information. 332 333 elif name == "ac:parameter": 334 self.macro_parameters[-1][self.attributes[-1].get("ac:name")] = text 335 text = "" 336 337 elif name == "ac:default-parameter": 338 self.macro_parameters[-1][self.attributes[-2].get("ac:name")] = text 339 text = "" 340 341 # Handle single-level tags. 342 343 elif name in single_level_tags and self.states[name] > 1: 344 conversion = "%s" 345 346 # Handle preformatted sections. 347 348 elif name in preformatted_tags or name in formatted_tags: 349 350 # Nest the section appropriately. 351 352 level = 3 + self.max_level - self.level 353 opening = "{" * level 354 closing = "}" * level 355 356 # Macro name information is used to style rich text body regions. 357 358 if name != "table" and self.macros and macro_rich_text_styles.has_key(self.macros[-1]): 359 details = macro_rich_text_styles[self.macros[-1]] 360 title = self.macro_parameters[-1].get("title") 361 if title: 362 details = "%s\n\n%s" % (details, title) 363 364 conversion = "%s#!wiki %s\n\n%%s\n%s" % (opening, details, closing) 365 366 elif name == "table": 367 conversion = "%s#!table\n%%s\n%s" % (opening, closing) 368 369 else: 370 # Preformatted sections containing newlines must contain an initial 371 # newline. 372 373 if text.find("\n") != -1 and not text.startswith("\n"): 374 opening += "\n" 375 376 conversion = "%s%%s%s" % (opening, closing) 377 378 # Handle the common case and simpler special cases. 379 380 if not conversion: 381 conversion = tags.get(name) 382 383 384 385 # Attempt to convert the text. 386 387 # Links require target information. 388 389 if name in ("ac:link", "ac:image"): 390 prefix = link_target_types.get(self.target_type, "") 391 anchor = self.attributes[-1].get("ac:anchor") or "" 392 label = self.label or text.strip() or self.target 393 text = conversion % (prefix, self.target, anchor and ("#%s" % anchor) or "", label) 394 self.target = self.target_type = self.label = None 395 396 elif name == "a": 397 text = conversion % (self.target, self.label or self.target) 398 self.target = self.target_type = self.label = None 399 400 # Macros require various kinds of information. 401 # Some macros affect the formatting of their contents, whereas other 402 # simpler macros are handled here. 403 404 elif name == "ac:macro": 405 conversion = macrotypes.get(self.macros[-1]) 406 if conversion: 407 parameters = {"content" : text} 408 parameters.update(self.macro_parameters[-1]) 409 argnames = macroargs.get(self.macros[-1]) 410 if argnames: 411 confargname, moinargname = argnames 412 parameters["args"] = quote_macro_argument("%s=%s" % (moinargname, self.macro_parameters[-1][confargname])) 413 text = conversion % parameters 414 if self.macros[-1] == "anchor" and self.forbids_macros(): 415 self.held_anchors.append(text) 416 text = "" 417 418 # Handle the common cases for parameterised and unparameterised 419 # substitutions. 420 421 elif text and conversion: 422 text = conversion % text 423 elif simple_tags.has_key(name) and not self.is_preformatted(): 424 text = simple_tags[name] 425 elif simple_preformatted_tags.has_key(name) and self.is_preformatted(): 426 text = simple_preformatted_tags[name] 427 428 429 430 # Postprocess table columns and rows. 431 432 if name in ("th", "td"): 433 if self.table_columns: 434 text = "\n|| %s" % text 435 self.table_columns += 1 436 elif name == "tr": 437 if self.table_rows: 438 text = "\n==\n%s" % text 439 self.table_rows += 1 440 441 # Postprocess held anchor tags in headings. 442 443 elif name in headings and self.held_anchors: 444 text = "%s\n%s" % ("".join(self.held_anchors), text) 445 446 447 448 # Normalise leading whitespace and indent the text if appropriate. 449 450 if name in indented_tags: 451 text = " " * self.indents[-1] + text.lstrip() 452 453 # Add the converted text to the end of the parent element's text nodes. 454 455 if len(self.text) > 1: 456 nodes = self.text[-2] 457 parent = self.elements[-2] 458 459 # Where preceding text exists, add any blank line separators. 460 461 if u"".join(nodes): 462 463 # All top-level elements are separated with blank lines. 464 465 if parent == "body": 466 nodes.append("\n") 467 468 # Block elements always cause a new line to be started. 469 470 if name in block_tags or self.have_block and name not in span_override_tags: 471 nodes.append("\n") 472 473 self.have_block = False 474 475 # Lists inside lists require separation. 476 477 elif list_tags.has_key(name) and parent == "li": 478 nodes.append("\n") 479 480 # Without preceding text, save any block node state for non-block 481 # elements so that newline separators can be added at another 482 # level. 483 484 elif name in block_tags and parent not in block_tags: 485 self.have_block = True 486 487 elif name not in block_tags and self.have_block and name not in span_override_tags: 488 self.have_block = True 489 490 else: 491 self.have_block = False 492 493 nodes.append(text) 494 495 # Otherwise, emit the text (at the top level of the document). 496 497 else: 498 self.out.write(text) 499 500 def is_preformatted(self): 501 return reduce(operator.or_, [self.states[tag] for tag in preformatted_tags], False) 502 503 def forbids_macros(self): 504 return reduce(operator.or_, [(tag in headings or tag == "a") for tag in self.elements], False) 505 506 # Whitespace normalisation. 507 508 def get_replacement(self, name): 509 if name in ("html", "body", "table", "tbody", "tr") or list_tags.has_key(name): 510 return "" 511 else: 512 return " " 513 514 def normalise(self, text, name): 515 return normalise_regexp.sub(self.get_replacement(name), text) 516 517 def parse(s, out, is_comment_page=False): 518 519 "Parse the content in the string 's', writing a translation to 'out'." 520 521 # NOTE: CDATA sections appear to have erroneous endings. 522 523 s = u"""\ 524 <?xml version="1.0"?> 525 <!DOCTYPE html 526 PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" 527 "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> 528 <html xmlns="http://www.w3.org/1999/xhtml"> 529 <body> 530 %s 531 </body> 532 </html>""" % s.replace("]] >", "]]>") 533 534 f = StringIO(s.encode("utf-8")) 535 try: 536 parser = ConfluenceXMLParser(out, is_comment_page) 537 parser.parse(f) 538 finally: 539 f.close() 540 541 if __name__ == "__main__": 542 s = codecs.getreader("utf-8")(sys.stdin).read() 543 out = codecs.getwriter("utf-8")(sys.stdout) 544 parse(s, out) 545 546 # vim: tabstop=4 expandtab shiftwidth=4