1 #!/usr/bin/env python 2 3 """ 4 Confluence Wiki XML/XHTML syntax parsing. 5 6 Copyright (C) 2012, 2013 Paul Boddie <paul@boddie.org.uk> 7 8 This software is free software; you can redistribute it and/or 9 modify it under the terms of the GNU General Public License as 10 published by the Free Software Foundation; either version 2 of 11 the License, or (at your option) any later version. 12 13 This software is distributed in the hope that it will be useful, 14 but WITHOUT ANY WARRANTY; without even the implied warranty of 15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 GNU General Public License for more details. 17 18 You should have received a copy of the GNU General Public 19 License along with this library; see the file LICENCE.txt 20 If not, write to the Free Software Foundation, Inc., 21 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA 22 """ 23 24 try: 25 from cStringIO import StringIO 26 except ImportError: 27 from StringIO import StringIO 28 29 from MoinMoin import wikiutil 30 from common import * 31 from xmlread import Parser 32 import re 33 import sys 34 import operator 35 import htmlentitydefs 36 import codecs 37 38 # XML dialect syntax parsing. 39 40 tags = { 41 # XHTML tag MoinMoin syntax 42 "strong" : "'''%s'''", 43 "em" : "''%s''", 44 "u" : "__%s__", 45 "del" : "--(%s)--", 46 "sup" : "^%s^", 47 "sub" : ",,%s,,", 48 "code" : "`%s`", 49 "tbody" : "%s", 50 "tr" : "%s", 51 "th" : "'''%s'''", 52 "td" : "%s", 53 "blockquote" : " %s", 54 "small" : "~-%s-~", 55 "big" : "~+%s+~", 56 "p" : "%s", 57 "ol" : "%s", 58 "ul" : "%s", 59 "ac:link" : "[[%s%s%s|%s]]", 60 "ac:image" : "{{%s%s%s|%s}}", 61 "a" : "[[%s|%s]]", 62 } 63 64 for tag, translation in blocktypes.items(): 65 tags[tag] = translation 66 67 simple_tags = { 68 # XHTML tag MoinMoin syntax 69 "br" : "<<BR>>", 70 } 71 72 simple_preformatted_tags = { 73 # XHTML tag MoinMoin syntax 74 "br" : "\n", 75 } 76 77 list_tags = { 78 # XHTML list tag MoinMoin list item syntax 79 "ol" : "1. %s", 80 "ul" : "* %s", 81 } 82 83 preformatted_tags = ["pre", "ac:plain-text-body"] 84 single_level_tags = ["strong", "em", "u", "del", "sup", "sub", "code"] 85 formatted_tags = ["ac:rich-text-body", "table"] 86 87 indented_tags = ["li", "p"] + preformatted_tags + formatted_tags 88 block_tags = indented_tags + blocktypes.keys() + list_tags.keys() 89 span_override_tags = ["ac:link"] 90 91 link_target_tags = { 92 # Confluence element Attributes providing the target 93 "ri:page" : ("ri:space-key", "ri:content-title"), 94 "ri:attachment" : ("ri:filename",), 95 "ri:user" : ("ri:username",), 96 } 97 98 link_target_prefixes = { 99 # Attribute with details Prefix ensuring correct relative link 100 "ri:space-key" : "..", 101 "ri:content-title" : "..", 102 } 103 104 link_label_attributes = "ri:content-title", "ac:link-body" 105 106 # NOTE: User links should support the intended user namespace prefix. 107 108 link_target_types = { 109 # Confluence element MoinMoin link prefix 110 "ri:attachment" : "attachment:", 111 "ri:user" : "", 112 } 113 114 macro_rich_text_styles = { 115 # Confluence style MoinMoin admonition style 116 "note" : "caution", 117 "warning" : "warning", 118 "info" : "important", 119 "tip" : "tip", 120 "excerpt" : "", 121 } 122 123 macroargs = { 124 # Confluence macro Confluence and MoinMoin macro arguments 125 "color" : ("color", "col"), 126 } 127 128 macrotypes = { 129 # Confluence macro MoinMoin syntax 130 "anchor" : "<<Anchor(%(anchor)s)>>", 131 "color" : "<<Color2(%(content)s, %(args)s)>>", 132 } 133 134 normalise_regexp_str = r"\s+" 135 normalise_regexp = re.compile(normalise_regexp_str) 136 137 class ConfluenceXMLParser(Parser): 138 139 "Handle content from Confluence 4 page revisions." 140 141 def __init__(self, out): 142 Parser.__init__(self) 143 self.out = out 144 145 # Link target and label information. 146 147 self.target = None 148 self.target_type = None 149 self.label = None 150 151 # Macro information. 152 153 self.macro = None 154 self.macro_parameters = {} 155 self.held_anchors = [] 156 157 # Indentation and element nesting states. 158 159 self.indents = [0] 160 self.states = {} 161 self.max_level = self.level = 0 162 163 for name in preformatted_tags + single_level_tags: 164 self.states[name] = 0 165 166 # Table states. 167 168 self.table_rows = 0 169 self.table_columns = 0 170 171 # Block states. 172 173 self.have_block = False 174 175 # ContentHandler-related methods. 176 177 def startElement(self, name, attrs): 178 179 # Track indentation for lists. 180 181 if list_tags.has_key(name): 182 self.indents.append(self.indents[-1] + 1) 183 184 # Track element nesting. 185 186 if self.states.has_key(name): 187 self.states[name] += 1 188 189 # Track cumulative element nesting in order to produce appropriate depth 190 # indicators in the formatted output. 191 192 if name in preformatted_tags or name in formatted_tags: 193 self.level += 1 194 self.max_level = max(self.level, self.max_level) 195 196 # Reset indentation within regions. 197 198 self.indents.append(0) 199 200 if name in headings: 201 self.held_anchors = [] 202 203 Parser.startElement(self, name, attrs) 204 205 # Remember macro information for use within the element. 206 207 if name == "ac:macro": 208 self.macro = self.attributes[-1].get("ac:name") 209 210 def endElement(self, name): 211 212 # Reset the indent for any preformatted/formatted region so that it may 213 # itself be indented. 214 215 if name in preformatted_tags or name in formatted_tags: 216 self.indents.pop() 217 218 Parser.endElement(self, name) 219 220 if list_tags.has_key(name): 221 self.indents.pop() 222 223 if self.states.has_key(name): 224 self.states[name] -= 1 225 226 if name in preformatted_tags or name in formatted_tags: 227 self.level -= 1 228 if not self.level: 229 self.max_level = 0 230 231 # Discard macro state. 232 233 if name == "ac:macro": 234 self.macro = None 235 self.macro_parameters = {} 236 237 def characters(self, content): 238 if not self.is_preformatted(): 239 content = self.normalise(content, self.elements[-1]) 240 Parser.characters(self, content) 241 242 def skippedEntity(self, name): 243 ch = htmlentitydefs.name2codepoint.get(name) 244 if ch: 245 self.text[-1].append(unichr(ch)) 246 247 # Parser-related methods. 248 249 def handleElement(self, name): 250 251 """ 252 Handle the completion of the element with the given 'name'. Any content 253 will either be recorded for later use (by an enclosing element, for 254 example) or emitted in some form. 255 """ 256 257 text = u"".join(self.text[-1]) 258 259 # Handle state. 260 261 if name == "table": 262 self.table_rows = 0 263 elif name == "tr": 264 self.table_columns = 0 265 266 # Find conversions. 267 268 conversion = None 269 270 # Handle list elements. 271 272 if name == "li" and len(self.elements) > 1: 273 list_tag = self.elements[-2] 274 conversion = list_tags.get(list_tag) 275 276 # Remember link target information. 277 278 elif link_target_tags.has_key(name): 279 target_details = [] 280 281 # Get target details from the element's attributes. 282 283 for attrname in link_target_tags[name]: 284 attrvalue = self.attributes[-1].get(attrname) 285 if attrvalue: 286 287 # Obtain a link label. 288 289 if attrname in link_label_attributes and not self.label: 290 self.label = attrvalue 291 292 # Validate any page title. 293 294 if attrname == "ri:content-title": 295 attrvalue = get_page_title(attrvalue) 296 target_details.append(attrvalue) 297 298 # Insert any prefix required for the link. 299 300 prefix = link_target_prefixes.get(attrname) 301 if prefix: 302 target_details.insert(0, prefix) 303 304 # Make a link based on the details. 305 306 self.target = u"/".join(target_details) 307 self.target_type = name 308 text = "" 309 310 # For anchor links, just use the raw text and let Moin do the formatting. 311 312 elif name == "ac:link-body": 313 self.label = text.strip() 314 text = "" 315 316 # For conventional links, remember the href attribute as the target. 317 318 elif name == "a": 319 self.target = self.attributes[-1].get("href") 320 self.label = text.strip() 321 text = "" 322 323 # Remember macro information. 324 325 elif name == "ac:parameter": 326 self.macro_parameters[self.attributes[-1].get("ac:name")] = text 327 text = "" 328 329 elif name == "ac:default-parameter": 330 self.macro_parameters[self.attributes[-2].get("ac:name")] = text 331 text = "" 332 333 # Handle single-level tags. 334 335 elif name in single_level_tags and self.states[name] > 1: 336 conversion = "%s" 337 338 # Handle preformatted sections. 339 340 elif name in preformatted_tags or name in formatted_tags: 341 342 # Nest the section appropriately. 343 344 level = 3 + self.max_level - self.level 345 opening = "{" * level 346 closing = "}" * level 347 348 # Macro name information is used to style rich text body regions. 349 350 if name != "table" and self.macro and macro_rich_text_styles.has_key(self.macro): 351 details = macro_rich_text_styles[self.macro] 352 title = self.macro_parameters.get("title") 353 if title: 354 details = "%s\n\n%s" % (details, title) 355 356 conversion = "%s#!wiki %s\n\n%%s\n%s" % (opening, details, closing) 357 358 elif name == "table": 359 conversion = "%s#!table\n%%s\n%s" % (opening, closing) 360 361 else: 362 # Preformatted sections containing newlines must contain an initial 363 # newline. 364 365 if text.find("\n") != -1 and not text.startswith("\n"): 366 opening += "\n" 367 368 conversion = "%s%%s%s" % (opening, closing) 369 370 # Handle the common case and simpler special cases. 371 372 if not conversion: 373 conversion = tags.get(name) 374 375 376 377 # Attempt to convert the text. 378 379 # Links require target information. 380 381 if name in ("ac:link", "ac:image"): 382 prefix = link_target_types.get(self.target_type, "") 383 anchor = self.attributes[-1].get("ac:anchor") or "" 384 label = self.label or text.strip() or self.target 385 text = conversion % (prefix, self.target, anchor and ("#%s" % anchor) or "", label) 386 self.target = self.target_type = self.label = None 387 388 elif name == "a": 389 text = conversion % (self.target, self.label or self.target) 390 self.target = self.target_type = self.label = None 391 392 # Macros require various kinds of information. 393 # Some macros affect the formatting of their contents, whereas other 394 # simpler macros are handled here. 395 396 elif name == "ac:macro": 397 conversion = macrotypes.get(self.macro) 398 if conversion: 399 parameters = {"content" : text} 400 parameters.update(self.macro_parameters) 401 argnames = macroargs.get(self.macro) 402 if argnames: 403 confargname, moinargname = argnames 404 parameters["args"] = quote_macro_argument("%s=%s" % (moinargname, self.macro_parameters[confargname])) 405 text = conversion % parameters 406 if self.macro == "anchor" and self.forbids_macros(): 407 self.held_anchors.append(text) 408 text = "" 409 410 # Handle the common cases for parameterised and unparameterised 411 # substitutions. 412 413 elif text and conversion: 414 text = conversion % text 415 elif simple_tags.has_key(name) and not self.is_preformatted(): 416 text = simple_tags[name] 417 elif simple_preformatted_tags.has_key(name) and self.is_preformatted(): 418 text = simple_preformatted_tags[name] 419 420 421 422 # Postprocess table columns and rows. 423 424 if name in ("th", "td"): 425 if self.table_columns: 426 text = "\n|| %s" % text 427 self.table_columns += 1 428 elif name == "tr": 429 if self.table_rows: 430 text = "\n==\n%s" % text 431 self.table_rows += 1 432 433 # Postprocess held anchor tags in headings. 434 435 elif name in headings and self.held_anchors: 436 text = "%s\n%s" % ("".join(self.held_anchors), text) 437 438 439 440 # Normalise leading whitespace and indent the text if appropriate. 441 442 if name in indented_tags: 443 text = " " * self.indents[-1] + text.lstrip() 444 445 # Add the converted text to the end of the parent element's text nodes. 446 447 if len(self.text) > 1: 448 nodes = self.text[-2] 449 parent = self.elements[-2] 450 451 # Where preceding text exists, add any blank line separators. 452 453 if u"".join(nodes): 454 455 # All top-level elements are separated with blank lines. 456 457 if parent == "body": 458 nodes.append("\n") 459 460 # Block elements always cause a new line to be started. 461 462 if name in block_tags or self.have_block and name not in span_override_tags: 463 nodes.append("\n") 464 465 self.have_block = False 466 467 # Lists inside lists require separation. 468 469 elif list_tags.has_key(name) and parent == "li": 470 nodes.append("\n") 471 472 # Without preceding text, save any block node state for non-block 473 # elements so that newline separators can be added at another 474 # level. 475 476 elif name in block_tags and parent not in block_tags: 477 self.have_block = True 478 479 elif name not in block_tags and self.have_block and name not in span_override_tags: 480 self.have_block = True 481 482 else: 483 self.have_block = False 484 485 nodes.append(text) 486 487 # Otherwise, emit the text (at the top level of the document). 488 489 else: 490 self.out.write(text) 491 492 def is_preformatted(self): 493 return reduce(operator.or_, [self.states[tag] for tag in preformatted_tags], False) 494 495 def forbids_macros(self): 496 return reduce(operator.or_, [(tag in headings or tag == "a") for tag in self.elements], False) 497 498 # Whitespace normalisation. 499 500 def get_replacement(self, name): 501 if name in ("html", "body", "table", "tbody", "tr") or list_tags.has_key(name): 502 return "" 503 else: 504 return " " 505 506 def normalise(self, text, name): 507 return normalise_regexp.sub(self.get_replacement(name), text) 508 509 def parse(s, out): 510 511 "Parse the content in the string 's', writing a translation to 'out'." 512 513 # NOTE: CDATA sections appear to have erroneous endings. 514 515 s = u"""\ 516 <?xml version="1.0"?> 517 <!DOCTYPE html 518 PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" 519 "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> 520 <html xmlns="http://www.w3.org/1999/xhtml"> 521 <body> 522 %s 523 </body> 524 </html>""" % s.replace("]] >", "]]>") 525 526 f = StringIO(s.encode("utf-8")) 527 try: 528 parser = ConfluenceXMLParser(out) 529 parser.parse(f) 530 finally: 531 f.close() 532 533 if __name__ == "__main__": 534 s = codecs.getreader("utf-8")(sys.stdin).read() 535 out = codecs.getwriter("utf-8")(sys.stdout) 536 parse(s, out) 537 538 # vim: tabstop=4 expandtab shiftwidth=4