1 #!/usr/bin/env python 2 3 """ 4 Confluence Wiki XML/XHTML syntax parsing. 5 6 Copyright (C) 2012, 2013 Paul Boddie <paul@boddie.org.uk> 7 8 This software is free software; you can redistribute it and/or 9 modify it under the terms of the GNU General Public License as 10 published by the Free Software Foundation; either version 2 of 11 the License, or (at your option) any later version. 12 13 This software is distributed in the hope that it will be useful, 14 but WITHOUT ANY WARRANTY; without even the implied warranty of 15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 GNU General Public License for more details. 17 18 You should have received a copy of the GNU General Public 19 License along with this library; see the file LICENCE.txt 20 If not, write to the Free Software Foundation, Inc., 21 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA 22 """ 23 24 try: 25 from cStringIO import StringIO 26 except ImportError: 27 from StringIO import StringIO 28 29 from MoinMoin import wikiutil 30 from common import * 31 from xmlread import Parser 32 import re 33 import sys 34 import operator 35 import htmlentitydefs 36 import codecs 37 38 # XML dialect syntax parsing. 39 40 tags = { 41 # XHTML tag MoinMoin syntax 42 "strong" : "'''%s'''", 43 "em" : "''%s''", 44 "u" : "__%s__", 45 "del" : "--(%s)--", 46 "sup" : "^%s^", 47 "sub" : ",,%s,,", 48 "code" : "`%s`", 49 "tbody" : "%s", 50 "tr" : "%s", 51 "th" : "'''%s'''", 52 "td" : "%s", 53 "blockquote" : " %s", 54 "small" : "~-%s-~", 55 "big" : "~+%s+~", 56 "p" : "%s", 57 "ol" : "%s", 58 "ul" : "%s", 59 "ac:link" : "[[%s%s%s|%s]]", 60 "ac:image" : "{{%s%s%s|%s}}", 61 "a" : "[[%s|%s]]", 62 } 63 64 for tag, translation in blocktypes.items(): 65 tags[tag] = translation 66 67 simple_tags = { 68 # XHTML tag MoinMoin syntax 69 "br" : "<<BR>>", 70 } 71 72 simple_preformatted_tags = { 73 # XHTML tag MoinMoin syntax 74 "br" : "\n", 75 } 76 77 list_tags = { 78 # XHTML list tag MoinMoin list item syntax 79 "ol" : "1. %s", 80 "ul" : "* %s", 81 } 82 83 preformatted_tags = ["pre", "ac:plain-text-body"] 84 single_level_tags = ["strong", "em", "u", "del", "sup", "sub", "code"] 85 formatted_tags = ["ac:rich-text-body", "table"] 86 87 indented_tags = ["li", "p"] + preformatted_tags + formatted_tags 88 block_tags = indented_tags + blocktypes.keys() + list_tags.keys() 89 span_override_tags = ["ac:link"] 90 91 link_target_tags = { 92 # Confluence element Attributes providing the target 93 "ri:page" : ("ri:space-key", "ri:content-title"), 94 "ri:attachment" : ("ri:filename",), 95 "ri:user" : ("ri:username",), 96 } 97 98 link_target_prefixes = { 99 # Attribute with details Prefix ensuring correct relative link 100 "ri:space-key" : "..", 101 "ri:content-title" : "..", 102 } 103 104 link_label_attributes = "ri:content-title", "ac:link-body" 105 106 # NOTE: User links should support the intended user namespace prefix. 107 108 link_target_types = { 109 # Confluence element MoinMoin link prefix 110 "ri:attachment" : "attachment:", 111 "ri:user" : "", 112 } 113 114 macro_rich_text_styles = { 115 # Confluence style MoinMoin admonition style 116 "note" : "caution", 117 "warning" : "warning", 118 "info" : "important", 119 "tip" : "tip", 120 } 121 122 macroargs = { 123 # Confluence macro Confluence and MoinMoin macro arguments 124 "color" : ("color", "col"), 125 } 126 127 macrotypes = { 128 # Confluence macro MoinMoin syntax 129 "anchor" : "<<Anchor(%(anchor)s)>>", 130 "color" : "<<Color2(%(content)s, %(args)s)>>", 131 } 132 133 normalise_regexp_str = r"\s+" 134 normalise_regexp = re.compile(normalise_regexp_str) 135 136 class ConfluenceXMLParser(Parser): 137 138 "Handle content from Confluence 4 page revisions." 139 140 def __init__(self, out): 141 Parser.__init__(self) 142 self.out = out 143 144 # Link target and label information. 145 146 self.target = None 147 self.target_type = None 148 self.label = None 149 150 # Macro information. 151 152 self.macro = None 153 self.macro_parameters = {} 154 self.held_anchors = [] 155 156 # Indentation and element nesting states. 157 158 self.indents = [0] 159 self.states = {} 160 self.max_level = self.level = 0 161 162 for name in preformatted_tags + single_level_tags: 163 self.states[name] = 0 164 165 # Table states. 166 167 self.table_rows = 0 168 self.table_columns = 0 169 170 # Block states. 171 172 self.have_block = False 173 174 # ContentHandler-related methods. 175 176 def startElement(self, name, attrs): 177 178 # Track indentation for lists. 179 180 if list_tags.has_key(name): 181 self.indents.append(self.indents[-1] + 1) 182 183 # Track element nesting. 184 185 if self.states.has_key(name): 186 self.states[name] += 1 187 188 # Track cumulative element nesting in order to produce appropriate depth 189 # indicators in the formatted output. 190 191 if name in preformatted_tags or name in formatted_tags: 192 self.level += 1 193 self.max_level = max(self.level, self.max_level) 194 195 # Reset indentation within regions. 196 197 self.indents.append(0) 198 199 if name in headings: 200 self.held_anchors = [] 201 202 Parser.startElement(self, name, attrs) 203 204 # Remember macro information for use within the element. 205 206 if name == "ac:macro": 207 self.macro = self.attributes[-1].get("ac:name") 208 209 def endElement(self, name): 210 211 # Reset the indent for any preformatted/formatted region so that it may 212 # itself be indented. 213 214 if name in preformatted_tags or name in formatted_tags: 215 self.indents.pop() 216 217 Parser.endElement(self, name) 218 219 if list_tags.has_key(name): 220 self.indents.pop() 221 222 if self.states.has_key(name): 223 self.states[name] -= 1 224 225 if name in preformatted_tags or name in formatted_tags: 226 self.level -= 1 227 if not self.level: 228 self.max_level = 0 229 230 # Discard macro state. 231 232 if name == "ac:macro": 233 self.macro = None 234 self.macro_parameters = {} 235 236 def characters(self, content): 237 if not self.is_preformatted(): 238 content = self.normalise(content, self.elements[-1]) 239 Parser.characters(self, content) 240 241 def skippedEntity(self, name): 242 ch = htmlentitydefs.name2codepoint.get(name) 243 if ch: 244 self.text[-1].append(unichr(ch)) 245 246 # Parser-related methods. 247 248 def handleElement(self, name): 249 250 """ 251 Handle the completion of the element with the given 'name'. Any content 252 will either be recorded for later use (by an enclosing element, for 253 example) or emitted in some form. 254 """ 255 256 text = u"".join(self.text[-1]) 257 258 # Handle state. 259 260 if name == "table": 261 self.table_rows = 0 262 elif name == "tr": 263 self.table_columns = 0 264 265 # Find conversions. 266 267 conversion = None 268 269 # Handle list elements. 270 271 if name == "li" and len(self.elements) > 1: 272 list_tag = self.elements[-2] 273 conversion = list_tags.get(list_tag) 274 275 # Remember link target information. 276 277 elif link_target_tags.has_key(name): 278 target_details = [] 279 280 # Get target details from the element's attributes. 281 282 for attrname in link_target_tags[name]: 283 attrvalue = self.attributes[-1].get(attrname) 284 if attrvalue: 285 286 # Obtain a link label. 287 288 if attrname in link_label_attributes and not self.label: 289 self.label = attrvalue 290 291 # Validate any page title. 292 293 if attrname == "ri:content-title": 294 attrvalue = get_page_title(attrvalue) 295 target_details.append(attrvalue) 296 297 # Insert any prefix required for the link. 298 299 prefix = link_target_prefixes.get(attrname) 300 if prefix: 301 target_details.insert(0, prefix) 302 303 # Make a link based on the details. 304 305 self.target = u"/".join(target_details) 306 self.target_type = name 307 text = "" 308 309 # For anchor links, just use the raw text and let Moin do the formatting. 310 311 elif name == "ac:link-body": 312 self.label = text.strip() 313 text = "" 314 315 # For conventional links, remember the href attribute as the target. 316 317 elif name == "a": 318 self.target = self.attributes[-1].get("href") 319 self.label = text.strip() 320 text = "" 321 322 # Remember macro information. 323 324 elif name == "ac:parameter": 325 self.macro_parameters[self.attributes[-1].get("ac:name")] = text 326 text = "" 327 328 elif name == "ac:default-parameter": 329 self.macro_parameters[self.attributes[-2].get("ac:name")] = text 330 text = "" 331 332 # Handle single-level tags. 333 334 elif name in single_level_tags and self.states[name] > 1: 335 conversion = "%s" 336 337 # Handle preformatted sections. 338 339 elif name in preformatted_tags or name in formatted_tags: 340 341 # Nest the section appropriately. 342 343 level = 3 + self.max_level - self.level 344 opening = "{" * level 345 closing = "}" * level 346 347 # Macro name information is used to style rich text body regions. 348 349 if name != "table" and self.macro and macro_rich_text_styles.has_key(self.macro): 350 details = macro_rich_text_styles[self.macro] 351 title = self.macro_parameters.get("title") 352 if title: 353 details = "%s\n\n%s" % (details, title) 354 355 conversion = "%s#!wiki %s\n\n%%s\n%s" % (opening, details, closing) 356 357 elif name == "table": 358 conversion = "%s#!table\n%%s\n%s" % (opening, closing) 359 360 else: 361 # Preformatted sections containing newlines must contain an initial 362 # newline. 363 364 if text.find("\n") != -1 and not text.startswith("\n"): 365 opening += "\n" 366 367 conversion = "%s%%s%s" % (opening, closing) 368 369 # Handle the common case and simpler special cases. 370 371 if not conversion: 372 conversion = tags.get(name) 373 374 375 376 # Attempt to convert the text. 377 378 # Links require target information. 379 380 if name in ("ac:link", "ac:image"): 381 prefix = link_target_types.get(self.target_type, "") 382 anchor = self.attributes[-1].get("ac:anchor") or "" 383 label = self.label or text.strip() or self.target 384 text = conversion % (prefix, self.target, anchor and ("#%s" % anchor) or "", label) 385 self.target = self.target_type = self.label = None 386 387 elif name == "a": 388 text = conversion % (self.target, self.label or self.target) 389 self.target = self.target_type = self.label = None 390 391 # Macros require various kinds of information. 392 # Some macros affect the formatting of their contents, whereas other 393 # simpler macros are handled here. 394 395 elif name == "ac:macro": 396 conversion = macrotypes.get(self.macro) 397 if conversion: 398 parameters = {"content" : text} 399 parameters.update(self.macro_parameters) 400 argnames = macroargs.get(self.macro) 401 if argnames: 402 confargname, moinargname = argnames 403 parameters["args"] = quote_macro_argument("%s=%s" % (moinargname, self.macro_parameters[confargname])) 404 text = conversion % parameters 405 if self.macro == "anchor" and self.forbids_macros(): 406 self.held_anchors.append(text) 407 text = "" 408 409 # Handle the common cases for parameterised and unparameterised 410 # substitutions. 411 412 elif text and conversion: 413 text = conversion % text 414 elif simple_tags.has_key(name) and not self.is_preformatted(): 415 text = simple_tags[name] 416 elif simple_preformatted_tags.has_key(name) and self.is_preformatted(): 417 text = simple_preformatted_tags[name] 418 419 420 421 # Postprocess table columns and rows. 422 423 if name in ("th", "td"): 424 if self.table_columns: 425 text = "\n|| %s" % text 426 self.table_columns += 1 427 elif name == "tr": 428 if self.table_rows: 429 text = "\n==\n%s" % text 430 self.table_rows += 1 431 432 # Postprocess held anchor tags in headings. 433 434 elif name in headings and self.held_anchors: 435 text = "%s\n%s" % ("".join(self.held_anchors), text) 436 437 438 439 # Normalise leading whitespace and indent the text if appropriate. 440 441 if name in indented_tags: 442 text = " " * self.indents[-1] + text.lstrip() 443 444 # Add the converted text to the end of the parent element's text nodes. 445 446 if len(self.text) > 1: 447 nodes = self.text[-2] 448 parent = self.elements[-2] 449 450 # Where preceding text exists, add any blank line separators. 451 452 if u"".join(nodes): 453 454 # All top-level elements are separated with blank lines. 455 456 if parent == "body": 457 nodes.append("\n") 458 459 # Block elements always cause a new line to be started. 460 461 if name in block_tags or self.have_block and name not in span_override_tags: 462 nodes.append("\n") 463 464 self.have_block = False 465 466 # Lists inside lists require separation. 467 468 elif list_tags.has_key(name) and parent == "li": 469 nodes.append("\n") 470 471 # Without preceding text, save any block node state for non-block 472 # elements so that newline separators can be added at another 473 # level. 474 475 elif name in block_tags and parent not in block_tags: 476 self.have_block = True 477 478 elif name not in block_tags and self.have_block and name not in span_override_tags: 479 self.have_block = True 480 481 else: 482 self.have_block = False 483 484 nodes.append(text) 485 486 # Otherwise, emit the text (at the top level of the document). 487 488 else: 489 self.out.write(text) 490 491 def is_preformatted(self): 492 return reduce(operator.or_, [self.states[tag] for tag in preformatted_tags], False) 493 494 def forbids_macros(self): 495 return reduce(operator.or_, [(tag in headings or tag == "a") for tag in self.elements], False) 496 497 # Whitespace normalisation. 498 499 def get_replacement(self, name): 500 if name in ("html", "body", "table", "tbody", "tr") or list_tags.has_key(name): 501 return "" 502 else: 503 return " " 504 505 def normalise(self, text, name): 506 return normalise_regexp.sub(self.get_replacement(name), text) 507 508 def parse(s, out): 509 510 "Parse the content in the string 's', writing a translation to 'out'." 511 512 # NOTE: CDATA sections appear to have erroneous endings. 513 514 s = u"""\ 515 <?xml version="1.0"?> 516 <!DOCTYPE html 517 PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" 518 "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> 519 <html xmlns="http://www.w3.org/1999/xhtml"> 520 <body> 521 %s 522 </body> 523 </html>""" % s.replace("]] >", "]]>") 524 525 f = StringIO(s.encode("utf-8")) 526 try: 527 parser = ConfluenceXMLParser(out) 528 parser.parse(f) 529 finally: 530 f.close() 531 532 if __name__ == "__main__": 533 s = codecs.getreader("utf-8")(sys.stdin).read() 534 out = codecs.getwriter("utf-8")(sys.stdout) 535 parse(s, out) 536 537 # vim: tabstop=4 expandtab shiftwidth=4