1 #!/usr/bin/env python 2 3 """ 4 Confluence Wiki XML/XHTML syntax parsing. 5 6 Copyright (C) 2012, 2013 Paul Boddie <paul@boddie.org.uk> 7 8 This software is free software; you can redistribute it and/or 9 modify it under the terms of the GNU General Public License as 10 published by the Free Software Foundation; either version 2 of 11 the License, or (at your option) any later version. 12 13 This software is distributed in the hope that it will be useful, 14 but WITHOUT ANY WARRANTY; without even the implied warranty of 15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 GNU General Public License for more details. 17 18 You should have received a copy of the GNU General Public 19 License along with this library; see the file LICENCE.txt 20 If not, write to the Free Software Foundation, Inc., 21 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA 22 """ 23 24 try: 25 from cStringIO import StringIO 26 except ImportError: 27 from StringIO import StringIO 28 29 from MoinMoin import wikiutil 30 from common import * 31 from xmlread import Parser 32 import re 33 import sys 34 import operator 35 import htmlentitydefs 36 import codecs 37 38 # XML dialect syntax parsing. 39 40 tags = { 41 # XHTML tag MoinMoin syntax 42 "strong" : "'''%s'''", 43 "em" : "''%s''", 44 "u" : "__%s__", 45 "del" : "--(%s)--", 46 "sup" : "^%s^", 47 "sub" : ",,%s,,", 48 "code" : "`%s`", 49 "tbody" : "%s", 50 "tr" : "%s", 51 "th" : "'''%s'''", 52 "td" : "%s", 53 "blockquote" : " %s", 54 "small" : "~-%s-~", 55 "big" : "~+%s+~", 56 "p" : "%s", 57 "ol" : "%s", 58 "ul" : "%s", 59 "ac:link" : "[[%s%s|%s]]", 60 "ac:image" : "{{%s%s|%s}}", 61 "a" : "[[%s|%s]]", 62 } 63 64 for tag, translation in blocktypes.items(): 65 tags[tag] = translation 66 67 simple_tags = { 68 # XHTML tag MoinMoin syntax 69 "br" : "<<BR>>", 70 } 71 72 simple_preformatted_tags = { 73 # XHTML tag MoinMoin syntax 74 "br" : "\n", 75 } 76 77 list_tags = { 78 # XHTML list tag MoinMoin list item syntax 79 "ol" : "1. %s", 80 "ul" : "* %s", 81 } 82 83 preformatted_tags = ["pre", "ac:plain-text-body"] 84 single_level_tags = ["strong", "em", "u", "del", "sup", "sub", "code"] 85 formatted_tags = ["ac:rich-text-body", "table"] 86 87 indented_tags = ["li", "p"] + preformatted_tags + formatted_tags 88 block_tags = indented_tags + blocktypes.keys() + list_tags.keys() 89 span_override_tags = ["ac:link"] 90 91 link_target_tags = { 92 # Confluence element Attributes providing the target 93 "ri:page" : ("ri:space-key", "ri:content-title"), 94 "ri:attachment" : ("ri:filename",), 95 "ri:user" : ("ri:username",), 96 } 97 98 link_target_prefixes = { 99 # Attribute with details Prefix ensuring correct relative link 100 "ri:space-key" : "..", 101 "ri:content-title" : "..", 102 } 103 104 link_label_attributes = "ri:content-title", "ac:link-body" 105 106 # NOTE: User links should support the intended user namespace prefix. 107 108 link_target_types = { 109 # Confluence element MoinMoin link prefix 110 "ri:attachment" : "attachment:", 111 "ri:user" : "", 112 "ac:link-body" : "#", 113 } 114 115 macro_rich_text_styles = { 116 # Confluence style MoinMoin admonition style 117 "note" : "caution", 118 "warning" : "warning", 119 "info" : "important", 120 "tip" : "tip", 121 } 122 123 macroargs = { 124 # Confluence macro Confluence and MoinMoin macro arguments 125 "color" : ("color", "col"), 126 } 127 128 macrotypes = { 129 # Confluence macro MoinMoin syntax 130 "anchor" : "<<Anchor(%(anchor)s)>>", 131 "color" : "<<Color2(%(content)s, %(args)s)>>", 132 } 133 134 normalise_regexp_str = r"\s+" 135 normalise_regexp = re.compile(normalise_regexp_str) 136 137 class ConfluenceXMLParser(Parser): 138 139 "Handle content from Confluence 4 page revisions." 140 141 def __init__(self, out): 142 Parser.__init__(self) 143 self.out = out 144 145 # Link target and label information. 146 147 self.target = None 148 self.target_type = None 149 self.label = None 150 151 # Macro information. 152 153 self.macro = None 154 self.macro_parameters = {} 155 156 # Indentation and element nesting states. 157 158 self.indents = [0] 159 self.states = {} 160 self.max_level = self.level = 0 161 162 for name in preformatted_tags + single_level_tags: 163 self.states[name] = 0 164 165 # Table states. 166 167 self.table_rows = 0 168 self.table_columns = 0 169 170 # Block states. 171 172 self.have_block = False 173 174 # ContentHandler-related methods. 175 176 def startElement(self, name, attrs): 177 178 # Track indentation for lists. 179 180 if list_tags.has_key(name): 181 self.indents.append(self.indents[-1] + 1) 182 183 # Track element nesting. 184 185 if self.states.has_key(name): 186 self.states[name] += 1 187 188 # Track cumulative element nesting in order to produce appropriate depth 189 # indicators in the formatted output. 190 191 if name in preformatted_tags or name in formatted_tags: 192 self.level += 1 193 self.max_level = max(self.level, self.max_level) 194 195 # Reset indentation within regions. 196 197 self.indents.append(0) 198 199 Parser.startElement(self, name, attrs) 200 201 # Remember macro information for use within the element. 202 203 if name == "ac:macro": 204 self.macro = self.attributes[-1].get("ac:name") 205 206 def endElement(self, name): 207 208 # Reset the indent for any preformatted/formatted region so that it may 209 # itself be indented. 210 211 if name in preformatted_tags or name in formatted_tags: 212 self.indents.pop() 213 214 Parser.endElement(self, name) 215 216 if list_tags.has_key(name): 217 self.indents.pop() 218 219 if self.states.has_key(name): 220 self.states[name] -= 1 221 222 if name in preformatted_tags or name in formatted_tags: 223 self.level -= 1 224 if not self.level: 225 self.max_level = 0 226 227 # Discard macro state. 228 229 if name == "ac:macro": 230 self.macro = None 231 self.macro_parameters = {} 232 233 def characters(self, content): 234 if not self.is_preformatted(): 235 content = self.normalise(content, self.elements[-1]) 236 Parser.characters(self, content) 237 238 def skippedEntity(self, name): 239 ch = htmlentitydefs.name2codepoint.get(name) 240 if ch: 241 self.text[-1].append(unichr(ch)) 242 243 # Parser-related methods. 244 245 def handleElement(self, name): 246 247 """ 248 Handle the completion of the element with the given 'name'. Any content 249 will either be recorded for later use (by an enclosing element, for 250 example) or emitted in some form. 251 """ 252 253 text = u"".join(self.text[-1]) 254 255 # Handle state. 256 257 if name == "table": 258 self.table_rows = 0 259 elif name == "tr": 260 self.table_columns = 0 261 262 # Find conversions. 263 264 conversion = None 265 266 # Handle list elements. 267 268 if name == "li" and len(self.elements) > 1: 269 list_tag = self.elements[-2] 270 conversion = list_tags.get(list_tag) 271 272 # Remember link target information. 273 274 elif link_target_tags.has_key(name): 275 target_details = [] 276 277 # Get target details from the element's attributes. 278 279 for attrname in link_target_tags[name]: 280 attrvalue = self.attributes[-1].get(attrname) 281 if attrvalue: 282 target_details.append(attrvalue) 283 prefix = link_target_prefixes.get(attrname) 284 if prefix: 285 target_details.insert(0, prefix) 286 if attrname in link_label_attributes and not self.label: 287 self.label = attrvalue 288 289 # Make a link based on the details. 290 291 self.target = u"/".join(target_details) 292 self.target_type = name 293 text = "" 294 295 # For anchor links, just use the raw text and let Moin do the formatting. 296 297 elif name == "ac:link-body": 298 if not self.target_type: 299 self.target_type = name 300 self.label = text.strip() 301 text = "" 302 303 # For conventional links, remember the href attribute as the target. 304 305 elif name == "a": 306 self.target = self.attributes[-1].get("href") 307 self.label = text.strip() 308 text = "" 309 310 # Remember macro information. 311 312 elif name == "ac:parameter": 313 self.macro_parameters[self.attributes[-1].get("ac:name")] = text 314 text = "" 315 316 elif name == "ac:default-parameter": 317 self.macro_parameters[self.attributes[-2].get("ac:name")] = text 318 text = "" 319 320 # Handle single-level tags. 321 322 elif name in single_level_tags and self.states[name] > 1: 323 conversion = "%s" 324 325 # Handle preformatted sections. 326 327 elif name in preformatted_tags or name in formatted_tags: 328 329 # Nest the section appropriately. 330 331 level = 3 + self.max_level - self.level 332 opening = "{" * level 333 closing = "}" * level 334 335 # Macro name information is used to style rich text body regions. 336 337 if name != "table" and self.macro and macro_rich_text_styles.has_key(self.macro): 338 details = macro_rich_text_styles[self.macro] 339 title = self.macro_parameters.get("title") 340 if title: 341 details = "%s\n\n%s" % (details, title) 342 343 conversion = "%s#!wiki %s\n\n%%s\n%s" % (opening, details, closing) 344 345 elif name == "table": 346 conversion = "%s#!table\n%%s\n%s" % (opening, closing) 347 348 else: 349 # Preformatted sections containing newlines must contain an initial 350 # newline. 351 352 if text.find("\n") != -1 and not text.startswith("\n"): 353 opening += "\n" 354 355 conversion = "%s%%s%s" % (opening, closing) 356 357 # Handle the common case and simpler special cases. 358 359 if not conversion: 360 conversion = tags.get(name) 361 362 363 364 # Attempt to convert the text. 365 366 # Links require target information. 367 368 if name in ("ac:link", "ac:image"): 369 prefix = link_target_types.get(self.target_type, "") 370 anchor = self.attributes[-1].get("ac:anchor") 371 text = conversion % (prefix, anchor or self.target, self.label or text.strip() or self.target) 372 self.target = self.target_type = self.label = None 373 374 elif name == "a": 375 text = conversion % (self.target, self.label or self.target) 376 self.target = self.target_type = self.label = None 377 378 # Macros require various kinds of information. 379 # Some macros affect the formatting of their contents, whereas other 380 # simpler macros are handled here. 381 382 elif name == "ac:macro" and not self.forbids_macros(): 383 conversion = macrotypes.get(self.macro) 384 if conversion: 385 parameters = {"content" : text} 386 parameters.update(self.macro_parameters) 387 argnames = macroargs.get(self.macro) 388 if argnames: 389 confargname, moinargname = argnames 390 parameters["args"] = quote_macro_argument("%s=%s" % (moinargname, self.macro_parameters[confargname])) 391 text = conversion % parameters 392 393 # Handle the common cases for parameterised and unparameterised 394 # substitutions. 395 396 elif text and conversion: 397 text = conversion % text 398 elif simple_tags.has_key(name) and not self.is_preformatted(): 399 text = simple_tags[name] 400 elif simple_preformatted_tags.has_key(name) and self.is_preformatted(): 401 text = simple_preformatted_tags[name] 402 403 404 405 # Postprocess table columns and rows. 406 407 if name in ("th", "td"): 408 if self.table_columns: 409 text = "\n|| %s" % text 410 self.table_columns += 1 411 elif name == "tr": 412 if self.table_rows: 413 text = "\n==\n%s" % text 414 self.table_rows += 1 415 416 417 418 # Normalise leading whitespace and indent the text if appropriate. 419 420 if name in indented_tags: 421 text = " " * self.indents[-1] + text.lstrip() 422 423 # Add the converted text to the end of the parent element's text nodes. 424 425 if len(self.text) > 1: 426 nodes = self.text[-2] 427 parent = self.elements[-2] 428 429 # Where preceding text exists, add any blank line separators. 430 431 if u"".join(nodes): 432 433 # All top-level elements are separated with blank lines. 434 435 if parent == "body": 436 nodes.append("\n") 437 438 # Block elements always cause a new line to be started. 439 440 if name in block_tags or self.have_block and name not in span_override_tags: 441 nodes.append("\n") 442 443 self.have_block = False 444 445 # Lists inside lists require separation. 446 447 elif list_tags.has_key(name) and parent == "li": 448 nodes.append("\n") 449 450 # Without preceding text, save any block node state for non-block 451 # elements so that newline separators can be added at another 452 # level. 453 454 elif name in block_tags and parent not in block_tags: 455 self.have_block = True 456 457 elif name not in block_tags and self.have_block and name not in span_override_tags: 458 self.have_block = True 459 460 else: 461 self.have_block = False 462 463 nodes.append(text) 464 465 # Otherwise, emit the text (at the top level of the document). 466 467 else: 468 self.out.write(text) 469 470 def is_preformatted(self): 471 return reduce(operator.or_, [self.states[tag] for tag in preformatted_tags], False) 472 473 def forbids_macros(self): 474 return reduce(operator.or_, [(tag in headings or tag == "a") for tag in self.elements], False) 475 476 # Whitespace normalisation. 477 478 def get_replacement(self, name): 479 if name in ("html", "body", "table", "tbody", "tr") or list_tags.has_key(name): 480 return "" 481 else: 482 return " " 483 484 def normalise(self, text, name): 485 return normalise_regexp.sub(self.get_replacement(name), text) 486 487 def parse(s, out): 488 489 "Parse the content in the string 's', writing a translation to 'out'." 490 491 # NOTE: CDATA sections appear to have erroneous endings. 492 493 s = u"""\ 494 <?xml version="1.0"?> 495 <!DOCTYPE html 496 PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" 497 "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> 498 <html xmlns="http://www.w3.org/1999/xhtml"> 499 <body> 500 %s 501 </body> 502 </html>""" % s.replace("]] >", "]]>") 503 504 f = StringIO(s.encode("utf-8")) 505 try: 506 parser = ConfluenceXMLParser(out) 507 parser.parse(f) 508 finally: 509 f.close() 510 511 if __name__ == "__main__": 512 s = codecs.getreader("utf-8")(sys.stdin).read() 513 out = codecs.getwriter("utf-8")(sys.stdout) 514 parse(s, out) 515 516 # vim: tabstop=4 expandtab shiftwidth=4