1 #!/usr/bin/env python 2 3 """ 4 Confluence Wiki syntax parsing. 5 6 Copyright (C) 2012, 2013 Paul Boddie <paul@boddie.org.uk> 7 8 This software is free software; you can redistribute it and/or 9 modify it under the terms of the GNU General Public License as 10 published by the Free Software Foundation; either version 2 of 11 the License, or (at your option) any later version. 12 13 This software is distributed in the hope that it will be useful, 14 but WITHOUT ANY WARRANTY; without even the implied warranty of 15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 GNU General Public License for more details. 17 18 You should have received a copy of the GNU General Public 19 License along with this library; see the file LICENCE.txt 20 If not, write to the Free Software Foundation, Inc., 21 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA 22 23 -------- 24 25 The basic procedure is as follows: 26 27 1. Wiki pages are first split up into regions. 28 2. Then, within these regions, the text is split into blocks. 29 1. First, lists are identified. 30 2. Additionally, other block-like elements are identified. 31 3. Each block is then parsed. 32 """ 33 34 try: 35 from cStringIO import StringIO 36 except ImportError: 37 from StringIO import StringIO 38 39 from xmlread import Parser 40 import re 41 import sys 42 import operator 43 import htmlentitydefs 44 45 URL_SCHEMES = ("http", "https", "ftp", "mailto") 46 47 # Section extraction. 48 49 sections_regexp_str = r"(?<!{){(?P<type>[^-_*+{}\n:]+)(:[^}\n]+)?}.*?{(?P=type)}" 50 sections_regexp = re.compile(sections_regexp_str, re.DOTALL | re.MULTILINE) 51 52 def get_regions(s): 53 54 """ 55 Return a list of regions from 's'. Each region is specified using a tuple of 56 the form (type, text). 57 """ 58 59 last = 0 60 regions = [] 61 for match in sections_regexp.finditer(s): 62 start, end = match.span() 63 regions.append((None, s[last:start])) 64 regions.append(get_section_details(s[start:end])) 65 last = end 66 regions.append((None, s[last:])) 67 return regions 68 69 # Section inspection. 70 71 section_regexp_str = r"{(?P<sectiontype>[^\n:]*?)(?::(?P<options>.*?))?}(?P<section>.*){(?P=sectiontype)}" 72 section_regexp = re.compile(section_regexp_str, re.DOTALL | re.MULTILINE) 73 74 def get_section_details(s): 75 76 "Return the details of a section 's' in the form (type, text)." 77 78 match = section_regexp.match(s) 79 if match: 80 return (match.group("sectiontype"), match.group("options")), match.group("section") 81 else: 82 return None, s 83 84 # Heading, table and list extraction. 85 86 list_regexp_str = r"^\s*(?P<listtype>[*#-])[*#-]*.*\n(\s*(?P=listtype).*(?:\n|$))*" 87 table_regexp_str = r"^((?P<celltype>[|]{1,2})(.+?(?P=celltype))+(\n|$))+" 88 blocktext_regexp_str = r"^(?P<type>h\d|bq)\.\s+(?P<text>.*)$" 89 90 blockelement_regexp = re.compile( 91 "(" + list_regexp_str + ")" 92 "|" 93 "(" + table_regexp_str + ")" 94 "|" 95 "(" + blocktext_regexp_str + ")", 96 re.MULTILINE 97 ) 98 99 def get_block_elements(s): 100 101 """ 102 Extract headings, tables and lists from the given string 's'. 103 """ 104 105 last = 0 106 blocks = [] 107 for match in blockelement_regexp.finditer(s): 108 start, end = match.span() 109 matchtype = match.group("listtype") and "list" or match.group("celltype") and "table" or match.group("type") 110 blocks.append((None, s[last:start])) 111 blocks.append((matchtype, match.group("text") or s[start:end])) 112 last = end 113 blocks.append((None, s[last:])) 114 return blocks 115 116 # Block extraction. 117 118 block_regexp_str = r"^(?:\s*\n)+" 119 block_regexp = re.compile(block_regexp_str, re.MULTILINE) 120 121 def get_basic_blocks(s): 122 123 """ 124 Return blocks from the given string 's' by splitting the text on blank lines 125 and eliminating those lines. 126 """ 127 128 return [b for b in block_regexp.split(s) if b.strip()] 129 130 # Block inspection. 131 132 def get_blocks(s): 133 134 """ 135 Return blocks from the given string 's', inspecting the basic blocks and 136 generating additional block-level text where appropriate. 137 """ 138 139 blocks = [] 140 141 for blocktype, blocktext in get_block_elements(s): 142 143 # Collect heading, list and table blocks. 144 145 if blocktype is not None: 146 blocks.append((blocktype, blocktext)) 147 148 # Attempt to find new subblocks in other regions. 149 150 else: 151 for block in get_basic_blocks(blocktext): 152 blocks.append((None, block)) 153 154 return blocks 155 156 # List item inspection. 157 158 listitem_regexp_str = r"^(?P<marker> *[-*#]+)\s*(?P<text>.*)$" 159 listitem_regexp = re.compile(listitem_regexp_str, re.MULTILINE) 160 161 def get_list_items(text): 162 163 "Return a list of (marker, text) tuples for the given list 'text'." 164 165 items = [] 166 167 for match in listitem_regexp.finditer(text): 168 items.append((match.group("marker"), match.group("text"))) 169 170 return items 171 172 # Table row inspection. 173 174 monospace_regexp_str = r"{{(?P<monotext>.*?)}}" 175 link_regexp_str = r"[[](?P<linktext>.*?)]" 176 image_regexp_str = r"!(?P<imagetext>.*?)!" 177 cellsep_regexp_str = r"(?P<celltype>[|]{1,2})" 178 179 content_regexp_str = ( 180 "(" + monospace_regexp_str + ")" 181 "|" 182 "(" + link_regexp_str + ")" 183 "|" 184 "(" + image_regexp_str + ")" 185 ) 186 187 table_content_regexp_str = ( 188 content_regexp_str + 189 "|" 190 "(" + cellsep_regexp_str + ")" 191 ) 192 193 content_regexp = re.compile(content_regexp_str) 194 table_content_regexp = re.compile(table_content_regexp_str) 195 196 def translate_content_match(match): 197 198 "Translate the content described by the given 'match', returning a string." 199 200 if match.group("monotext"): 201 return "{{{%s}}}" % match.group("monotext") 202 203 elif match.group("linktext"): 204 parts = match.group("linktext").split("|") 205 206 # NOTE: Proper detection of external links required. 207 208 if len(parts) == 1: 209 label, target, title = None, parts[0], None 210 elif len(parts) == 2: 211 (label, target), title = parts, None 212 else: 213 label, target, title = parts 214 215 target = target.strip() 216 217 # Look for namespace links and rewrite them. 218 219 if target.find(":") != -1: 220 prefix = "" 221 space, rest = target.split(":", 1) 222 if space not in URL_SCHEMES: 223 target = "%s/%s" % (space, rest) 224 225 # Detect anchors. 226 227 elif target.startswith("#"): 228 prefix = "" 229 230 # Detect attachments. 231 232 elif target.startswith("^"): 233 prefix = "attachment:" 234 235 # Link to other pages within a space. 236 237 else: 238 prefix = "../" 239 240 # Make the link tidier by making a target if none was given. 241 242 if not label: 243 label = target 244 245 if not label and not title: 246 return "[[%s%s]]" % (prefix, target) 247 elif not title: 248 return "[[%s%s|%s]]" % (prefix, target, label) 249 else: 250 return "[[%s%s|%s|title=%s]]" % (prefix, target, label, title) 251 252 elif match.group("imagetext"): 253 parts = match.group("imagetext").split("|") 254 255 # NOTE: Proper detection of external links required. 256 257 if parts[0].startswith("http"): 258 prefix = "" 259 else: 260 prefix = "attachment:" 261 262 # NOTE: Proper options conversion required. 263 264 if len(parts) == 1: 265 return "{{%s%s}}" % (prefix, parts[0]) 266 else: 267 return "{{%s%s|%s}}" % (prefix, parts[0], parts[1]) 268 269 else: 270 return match.group() 271 272 def get_table_rows(text): 273 274 "Return a list of (cellsep, columns) tuples for the given table 'text'." 275 276 rows = [] 277 278 for line in text.split("\n"): 279 cellsep = None 280 columns = [""] 281 last = 0 282 for match in table_content_regexp.finditer(line): 283 start, end = match.span() 284 columns[-1] += line[last:start] 285 286 if match.group("celltype"): 287 if cellsep is None: 288 cellsep = match.group("celltype") 289 columns.append("") 290 else: 291 columns[-1] += match.group() 292 293 last = end 294 295 columns[-1] += line[last:] 296 297 if cellsep: 298 rows.append((cellsep, columns[1:-1])) 299 300 return rows 301 302 def translate_content(text, sectiontype=None): 303 304 """ 305 Return a translation of the given 'text'. If the optional 'sectiontype' is 306 specified, the translation may be modified to a form appropriate to the 307 section being translated. 308 """ 309 310 parts = [] 311 312 last = 0 313 for match in content_regexp.finditer(text): 314 start, end = match.span() 315 parts.append(text[last:start]) 316 317 # Handle unformatted sections. 318 319 if sectiontype in ("code", "noformat"): 320 parts.append(match.group()) 321 else: 322 parts.append(translate_content_match(match)) 323 324 last = end 325 326 parts.append(text[last:]) 327 return "".join(parts) 328 329 # Translation helpers. 330 331 blocktypes = { 332 "h1" : "= %s =", 333 "h2" : "== %s ==", 334 "h3" : "=== %s ===", 335 "h4" : "==== %s ====", 336 "h5" : "===== %s =====", 337 "h6" : "====== %s ======", 338 "bq" : "{{{%s}}}", 339 } 340 341 markers = { 342 "*" : "*", 343 "#" : "1.", 344 "-" : "*", 345 } 346 347 def translate_marker(marker): 348 349 "Translate the given 'marker' to a suitable Moin representation." 350 351 return " " * len(marker) + markers[marker[-1]] 352 353 cellseps = { 354 "|" : "||", 355 "||" : "||", 356 } 357 358 cellextra = { 359 "|" : "", 360 "||" : "'''", 361 } 362 363 def translate_cellsep(cellsep): 364 365 "Translate the given 'cellsep' to a suitable Moin representation." 366 367 return cellseps[cellsep] 368 369 def translate_cell(cellsep, text): 370 371 "Using 'cellsep', translate the cell 'text'." 372 373 return cellextra[cellsep] + translate_content(text) + cellextra[cellsep] 374 375 sectiontypes = { 376 "code" : "", 377 "noformat" : "", 378 "quote" : "", 379 "info" : "wiki important", 380 "note" : "wiki caution", 381 "tip" : "wiki tip", 382 "warning" : "wiki warning", 383 } 384 385 # XML dialect syntax parsing. 386 387 tags = { 388 # XHTML tag MoinMoin syntax 389 "strong" : "'''%s'''", 390 "em" : "''%s''", 391 "u" : "__%s__", 392 "del" : "--(%s)--", 393 "sup" : "^%s^", 394 "sub" : ",,%s,,", 395 "code" : "`%s`", 396 "pre" : "{{{%s}}}", 397 "blockquote" : " %s", 398 "small" : "~-%s-~", 399 "big" : "~+%s+~", 400 "p" : "%s", 401 "ol" : "%s", 402 "ul" : "%s", 403 "ac:plain-text-body" : "{{{%s}}}", 404 "ac:link" : "[[%s%s|%s]]", 405 } 406 407 for tag, translation in blocktypes.items(): 408 tags[tag] = translation 409 410 simple_tags = { 411 # XHTML tag MoinMoin syntax 412 "br" : "<<BR>>", 413 } 414 415 list_tags = { 416 # XHTML list tag MoinMoin list item syntax 417 "ol" : "1. %s", 418 "ul" : "* %s", 419 } 420 421 indented_tags = ["li", "p"] 422 423 link_target_tags = { 424 # Confluence element Attribute providing the target 425 "ri:page" : "ri:content-title", 426 "ri:attachment" : "ri:filename", 427 "ri:user" : "ri:username", 428 } 429 430 macro_rich_text_styles = { 431 # Confluence style MoinMoin admonition style 432 "note" : "caution", 433 "warning" : "warning", 434 "info" : "important", 435 "tip" : "tip", 436 } 437 438 normalise_regexp_str = r"\s+" 439 normalise_regexp = re.compile(normalise_regexp_str) 440 441 class ConfluenceXMLParser(Parser): 442 443 "Handle content from Confluence 4 page revisions." 444 445 def __init__(self, out): 446 Parser.__init__(self) 447 self.out = out 448 449 # Link target information. 450 451 self.target = None 452 self.target_type = None 453 454 # Macro information. 455 456 self.macro = None 457 self.macro_parameters = {} 458 459 # Indentation and preformatted states. 460 461 self.indent = 0 462 self.states = {} 463 for name in ("pre", "ac:plain-text-body"): 464 self.states[name] = 0 465 466 # ContentHandler-related methods. 467 468 def startElement(self, name, attrs): 469 if list_tags.has_key(name): 470 self.indent += 1 471 elif self.states.has_key(name): 472 self.states[name] += 1 473 Parser.startElement(self, name, attrs) 474 475 def endElement(self, name): 476 Parser.endElement(self, name) 477 if list_tags.has_key(name): 478 self.indent -= 1 479 elif self.states.has_key(name): 480 self.states[name] -= 1 481 482 def characters(self, content): 483 if not self.is_preformatted(): 484 content = self.normalise(content, self.elements[-1]) 485 Parser.characters(self, content) 486 487 def skippedEntity(self, name): 488 ch = htmlentitydefs.name2codepoint.get(name) 489 if ch: 490 self.text[-1].append(unichr(ch)) 491 492 # Parser-related methods. 493 494 def handleElement(self, name): 495 text = "".join(self.text[-1]) 496 conversion = None 497 498 # Handle list elements. 499 500 if name == "li" and len(self.elements) > 1: 501 list_tag = self.elements[-2] 502 conversion = list_tags.get(list_tag) 503 504 # Remember link target information. 505 506 elif link_target_tags.has_key(name): 507 self.target = self.attributes[-1].get(link_target_tags[name]) 508 self.target_type = name 509 text = "" 510 511 # Remember macro information. 512 513 elif name == "ac:parameter": 514 self.macro_parameters[self.attributes[-1].get("ac:name")] = text 515 text = "" 516 517 elif name == "ac:macro": 518 self.macro = self.attributes[-1].get("ac:name") 519 520 # Handle the common case. 521 522 else: 523 conversion = tags.get(name) 524 525 # Attempt to convert the text. 526 527 # Links require target information. 528 # NOTE: User links should support the intended user namespace prefix. 529 530 if name == "ac:link": 531 if self.target_type == "ri:attachment": 532 prefix = "attachment:" 533 elif self.target_type == "ri:user": 534 prefix = "" 535 else: 536 prefix = "../" 537 538 text = conversion % (prefix, self.target, text or self.target) 539 self.target = self.target_type = None 540 541 # Macro name information is used to style rich text body regions. 542 543 elif name == "ac:macro" and macro_rich_text_styles.has_key(self.macro): 544 details = macro_rich_text_styles[self.macro] 545 title = self.macro_parameters.get("title") 546 if title: 547 details = "%s\n\n%s" % (details, title) 548 text = "{{{#!wiki %s\n\n%s}}}" % (details, text) 549 self.macro = None 550 self.macro_parameters = {} 551 552 # Handle the common case. 553 554 elif text and conversion: 555 text = conversion % text 556 elif simple_tags.has_key(name): 557 text = simple_tags[name] 558 559 # Normalise leading whitespace and indent the text if appropriate. 560 561 if name in indented_tags: 562 text = " " * self.indent + text.lstrip() 563 564 # Add the converted text to the end of the parent element's text nodes. 565 566 if len(self.text) > 1: 567 nodes = self.text[-2] 568 if "".join(self.text[-2]): 569 parent = self.elements[-2] 570 if parent == "body": 571 nodes.append("\n\n") 572 elif list_tags.has_key(parent): 573 nodes.append("\n") 574 elif list_tags.has_key(name) and parent == "li": 575 nodes.append("\n") 576 nodes.append(text) 577 578 # Otherwise, emit the text. 579 580 else: 581 self.out.write(text) 582 583 def is_preformatted(self): 584 return reduce(operator.or_, self.states.values(), False) 585 586 # Whitespace normalisation. 587 588 def get_replacement(self, name): 589 if name in ("html", "body") or list_tags.has_key(name): 590 return "" 591 else: 592 return " " 593 594 def normalise(self, text, name): 595 return normalise_regexp.sub(self.get_replacement(name), text) 596 597 def xmlparse(s, out): 598 599 "Parse the content in the string 's', writing a translation to 'out'." 600 601 # NOTE: CDATA sections appear to have erroneous endings. 602 603 s = u"""\ 604 <?xml version="1.0"?> 605 <!DOCTYPE html 606 PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" 607 "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> 608 <html xmlns="http://www.w3.org/1999/xhtml"> 609 <body> 610 %s 611 </body> 612 </html>""" % s.replace("]] >", "]]>") 613 614 f = StringIO(s.encode("utf-8")) 615 try: 616 parser = ConfluenceXMLParser(out) 617 parser.parse(f) 618 finally: 619 f.close() 620 621 # General parsing. 622 623 def parse(s, out): 624 625 "Parse the content in the string 's', writing a translation to 'out'." 626 627 for type, text in get_regions(s): 628 629 # Handle list, heading, blockquote or anonymous blocks. 630 631 if type is None: 632 for blocktype, blocktext in get_blocks(text): 633 634 # Translate headings and blockquotes. 635 636 if blocktypes.has_key(blocktype): 637 print >>out, blocktypes[blocktype] % blocktext 638 639 # Translate list items. 640 641 elif blocktype == "list": 642 for listmarker, listitem in get_list_items(blocktext): 643 print >>out, "%s %s" % (translate_marker(listmarker), translate_content(listitem)) 644 645 # Translate table items. 646 647 elif blocktype == "table": 648 for cellsep, columns in get_table_rows(blocktext): 649 moinsep = translate_cellsep(cellsep) 650 print >>out, moinsep + moinsep.join([translate_cell(cellsep, column) for column in columns]) + moinsep 651 652 # Handle anonymous blocks. 653 654 else: 655 print >>out, translate_content(blocktext.rstrip()) 656 657 print >>out 658 659 # Handle sections. 660 661 else: 662 sectiontype, options = type 663 664 # Direct translations of sections. 665 666 mointype = sectiontypes.get(sectiontype) 667 if mointype: 668 print >>out, "{{{#!%s" % mointype 669 if options: 670 print >>out, "##", options 671 else: 672 print >>out, "{{{", 673 print >>out, translate_content(text, sectiontype), 674 print >>out, "}}}" 675 print >>out 676 677 if __name__ == "__main__": 678 s = sys.stdin.read() 679 if "--xml" in sys.argv: 680 xmlparse(s, sys.stdout) 681 else: 682 parse(s, sys.stdout) 683 684 # vim: tabstop=4 expandtab shiftwidth=4