1 #!/usr/bin/env python 2 3 """ 4 Confluence Wiki syntax parsing. 5 6 Copyright (C) 2012 Paul Boddie <paul@boddie.org.uk> 7 8 This software is free software; you can redistribute it and/or 9 modify it under the terms of the GNU General Public License as 10 published by the Free Software Foundation; either version 2 of 11 the License, or (at your option) any later version. 12 13 This software is distributed in the hope that it will be useful, 14 but WITHOUT ANY WARRANTY; without even the implied warranty of 15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 GNU General Public License for more details. 17 18 You should have received a copy of the GNU General Public 19 License along with this library; see the file LICENCE.txt 20 If not, write to the Free Software Foundation, Inc., 21 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA 22 23 -------- 24 25 The basic procedure is as follows: 26 27 1. Wiki pages are first split up into regions. 28 2. Then, within these regions, the text is split into blocks. 29 1. First, lists are identified. 30 2. Additionally, other block-like elements are identified. 31 3. Each block is then parsed. 32 """ 33 34 try: 35 from cStringIO import StringIO 36 except ImportError: 37 from StringIO import StringIO 38 39 from xmlread import Parser 40 import re 41 import sys 42 import operator 43 import htmlentitydefs 44 45 URL_SCHEMES = ("http", "https", "ftp", "mailto") 46 47 # Section extraction. 48 49 sections_regexp_str = r"(?<!{){(?P<type>[^-_*+{}\n:]+)(:[^}\n]+)?}.*?{(?P=type)}" 50 sections_regexp = re.compile(sections_regexp_str, re.DOTALL | re.MULTILINE) 51 52 def get_regions(s): 53 54 """ 55 Return a list of regions from 's'. Each region is specified using a tuple of 56 the form (type, text). 57 """ 58 59 last = 0 60 regions = [] 61 for match in sections_regexp.finditer(s): 62 start, end = match.span() 63 regions.append((None, s[last:start])) 64 regions.append(get_section_details(s[start:end])) 65 last = end 66 regions.append((None, s[last:])) 67 return regions 68 69 # Section inspection. 70 71 section_regexp_str = r"{(?P<sectiontype>[^\n:]*?)(?::(?P<options>.*?))?}(?P<section>.*){(?P=sectiontype)}" 72 section_regexp = re.compile(section_regexp_str, re.DOTALL | re.MULTILINE) 73 74 def get_section_details(s): 75 76 "Return the details of a section 's' in the form (type, text)." 77 78 match = section_regexp.match(s) 79 if match: 80 return (match.group("sectiontype"), match.group("options")), match.group("section") 81 else: 82 return None, s 83 84 # Heading, table and list extraction. 85 86 list_regexp_str = r"^\s*(?P<listtype>[*#-])[*#-]*.*\n(\s*(?P=listtype).*(?:\n|$))*" 87 table_regexp_str = r"^((?P<celltype>[|]{1,2})(.+?(?P=celltype))+(\n|$))+" 88 blocktext_regexp_str = r"^(?P<type>h\d|bq)\.\s+(?P<text>.*)$" 89 90 blockelement_regexp = re.compile( 91 "(" + list_regexp_str + ")" 92 "|" 93 "(" + table_regexp_str + ")" 94 "|" 95 "(" + blocktext_regexp_str + ")", 96 re.MULTILINE 97 ) 98 99 def get_block_elements(s): 100 101 """ 102 Extract headings, tables and lists from the given string 's'. 103 """ 104 105 last = 0 106 blocks = [] 107 for match in blockelement_regexp.finditer(s): 108 start, end = match.span() 109 matchtype = match.group("listtype") and "list" or match.group("celltype") and "table" or match.group("type") 110 blocks.append((None, s[last:start])) 111 blocks.append((matchtype, match.group("text") or s[start:end])) 112 last = end 113 blocks.append((None, s[last:])) 114 return blocks 115 116 # Block extraction. 117 118 block_regexp_str = r"^(?:\s*\n)+" 119 block_regexp = re.compile(block_regexp_str, re.MULTILINE) 120 121 def get_basic_blocks(s): 122 123 """ 124 Return blocks from the given string 's' by splitting the text on blank lines 125 and eliminating those lines. 126 """ 127 128 return [b for b in block_regexp.split(s) if b.strip()] 129 130 # Block inspection. 131 132 def get_blocks(s): 133 134 """ 135 Return blocks from the given string 's', inspecting the basic blocks and 136 generating additional block-level text where appropriate. 137 """ 138 139 blocks = [] 140 141 for blocktype, blocktext in get_block_elements(s): 142 143 # Collect heading, list and table blocks. 144 145 if blocktype is not None: 146 blocks.append((blocktype, blocktext)) 147 148 # Attempt to find new subblocks in other regions. 149 150 else: 151 for block in get_basic_blocks(blocktext): 152 blocks.append((None, block)) 153 154 return blocks 155 156 # List item inspection. 157 158 listitem_regexp_str = r"^(?P<marker> *[-*#]+)\s*(?P<text>.*)$" 159 listitem_regexp = re.compile(listitem_regexp_str, re.MULTILINE) 160 161 def get_list_items(text): 162 163 "Return a list of (marker, text) tuples for the given list 'text'." 164 165 items = [] 166 167 for match in listitem_regexp.finditer(text): 168 items.append((match.group("marker"), match.group("text"))) 169 170 return items 171 172 # Table row inspection. 173 174 monospace_regexp_str = r"{{(?P<monotext>.*?)}}" 175 link_regexp_str = r"[[](?P<linktext>.*?)]" 176 image_regexp_str = r"!(?P<imagetext>.*?)!" 177 cellsep_regexp_str = r"(?P<celltype>[|]{1,2})" 178 179 content_regexp_str = ( 180 "(" + monospace_regexp_str + ")" 181 "|" 182 "(" + link_regexp_str + ")" 183 "|" 184 "(" + image_regexp_str + ")" 185 ) 186 187 table_content_regexp_str = ( 188 content_regexp_str + 189 "|" 190 "(" + cellsep_regexp_str + ")" 191 ) 192 193 content_regexp = re.compile(content_regexp_str) 194 table_content_regexp = re.compile(table_content_regexp_str) 195 196 def translate_content_match(match): 197 198 "Translate the content described by the given 'match', returning a string." 199 200 if match.group("monotext"): 201 return "{{{%s}}}" % match.group("monotext") 202 203 elif match.group("linktext"): 204 parts = match.group("linktext").split("|") 205 206 # NOTE: Proper detection of external links required. 207 208 if len(parts) == 1: 209 label, target, title = None, parts[0], None 210 elif len(parts) == 2: 211 (label, target), title = parts, None 212 else: 213 label, target, title = parts 214 215 target = target.strip() 216 217 # Look for namespace links and rewrite them. 218 219 if target.find(":") != -1: 220 prefix = "" 221 space, rest = target.split(":", 1) 222 if space not in URL_SCHEMES: 223 target = "%s/%s" % (space, rest) 224 225 # Detect anchors. 226 227 elif target.startswith("#"): 228 prefix = "" 229 230 # Detect attachments. 231 232 elif target.startswith("^"): 233 prefix = "attachment:" 234 235 # Link to other pages within a space. 236 237 else: 238 prefix = "../" 239 240 # Make the link tidier by making a target if none was given. 241 242 if not label: 243 label = target 244 245 if not label and not title: 246 return "[[%s%s]]" % (prefix, target) 247 elif not title: 248 return "[[%s%s|%s]]" % (prefix, target, label) 249 else: 250 return "[[%s%s|%s|title=%s]]" % (prefix, target, label, title) 251 252 elif match.group("imagetext"): 253 parts = match.group("imagetext").split("|") 254 255 # NOTE: Proper detection of external links required. 256 257 if parts[0].startswith("http"): 258 prefix = "" 259 else: 260 prefix = "attachment:" 261 262 # NOTE: Proper options conversion required. 263 264 if len(parts) == 1: 265 return "{{%s%s}}" % (prefix, parts[0]) 266 else: 267 return "{{%s%s|%s}}" % (prefix, parts[0], parts[1]) 268 269 else: 270 return match.group() 271 272 def get_table_rows(text): 273 274 "Return a list of (cellsep, columns) tuples for the given table 'text'." 275 276 rows = [] 277 278 for line in text.split("\n"): 279 cellsep = None 280 columns = [""] 281 last = 0 282 for match in table_content_regexp.finditer(line): 283 start, end = match.span() 284 columns[-1] += line[last:start] 285 286 if match.group("celltype"): 287 if cellsep is None: 288 cellsep = match.group("celltype") 289 columns.append("") 290 else: 291 columns[-1] += match.group() 292 293 last = end 294 295 columns[-1] += line[last:] 296 297 if cellsep: 298 rows.append((cellsep, columns[1:-1])) 299 300 return rows 301 302 def translate_content(text, sectiontype=None): 303 304 """ 305 Return a translation of the given 'text'. If the optional 'sectiontype' is 306 specified, the translation may be modified to a form appropriate to the 307 section being translated. 308 """ 309 310 parts = [] 311 312 last = 0 313 for match in content_regexp.finditer(text): 314 start, end = match.span() 315 parts.append(text[last:start]) 316 317 # Handle unformatted sections. 318 319 if sectiontype in ("code", "noformat"): 320 parts.append(match.group()) 321 else: 322 parts.append(translate_content_match(match)) 323 324 last = end 325 326 parts.append(text[last:]) 327 return "".join(parts) 328 329 # Translation helpers. 330 331 blocktypes = { 332 "h1" : "= %s =", 333 "h2" : "== %s ==", 334 "h3" : "=== %s ===", 335 "h4" : "==== %s ====", 336 "h5" : "===== %s =====", 337 "h6" : "====== %s ======", 338 "bq" : "{{{%s}}}", 339 } 340 341 markers = { 342 "*" : "*", 343 "#" : "1.", 344 "-" : "*", 345 } 346 347 def translate_marker(marker): 348 349 "Translate the given 'marker' to a suitable Moin representation." 350 351 return " " * len(marker) + markers[marker[-1]] 352 353 cellseps = { 354 "|" : "||", 355 "||" : "||", 356 } 357 358 cellextra = { 359 "|" : "", 360 "||" : "'''", 361 } 362 363 def translate_cellsep(cellsep): 364 365 "Translate the given 'cellsep' to a suitable Moin representation." 366 367 return cellseps[cellsep] 368 369 def translate_cell(cellsep, text): 370 371 "Using 'cellsep', translate the cell 'text'." 372 373 return cellextra[cellsep] + translate_content(text) + cellextra[cellsep] 374 375 sectiontypes = { 376 "code" : "", 377 "noformat" : "", 378 "quote" : "", 379 "info" : "wiki important", 380 "note" : "wiki caution", 381 "tip" : "wiki tip", 382 "warning" : "wiki warning", 383 } 384 385 # XML dialect syntax parsing. 386 387 tags = { 388 # XHTML tag MoinMoin syntax 389 "strong" : "'''%s'''", 390 "em" : "''%s''", 391 "u" : "__%s__", 392 "del" : "--(%s)--", 393 "sup" : "^%s^", 394 "sub" : ",,%s,,", 395 "code" : "`%s`", 396 "pre" : "{{{%s}}}", 397 "blockquote" : " %s", 398 "small" : "~-%s-~", 399 "big" : "~+%s+~", 400 "p" : "\n%s\n", 401 "ol" : "\n%s", 402 "ul" : "\n%s", 403 "ac:plain-text-body" : "{{{%s}}}", 404 "ac:link" : "[[%s%s|%s]]", 405 } 406 407 for tag, translation in blocktypes.items(): 408 tags[tag] = "\n%s\n" % translation 409 410 simple_tags = { 411 # XHTML tag MoinMoin syntax 412 "br" : "<<BR>>", 413 } 414 415 list_tags = { 416 # XHTML list tag MoinMoin list item syntax 417 "ol" : "1. %s\n", 418 "ul" : "* %s\n", 419 } 420 421 indented_tags = ["li", "p"] 422 423 link_target_tags = { 424 # Confluence element Attribute providing the target 425 "ri:page" : "ri:content-title", 426 "ri:attachment" : "ri:filename", 427 "ri:user" : "ri:username", 428 } 429 430 macro_rich_text_styles = { 431 # Confluence style MoinMoin admonition style 432 "note" : "caution", 433 "warning" : "warning", 434 "info" : "important", 435 "tip" : "tip", 436 } 437 438 normalise_regexp_str = r"\s+" 439 normalise_regexp = re.compile(normalise_regexp_str) 440 441 normalise_end_regexp_str = r"\s\s+$" 442 normalise_end_regexp = re.compile(normalise_end_regexp_str) 443 444 class ConfluenceXMLParser(Parser): 445 446 "Handle content from Confluence 4 page revisions." 447 448 def __init__(self, out): 449 Parser.__init__(self) 450 self.out = out 451 452 # Link target information. 453 454 self.target = None 455 self.target_type = None 456 457 # Macro information. 458 459 self.macro = None 460 self.macro_parameters = {} 461 462 # Indentation and preformatted states. 463 464 self.indent = 0 465 self.states = {} 466 for name in ("pre", "ac:plain-text-body"): 467 self.states[name] = 0 468 469 # ContentHandler-related methods. 470 471 def startElement(self, name, attrs): 472 if list_tags.has_key(name): 473 self.indent += 1 474 elif self.states.has_key(name): 475 self.states[name] += 1 476 Parser.startElement(self, name, attrs) 477 478 def endElement(self, name): 479 Parser.endElement(self, name) 480 if list_tags.has_key(name): 481 self.indent -= 1 482 elif self.states.has_key(name): 483 self.states[name] -= 1 484 485 def characters(self, content): 486 if not self.is_preformatted(): 487 content = self.normalise(content, self.elements[-1]) 488 Parser.characters(self, content) 489 490 def skippedEntity(self, name): 491 ch = htmlentitydefs.name2codepoint.get(name) 492 if ch: 493 self.text[-1].append(unichr(ch)) 494 495 # Parser-related methods. 496 497 def handleElement(self, name): 498 text = "".join(self.text[-1]) 499 conversion = None 500 501 # Handle list elements. 502 503 if name == "li" and len(self.elements) > 1: 504 list_tag = self.elements[-2] 505 conversion = list_tags.get(list_tag) 506 507 # Remember link target information. 508 509 elif link_target_tags.has_key(name): 510 self.target = self.attributes[-1].get(link_target_tags[name]) 511 self.target_type = name 512 text = "" 513 514 # Remember macro information. 515 516 elif name == "ac:parameter": 517 self.macro_parameters[self.attributes[-1].get("ac:name")] = text 518 text = "" 519 520 elif name == "ac:macro": 521 self.macro = self.attributes[-1].get("ac:name") 522 523 # Handle the common case. 524 525 else: 526 conversion = tags.get(name) 527 528 # Attempt to convert the text. 529 530 # Links require target information. 531 # NOTE: User links should support the intended user namespace prefix. 532 533 if name == "ac:link": 534 if self.target_type == "ri:attachment": 535 prefix = "attachment:" 536 elif self.target_type == "ri:user": 537 prefix = "" 538 else: 539 prefix = "../" 540 541 text = conversion % (prefix, self.target, text or self.target) 542 self.target = self.target_type = None 543 544 # Macro name information is used to style rich text body regions. 545 546 elif name == "ac:macro" and macro_rich_text_styles.has_key(self.macro): 547 details = macro_rich_text_styles[self.macro] 548 title = self.macro_parameters.get("title") 549 if title: 550 details = "%s\n\n%s" % (details, title) 551 text = "{{{#!wiki %s\n\n%s}}}" % (details, text) 552 self.macro = None 553 self.macro_parameters = {} 554 555 # Handle the common case. 556 557 elif text and conversion: 558 text = conversion % text 559 elif simple_tags.has_key(name): 560 text = simple_tags[name] 561 562 # Normalise leading whitespace and indent the text if appropriate. 563 564 if name in indented_tags: 565 text = " " * self.indent + text.lstrip() 566 567 # Add the converted text to the end of the parent element's text nodes. 568 569 if len(self.text) > 1: 570 preceding = "".join(self.text[-2]) 571 572 if not self.is_preformatted(): 573 preceding = self.normalise_end(preceding, self.elements[-2]) 574 575 self.text[-2] = [preceding] 576 self.text[-2].append(text) 577 578 # Otherwise, emit the text. 579 580 else: 581 self.out.write(text) 582 583 def is_preformatted(self): 584 return reduce(operator.or_, self.states.values(), False) 585 586 def get_replacement(self, name, end=False): 587 if list_tags.has_key(name): 588 if end: 589 return "\n" 590 else: 591 return "" 592 elif name == "body": 593 return "\n\n" 594 else: 595 return " " 596 597 def normalise(self, text, name): 598 return normalise_regexp.sub(self.get_replacement(name), text) 599 600 def normalise_end(self, text, name): 601 return normalise_end_regexp.sub(self.get_replacement(name, True), text) 602 603 def xmlparse(s, out): 604 605 "Parse the content in the string 's', writing a translation to 'out'." 606 607 # NOTE: CDATA sections appear to have erroneous endings. 608 609 s = u"""\ 610 <?xml version="1.0"?> 611 <!DOCTYPE html 612 PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" 613 "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> 614 <html xmlns="http://www.w3.org/1999/xhtml"> 615 <body> 616 %s 617 </body> 618 </html>""" % s.replace("]] >", "]]>") 619 620 f = StringIO(s.encode("utf-8")) 621 try: 622 parser = ConfluenceXMLParser(out) 623 parser.parse(f) 624 finally: 625 f.close() 626 627 # General parsing. 628 629 def parse(s, out): 630 631 "Parse the content in the string 's', writing a translation to 'out'." 632 633 for type, text in get_regions(s): 634 635 # Handle list, heading, blockquote or anonymous blocks. 636 637 if type is None: 638 for blocktype, blocktext in get_blocks(text): 639 640 # Translate headings and blockquotes. 641 642 if blocktypes.has_key(blocktype): 643 print >>out, blocktypes[blocktype] % blocktext 644 645 # Translate list items. 646 647 elif blocktype == "list": 648 for listmarker, listitem in get_list_items(blocktext): 649 print >>out, "%s %s" % (translate_marker(listmarker), translate_content(listitem)) 650 651 # Translate table items. 652 653 elif blocktype == "table": 654 for cellsep, columns in get_table_rows(blocktext): 655 moinsep = translate_cellsep(cellsep) 656 print >>out, moinsep + moinsep.join([translate_cell(cellsep, column) for column in columns]) + moinsep 657 658 # Handle anonymous blocks. 659 660 else: 661 print >>out, translate_content(blocktext.rstrip()) 662 663 print >>out 664 665 # Handle sections. 666 667 else: 668 sectiontype, options = type 669 670 # Direct translations of sections. 671 672 mointype = sectiontypes.get(sectiontype) 673 if mointype: 674 print >>out, "{{{#!%s" % mointype 675 if options: 676 print >>out, "##", options 677 else: 678 print >>out, "{{{", 679 print >>out, translate_content(text, sectiontype), 680 print >>out, "}}}" 681 print >>out 682 683 if __name__ == "__main__": 684 s = sys.stdin.read() 685 parse(s, sys.stdout) 686 687 # vim: tabstop=4 expandtab shiftwidth=4