1 #!/usr/bin/env python 2 3 """ 4 Confluence Wiki syntax parsing. 5 6 Copyright (C) 2012 Paul Boddie <paul@boddie.org.uk> 7 8 This software is free software; you can redistribute it and/or 9 modify it under the terms of the GNU General Public License as 10 published by the Free Software Foundation; either version 2 of 11 the License, or (at your option) any later version. 12 13 This software is distributed in the hope that it will be useful, 14 but WITHOUT ANY WARRANTY; without even the implied warranty of 15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 GNU General Public License for more details. 17 18 You should have received a copy of the GNU General Public 19 License along with this library; see the file LICENCE.txt 20 If not, write to the Free Software Foundation, Inc., 21 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA 22 23 -------- 24 25 The basic procedure is as follows: 26 27 1. Wiki pages are first split up into regions. 28 2. Then, within these regions, the text is split into blocks. 29 1. First, lists are identified. 30 2. Additionally, other block-like elements are identified. 31 3. Each block is then parsed. 32 """ 33 34 try: 35 from cStringIO import StringIO 36 except ImportError: 37 from StringIO import StringIO 38 39 from xmlread import Parser 40 import re 41 import sys 42 import operator 43 44 URL_SCHEMES = ("http", "https", "ftp", "mailto") 45 46 # Section extraction. 47 48 sections_regexp_str = r"(?<!{){(?P<type>[^-_*+{}\n:]+)(:[^}\n]+)?}.*?{(?P=type)}" 49 sections_regexp = re.compile(sections_regexp_str, re.DOTALL | re.MULTILINE) 50 51 def get_regions(s): 52 53 """ 54 Return a list of regions from 's'. Each region is specified using a tuple of 55 the form (type, text). 56 """ 57 58 last = 0 59 regions = [] 60 for match in sections_regexp.finditer(s): 61 start, end = match.span() 62 regions.append((None, s[last:start])) 63 regions.append(get_section_details(s[start:end])) 64 last = end 65 regions.append((None, s[last:])) 66 return regions 67 68 # Section inspection. 69 70 section_regexp_str = r"{(?P<sectiontype>[^\n:]*?)(?::(?P<options>.*?))?}(?P<section>.*){(?P=sectiontype)}" 71 section_regexp = re.compile(section_regexp_str, re.DOTALL | re.MULTILINE) 72 73 def get_section_details(s): 74 75 "Return the details of a section 's' in the form (type, text)." 76 77 match = section_regexp.match(s) 78 if match: 79 return (match.group("sectiontype"), match.group("options")), match.group("section") 80 else: 81 return None, s 82 83 # Heading, table and list extraction. 84 85 list_regexp_str = r"^\s*(?P<listtype>[*#-])[*#-]*.*\n(\s*(?P=listtype).*(?:\n|$))*" 86 table_regexp_str = r"^((?P<celltype>[|]{1,2})(.+?(?P=celltype))+(\n|$))+" 87 blocktext_regexp_str = r"^(?P<type>h\d|bq)\.\s+(?P<text>.*)$" 88 89 blockelement_regexp = re.compile( 90 "(" + list_regexp_str + ")" 91 "|" 92 "(" + table_regexp_str + ")" 93 "|" 94 "(" + blocktext_regexp_str + ")", 95 re.MULTILINE 96 ) 97 98 def get_block_elements(s): 99 100 """ 101 Extract headings, tables and lists from the given string 's'. 102 """ 103 104 last = 0 105 blocks = [] 106 for match in blockelement_regexp.finditer(s): 107 start, end = match.span() 108 matchtype = match.group("listtype") and "list" or match.group("celltype") and "table" or match.group("type") 109 blocks.append((None, s[last:start])) 110 blocks.append((matchtype, match.group("text") or s[start:end])) 111 last = end 112 blocks.append((None, s[last:])) 113 return blocks 114 115 # Block extraction. 116 117 block_regexp_str = r"^(?:\s*\n)+" 118 block_regexp = re.compile(block_regexp_str, re.MULTILINE) 119 120 def get_basic_blocks(s): 121 122 """ 123 Return blocks from the given string 's' by splitting the text on blank lines 124 and eliminating those lines. 125 """ 126 127 return [b for b in block_regexp.split(s) if b.strip()] 128 129 # Block inspection. 130 131 def get_blocks(s): 132 133 """ 134 Return blocks from the given string 's', inspecting the basic blocks and 135 generating additional block-level text where appropriate. 136 """ 137 138 blocks = [] 139 140 for blocktype, blocktext in get_block_elements(s): 141 142 # Collect heading, list and table blocks. 143 144 if blocktype is not None: 145 blocks.append((blocktype, blocktext)) 146 147 # Attempt to find new subblocks in other regions. 148 149 else: 150 for block in get_basic_blocks(blocktext): 151 blocks.append((None, block)) 152 153 return blocks 154 155 # List item inspection. 156 157 listitem_regexp_str = r"^(?P<marker> *[-*#]+)\s*(?P<text>.*)$" 158 listitem_regexp = re.compile(listitem_regexp_str, re.MULTILINE) 159 160 def get_list_items(text): 161 162 "Return a list of (marker, text) tuples for the given list 'text'." 163 164 items = [] 165 166 for match in listitem_regexp.finditer(text): 167 items.append((match.group("marker"), match.group("text"))) 168 169 return items 170 171 # Table row inspection. 172 173 monospace_regexp_str = r"{{(?P<monotext>.*?)}}" 174 link_regexp_str = r"[[](?P<linktext>.*?)]" 175 image_regexp_str = r"!(?P<imagetext>.*?)!" 176 cellsep_regexp_str = r"(?P<celltype>[|]{1,2})" 177 178 content_regexp_str = ( 179 "(" + monospace_regexp_str + ")" 180 "|" 181 "(" + link_regexp_str + ")" 182 "|" 183 "(" + image_regexp_str + ")" 184 ) 185 186 table_content_regexp_str = ( 187 content_regexp_str + 188 "|" 189 "(" + cellsep_regexp_str + ")" 190 ) 191 192 content_regexp = re.compile(content_regexp_str) 193 table_content_regexp = re.compile(table_content_regexp_str) 194 195 def translate_content_match(match): 196 197 "Translate the content described by the given 'match', returning a string." 198 199 if match.group("monotext"): 200 return "{{{%s}}}" % match.group("monotext") 201 202 elif match.group("linktext"): 203 parts = match.group("linktext").split("|") 204 205 # NOTE: Proper detection of external links required. 206 207 if len(parts) == 1: 208 label, target, title = None, parts[0], None 209 elif len(parts) == 2: 210 (label, target), title = parts, None 211 else: 212 label, target, title = parts 213 214 target = target.strip() 215 216 # Look for namespace links and rewrite them. 217 218 if target.find(":") != -1: 219 prefix = "" 220 space, rest = target.split(":", 1) 221 if space not in URL_SCHEMES: 222 target = "%s/%s" % (space, rest) 223 224 # Detect anchors. 225 226 elif target.startswith("#"): 227 prefix = "" 228 229 # Detect attachments. 230 231 elif target.startswith("^"): 232 prefix = "attachment:" 233 234 # Link to other pages within a space. 235 236 else: 237 prefix = "../" 238 239 # Make the link tidier by making a target if none was given. 240 241 if not label: 242 label = target 243 244 if not label and not title: 245 return "[[%s%s]]" % (prefix, target) 246 elif not title: 247 return "[[%s%s|%s]]" % (prefix, target, label) 248 else: 249 return "[[%s%s|%s|title=%s]]" % (prefix, target, label, title) 250 251 elif match.group("imagetext"): 252 parts = match.group("imagetext").split("|") 253 254 # NOTE: Proper detection of external links required. 255 256 if parts[0].startswith("http"): 257 prefix = "" 258 else: 259 prefix = "attachment:" 260 261 # NOTE: Proper options conversion required. 262 263 if len(parts) == 1: 264 return "{{%s%s}}" % (prefix, parts[0]) 265 else: 266 return "{{%s%s|%s}}" % (prefix, parts[0], parts[1]) 267 268 else: 269 return match.group() 270 271 def get_table_rows(text): 272 273 "Return a list of (cellsep, columns) tuples for the given table 'text'." 274 275 rows = [] 276 277 for line in text.split("\n"): 278 cellsep = None 279 columns = [""] 280 last = 0 281 for match in table_content_regexp.finditer(line): 282 start, end = match.span() 283 columns[-1] += line[last:start] 284 285 if match.group("celltype"): 286 if cellsep is None: 287 cellsep = match.group("celltype") 288 columns.append("") 289 else: 290 columns[-1] += match.group() 291 292 last = end 293 294 columns[-1] += line[last:] 295 296 if cellsep: 297 rows.append((cellsep, columns[1:-1])) 298 299 return rows 300 301 def translate_content(text, sectiontype=None): 302 303 """ 304 Return a translation of the given 'text'. If the optional 'sectiontype' is 305 specified, the translation may be modified to a form appropriate to the 306 section being translated. 307 """ 308 309 parts = [] 310 311 last = 0 312 for match in content_regexp.finditer(text): 313 start, end = match.span() 314 parts.append(text[last:start]) 315 316 # Handle unformatted sections. 317 318 if sectiontype in ("code", "noformat"): 319 parts.append(match.group()) 320 else: 321 parts.append(translate_content_match(match)) 322 323 last = end 324 325 parts.append(text[last:]) 326 return "".join(parts) 327 328 # Translation helpers. 329 330 blocktypes = { 331 "h1" : "= %s =", 332 "h2" : "== %s ==", 333 "h3" : "=== %s ===", 334 "h4" : "==== %s ====", 335 "h5" : "===== %s =====", 336 "h6" : "====== %s ======", 337 "bq" : "{{{%s}}}", 338 } 339 340 markers = { 341 "*" : "*", 342 "#" : "1.", 343 "-" : "*", 344 } 345 346 def translate_marker(marker): 347 348 "Translate the given 'marker' to a suitable Moin representation." 349 350 return " " * len(marker) + markers[marker[-1]] 351 352 cellseps = { 353 "|" : "||", 354 "||" : "||", 355 } 356 357 cellextra = { 358 "|" : "", 359 "||" : "'''", 360 } 361 362 def translate_cellsep(cellsep): 363 364 "Translate the given 'cellsep' to a suitable Moin representation." 365 366 return cellseps[cellsep] 367 368 def translate_cell(cellsep, text): 369 370 "Using 'cellsep', translate the cell 'text'." 371 372 return cellextra[cellsep] + translate_content(text) + cellextra[cellsep] 373 374 sectiontypes = { 375 "code" : "", 376 "noformat" : "", 377 "quote" : "", 378 "info" : "wiki important", 379 "note" : "wiki caution", 380 "tip" : "wiki tip", 381 "warning" : "wiki warning", 382 } 383 384 # XML dialect syntax parsing. 385 386 tags = { 387 # XHTML tag MoinMoin syntax 388 "strong" : "'''%s'''", 389 "em" : "''%s''", 390 "u" : "__%s__", 391 "del" : "--(%s)--", 392 "sup" : "^%s^", 393 "sub" : ",,%s,,", 394 "code" : "`%s`", 395 "pre" : "{{{%s}}}", 396 "blockquote" : " %s", 397 "small" : "~-%s-~", 398 "big" : "~+%s+~", 399 "p" : "\n%s\n", 400 "ol" : "\n%s", 401 "ul" : "\n%s", 402 "ac:plain-text-body" : "{{{%s}}}", 403 "ac:link" : "[[%s%s|%s]]", 404 } 405 406 for tag, translation in blocktypes.items(): 407 tags[tag] = "\n%s\n" % translation 408 409 simple_tags = { 410 # XHTML tag MoinMoin syntax 411 "br" : "<<BR>>", 412 } 413 414 list_tags = { 415 # XHTML list tag MoinMoin list item syntax 416 "ol" : "1. %s\n", 417 "ul" : "* %s\n", 418 } 419 420 indented_tags = ["li", "p"] 421 422 link_target_tags = { 423 # Confluence element Attribute providing the target 424 "ri:page" : "ri:content-title", 425 "ri:attachment" : "ri:filename", 426 } 427 428 macro_rich_text_styles = { 429 # Confluence style MoinMoin admonition style 430 "note" : "caution", 431 "warning" : "warning", 432 "info" : "important", 433 "tip" : "tip", 434 } 435 436 normalise_regexp_str = r"\s+" 437 normalise_regexp = re.compile(normalise_regexp_str) 438 439 normalise_end_regexp_str = r"\s\s+$" 440 normalise_end_regexp = re.compile(normalise_end_regexp_str) 441 442 class ConfluenceXMLParser(Parser): 443 444 "Handle content from Confluence 4 page revisions." 445 446 def __init__(self, out): 447 Parser.__init__(self) 448 self.out = out 449 450 # Link target information. 451 452 self.target = None 453 self.target_type = None 454 455 # Macro information. 456 457 self.macro = None 458 self.macro_parameters = {} 459 460 # Indentation and preformatted states. 461 462 self.indent = 0 463 self.states = {} 464 for name in ("pre", "ac:plain-text-body"): 465 self.states[name] = 0 466 467 # ContentHandler-related methods. 468 469 def startElement(self, name, attrs): 470 if list_tags.has_key(name): 471 self.indent += 1 472 elif self.states.has_key(name): 473 self.states[name] += 1 474 Parser.startElement(self, name, attrs) 475 476 def endElement(self, name): 477 Parser.endElement(self, name) 478 if list_tags.has_key(name): 479 self.indent -= 1 480 elif self.states.has_key(name): 481 self.states[name] -= 1 482 483 def characters(self, content): 484 if not self.is_preformatted(): 485 content = self.normalise(content, self.elements[-1]) 486 Parser.characters(self, content) 487 488 def skippedEntity(self, name): 489 if name == "mdash": 490 self.text[-1].append(u"\u2014") 491 elif name == "ndash": 492 self.text[-1].append(u"\u2013") 493 494 # Parser-related methods. 495 496 def handleElement(self, name): 497 text = "".join(self.text[-1]) 498 conversion = None 499 500 # Handle list elements. 501 502 if name == "li" and len(self.elements) > 1: 503 list_tag = self.elements[-2] 504 conversion = list_tags.get(list_tag) 505 506 # Remember link target information. 507 508 elif link_target_tags.has_key(name): 509 self.target = self.attributes[-1].get(link_target_tags[name]) 510 self.target_type = name 511 text = "" 512 513 # Remember macro information. 514 515 elif name == "ac:parameter": 516 self.macro_parameters[self.attributes[-1].get("ac:name")] = text 517 text = "" 518 519 elif name == "ac:macro": 520 self.macro = self.attributes[-1].get("ac:name") 521 522 # Handle the common case. 523 524 else: 525 conversion = tags.get(name) 526 527 # Attempt to convert the text. 528 529 # Links require target information. 530 531 if name == "ac:link": 532 if self.target_type == "ri:attachment": 533 prefix = "attachment:" 534 else: 535 prefix = "../" 536 537 text = conversion % (prefix, self.target, text or self.target) 538 self.target = self.target_type = None 539 540 # Macro name information is used to style rich text body regions. 541 542 elif name == "ac:macro" and macro_rich_text_styles.has_key(self.macro): 543 details = macro_rich_text_styles[self.macro] 544 title = self.macro_parameters.get("title") 545 if title: 546 details = "%s\n\n%s" % (details, title) 547 text = "{{{#!wiki %s\n\n%s}}}" % (details, text) 548 self.macro = None 549 self.macro_parameters = {} 550 551 # Handle the common case. 552 553 elif text and conversion: 554 text = conversion % text 555 elif simple_tags.has_key(name): 556 text = simple_tags[name] 557 558 # Normalise leading whitespace and indent the text if appropriate. 559 560 if name in indented_tags: 561 text = " " * self.indent + text.lstrip() 562 563 # Add the converted text to the end of the parent element's text nodes. 564 565 if len(self.text) > 1: 566 preceding = "".join(self.text[-2]) 567 568 if not self.is_preformatted(): 569 preceding = self.normalise_end(preceding, self.elements[-2]) 570 571 self.text[-2] = [preceding] 572 self.text[-2].append(text) 573 574 # Otherwise, emit the text. 575 576 else: 577 self.out.write(text) 578 579 def is_preformatted(self): 580 return reduce(operator.or_, self.states.values(), False) 581 582 def get_replacement(self, name, end=False): 583 if list_tags.has_key(name): 584 if end: 585 return "\n" 586 else: 587 return "" 588 elif name == "body": 589 return "\n\n" 590 else: 591 return " " 592 593 def normalise(self, text, name): 594 return normalise_regexp.sub(self.get_replacement(name), text) 595 596 def normalise_end(self, text, name): 597 return normalise_end_regexp.sub(self.get_replacement(name, True), text) 598 599 def xmlparse(s, out): 600 601 "Parse the content in the string 's', writing a translation to 'out'." 602 603 # NOTE: CDATA sections appear to have erroneous endings. 604 605 s = u"""\ 606 <?xml version="1.0"?> 607 <!DOCTYPE html 608 PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" 609 "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> 610 <html xmlns="http://www.w3.org/1999/xhtml"> 611 <body> 612 %s 613 </body> 614 </html>""" % s.replace("]] >", "]]>") 615 616 f = StringIO(s.encode("utf-8")) 617 try: 618 parser = ConfluenceXMLParser(out) 619 parser.parse(f) 620 finally: 621 f.close() 622 623 # General parsing. 624 625 def parse(s, out): 626 627 "Parse the content in the string 's', writing a translation to 'out'." 628 629 for type, text in get_regions(s): 630 631 # Handle list, heading, blockquote or anonymous blocks. 632 633 if type is None: 634 for blocktype, blocktext in get_blocks(text): 635 636 # Translate headings and blockquotes. 637 638 if blocktypes.has_key(blocktype): 639 print >>out, blocktypes[blocktype] % blocktext 640 641 # Translate list items. 642 643 elif blocktype == "list": 644 for listmarker, listitem in get_list_items(blocktext): 645 print >>out, "%s %s" % (translate_marker(listmarker), translate_content(listitem)) 646 647 # Translate table items. 648 649 elif blocktype == "table": 650 for cellsep, columns in get_table_rows(blocktext): 651 moinsep = translate_cellsep(cellsep) 652 print >>out, moinsep + moinsep.join([translate_cell(cellsep, column) for column in columns]) + moinsep 653 654 # Handle anonymous blocks. 655 656 else: 657 print >>out, translate_content(blocktext.rstrip()) 658 659 print >>out 660 661 # Handle sections. 662 663 else: 664 sectiontype, options = type 665 666 # Direct translations of sections. 667 668 mointype = sectiontypes.get(sectiontype) 669 if mointype: 670 print >>out, "{{{#!%s" % mointype 671 if options: 672 print >>out, "##", options 673 else: 674 print >>out, "{{{", 675 print >>out, translate_content(text, sectiontype), 676 print >>out, "}}}" 677 print >>out 678 679 if __name__ == "__main__": 680 s = sys.stdin.read() 681 parse(s, sys.stdout) 682 683 # vim: tabstop=4 expandtab shiftwidth=4