1 #!/usr/bin/env python 2 3 """ 4 Confluence Wiki syntax parsing. 5 6 Copyright (C) 2012, 2013, 2015 Paul Boddie <paul@boddie.org.uk> 7 8 This software is free software; you can redistribute it and/or 9 modify it under the terms of the GNU General Public License as 10 published by the Free Software Foundation; either version 2 of 11 the License, or (at your option) any later version. 12 13 This software is distributed in the hope that it will be useful, 14 but WITHOUT ANY WARRANTY; without even the implied warranty of 15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 GNU General Public License for more details. 17 18 You should have received a copy of the GNU General Public 19 License along with this library; see the file LICENCE.txt 20 If not, write to the Free Software Foundation, Inc., 21 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA 22 23 -------- 24 25 The basic procedure is as follows: 26 27 1. Wiki pages are first split up into regions. 28 2. Then, within these regions, the text is split into blocks. 29 1. First, lists are identified. 30 2. Additionally, other block-like elements are identified. 31 3. Each block is then split into regions. 32 """ 33 34 from common import * 35 import re 36 import sys 37 import codecs 38 import operator 39 40 # Section extraction. 41 42 sections_regexp_str = r"(?<!{){(?P<type>[^-_*+{}\n:]+)(?P<options>:[^}\n]+)?}" \ 43 r"|" \ 44 r"^(?P<rowstart>[|]{1,2})" \ 45 r"|" \ 46 r"(?P<rowend>[|]{1,2}(\n|$))" \ 47 r"|" \ 48 r"^(?P<listitem>\s*[*#-]+\s+.*?([^|](\n|$)|(?=[|](\n|$))))" 49 50 sections_regexp = re.compile(sections_regexp_str, re.MULTILINE) 51 52 def get_regions(s): 53 54 """ 55 Return a list of regions from 's'. Each region is specified using a tuple of 56 the form (type, text). 57 """ 58 59 last = 0 60 regions = [""] 61 depth = 0 62 had_row = False 63 had_item = False 64 65 for match in sections_regexp.finditer(s): 66 start, end = match.span() 67 is_start = match.group("options") or match.group("rowstart") 68 is_section = is_section_marker(match.group("type")) 69 is_row = match.group("rowstart") or match.group("rowend") 70 is_item = match.group("listitem") 71 72 # The start of a region is either indicated by a marker with options or 73 # by a marker where no region is currently active. 74 75 if is_start or not depth: 76 77 # Where no region is active, add the text since the last match as a 78 # "null" region. 79 80 if not depth: 81 regions[-1] += s[last:start] 82 83 # A new region is maintained as a string. 84 85 if is_section: 86 regions.append(s[start:end]) 87 88 # A new row may either continue a table region or start a new 89 # table region. 90 91 elif is_row: 92 if had_row and last == start: 93 regions[-2] += regions[-1] + s[start:end] 94 regions.pop() 95 else: 96 regions.append(s[start:end]) 97 98 # A list item may either continue a list region or start a new 99 # list region. 100 101 elif is_item: 102 103 # If continuing a list, merge the list regions and start a 104 # new potentally separate region. 105 106 if had_item and last == start: 107 regions[-2] += regions[-1] + s[start:end] 108 regions[-1] = "" 109 110 # If not continuing a list, make a region for a new list and 111 # start a new potentally separate region. 112 113 else: 114 regions.append(s[start:end]) 115 regions.append("") 116 117 # Certain markers may be standalone macros. 118 119 else: 120 regions[-1] += s[start:end] 121 122 # Where a region is active, add the text since the last match as 123 # well as the text in this match to the region. 124 125 else: 126 regions[-1] += s[last:end] 127 128 if is_section or is_row: 129 depth += 1 130 131 # The end of a region is indicated by a marker with no options or the 132 # end of a row. 133 134 else: 135 # Where no region is active, the text since the last match plus the 136 # marker are added to the current "null" region. 137 138 if not depth: 139 140 # Add to the string portion of the "null" region. 141 142 regions[-1] += s[last:end] 143 144 # Where a region is active, the end marker and preceding text is 145 # either incorporated into the current region if more than one 146 # region is active, or the preceding text is incorporated into the 147 # current region and the details of the region are then obtained. 148 149 else: 150 if depth > 1 or (not is_section and not is_row): 151 regions[-1] += s[last:end] 152 153 # Terminate the active region, interpreting its contents. 154 155 else: 156 regions[-1] += s[last:end] 157 regions.append("") 158 159 if is_section or is_row: 160 depth -= 1 161 162 had_row = is_row 163 had_item = is_item 164 last = end 165 166 # Where a region is still active, terminate it. 167 168 regions[-1] += s[last:] 169 170 return [get_section_details(s) for s in regions if s] 171 172 def is_section_marker(sectiontype): 173 return sectiontypes.has_key(sectiontype) or sectiontype == "color" 174 175 # Section inspection. 176 177 section_regexp_str = r"{(?P<sectiontype>[^\n:]*?)(?::(?P<options>.*?))?}(?P<section>.*){(?P=sectiontype)}" 178 section_regexp = re.compile(section_regexp_str, re.DOTALL | re.MULTILINE) 179 180 def get_section_details(s): 181 182 "Return the details of a section 's' in the form (type, text)." 183 184 match = section_regexp.match(s) 185 if match: 186 return (match.group("sectiontype"), match.group("options")), match.group("section") 187 else: 188 return None, s 189 190 # Heading, table and list extraction. 191 192 list_regexp_str = r"^\s*(?P<listtype>[*#-])[*#-]*\s+.*(\n\s*(?P=listtype).*?)*(?:\n|$)" 193 table_regexp_str = r"^((?P<celltype>[|]{1,2})((.|\n(?!\n))+?(?P=celltype))+(\n|$))+" 194 blocktext_regexp_str = r"^(?P<type>h\d|bq)\.\s+(?P<text>.*)$" 195 196 blockelement_regexp = re.compile( 197 "(" + list_regexp_str + ")" 198 "|" 199 "(" + table_regexp_str + ")" 200 "|" 201 "(" + blocktext_regexp_str + ")", 202 re.MULTILINE 203 ) 204 205 def get_block_elements(s): 206 207 """ 208 Extract headings, tables and lists from the given string 's'. 209 """ 210 211 last = 0 212 blocks = [] 213 for match in blockelement_regexp.finditer(s): 214 start, end = match.span() 215 matchtype = match.group("listtype") and "list" or match.group("celltype") and "table" or match.group("type") 216 blocks.append((None, s[last:start])) 217 blocks.append((matchtype, match.group("text") or s[start:end])) 218 last = end 219 blocks.append((None, s[last:])) 220 return blocks 221 222 # Block extraction. 223 224 block_regexp_str = r"^(?:\s*\n)+" 225 block_regexp = re.compile(block_regexp_str, re.MULTILINE) 226 227 def get_basic_blocks(s): 228 229 """ 230 Return blocks from the given string 's' by splitting the text on blank lines 231 and eliminating those lines. 232 """ 233 234 return [b for b in block_regexp.split(s) if b.strip()] 235 236 # Block inspection. 237 238 def get_blocks(s): 239 240 """ 241 Return blocks from the given string 's', inspecting the basic blocks and 242 generating additional block-level text where appropriate. 243 """ 244 245 blocks = [] 246 247 for blocktype, blocktext in get_block_elements(s): 248 249 # Collect heading, list and table blocks. 250 251 if blocktype is not None: 252 blocks.append((blocktype, blocktext)) 253 254 # Attempt to find new subblocks in other regions. 255 256 else: 257 for block in get_basic_blocks(blocktext): 258 blocks.append((None, block)) 259 260 return blocks 261 262 # List item inspection. 263 264 listitem_regexp_str = r"^(?P<marker> *[-*#]+)\s+(?P<text>.*)$" 265 listitem_regexp = re.compile(listitem_regexp_str, re.MULTILINE) 266 267 def get_list_items(text): 268 269 "Return a list of (marker, text) tuples for the given list 'text'." 270 271 items = [] 272 273 for match in listitem_regexp.finditer(text): 274 items.append((match.group("marker"), match.group("text"))) 275 276 return items 277 278 # Content inspection. 279 280 monospace_regexp_str = r"{{(?P<monotext>.*?)}}" 281 link_regexp_str = r"(?<!\\)[[](?P<linktext>.*?)]" 282 image_regexp_str = r"!(?P<imagetext>\w.*?)!" 283 macro_regexp_str = r"{(?P<macro>.*?)(?::(?P<options>.*?))?}" 284 285 # Word-dependent patterns. 286 # Here, the unbracketed markers must test for the absence of surrounding word 287 # characters. 288 289 italic_regexp_str = r"(?:(?<!\w)_|\{_\})(?P<italictext>.*?)(?:_(?!\w)|\{_\})" 290 bold_regexp_str = r"(?:(?<!\w)\*|\{\*\})(?P<boldtext>.*?)(?:\*(?!\w)|\{\*\})" 291 del_regexp_str = r"(?:(?<!\w)-|\{-\})(?P<deltext>.*?)(?:-(?!\w)|\{-\})" 292 underline_regexp_str = r"(?:(?<!\w)\+|\{\+\})(?P<underlinetext>.*?)(?:\+(?!\w)|\{\+\})" 293 sub_regexp_str = r"(?:(?<!\w)~|\{~\})(?P<subtext>.*?)(?:~(?!\w)|\{~\})" 294 295 content_regexp_str = ( 296 "(" + monospace_regexp_str + ")" 297 "|" 298 "(" + link_regexp_str + ")" 299 "|" 300 "(" + image_regexp_str + ")" 301 "|" 302 "(" + macro_regexp_str + ")" 303 "|" 304 "(" + italic_regexp_str + ")" 305 "|" 306 "(" + bold_regexp_str + ")" 307 "|" 308 "(" + del_regexp_str + ")" 309 "|" 310 "(" + underline_regexp_str + ")" 311 "|" 312 "(" + sub_regexp_str + ")" 313 ) 314 315 # Table row inspection. 316 317 cellsep_regexp_str = r"(?P<celltype>[|]{1,2})" 318 319 table_content_regexp_str = ( 320 content_regexp_str + 321 "|" 322 "(" + cellsep_regexp_str + ")" 323 ) 324 325 content_regexp = re.compile(content_regexp_str) 326 table_content_regexp = re.compile(table_content_regexp_str) 327 328 def get_table_rows(text): 329 330 "Return a list of (cellsep, columns) tuples for the given table 'text'." 331 332 rows = [] 333 334 for row in text.split("|\n"): 335 if not row: 336 break 337 338 row += "|" 339 cellsep = None 340 columns = [""] 341 last = 0 342 for match in table_content_regexp.finditer(row): 343 start, end = match.span() 344 columns[-1] += row[last:start] 345 346 if match.group("celltype"): 347 if cellsep is None: 348 cellsep = match.group("celltype") 349 columns.append("") 350 else: 351 columns[-1] += match.group() 352 353 last = end 354 355 columns[-1] += row[last:] 356 357 if cellsep: 358 rows.append((cellsep, columns[1:-1])) 359 360 return rows 361 362 # Notation conversion. 363 364 notation_mapping = [ 365 (r"\!", "!"), 366 (r"\-", "-"), 367 (r"\\""\n", "<<BR>>"), 368 (r"\\ ", "<<BR>>"), 369 (r"\~", "~"), 370 (r"\[", "<<Verbatim([)>>"), 371 (r"\]", "<<Verbatim(])>>"), 372 (r"\*", "*"), 373 ] 374 375 preformatted_notation_mapping = [ 376 (r"\!", "!"), 377 (r"\-", "-"), 378 (r"\\""\n", "\n"), 379 (r"\\ ", "\n"), 380 (r"\~", "~"), 381 ] 382 383 # Translation helpers. 384 385 markers = { 386 "*" : "*", 387 "#" : "1.", 388 "-" : "*", 389 } 390 391 cellseps = { 392 "|" : "\n|| ", 393 "||" : "\n|| ", 394 } 395 396 cellextra = { 397 "|" : "", 398 "||" : "'''", 399 } 400 401 sectiontypes = { 402 "code" : "", 403 "excerpt" : "#!wiki", 404 "noformat" : "", 405 "quote" : "", 406 "info" : "#!wiki important", 407 "note" : "#!wiki caution", 408 "tip" : "#!wiki tip", 409 "warning" : "#!wiki warning", 410 } 411 412 preformatted_sectiontypes = (None, "noformat") 413 414 macroargs = { 415 "color" : "col", 416 } 417 418 macrotypes = { 419 "anchor" : "<<Anchor(%(args)s)>>", 420 "color" : "<<Color2(%(content)s, %(args)s)>>", 421 "toc" : "<<TableOfContents>>", 422 } 423 424 class ConfluenceParser: 425 426 "A parser for Confluence markup." 427 428 def __init__(self, is_comment_page=False): 429 self.is_comment_page = is_comment_page 430 self.max_level = self.level = 0 431 self.in_heading = False 432 self.held_anchors = [] 433 self.macro = None 434 self.sections = [] 435 436 def translate_marker(self, marker): 437 438 "Translate the given 'marker' to a suitable Moin representation." 439 440 return " " * len(marker) + markers[marker[-1]] 441 442 def translate_cellsep(self, cellsep): 443 444 "Translate the given 'cellsep' to a suitable Moin representation." 445 446 return cellseps[cellsep] 447 448 def translate_cell(self, cellsep, text): 449 450 "Using 'cellsep', translate the cell 'text'." 451 452 return cellextra[cellsep] + self.parse_text(text).strip() + cellextra[cellsep] 453 454 def translate_content_match(self, match): 455 456 "Translate the content described by the given 'match', returning a string." 457 458 if match.group("monotext"): 459 self.enter_section(); self.leave_section() 460 return "{{{%s}}}" % match.group("monotext") 461 462 elif match.group("linktext"): 463 parts = match.group("linktext").split("|") 464 465 # NOTE: Proper detection of external links required. 466 467 if len(parts) == 1: 468 label, target, title = None, parts[0], None 469 elif len(parts) == 2: 470 (label, target), title = parts, None 471 else: 472 label, target, title = parts 473 474 target = target.strip() 475 476 # Look for namespace links and rewrite them. 477 478 if target.find(":") != -1: 479 prefix = "" 480 space, rest = target.split(":", 1) 481 if space not in URL_SCHEMES: 482 rest = get_page_title(rest) 483 target = "%s/%s" % (space, rest) 484 485 # Detect anchors. 486 487 elif target.startswith("#"): 488 prefix = "" 489 490 # Detect attachments. 491 492 elif target.startswith("^"): 493 prefix = "attachment:" 494 495 # Link to other pages within a space. 496 497 else: 498 prefix = "../" 499 if self.is_comment_page: 500 prefix += "../" 501 502 # Make the link tidier by making a target if none was given. 503 504 if not label: 505 label = target 506 507 target = get_page_title(target) 508 509 if not label and not title: 510 return "[[%s%s]]" % (prefix, target) 511 elif not title: 512 return "[[%s%s|%s]]" % (prefix, target, label) 513 else: 514 return "[[%s%s|%s|title=%s]]" % (prefix, target, label, title) 515 516 elif match.group("imagetext"): 517 parts = match.group("imagetext").split("|") 518 519 # NOTE: Proper detection of external links required. 520 521 if parts[0].startswith("http"): 522 prefix = "" 523 else: 524 prefix = "attachment:" 525 526 # NOTE: Proper options conversion required. 527 528 if len(parts) == 1: 529 return "{{%s%s}}" % (prefix, parts[0]) 530 else: 531 return "{{%s%s|%s}}" % (prefix, parts[0], parts[1]) 532 533 elif match.group("macro"): 534 macro_name = match.group("macro") 535 if macrotypes.has_key(macro_name): 536 argname = macroargs.get(macro_name) 537 result = macrotypes[macro_name] % { 538 "args" : quote_macro_argument((argname and ("%s=" % argname) or "") + (match.group("options") or "")) 539 } 540 if not self.forbids_macros(): 541 return result 542 if macro_name == "anchor": 543 self.held_anchors.append(result) 544 return "" 545 546 elif match.group("italictext"): 547 return "''%s''" % self.translate_content(match.group("italictext")) 548 549 elif match.group("boldtext"): 550 return "'''%s'''" % self.translate_content(match.group("boldtext")) 551 552 elif match.group("deltext"): 553 return "--(%s)--" % self.translate_content(match.group("deltext")) 554 555 elif match.group("underlinetext"): 556 return "__%s__" % self.translate_content(match.group("underlinetext")) 557 558 elif match.group("subtext"): 559 return ",,%s,," % self.translate_content(match.group("subtext")) 560 561 else: 562 return self.translate_text(match.group()) 563 564 def translate_text(self, s, preformatted=False): 565 566 "Translate the plain text string 's', converting notation." 567 568 for before, after in preformatted and preformatted_notation_mapping or notation_mapping: 569 s = s.replace(before, after) 570 return s 571 572 def translate_content(self, text): 573 574 """ 575 Return a translation of the given 'text'. If the optional 'sectiontype' is 576 specified, the translation may be modified to a form appropriate to the 577 section being translated. 578 """ 579 580 parts = [] 581 preformatted = self.is_preformatted() 582 583 last = 0 584 for match in content_regexp.finditer(text): 585 start, end = match.span() 586 parts.append(self.translate_text(text[last:start], preformatted)) 587 588 # Handle unformatted sections. 589 590 if self.sections and self.sections[-1] in ("code", "noformat"): 591 parts.append(match.group()) 592 else: 593 parts.append(self.translate_content_match(match)) 594 595 last = end 596 597 parts.append(self.translate_text(text[last:], preformatted)) 598 return "".join(parts) 599 600 def is_preformatted(self): 601 return reduce(operator.or_, [x in preformatted_sectiontypes for x in self.sections], False) 602 603 def translate_block(self, blocktype, blocktext): 604 605 "Translate the block with the given 'blocktype' and 'blocktext'." 606 607 if blocktype in headings: 608 self.in_heading = True 609 self.held_anchors = [] 610 611 parts = [] 612 613 # Translate headings and blockquotes. 614 615 if blocktypes.has_key(blocktype): 616 text = self.parse_text(blocktext) 617 for anchor in self.held_anchors: 618 parts.append(anchor) 619 parts.append(blocktypes[blocktype] % text) 620 621 # Translate list items. 622 623 elif blocktype == "list": 624 for listmarker, listitem in get_list_items(blocktext): 625 parts.append("%s %s" % (self.translate_marker(listmarker), self.parse_text(listitem))) 626 627 # Translate table items. 628 629 elif blocktype == "table": 630 631 # Enter the table. 632 633 self.enter_section("table") 634 635 table_parts = [] 636 first = True 637 638 for cellsep, columns in get_table_rows(blocktext): 639 if not first: 640 table_parts.append("==") 641 else: 642 first = False 643 moinsep = self.translate_cellsep(cellsep) 644 table_parts.append(moinsep.join([self.translate_cell(cellsep, column) for column in columns])) 645 646 # Nest the section appropriately. 647 648 opening, closing = self.nest_section() 649 650 parts.append("%s#!table" % opening) 651 parts += table_parts 652 parts.append(closing) 653 654 # Leave the table. 655 656 self.leave_section() 657 658 # Handle anonymous blocks. 659 660 else: 661 parts.append(self.parse_text(blocktext)) 662 663 if blocktype in headings: 664 self.in_heading = False 665 666 return "\n".join(parts) 667 668 def translate_section(self, sectiontype, options, text): 669 670 """ 671 Translate the section with the given 'sectiontype', 'options' and 672 'text'. 673 """ 674 675 parts = [] 676 677 # Enter the section. 678 679 self.enter_section(sectiontype) 680 681 # Sections can contain other sections. 682 683 if sectiontype == "noformat": 684 section_content = self.translate_content(text.strip("\n")) 685 else: 686 section_content = self.parse_text(text.strip()) 687 688 # Nest the section appropriately. 689 690 opening, closing = self.nest_section() 691 mointype = sectiontypes.get(sectiontype) 692 693 parts.append("%s%s\n" % (opening, mointype or "")) 694 parts.append(section_content) 695 parts.append("\n%s\n" % closing) 696 697 # Leave the section. 698 699 self.leave_section() 700 701 return parts 702 703 def enter_section(self, sectiontype=None): 704 self.level += 1 705 self.max_level = max(self.level, self.max_level) 706 self.sections.append(sectiontype) 707 708 def leave_section(self): 709 self.level -= 1 710 if not self.level: 711 self.max_level = 0 712 self.sections.pop() 713 714 def nest_section(self): 715 level = 3 + self.max_level - self.level 716 opening = "{" * level 717 closing = "}" * level 718 return opening, closing 719 720 # General parsing. 721 722 def parse_text(self, s, top=False): 723 724 "Parse the content in the string 's', returning the translation." 725 726 parts = [] 727 728 # Control spacing between blocks and other blocks or sections. 729 730 preceded_by_block = False 731 732 for type, text in get_regions(s): 733 734 # Handle list, heading, blockquote or anonymous blocks. 735 736 if type is None: 737 738 # Where the region is the same as the provided text, return 739 # immediately. This is the base case of the recursive parsing 740 # process. 741 742 if text == s and not top: 743 return self.translate_content(text) 744 745 # Otherwise, obtain and translate the blocks. 746 747 if preceded_by_block: 748 parts.append("\n") 749 750 first = True 751 for blocktype, blocktext in get_blocks(text): 752 if not first: 753 parts.append("\n") 754 else: 755 first = False 756 parts.append("%s" % self.translate_block(blocktype, blocktext)) 757 758 if not first: 759 preceded_by_block = True 760 761 # Handle sections. 762 763 else: 764 sectiontype, options = type 765 766 # Direct translations of sections. 767 768 if sectiontypes.has_key(sectiontype): 769 if preceded_by_block: 770 parts.append("\n") 771 772 parts += self.translate_section(sectiontype, options, text) 773 preceded_by_block = True 774 775 # Translations of macros acting as sections. 776 777 elif macrotypes.has_key(sectiontype): 778 779 # Prevent the production of macros in places they would 780 # produce illegal Moin syntax. 781 782 if not self.forbids_macros(): 783 self.macro = sectiontype 784 argname = macroargs.get(sectiontype) 785 parts.append(macrotypes[sectiontype] % { 786 "content" : quote_macro_argument(self.parse_text(text)), 787 "args" : quote_macro_argument((argname and ("%s=" % argname) or "") + options) 788 }) 789 self.macro = None 790 791 # Include the contents of section-based macros where the 792 # macros themselves are not allowed. 793 794 else: 795 parts.append(self.translate_content(text)) 796 797 preceded_by_block = False 798 799 # Unrecognised sections. 800 801 else: 802 parts += self.translate_section(sectiontype, None, text) 803 preceded_by_block = False 804 805 return "".join(parts) 806 807 def forbids_macros(self): 808 return self.in_heading or self.macro 809 810 def parse(s, out, is_comment_page=False): 811 812 "Parse the content in the string 's', writing a translation to 'out'." 813 814 parser = ConfluenceParser(is_comment_page) 815 out.write(parser.parse_text(s, top=True)) 816 817 if __name__ == "__main__": 818 s = codecs.getreader("utf-8")(sys.stdin).read() 819 out = codecs.getwriter("utf-8")(sys.stdout) 820 parse(s, out) 821 822 # vim: tabstop=4 expandtab shiftwidth=4