1 #!/usr/bin/env python 2 3 """ 4 Confluence Wiki syntax parsing. 5 6 Copyright (C) 2012, 2013 Paul Boddie <paul@boddie.org.uk> 7 8 This software is free software; you can redistribute it and/or 9 modify it under the terms of the GNU General Public License as 10 published by the Free Software Foundation; either version 2 of 11 the License, or (at your option) any later version. 12 13 This software is distributed in the hope that it will be useful, 14 but WITHOUT ANY WARRANTY; without even the implied warranty of 15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 GNU General Public License for more details. 17 18 You should have received a copy of the GNU General Public 19 License along with this library; see the file LICENCE.txt 20 If not, write to the Free Software Foundation, Inc., 21 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA 22 23 -------- 24 25 The basic procedure is as follows: 26 27 1. Wiki pages are first split up into regions. 28 2. Then, within these regions, the text is split into blocks. 29 1. First, lists are identified. 30 2. Additionally, other block-like elements are identified. 31 3. Each block is then split into regions. 32 """ 33 34 from common import * 35 import re 36 import sys 37 import codecs 38 import operator 39 40 # Section extraction. 41 42 sections_regexp_str = r"(?<!{){(?P<type>[^-_*+{}\n:]+)(?P<options>:[^}\n]+)?}" \ 43 r"|" \ 44 r"^(?P<rowstart>[|]{1,2})" \ 45 r"|" \ 46 r"(?P<rowend>[|]{1,2}(\n|$))" \ 47 r"|" \ 48 r"^(?P<listitem>\s*[*#-]+\s+.*?([^|](\n|$)|(?=[|](\n|$))))" 49 50 sections_regexp = re.compile(sections_regexp_str, re.MULTILINE) 51 52 def get_regions(s): 53 54 """ 55 Return a list of regions from 's'. Each region is specified using a tuple of 56 the form (type, text). 57 """ 58 59 last = 0 60 regions = [""] 61 depth = 0 62 had_row = False 63 had_item = False 64 65 for match in sections_regexp.finditer(s): 66 start, end = match.span() 67 is_start = match.group("options") or match.group("rowstart") 68 is_section = is_section_marker(match.group("type")) 69 is_row = match.group("rowstart") or match.group("rowend") 70 is_item = match.group("listitem") 71 72 # The start of a region is either indicated by a marker with options or 73 # by a marker where no region is currently active. 74 75 if is_start or not depth: 76 77 # Where no region is active, add the text since the last match as a 78 # "null" region. 79 80 if not depth: 81 regions[-1] += s[last:start] 82 83 # A new region is maintained as a string. 84 85 if is_section: 86 regions.append(s[start:end]) 87 88 # A new row may either continue a table region or start a new 89 # table region. 90 91 elif is_row: 92 if had_row and last == start: 93 regions[-2] += regions[-1] + s[start:end] 94 regions.pop() 95 else: 96 regions.append(s[start:end]) 97 98 # A list item may either continue a list region or start a new 99 # list region. 100 101 elif is_item: 102 103 # If continuing a list, merge the list regions and start a 104 # new potentally separate region. 105 106 if had_item and last == start: 107 regions[-2] += regions[-1] + s[start:end] 108 regions[-1] = "" 109 110 # If not continuing a list, make a region for a new list and 111 # start a new potentally separate region. 112 113 else: 114 regions.append(s[start:end]) 115 regions.append("") 116 117 # Certain markers may be standalone macros. 118 119 else: 120 regions[-1] += s[start:end] 121 122 # Where a region is active, add the text since the last match as 123 # well as the text in this match to the region. 124 125 else: 126 regions[-1] += s[last:end] 127 128 if is_section or is_row: 129 depth += 1 130 131 # The end of a region is indicated by a marker with no options or the 132 # end of a row. 133 134 else: 135 # Where no region is active, the text since the last match plus the 136 # marker are added to the current "null" region. 137 138 if not depth: 139 140 # Add to the string portion of the "null" region. 141 142 regions[-1] += s[last:end] 143 144 # Where a region is active, the end marker and preceding text is 145 # either incorporated into the current region if more than one 146 # region is active, or the preceding text is incorporated into the 147 # current region and the details of the region are then obtained. 148 149 else: 150 if depth > 1 or (not is_section and not is_row): 151 regions[-1] += s[last:end] 152 153 # Terminate the active region, interpreting its contents. 154 155 else: 156 regions[-1] += s[last:end] 157 regions.append("") 158 159 if is_section or is_row: 160 depth -= 1 161 162 had_row = is_row 163 had_item = is_item 164 last = end 165 166 # Where a region is still active, terminate it. 167 168 regions[-1] += s[last:] 169 170 return [get_section_details(s) for s in regions if s] 171 172 def is_section_marker(sectiontype): 173 return sectiontypes.has_key(sectiontype) or sectiontype == "color" 174 175 # Section inspection. 176 177 section_regexp_str = r"{(?P<sectiontype>[^\n:]*?)(?::(?P<options>.*?))?}(?P<section>.*){(?P=sectiontype)}" 178 section_regexp = re.compile(section_regexp_str, re.DOTALL | re.MULTILINE) 179 180 def get_section_details(s): 181 182 "Return the details of a section 's' in the form (type, text)." 183 184 match = section_regexp.match(s) 185 if match: 186 return (match.group("sectiontype"), match.group("options")), match.group("section") 187 else: 188 return None, s 189 190 # Heading, table and list extraction. 191 192 list_regexp_str = r"^\s*(?P<listtype>[*#-])[*#-]*\s+.*(\n\s*(?P=listtype).*?)*(?:\n|$)" 193 table_regexp_str = r"^((?P<celltype>[|]{1,2})((.|\n(?!\n))+?(?P=celltype))+(\n|$))+" 194 blocktext_regexp_str = r"^(?P<type>h\d|bq)\.\s+(?P<text>.*)$" 195 196 blockelement_regexp = re.compile( 197 "(" + list_regexp_str + ")" 198 "|" 199 "(" + table_regexp_str + ")" 200 "|" 201 "(" + blocktext_regexp_str + ")", 202 re.MULTILINE 203 ) 204 205 def get_block_elements(s): 206 207 """ 208 Extract headings, tables and lists from the given string 's'. 209 """ 210 211 last = 0 212 blocks = [] 213 for match in blockelement_regexp.finditer(s): 214 start, end = match.span() 215 matchtype = match.group("listtype") and "list" or match.group("celltype") and "table" or match.group("type") 216 blocks.append((None, s[last:start])) 217 blocks.append((matchtype, match.group("text") or s[start:end])) 218 last = end 219 blocks.append((None, s[last:])) 220 return blocks 221 222 # Block extraction. 223 224 block_regexp_str = r"^(?:\s*\n)+" 225 block_regexp = re.compile(block_regexp_str, re.MULTILINE) 226 227 def get_basic_blocks(s): 228 229 """ 230 Return blocks from the given string 's' by splitting the text on blank lines 231 and eliminating those lines. 232 """ 233 234 return [b for b in block_regexp.split(s) if b.strip()] 235 236 # Block inspection. 237 238 def get_blocks(s): 239 240 """ 241 Return blocks from the given string 's', inspecting the basic blocks and 242 generating additional block-level text where appropriate. 243 """ 244 245 blocks = [] 246 247 for blocktype, blocktext in get_block_elements(s): 248 249 # Collect heading, list and table blocks. 250 251 if blocktype is not None: 252 blocks.append((blocktype, blocktext)) 253 254 # Attempt to find new subblocks in other regions. 255 256 else: 257 for block in get_basic_blocks(blocktext): 258 blocks.append((None, block)) 259 260 return blocks 261 262 # List item inspection. 263 264 listitem_regexp_str = r"^(?P<marker> *[-*#]+)\s+(?P<text>.*)$" 265 listitem_regexp = re.compile(listitem_regexp_str, re.MULTILINE) 266 267 def get_list_items(text): 268 269 "Return a list of (marker, text) tuples for the given list 'text'." 270 271 items = [] 272 273 for match in listitem_regexp.finditer(text): 274 items.append((match.group("marker"), match.group("text"))) 275 276 return items 277 278 # Content inspection. 279 280 monospace_regexp_str = r"{{(?P<monotext>.*?)}}" 281 link_regexp_str = r"(?<!\\)[[](?P<linktext>.*?)]" 282 image_regexp_str = r"!(?P<imagetext>\w.*?)!" 283 macro_regexp_str = r"{(?P<macro>.*?)(?::(?P<options>.*?))?}" 284 285 # Word-dependent patterns. 286 # Here, the unbracketed markers must test for the absence of surrounding word 287 # characters. 288 289 italic_regexp_str = r"(?:(?<!\w)_|\{_\})(?P<italictext>.*?)(?:_(?!\w)|\{_\})" 290 bold_regexp_str = r"(?:(?<!\w)\*|\{\*\})(?P<boldtext>.*?)(?:\*(?!\w)|\{\*\})" 291 del_regexp_str = r"(?:(?<!\w)-|\{-\})(?P<deltext>.*?)(?:-(?!\w)|\{-\})" 292 underline_regexp_str = r"(?:(?<!\w)\+|\{\+\})(?P<underlinetext>.*?)(?:\+(?!\w)|\{\+\})" 293 sub_regexp_str = r"(?:(?<!\w)~|\{~\})(?P<subtext>.*?)(?:~(?!\w)|\{~\})" 294 295 content_regexp_str = ( 296 "(" + monospace_regexp_str + ")" 297 "|" 298 "(" + link_regexp_str + ")" 299 "|" 300 "(" + image_regexp_str + ")" 301 "|" 302 "(" + macro_regexp_str + ")" 303 "|" 304 "(" + italic_regexp_str + ")" 305 "|" 306 "(" + bold_regexp_str + ")" 307 "|" 308 "(" + del_regexp_str + ")" 309 "|" 310 "(" + underline_regexp_str + ")" 311 "|" 312 "(" + sub_regexp_str + ")" 313 ) 314 315 # Table row inspection. 316 317 cellsep_regexp_str = r"(?P<celltype>[|]{1,2})" 318 319 table_content_regexp_str = ( 320 content_regexp_str + 321 "|" 322 "(" + cellsep_regexp_str + ")" 323 ) 324 325 content_regexp = re.compile(content_regexp_str) 326 table_content_regexp = re.compile(table_content_regexp_str) 327 328 def get_table_rows(text): 329 330 "Return a list of (cellsep, columns) tuples for the given table 'text'." 331 332 rows = [] 333 334 for row in text.split("|\n"): 335 if not row: 336 break 337 338 row += "|" 339 cellsep = None 340 columns = [""] 341 last = 0 342 for match in table_content_regexp.finditer(row): 343 start, end = match.span() 344 columns[-1] += row[last:start] 345 346 if match.group("celltype"): 347 if cellsep is None: 348 cellsep = match.group("celltype") 349 columns.append("") 350 else: 351 columns[-1] += match.group() 352 353 last = end 354 355 columns[-1] += row[last:] 356 357 if cellsep: 358 rows.append((cellsep, columns[1:-1])) 359 360 return rows 361 362 # Notation conversion. 363 364 notation_mapping = [ 365 (r"\!", "!"), 366 (r"\-", "-"), 367 (r"\\""\n", "<<BR>>"), 368 (r"\\ ", "<<BR>>"), 369 (r"\~", "~"), 370 (r"\[", "<<Verbatim([)>>"), 371 (r"\]", "<<Verbatim(])>>"), 372 (r"\*", "*"), 373 ] 374 375 preformatted_notation_mapping = [ 376 (r"\!", "!"), 377 (r"\-", "-"), 378 (r"\\""\n", "\n"), 379 (r"\\ ", "\n"), 380 (r"\~", "~"), 381 ] 382 383 # Translation helpers. 384 385 markers = { 386 "*" : "*", 387 "#" : "1.", 388 "-" : "*", 389 } 390 391 cellseps = { 392 "|" : "\n|| ", 393 "||" : "\n|| ", 394 } 395 396 cellextra = { 397 "|" : "", 398 "||" : "'''", 399 } 400 401 sectiontypes = { 402 "code" : "", 403 "excerpt" : "#!wiki", 404 "noformat" : "", 405 "quote" : "", 406 "info" : "#!wiki important", 407 "note" : "#!wiki caution", 408 "tip" : "#!wiki tip", 409 "warning" : "#!wiki warning", 410 } 411 412 preformatted_sectiontypes = (None, "noformat") 413 414 macroargs = { 415 "color" : "col", 416 } 417 418 macrotypes = { 419 "anchor" : "<<Anchor(%(args)s)>>", 420 "color" : "<<Color2(%(content)s, %(args)s)>>", 421 "toc" : "<<TableOfContents>>", 422 } 423 424 class ConfluenceParser: 425 426 "A parser for Confluence markup." 427 428 def __init__(self): 429 self.max_level = self.level = 0 430 self.in_heading = False 431 self.held_anchors = [] 432 self.macro = None 433 self.sections = [] 434 435 def translate_marker(self, marker): 436 437 "Translate the given 'marker' to a suitable Moin representation." 438 439 return " " * len(marker) + markers[marker[-1]] 440 441 def translate_cellsep(self, cellsep): 442 443 "Translate the given 'cellsep' to a suitable Moin representation." 444 445 return cellseps[cellsep] 446 447 def translate_cell(self, cellsep, text): 448 449 "Using 'cellsep', translate the cell 'text'." 450 451 return cellextra[cellsep] + self.parse_text(text).strip() + cellextra[cellsep] 452 453 def translate_content_match(self, match): 454 455 "Translate the content described by the given 'match', returning a string." 456 457 if match.group("monotext"): 458 self.enter_section(); self.leave_section() 459 return "{{{%s}}}" % match.group("monotext") 460 461 elif match.group("linktext"): 462 parts = match.group("linktext").split("|") 463 464 # NOTE: Proper detection of external links required. 465 466 if len(parts) == 1: 467 label, target, title = None, parts[0], None 468 elif len(parts) == 2: 469 (label, target), title = parts, None 470 else: 471 label, target, title = parts 472 473 target = target.strip() 474 475 # Look for namespace links and rewrite them. 476 477 if target.find(":") != -1: 478 prefix = "" 479 space, rest = target.split(":", 1) 480 if space not in URL_SCHEMES: 481 rest = get_page_title(rest) 482 target = "%s/%s" % (space, rest) 483 484 # Detect anchors. 485 486 elif target.startswith("#"): 487 prefix = "" 488 489 # Detect attachments. 490 491 elif target.startswith("^"): 492 prefix = "attachment:" 493 494 # Link to other pages within a space. 495 496 else: 497 prefix = "../" 498 499 # Make the link tidier by making a target if none was given. 500 501 if not label: 502 label = target 503 504 target = get_page_title(target) 505 506 if not label and not title: 507 return "[[%s%s]]" % (prefix, target) 508 elif not title: 509 return "[[%s%s|%s]]" % (prefix, target, label) 510 else: 511 return "[[%s%s|%s|title=%s]]" % (prefix, target, label, title) 512 513 elif match.group("imagetext"): 514 parts = match.group("imagetext").split("|") 515 516 # NOTE: Proper detection of external links required. 517 518 if parts[0].startswith("http"): 519 prefix = "" 520 else: 521 prefix = "attachment:" 522 523 # NOTE: Proper options conversion required. 524 525 if len(parts) == 1: 526 return "{{%s%s}}" % (prefix, parts[0]) 527 else: 528 return "{{%s%s|%s}}" % (prefix, parts[0], parts[1]) 529 530 elif match.group("macro"): 531 macro_name = match.group("macro") 532 if macrotypes.has_key(macro_name): 533 argname = macroargs.get(macro_name) 534 result = macrotypes[macro_name] % { 535 "args" : quote_macro_argument((argname and ("%s=" % argname) or "") + (match.group("options") or "")) 536 } 537 if not self.forbids_macros(): 538 return result 539 if macro_name == "anchor": 540 self.held_anchors.append(result) 541 return "" 542 543 elif match.group("italictext"): 544 return "''%s''" % self.translate_content(match.group("italictext")) 545 546 elif match.group("boldtext"): 547 return "'''%s'''" % self.translate_content(match.group("boldtext")) 548 549 elif match.group("deltext"): 550 return "--(%s)--" % self.translate_content(match.group("deltext")) 551 552 elif match.group("underlinetext"): 553 return "__%s__" % self.translate_content(match.group("underlinetext")) 554 555 elif match.group("subtext"): 556 return ",,%s,," % self.translate_content(match.group("subtext")) 557 558 else: 559 return self.translate_text(match.group()) 560 561 def translate_text(self, s, preformatted=False): 562 563 "Translate the plain text string 's', converting notation." 564 565 for before, after in preformatted and preformatted_notation_mapping or notation_mapping: 566 s = s.replace(before, after) 567 return s 568 569 def translate_content(self, text): 570 571 """ 572 Return a translation of the given 'text'. If the optional 'sectiontype' is 573 specified, the translation may be modified to a form appropriate to the 574 section being translated. 575 """ 576 577 parts = [] 578 preformatted = self.is_preformatted() 579 580 last = 0 581 for match in content_regexp.finditer(text): 582 start, end = match.span() 583 parts.append(self.translate_text(text[last:start], preformatted)) 584 585 # Handle unformatted sections. 586 587 if self.sections and self.sections[-1] in ("code", "noformat"): 588 parts.append(match.group()) 589 else: 590 parts.append(self.translate_content_match(match)) 591 592 last = end 593 594 parts.append(self.translate_text(text[last:], preformatted)) 595 return "".join(parts) 596 597 def is_preformatted(self): 598 return reduce(operator.or_, [x in preformatted_sectiontypes for x in self.sections], False) 599 600 def translate_block(self, blocktype, blocktext): 601 602 "Translate the block with the given 'blocktype' and 'blocktext'." 603 604 if blocktype in headings: 605 self.in_heading = True 606 self.held_anchors = [] 607 608 parts = [] 609 610 # Translate headings and blockquotes. 611 612 if blocktypes.has_key(blocktype): 613 text = self.parse_text(blocktext) 614 for anchor in self.held_anchors: 615 parts.append(anchor) 616 parts.append(blocktypes[blocktype] % text) 617 618 # Translate list items. 619 620 elif blocktype == "list": 621 for listmarker, listitem in get_list_items(blocktext): 622 parts.append("%s %s" % (self.translate_marker(listmarker), self.parse_text(listitem))) 623 624 # Translate table items. 625 626 elif blocktype == "table": 627 628 # Enter the table. 629 630 self.enter_section("table") 631 632 table_parts = [] 633 first = True 634 635 for cellsep, columns in get_table_rows(blocktext): 636 if not first: 637 table_parts.append("==") 638 else: 639 first = False 640 moinsep = self.translate_cellsep(cellsep) 641 table_parts.append(moinsep.join([self.translate_cell(cellsep, column) for column in columns])) 642 643 # Nest the section appropriately. 644 645 opening, closing = self.nest_section() 646 647 parts.append("%s#!table" % opening) 648 parts += table_parts 649 parts.append(closing) 650 651 # Leave the table. 652 653 self.leave_section() 654 655 # Handle anonymous blocks. 656 657 else: 658 parts.append(self.parse_text(blocktext)) 659 660 if blocktype in headings: 661 self.in_heading = False 662 663 return "\n".join(parts) 664 665 def translate_section(self, sectiontype, options, text): 666 667 """ 668 Translate the section with the given 'sectiontype', 'options' and 669 'text'. 670 """ 671 672 parts = [] 673 674 # Enter the section. 675 676 self.enter_section(sectiontype) 677 678 # Sections can contain other sections. 679 680 if sectiontype == "noformat": 681 section_content = self.translate_content(text.strip("\n")) 682 else: 683 section_content = self.parse_text(text.strip()) 684 685 # Nest the section appropriately. 686 687 opening, closing = self.nest_section() 688 mointype = sectiontypes.get(sectiontype) 689 690 parts.append("%s%s\n" % (opening, mointype or "")) 691 parts.append(section_content) 692 parts.append("\n%s\n" % closing) 693 694 # Leave the section. 695 696 self.leave_section() 697 698 return parts 699 700 def enter_section(self, sectiontype=None): 701 self.level += 1 702 self.max_level = max(self.level, self.max_level) 703 self.sections.append(sectiontype) 704 705 def leave_section(self): 706 self.level -= 1 707 if not self.level: 708 self.max_level = 0 709 self.sections.pop() 710 711 def nest_section(self): 712 level = 3 + self.max_level - self.level 713 opening = "{" * level 714 closing = "}" * level 715 return opening, closing 716 717 # General parsing. 718 719 def parse_text(self, s, top=False): 720 721 "Parse the content in the string 's', returning the translation." 722 723 parts = [] 724 725 # Control spacing between blocks and other blocks or sections. 726 727 preceded_by_block = False 728 729 for type, text in get_regions(s): 730 731 # Handle list, heading, blockquote or anonymous blocks. 732 733 if type is None: 734 735 # Where the region is the same as the provided text, return 736 # immediately. This is the base case of the recursive parsing 737 # process. 738 739 if text == s and not top: 740 return self.translate_content(text) 741 742 # Otherwise, obtain and translate the blocks. 743 744 if preceded_by_block: 745 parts.append("\n") 746 747 first = True 748 for blocktype, blocktext in get_blocks(text): 749 if not first: 750 parts.append("\n") 751 else: 752 first = False 753 parts.append("%s" % self.translate_block(blocktype, blocktext)) 754 755 if not first: 756 preceded_by_block = True 757 758 # Handle sections. 759 760 else: 761 sectiontype, options = type 762 763 # Direct translations of sections. 764 765 if sectiontypes.has_key(sectiontype): 766 if preceded_by_block: 767 parts.append("\n") 768 769 parts += self.translate_section(sectiontype, options, text) 770 preceded_by_block = True 771 772 # Translations of macros acting as sections. 773 774 elif macrotypes.has_key(sectiontype): 775 776 # Prevent the production of macros in places they would 777 # produce illegal Moin syntax. 778 779 if not self.forbids_macros(): 780 self.macro = sectiontype 781 argname = macroargs.get(sectiontype) 782 parts.append(macrotypes[sectiontype] % { 783 "content" : quote_macro_argument(self.parse_text(text)), 784 "args" : quote_macro_argument((argname and ("%s=" % argname) or "") + options) 785 }) 786 self.macro = None 787 788 # Include the contents of section-based macros where the 789 # macros themselves are not allowed. 790 791 else: 792 parts.append(self.translate_content(text)) 793 794 preceded_by_block = False 795 796 # Unrecognised sections. 797 798 else: 799 parts += self.translate_section(sectiontype, None, text) 800 preceded_by_block = False 801 802 return "".join(parts) 803 804 def forbids_macros(self): 805 return self.in_heading or self.macro 806 807 def parse(s, out): 808 809 "Parse the content in the string 's', writing a translation to 'out'." 810 811 parser = ConfluenceParser() 812 out.write(parser.parse_text(s, top=True)) 813 814 if __name__ == "__main__": 815 s = codecs.getreader("utf-8")(sys.stdin).read() 816 out = codecs.getwriter("utf-8")(sys.stdout) 817 parse(s, out) 818 819 # vim: tabstop=4 expandtab shiftwidth=4