1 #!/usr/bin/env python 2 3 """ 4 Confluence Wiki syntax parsing. 5 6 Copyright (C) 2012, 2013 Paul Boddie <paul@boddie.org.uk> 7 8 This software is free software; you can redistribute it and/or 9 modify it under the terms of the GNU General Public License as 10 published by the Free Software Foundation; either version 2 of 11 the License, or (at your option) any later version. 12 13 This software is distributed in the hope that it will be useful, 14 but WITHOUT ANY WARRANTY; without even the implied warranty of 15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 GNU General Public License for more details. 17 18 You should have received a copy of the GNU General Public 19 License along with this library; see the file LICENCE.txt 20 If not, write to the Free Software Foundation, Inc., 21 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA 22 23 -------- 24 25 The basic procedure is as follows: 26 27 1. Wiki pages are first split up into regions. 28 2. Then, within these regions, the text is split into blocks. 29 1. First, lists are identified. 30 2. Additionally, other block-like elements are identified. 31 3. Each block is then split into regions. 32 """ 33 34 from common import * 35 import re 36 import sys 37 import codecs 38 import operator 39 40 # Section extraction. 41 42 sections_regexp_str = r"(?<!{){(?P<type>[^-_*+{}\n:]+)(?P<options>:[^}\n]+)?}" \ 43 r"|" \ 44 r"^(?P<rowstart>[|]{1,2})" \ 45 r"|" \ 46 r"(?P<rowend>[|]{1,2}(\n|$))" \ 47 r"|" \ 48 r"^(?P<listitem>\s*[*#-]+\s+.*?([^|](\n|$)|(?=[|](\n|$))))" 49 50 sections_regexp = re.compile(sections_regexp_str, re.MULTILINE) 51 52 def get_regions(s): 53 54 """ 55 Return a list of regions from 's'. Each region is specified using a tuple of 56 the form (type, text). 57 """ 58 59 last = 0 60 regions = [""] 61 depth = 0 62 had_row = False 63 had_item = False 64 65 for match in sections_regexp.finditer(s): 66 start, end = match.span() 67 is_start = match.group("options") or match.group("rowstart") 68 is_section = is_section_marker(match.group("type")) 69 is_row = match.group("rowstart") or match.group("rowend") 70 is_item = match.group("listitem") 71 72 # The start of a region is either indicated by a marker with options or 73 # by a marker where no region is currently active. 74 75 if is_start or not depth: 76 77 # Where no region is active, add the text since the last match as a 78 # "null" region. 79 80 if not depth: 81 regions[-1] += s[last:start] 82 83 # A new region is maintained as a string. 84 85 if is_section: 86 regions.append(s[start:end]) 87 88 # A new row may either continue a table region or start a new 89 # table region. 90 91 elif is_row: 92 if had_row and last == start: 93 regions[-2] += regions[-1] + s[start:end] 94 regions.pop() 95 else: 96 regions.append(s[start:end]) 97 98 # A list item may either continue a list region or start a new 99 # list region. 100 101 elif is_item: 102 103 # If continuing a list, merge the list regions and start a 104 # new potentally separate region. 105 106 if had_item and last == start: 107 regions[-2] += regions[-1] + s[start:end] 108 regions[-1] = "" 109 110 # If not continuing a list, make a region for a new list and 111 # start a new potentally separate region. 112 113 else: 114 regions.append(s[start:end]) 115 regions.append("") 116 117 # Certain markers may be standalone macros. 118 119 else: 120 regions[-1] += s[start:end] 121 122 # Where a region is active, add the text since the last match as 123 # well as the text in this match to the region. 124 125 else: 126 regions[-1] += s[last:end] 127 128 if is_section or is_row: 129 depth += 1 130 131 # The end of a region is indicated by a marker with no options or the 132 # end of a row. 133 134 else: 135 # Where no region is active, the text since the last match plus the 136 # marker are added to the current "null" region. 137 138 if not depth: 139 140 # Add to the string portion of the "null" region. 141 142 regions[-1] += s[last:end] 143 144 # Where a region is active, the end marker and preceding text is 145 # either incorporated into the current region if more than one 146 # region is active, or the preceding text is incorporated into the 147 # current region and the details of the region are then obtained. 148 149 else: 150 if depth > 1 or (not is_section and not is_row): 151 regions[-1] += s[last:end] 152 153 # Terminate the active region, interpreting its contents. 154 155 else: 156 regions[-1] += s[last:end] 157 regions.append("") 158 159 if is_section or is_row: 160 depth -= 1 161 162 had_row = is_row 163 had_item = is_item 164 last = end 165 166 # Where a region is still active, terminate it. 167 168 regions[-1] += s[last:] 169 170 return [get_section_details(s) for s in regions if s] 171 172 def is_section_marker(sectiontype): 173 return sectiontypes.has_key(sectiontype) or sectiontype == "color" 174 175 # Section inspection. 176 177 section_regexp_str = r"{(?P<sectiontype>[^\n:]*?)(?::(?P<options>.*?))?}(?P<section>.*){(?P=sectiontype)}" 178 section_regexp = re.compile(section_regexp_str, re.DOTALL | re.MULTILINE) 179 180 def get_section_details(s): 181 182 "Return the details of a section 's' in the form (type, text)." 183 184 match = section_regexp.match(s) 185 if match: 186 return (match.group("sectiontype"), match.group("options")), match.group("section") 187 else: 188 return None, s 189 190 # Heading, table and list extraction. 191 192 list_regexp_str = r"^\s*(?P<listtype>[*#-])[*#-]*\s+.*(\n\s*(?P=listtype).*?)*(?:\n|$)" 193 table_regexp_str = r"^((?P<celltype>[|]{1,2})((.|\n(?!\n))+?(?P=celltype))+(\n|$))+" 194 blocktext_regexp_str = r"^(?P<type>h\d|bq)\.\s+(?P<text>.*)$" 195 196 blockelement_regexp = re.compile( 197 "(" + list_regexp_str + ")" 198 "|" 199 "(" + table_regexp_str + ")" 200 "|" 201 "(" + blocktext_regexp_str + ")", 202 re.MULTILINE 203 ) 204 205 def get_block_elements(s): 206 207 """ 208 Extract headings, tables and lists from the given string 's'. 209 """ 210 211 last = 0 212 blocks = [] 213 for match in blockelement_regexp.finditer(s): 214 start, end = match.span() 215 matchtype = match.group("listtype") and "list" or match.group("celltype") and "table" or match.group("type") 216 blocks.append((None, s[last:start])) 217 blocks.append((matchtype, match.group("text") or s[start:end])) 218 last = end 219 blocks.append((None, s[last:])) 220 return blocks 221 222 # Block extraction. 223 224 block_regexp_str = r"^(?:\s*\n)+" 225 block_regexp = re.compile(block_regexp_str, re.MULTILINE) 226 227 def get_basic_blocks(s): 228 229 """ 230 Return blocks from the given string 's' by splitting the text on blank lines 231 and eliminating those lines. 232 """ 233 234 return [b for b in block_regexp.split(s) if b.strip()] 235 236 # Block inspection. 237 238 def get_blocks(s): 239 240 """ 241 Return blocks from the given string 's', inspecting the basic blocks and 242 generating additional block-level text where appropriate. 243 """ 244 245 blocks = [] 246 247 for blocktype, blocktext in get_block_elements(s): 248 249 # Collect heading, list and table blocks. 250 251 if blocktype is not None: 252 blocks.append((blocktype, blocktext)) 253 254 # Attempt to find new subblocks in other regions. 255 256 else: 257 for block in get_basic_blocks(blocktext): 258 blocks.append((None, block)) 259 260 return blocks 261 262 # List item inspection. 263 264 listitem_regexp_str = r"^(?P<marker> *[-*#]+)\s+(?P<text>.*)$" 265 listitem_regexp = re.compile(listitem_regexp_str, re.MULTILINE) 266 267 def get_list_items(text): 268 269 "Return a list of (marker, text) tuples for the given list 'text'." 270 271 items = [] 272 273 for match in listitem_regexp.finditer(text): 274 items.append((match.group("marker"), match.group("text"))) 275 276 return items 277 278 # Content inspection. 279 280 monospace_regexp_str = r"{{(?P<monotext>.*?)}}" 281 link_regexp_str = r"[[](?P<linktext>.*?)]" 282 image_regexp_str = r"!(?P<imagetext>\w.*?)!" 283 macro_regexp_str = r"{(?P<macro>.*?):(?P<options>.*?)}" 284 285 # Word-dependent patterns. 286 # Here, the unbracketed markers must test for the absence of surrounding word 287 # characters. 288 289 italic_regexp_str = r"(?:(?<!\w)_|\{_\})(?P<italictext>.*?)(?:_(?!\w)|\{_\})" 290 bold_regexp_str = r"(?:(?<!\w)\*|\{\*\})(?P<boldtext>.*?)(?:\*(?!\w)|\{\*\})" 291 del_regexp_str = r"(?:(?<!\w)-|\{-\})(?P<deltext>.*?)(?:-(?!\w)|\{-\})" 292 underline_regexp_str = r"(?:(?<!\w)\+|\{\+\})(?P<underlinetext>.*?)(?:\+(?!\w)|\{\+\})" 293 sub_regexp_str = r"(?:(?<!\w)~|\{~\})(?P<subtext>.*?)(?:~(?!\w)|\{~\})" 294 295 content_regexp_str = ( 296 "(" + monospace_regexp_str + ")" 297 "|" 298 "(" + link_regexp_str + ")" 299 "|" 300 "(" + image_regexp_str + ")" 301 "|" 302 "(" + macro_regexp_str + ")" 303 "|" 304 "(" + italic_regexp_str + ")" 305 "|" 306 "(" + bold_regexp_str + ")" 307 "|" 308 "(" + del_regexp_str + ")" 309 "|" 310 "(" + underline_regexp_str + ")" 311 "|" 312 "(" + sub_regexp_str + ")" 313 ) 314 315 # Table row inspection. 316 317 cellsep_regexp_str = r"(?P<celltype>[|]{1,2})" 318 319 table_content_regexp_str = ( 320 content_regexp_str + 321 "|" 322 "(" + cellsep_regexp_str + ")" 323 ) 324 325 content_regexp = re.compile(content_regexp_str) 326 table_content_regexp = re.compile(table_content_regexp_str) 327 328 def get_table_rows(text): 329 330 "Return a list of (cellsep, columns) tuples for the given table 'text'." 331 332 rows = [] 333 334 for row in text.split("|\n"): 335 if not row: 336 break 337 338 row += "|" 339 cellsep = None 340 columns = [""] 341 last = 0 342 for match in table_content_regexp.finditer(row): 343 start, end = match.span() 344 columns[-1] += row[last:start] 345 346 if match.group("celltype"): 347 if cellsep is None: 348 cellsep = match.group("celltype") 349 columns.append("") 350 else: 351 columns[-1] += match.group() 352 353 last = end 354 355 columns[-1] += row[last:] 356 357 if cellsep: 358 rows.append((cellsep, columns[1:-1])) 359 360 return rows 361 362 # Notation conversion. 363 364 notation_mapping = [ 365 (r"\!", "!"), 366 (r"\-", "-"), 367 (r"\\""\n", "<<BR>>"), 368 (r"\\ ", "<<BR>>"), 369 (r"\~", "~"), 370 ] 371 372 preformatted_notation_mapping = [ 373 (r"\!", "!"), 374 (r"\-", "-"), 375 (r"\\""\n", "\n"), 376 (r"\\ ", "\n"), 377 (r"\~", "~"), 378 ] 379 380 # Translation helpers. 381 382 markers = { 383 "*" : "*", 384 "#" : "1.", 385 "-" : "*", 386 } 387 388 cellseps = { 389 "|" : "\n|| ", 390 "||" : "\n|| ", 391 } 392 393 cellextra = { 394 "|" : "", 395 "||" : "'''", 396 } 397 398 sectiontypes = { 399 "code" : "", 400 "noformat" : "", 401 "quote" : "", 402 "info" : "#!wiki important", 403 "note" : "#!wiki caution", 404 "tip" : "#!wiki tip", 405 "warning" : "#!wiki warning", 406 } 407 408 preformatted_sectiontypes = (None, "noformat") 409 410 macroargs = { 411 "color" : "col", 412 } 413 414 macrotypes = { 415 "anchor" : "<<Anchor(%(args)s)>>", 416 "color" : "<<Color2(%(content)s, %(args)s)>>", 417 } 418 419 class ConfluenceParser: 420 421 "A parser for Confluence markup." 422 423 def __init__(self): 424 self.max_level = self.level = 0 425 self.in_heading = False 426 self.held_anchors = [] 427 self.macro = None 428 self.sections = [] 429 430 def translate_marker(self, marker): 431 432 "Translate the given 'marker' to a suitable Moin representation." 433 434 return " " * len(marker) + markers[marker[-1]] 435 436 def translate_cellsep(self, cellsep): 437 438 "Translate the given 'cellsep' to a suitable Moin representation." 439 440 return cellseps[cellsep] 441 442 def translate_cell(self, cellsep, text): 443 444 "Using 'cellsep', translate the cell 'text'." 445 446 return cellextra[cellsep] + self.parse_text(text).strip() + cellextra[cellsep] 447 448 def translate_content_match(self, match): 449 450 "Translate the content described by the given 'match', returning a string." 451 452 if match.group("monotext"): 453 self.enter_section(); self.leave_section() 454 return "{{{%s}}}" % match.group("monotext") 455 456 elif match.group("linktext"): 457 parts = match.group("linktext").split("|") 458 459 # NOTE: Proper detection of external links required. 460 461 if len(parts) == 1: 462 label, target, title = None, parts[0], None 463 elif len(parts) == 2: 464 (label, target), title = parts, None 465 else: 466 label, target, title = parts 467 468 target = target.strip() 469 470 # Look for namespace links and rewrite them. 471 472 if target.find(":") != -1: 473 prefix = "" 474 space, rest = target.split(":", 1) 475 if space not in URL_SCHEMES: 476 rest = get_page_title(rest) 477 target = "%s/%s" % (space, rest) 478 479 # Detect anchors. 480 481 elif target.startswith("#"): 482 prefix = "" 483 484 # Detect attachments. 485 486 elif target.startswith("^"): 487 prefix = "attachment:" 488 489 # Link to other pages within a space. 490 491 else: 492 prefix = "../" 493 494 # Make the link tidier by making a target if none was given. 495 496 if not label: 497 label = target 498 499 target = get_page_title(target) 500 501 if not label and not title: 502 return "[[%s%s]]" % (prefix, target) 503 elif not title: 504 return "[[%s%s|%s]]" % (prefix, target, label) 505 else: 506 return "[[%s%s|%s|title=%s]]" % (prefix, target, label, title) 507 508 elif match.group("imagetext"): 509 parts = match.group("imagetext").split("|") 510 511 # NOTE: Proper detection of external links required. 512 513 if parts[0].startswith("http"): 514 prefix = "" 515 else: 516 prefix = "attachment:" 517 518 # NOTE: Proper options conversion required. 519 520 if len(parts) == 1: 521 return "{{%s%s}}" % (prefix, parts[0]) 522 else: 523 return "{{%s%s|%s}}" % (prefix, parts[0], parts[1]) 524 525 elif match.group("macro"): 526 macro_name = match.group("macro") 527 if macrotypes.has_key(macro_name): 528 argname = macroargs.get(macro_name) 529 result = macrotypes[macro_name] % { 530 "args" : quote_macro_argument((argname and ("%s=" % argname) or "") + match.group("options")) 531 } 532 if not self.forbids_macros(): 533 return result 534 if macro_name == "anchor": 535 self.held_anchors.append(result) 536 return "" 537 538 elif match.group("italictext"): 539 return "''%s''" % self.translate_content(match.group("italictext")) 540 541 elif match.group("boldtext"): 542 return "'''%s'''" % self.translate_content(match.group("boldtext")) 543 544 elif match.group("deltext"): 545 return "--(%s)--" % self.translate_content(match.group("deltext")) 546 547 elif match.group("underlinetext"): 548 return "__%s__" % self.translate_content(match.group("underlinetext")) 549 550 elif match.group("subtext"): 551 return ",,%s,," % self.translate_content(match.group("subtext")) 552 553 else: 554 return self.translate_text(match.group()) 555 556 def translate_text(self, s, preformatted=False): 557 558 "Translate the plain text string 's', converting notation." 559 560 for before, after in preformatted and preformatted_notation_mapping or notation_mapping: 561 s = s.replace(before, after) 562 return s 563 564 def translate_content(self, text): 565 566 """ 567 Return a translation of the given 'text'. If the optional 'sectiontype' is 568 specified, the translation may be modified to a form appropriate to the 569 section being translated. 570 """ 571 572 parts = [] 573 preformatted = self.is_preformatted() 574 575 last = 0 576 for match in content_regexp.finditer(text): 577 start, end = match.span() 578 parts.append(self.translate_text(text[last:start], preformatted)) 579 580 # Handle unformatted sections. 581 582 if self.sections and self.sections[-1] in ("code", "noformat"): 583 parts.append(match.group()) 584 else: 585 parts.append(self.translate_content_match(match)) 586 587 last = end 588 589 parts.append(self.translate_text(text[last:], preformatted)) 590 return "".join(parts) 591 592 def is_preformatted(self): 593 return reduce(operator.or_, [x in preformatted_sectiontypes for x in self.sections], False) 594 595 def translate_block(self, blocktype, blocktext): 596 597 "Translate the block with the given 'blocktype' and 'blocktext'." 598 599 if blocktype in headings: 600 self.in_heading = True 601 self.held_anchors = [] 602 603 parts = [] 604 605 # Translate headings and blockquotes. 606 607 if blocktypes.has_key(blocktype): 608 text = self.parse_text(blocktext) 609 for anchor in self.held_anchors: 610 parts.append(anchor) 611 parts.append(blocktypes[blocktype] % text) 612 613 # Translate list items. 614 615 elif blocktype == "list": 616 for listmarker, listitem in get_list_items(blocktext): 617 parts.append("%s %s" % (self.translate_marker(listmarker), self.parse_text(listitem))) 618 619 # Translate table items. 620 621 elif blocktype == "table": 622 623 # Enter the table. 624 625 self.enter_section() 626 627 table_parts = [] 628 first = True 629 630 for cellsep, columns in get_table_rows(blocktext): 631 if not first: 632 table_parts.append("==") 633 else: 634 first = False 635 moinsep = self.translate_cellsep(cellsep) 636 table_parts.append(moinsep.join([self.translate_cell(cellsep, column) for column in columns])) 637 638 # Nest the section appropriately. 639 640 opening, closing = self.nest_section() 641 642 parts.append("%s#!table" % opening) 643 parts += table_parts 644 parts.append(closing) 645 646 # Leave the table. 647 648 self.leave_section() 649 650 # Handle anonymous blocks. 651 652 else: 653 parts.append(self.parse_text(blocktext)) 654 655 if blocktype in headings: 656 self.in_heading = False 657 658 return "\n".join(parts) 659 660 def translate_section(self, sectiontype, options, text): 661 662 """ 663 Translate the section with the given 'sectiontype', 'options' and 664 'text'. 665 """ 666 667 parts = [] 668 669 # Enter the section. 670 671 self.enter_section(sectiontype) 672 673 # Sections can contain other sections. 674 675 if sectiontype == "noformat": 676 section_content = self.translate_content(text.strip("\n")) 677 else: 678 section_content = self.parse_text(text.strip()) 679 680 # Nest the section appropriately. 681 682 opening, closing = self.nest_section() 683 mointype = sectiontypes.get(sectiontype) 684 685 parts.append("%s%s\n" % (opening, mointype or "")) 686 if options: 687 parts.append("## %s\n" % options) 688 parts.append(section_content) 689 parts.append("\n%s\n" % closing) 690 691 # Leave the section. 692 693 self.leave_section() 694 695 return parts 696 697 def enter_section(self, sectiontype=None): 698 self.level += 1 699 self.max_level = max(self.level, self.max_level) 700 self.sections.append(sectiontype) 701 702 def leave_section(self): 703 self.level -= 1 704 if not self.level: 705 self.max_level = 0 706 self.sections.pop() 707 708 def nest_section(self): 709 level = 3 + self.max_level - self.level 710 opening = "{" * level 711 closing = "}" * level 712 return opening, closing 713 714 # General parsing. 715 716 def parse_text(self, s, top=False): 717 718 "Parse the content in the string 's', returning the translation." 719 720 parts = [] 721 722 # Control spacing between blocks and other blocks or sections. 723 724 preceded_by_block = False 725 726 for type, text in get_regions(s): 727 728 # Handle list, heading, blockquote or anonymous blocks. 729 730 if type is None: 731 732 # Where the region is the same as the provided text, return 733 # immediately. This is the base case of the recursive parsing 734 # process. 735 736 if text == s and not top: 737 return self.translate_content(text) 738 739 # Otherwise, obtain and translate the blocks. 740 741 if preceded_by_block: 742 parts.append("\n") 743 744 first = True 745 for blocktype, blocktext in get_blocks(text): 746 if not first: 747 parts.append("\n") 748 else: 749 first = False 750 parts.append("%s" % self.translate_block(blocktype, blocktext)) 751 752 if not first: 753 preceded_by_block = True 754 755 # Handle sections. 756 757 else: 758 sectiontype, options = type 759 760 # Direct translations of sections. 761 762 if sectiontypes.has_key(sectiontype): 763 if preceded_by_block: 764 parts.append("\n") 765 766 parts += self.translate_section(sectiontype, options, text) 767 preceded_by_block = True 768 769 # Translations of macros acting as sections. 770 771 elif macrotypes.has_key(sectiontype): 772 773 # Prevent the production of macros in places they would 774 # produce illegal Moin syntax. 775 776 if not self.forbids_macros(): 777 self.macro = sectiontype 778 argname = macroargs.get(sectiontype) 779 parts.append(macrotypes[sectiontype] % { 780 "content" : quote_macro_argument(self.parse_text(text)), 781 "args" : quote_macro_argument((argname and ("%s=" % argname) or "") + options) 782 }) 783 self.macro = None 784 785 # Include the contents of section-based macros where the 786 # macros themselves are not allowed. 787 788 else: 789 parts.append(self.translate_content(text)) 790 791 preceded_by_block = False 792 793 # Unrecognised sections. 794 795 else: 796 parts += self.translate_section(sectiontype, None, text) 797 preceded_by_block = False 798 799 return "".join(parts) 800 801 def forbids_macros(self): 802 return self.in_heading or self.macro 803 804 def parse(s, out): 805 806 "Parse the content in the string 's', writing a translation to 'out'." 807 808 parser = ConfluenceParser() 809 out.write(parser.parse_text(s, top=True)) 810 811 if __name__ == "__main__": 812 s = codecs.getreader("utf-8")(sys.stdin).read() 813 out = codecs.getwriter("utf-8")(sys.stdout) 814 parse(s, out) 815 816 # vim: tabstop=4 expandtab shiftwidth=4