1 #!/usr/bin/env python 2 3 """ 4 Confluence Wiki syntax parsing. 5 6 Copyright (C) 2012, 2013 Paul Boddie <paul@boddie.org.uk> 7 8 This software is free software; you can redistribute it and/or 9 modify it under the terms of the GNU General Public License as 10 published by the Free Software Foundation; either version 2 of 11 the License, or (at your option) any later version. 12 13 This software is distributed in the hope that it will be useful, 14 but WITHOUT ANY WARRANTY; without even the implied warranty of 15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 GNU General Public License for more details. 17 18 You should have received a copy of the GNU General Public 19 License along with this library; see the file LICENCE.txt 20 If not, write to the Free Software Foundation, Inc., 21 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA 22 23 -------- 24 25 The basic procedure is as follows: 26 27 1. Wiki pages are first split up into regions. 28 2. Then, within these regions, the text is split into blocks. 29 1. First, lists are identified. 30 2. Additionally, other block-like elements are identified. 31 3. Each block is then split into regions. 32 """ 33 34 from common import * 35 import re 36 import sys 37 import codecs 38 import operator 39 40 # Section extraction. 41 42 sections_regexp_str = r"(?<!{){(?P<type>[^-_*+{}\n:]+)(?P<options>:[^}\n]+)?}" \ 43 r"|" \ 44 r"^(?P<rowstart>[|]{1,2})" \ 45 r"|" \ 46 r"(?P<rowend>[|]{1,2}(\n|$))" \ 47 r"|" \ 48 r"^(?P<listitem>\s*[*#-]+\s+.*(\n|$))" 49 50 sections_regexp = re.compile(sections_regexp_str, re.DOTALL | re.MULTILINE) 51 52 def get_regions(s): 53 54 """ 55 Return a list of regions from 's'. Each region is specified using a tuple of 56 the form (type, text). 57 """ 58 59 last = 0 60 regions = [""] 61 depth = 0 62 had_row = False 63 had_item = False 64 65 for match in sections_regexp.finditer(s): 66 start, end = match.span() 67 is_start = match.group("options") or match.group("rowstart") 68 is_section = is_section_marker(match.group("type")) 69 is_row = match.group("rowstart") or match.group("rowend") 70 is_item = match.group("listitem") 71 72 # The start of a region is either indicated by a marker with options or 73 # by a marker where no region is currently active. 74 75 if is_start or not depth: 76 77 # Where no region is active, add the text since the last match as a 78 # "null" region. 79 80 if not depth: 81 regions[-1] += s[last:start] 82 83 # A new region is maintained as a string. 84 85 if is_section: 86 regions.append(s[start:end]) 87 88 # A new row may either continue a table region or start a new 89 # table region. 90 91 elif is_row: 92 if (last != start or not had_row): 93 regions.append(s[start:end]) 94 else: 95 regions[-2] += regions[-1] + s[start:end] 96 regions.pop() 97 98 # A list item may either continue a list region or start a new 99 # list region. 100 101 elif is_item: 102 if (last != start or not had_item): 103 regions.append(s[start:end]) 104 else: 105 regions[-2] += regions[-1] + s[start:end] 106 regions.pop() 107 108 # Certain markers may be standalone macros. 109 110 else: 111 regions[-1] += s[start:end] 112 113 # Where a region is active, add the text since the last match as 114 # well as the text in this match to the region. 115 116 else: 117 regions[-1] += s[last:end] 118 119 if is_section or is_row: 120 depth += 1 121 122 # The end of a region is indicated by a marker with no options. 123 124 else: 125 # Where no region is active, the text since the last match plus the 126 # marker are added to the current "null" region. 127 128 if not depth: 129 130 # Add to the string portion of the "null" region. 131 132 regions[-1] += s[last:end] 133 134 # Where a region is active, the end marker and preceding text is 135 # either incorporated into the current region if more than one 136 # region is active, or the preceding text is incorporated into the 137 # current region and the details of the region are then obtained. 138 139 else: 140 if depth > 1 or (not is_section and not is_row): 141 regions[-1] += s[last:end] 142 143 # Terminate the active region, interpreting its contents. 144 145 else: 146 regions[-1] += s[last:end] 147 regions.append("") 148 149 if is_section or is_row: 150 depth -= 1 151 152 had_row = is_row 153 had_item = is_item 154 last = end 155 156 # Where a region is still active, terminate it. 157 158 regions[-1] += s[last:] 159 160 return [get_section_details(s) for s in regions if s] 161 162 def is_section_marker(sectiontype): 163 return sectiontypes.has_key(sectiontype) or sectiontype == "color" 164 165 # Section inspection. 166 167 section_regexp_str = r"{(?P<sectiontype>[^\n:]*?)(?::(?P<options>.*?))?}(?P<section>.*){(?P=sectiontype)}" 168 section_regexp = re.compile(section_regexp_str, re.DOTALL | re.MULTILINE) 169 170 def get_section_details(s): 171 172 "Return the details of a section 's' in the form (type, text)." 173 174 match = section_regexp.match(s) 175 if match: 176 return (match.group("sectiontype"), match.group("options")), match.group("section") 177 else: 178 return None, s 179 180 # Heading, table and list extraction. 181 182 list_regexp_str = r"^\s*(?P<listtype>[*#-])[*#-]*\s+.*(\n\s*(?P=listtype).*?)*(?:\n|$)" 183 table_regexp_str = r"^((?P<celltype>[|]{1,2})((.|\n(?!\n))+?(?P=celltype))+(\n|$))+" 184 blocktext_regexp_str = r"^(?P<type>h\d|bq)\.\s+(?P<text>.*)$" 185 186 blockelement_regexp = re.compile( 187 "(" + list_regexp_str + ")" 188 "|" 189 "(" + table_regexp_str + ")" 190 "|" 191 "(" + blocktext_regexp_str + ")", 192 re.MULTILINE 193 ) 194 195 def get_block_elements(s): 196 197 """ 198 Extract headings, tables and lists from the given string 's'. 199 """ 200 201 last = 0 202 blocks = [] 203 for match in blockelement_regexp.finditer(s): 204 start, end = match.span() 205 matchtype = match.group("listtype") and "list" or match.group("celltype") and "table" or match.group("type") 206 blocks.append((None, s[last:start])) 207 blocks.append((matchtype, match.group("text") or s[start:end])) 208 last = end 209 blocks.append((None, s[last:])) 210 return blocks 211 212 # Block extraction. 213 214 block_regexp_str = r"^(?:\s*\n)+" 215 block_regexp = re.compile(block_regexp_str, re.MULTILINE) 216 217 def get_basic_blocks(s): 218 219 """ 220 Return blocks from the given string 's' by splitting the text on blank lines 221 and eliminating those lines. 222 """ 223 224 return [b for b in block_regexp.split(s) if b.strip()] 225 226 # Block inspection. 227 228 def get_blocks(s): 229 230 """ 231 Return blocks from the given string 's', inspecting the basic blocks and 232 generating additional block-level text where appropriate. 233 """ 234 235 blocks = [] 236 237 for blocktype, blocktext in get_block_elements(s): 238 239 # Collect heading, list and table blocks. 240 241 if blocktype is not None: 242 blocks.append((blocktype, blocktext)) 243 244 # Attempt to find new subblocks in other regions. 245 246 else: 247 for block in get_basic_blocks(blocktext): 248 blocks.append((None, block)) 249 250 return blocks 251 252 # List item inspection. 253 254 listitem_regexp_str = r"^(?P<marker> *[-*#]+)\s+(?P<text>.*)$" 255 listitem_regexp = re.compile(listitem_regexp_str, re.MULTILINE) 256 257 def get_list_items(text): 258 259 "Return a list of (marker, text) tuples for the given list 'text'." 260 261 items = [] 262 263 for match in listitem_regexp.finditer(text): 264 items.append((match.group("marker"), match.group("text"))) 265 266 return items 267 268 # Content inspection. 269 270 monospace_regexp_str = r"{{(?P<monotext>.*?)}}" 271 link_regexp_str = r"[[](?P<linktext>.*?)]" 272 image_regexp_str = r"!(?P<imagetext>\w.*?)!" 273 macro_regexp_str = r"{(?P<macro>.*?):(?P<options>.*?)}" 274 275 # Word-dependent patterns. 276 # Here, the unbracketed markers must test for the absence of surrounding word 277 # characters. 278 279 italic_regexp_str = r"(?:(?<!\w)_|\{_\})(?P<italictext>.*?)(?:_(?!\w)|\{_\})" 280 bold_regexp_str = r"(?:(?<!\w)\*|\{\*\})(?P<boldtext>.*?)(?:\*(?!\w)|\{\*\})" 281 del_regexp_str = r"(?:(?<!\w)-|\{-\})(?P<deltext>.*?)(?:-(?!\w)|\{-\})" 282 underline_regexp_str = r"(?:(?<!\w)\+|\{\+\})(?P<underlinetext>.*?)(?:\+(?!\w)|\{\+\})" 283 sub_regexp_str = r"(?:(?<!\w)~|\{~\})(?P<subtext>.*?)(?:~(?!\w)|\{~\})" 284 285 content_regexp_str = ( 286 "(" + monospace_regexp_str + ")" 287 "|" 288 "(" + link_regexp_str + ")" 289 "|" 290 "(" + image_regexp_str + ")" 291 "|" 292 "(" + macro_regexp_str + ")" 293 "|" 294 "(" + italic_regexp_str + ")" 295 "|" 296 "(" + bold_regexp_str + ")" 297 "|" 298 "(" + del_regexp_str + ")" 299 "|" 300 "(" + underline_regexp_str + ")" 301 "|" 302 "(" + sub_regexp_str + ")" 303 ) 304 305 # Table row inspection. 306 307 cellsep_regexp_str = r"(?P<celltype>[|]{1,2})" 308 309 table_content_regexp_str = ( 310 content_regexp_str + 311 "|" 312 "(" + cellsep_regexp_str + ")" 313 ) 314 315 content_regexp = re.compile(content_regexp_str) 316 table_content_regexp = re.compile(table_content_regexp_str) 317 318 def get_table_rows(text): 319 320 "Return a list of (cellsep, columns) tuples for the given table 'text'." 321 322 rows = [] 323 324 for row in text.split("|\n"): 325 if not row: 326 break 327 328 row += "|" 329 cellsep = None 330 columns = [""] 331 last = 0 332 for match in table_content_regexp.finditer(row): 333 start, end = match.span() 334 columns[-1] += row[last:start] 335 336 if match.group("celltype"): 337 if cellsep is None: 338 cellsep = match.group("celltype") 339 columns.append("") 340 else: 341 columns[-1] += match.group() 342 343 last = end 344 345 columns[-1] += row[last:] 346 347 if cellsep: 348 rows.append((cellsep, columns[1:-1])) 349 350 return rows 351 352 # Notation conversion. 353 354 notation_mapping = [ 355 (r"\!", "!"), 356 (r"\-", "-"), 357 (r"\\""\n", "<<BR>>"), 358 (r"\\ ", "<<BR>>"), 359 (r"\~", "~"), 360 ] 361 362 preformatted_notation_mapping = [ 363 (r"\!", "!"), 364 (r"\-", "-"), 365 (r"\\""\n", "\n"), 366 (r"\\ ", "\n"), 367 (r"\~", "~"), 368 ] 369 370 # Translation helpers. 371 372 markers = { 373 "*" : "*", 374 "#" : "1.", 375 "-" : "*", 376 } 377 378 cellseps = { 379 "|" : "\n|| ", 380 "||" : "\n|| ", 381 } 382 383 cellextra = { 384 "|" : "", 385 "||" : "'''", 386 } 387 388 sectiontypes = { 389 "code" : "", 390 "noformat" : "", 391 "quote" : "", 392 "info" : "#!wiki important", 393 "note" : "#!wiki caution", 394 "tip" : "#!wiki tip", 395 "warning" : "#!wiki warning", 396 } 397 398 preformatted_sectiontypes = (None, "noformat") 399 400 macroargs = { 401 "color" : "col", 402 } 403 404 macrotypes = { 405 "anchor" : "<<Anchor(%(args)s)>>", 406 "color" : "<<Color2(%(content)s, %(args)s)>>", 407 } 408 409 class ConfluenceParser: 410 411 "A parser for Confluence markup." 412 413 def __init__(self): 414 self.max_level = self.level = 0 415 self.in_heading = False 416 self.held_anchors = [] 417 self.macro = None 418 self.sections = [] 419 420 def translate_marker(self, marker): 421 422 "Translate the given 'marker' to a suitable Moin representation." 423 424 return " " * len(marker) + markers[marker[-1]] 425 426 def translate_cellsep(self, cellsep): 427 428 "Translate the given 'cellsep' to a suitable Moin representation." 429 430 return cellseps[cellsep] 431 432 def translate_cell(self, cellsep, text): 433 434 "Using 'cellsep', translate the cell 'text'." 435 436 return cellextra[cellsep] + self.parse_text(text).strip() + cellextra[cellsep] 437 438 def translate_content_match(self, match): 439 440 "Translate the content described by the given 'match', returning a string." 441 442 if match.group("monotext"): 443 self.enter_section(); self.leave_section() 444 return "{{{%s}}}" % match.group("monotext") 445 446 elif match.group("linktext"): 447 parts = match.group("linktext").split("|") 448 449 # NOTE: Proper detection of external links required. 450 451 if len(parts) == 1: 452 label, target, title = None, parts[0], None 453 elif len(parts) == 2: 454 (label, target), title = parts, None 455 else: 456 label, target, title = parts 457 458 target = target.strip() 459 460 # Look for namespace links and rewrite them. 461 462 if target.find(":") != -1: 463 prefix = "" 464 space, rest = target.split(":", 1) 465 if space not in URL_SCHEMES: 466 rest = get_page_title(rest) 467 target = "%s/%s" % (space, rest) 468 469 # Detect anchors. 470 471 elif target.startswith("#"): 472 prefix = "" 473 474 # Detect attachments. 475 476 elif target.startswith("^"): 477 prefix = "attachment:" 478 479 # Link to other pages within a space. 480 481 else: 482 prefix = "../" 483 484 # Make the link tidier by making a target if none was given. 485 486 if not label: 487 label = target 488 489 target = get_page_title(target) 490 491 if not label and not title: 492 return "[[%s%s]]" % (prefix, target) 493 elif not title: 494 return "[[%s%s|%s]]" % (prefix, target, label) 495 else: 496 return "[[%s%s|%s|title=%s]]" % (prefix, target, label, title) 497 498 elif match.group("imagetext"): 499 parts = match.group("imagetext").split("|") 500 501 # NOTE: Proper detection of external links required. 502 503 if parts[0].startswith("http"): 504 prefix = "" 505 else: 506 prefix = "attachment:" 507 508 # NOTE: Proper options conversion required. 509 510 if len(parts) == 1: 511 return "{{%s%s}}" % (prefix, parts[0]) 512 else: 513 return "{{%s%s|%s}}" % (prefix, parts[0], parts[1]) 514 515 elif match.group("macro"): 516 macro_name = match.group("macro") 517 if macrotypes.has_key(macro_name): 518 argname = macroargs.get(macro_name) 519 result = macrotypes[macro_name] % { 520 "args" : quote_macro_argument((argname and ("%s=" % argname) or "") + match.group("options")) 521 } 522 if not self.forbids_macros(): 523 return result 524 if macro_name == "anchor": 525 self.held_anchors.append(result) 526 return "" 527 528 elif match.group("italictext"): 529 return "''%s''" % self.translate_content(match.group("italictext")) 530 531 elif match.group("boldtext"): 532 return "'''%s'''" % self.translate_content(match.group("boldtext")) 533 534 elif match.group("deltext"): 535 return "--(%s)--" % self.translate_content(match.group("deltext")) 536 537 elif match.group("underlinetext"): 538 return "__%s__" % self.translate_content(match.group("underlinetext")) 539 540 elif match.group("subtext"): 541 return ",,%s,," % self.translate_content(match.group("subtext")) 542 543 else: 544 return self.translate_text(match.group()) 545 546 def translate_text(self, s, preformatted=False): 547 548 "Translate the plain text string 's', converting notation." 549 550 for before, after in preformatted and preformatted_notation_mapping or notation_mapping: 551 s = s.replace(before, after) 552 return s 553 554 def translate_content(self, text): 555 556 """ 557 Return a translation of the given 'text'. If the optional 'sectiontype' is 558 specified, the translation may be modified to a form appropriate to the 559 section being translated. 560 """ 561 562 parts = [] 563 preformatted = self.is_preformatted() 564 565 last = 0 566 for match in content_regexp.finditer(text): 567 start, end = match.span() 568 parts.append(self.translate_text(text[last:start], preformatted)) 569 570 # Handle unformatted sections. 571 572 if self.sections and self.sections[-1] in ("code", "noformat"): 573 parts.append(match.group()) 574 else: 575 parts.append(self.translate_content_match(match)) 576 577 last = end 578 579 parts.append(self.translate_text(text[last:], preformatted)) 580 return "".join(parts) 581 582 def is_preformatted(self): 583 return reduce(operator.or_, [x in preformatted_sectiontypes for x in self.sections], False) 584 585 def translate_block(self, blocktype, blocktext): 586 587 "Translate the block with the given 'blocktype' and 'blocktext'." 588 589 if blocktype in headings: 590 self.in_heading = True 591 self.held_anchors = [] 592 593 parts = [] 594 595 # Translate headings and blockquotes. 596 597 if blocktypes.has_key(blocktype): 598 text = self.parse_text(blocktext) 599 for anchor in self.held_anchors: 600 parts.append(anchor) 601 parts.append(blocktypes[blocktype] % text) 602 603 # Translate list items. 604 605 elif blocktype == "list": 606 for listmarker, listitem in get_list_items(blocktext): 607 parts.append("%s %s" % (self.translate_marker(listmarker), self.parse_text(listitem))) 608 609 # Translate table items. 610 611 elif blocktype == "table": 612 613 # Enter the table. 614 615 self.enter_section() 616 617 table_parts = [] 618 first = True 619 620 for cellsep, columns in get_table_rows(blocktext): 621 if not first: 622 table_parts.append("==") 623 else: 624 first = False 625 moinsep = self.translate_cellsep(cellsep) 626 table_parts.append(moinsep.join([self.translate_cell(cellsep, column) for column in columns])) 627 628 # Nest the section appropriately. 629 630 opening, closing = self.nest_section() 631 632 parts.append("%s#!table" % opening) 633 parts += table_parts 634 parts.append(closing) 635 636 # Leave the table. 637 638 self.leave_section() 639 640 # Handle anonymous blocks. 641 642 else: 643 parts.append(self.parse_text(blocktext)) 644 645 if blocktype in headings: 646 self.in_heading = False 647 648 return "\n".join(parts) 649 650 def translate_section(self, sectiontype, options, text): 651 652 """ 653 Translate the section with the given 'sectiontype', 'options' and 654 'text'. 655 """ 656 657 parts = [] 658 659 # Enter the section. 660 661 self.enter_section(sectiontype) 662 663 # Sections can contain other sections. 664 665 section_content = self.parse_text(text.strip()) 666 667 # Nest the section appropriately. 668 669 opening, closing = self.nest_section() 670 mointype = sectiontypes.get(sectiontype) 671 672 parts.append("%s%s\n" % (opening, mointype or "")) 673 if options: 674 parts.append("## %s\n" % options) 675 parts.append(section_content) 676 parts.append("\n%s\n" % closing) 677 678 # Leave the section. 679 680 self.leave_section() 681 682 return parts 683 684 def enter_section(self, sectiontype=None): 685 self.level += 1 686 self.max_level = max(self.level, self.max_level) 687 self.sections.append(sectiontype) 688 689 def leave_section(self): 690 self.level -= 1 691 if not self.level: 692 self.max_level = 0 693 self.sections.pop() 694 695 def nest_section(self): 696 level = 3 + self.max_level - self.level 697 opening = "{" * level 698 closing = "}" * level 699 return opening, closing 700 701 # General parsing. 702 703 def parse_text(self, s, top=False): 704 705 "Parse the content in the string 's', returning the translation." 706 707 parts = [] 708 709 # Control spacing between blocks and other blocks or sections. 710 711 preceded_by_block = False 712 713 for type, text in get_regions(s): 714 715 # Handle list, heading, blockquote or anonymous blocks. 716 717 if type is None: 718 719 # Where the region is the same as the provided text, return 720 # immediately. This is the base case of the recursive parsing 721 # process. 722 723 if text == s and not top: 724 return self.translate_content(text) 725 726 # Otherwise, obtain and translate the blocks. 727 728 if preceded_by_block: 729 parts.append("\n") 730 731 first = True 732 for blocktype, blocktext in get_blocks(text): 733 if not first: 734 parts.append("\n") 735 else: 736 first = False 737 parts.append("%s" % self.translate_block(blocktype, blocktext)) 738 739 if not first: 740 preceded_by_block = True 741 742 # Handle sections. 743 744 else: 745 sectiontype, options = type 746 747 # Direct translations of sections. 748 749 if sectiontypes.has_key(sectiontype): 750 if preceded_by_block: 751 parts.append("\n") 752 753 parts += self.translate_section(sectiontype, options, text) 754 preceded_by_block = True 755 756 # Translations of macros acting as sections. 757 758 elif macrotypes.has_key(sectiontype): 759 760 # Prevent the production of macros in places they would 761 # produce illegal Moin syntax. 762 763 if not self.forbids_macros(): 764 self.macro = sectiontype 765 argname = macroargs.get(sectiontype) 766 parts.append(macrotypes[sectiontype] % { 767 "content" : quote_macro_argument(self.parse_text(text)), 768 "args" : quote_macro_argument((argname and ("%s=" % argname) or "") + options) 769 }) 770 self.macro = None 771 772 # Include the contents of section-based macros where the 773 # macros themselves are not allowed. 774 775 else: 776 parts.append(self.translate_content(text)) 777 778 preceded_by_block = False 779 780 # Unrecognised sections. 781 782 else: 783 parts += self.translate_section(sectiontype, None, text) 784 preceded_by_block = False 785 786 return "".join(parts) 787 788 def forbids_macros(self): 789 return self.in_heading or self.macro 790 791 def parse(s, out): 792 793 "Parse the content in the string 's', writing a translation to 'out'." 794 795 parser = ConfluenceParser() 796 out.write(parser.parse_text(s, top=True)) 797 798 if __name__ == "__main__": 799 s = codecs.getreader("utf-8")(sys.stdin).read() 800 out = codecs.getwriter("utf-8")(sys.stdout) 801 parse(s, out) 802 803 # vim: tabstop=4 expandtab shiftwidth=4