1 #!/usr/bin/env python 2 3 """ 4 Confluence Wiki syntax parsing. 5 6 Copyright (C) 2012, 2013 Paul Boddie <paul@boddie.org.uk> 7 8 This software is free software; you can redistribute it and/or 9 modify it under the terms of the GNU General Public License as 10 published by the Free Software Foundation; either version 2 of 11 the License, or (at your option) any later version. 12 13 This software is distributed in the hope that it will be useful, 14 but WITHOUT ANY WARRANTY; without even the implied warranty of 15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 GNU General Public License for more details. 17 18 You should have received a copy of the GNU General Public 19 License along with this library; see the file LICENCE.txt 20 If not, write to the Free Software Foundation, Inc., 21 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA 22 23 -------- 24 25 The basic procedure is as follows: 26 27 1. Wiki pages are first split up into regions. 28 2. Then, within these regions, the text is split into blocks. 29 1. First, lists are identified. 30 2. Additionally, other block-like elements are identified. 31 3. Each block is then parsed. 32 """ 33 34 from common import * 35 import re 36 import sys 37 import codecs 38 39 # Section extraction. 40 41 sections_regexp_str = r"(?<!{){(?P<type>[^-_*+{}\n:]+)(?P<options>:[^}\n]+)?}" 42 sections_regexp = re.compile(sections_regexp_str, re.DOTALL | re.MULTILINE) 43 44 def get_regions(s): 45 46 """ 47 Return a list of regions from 's'. Each region is specified using a tuple of 48 the form (type, text). 49 """ 50 51 last = 0 52 regions = [] 53 depth = 0 54 55 for match in sections_regexp.finditer(s): 56 start, end = match.span() 57 is_start = match.group("options") 58 59 # The start of a region is either indicated by a marker with options or 60 # by a marker where no region is currently active. 61 62 if is_start or not depth: 63 64 # Where no region is active, add the text since the last match as a 65 # "null" region. 66 67 if not depth: 68 regions.append((None, s[last:start])) 69 70 # A new region is maintained as a string. 71 72 regions.append(s[start:end]) 73 74 # Where a region is active, add the text since the last match as 75 # well as the text in this match to the region. 76 77 else: 78 regions[-1] += s[last:end] 79 80 # Any start marker will cause an increase in the depth of the region 81 # nesting. 82 83 depth += 1 84 85 # The end of a region is indicated by a marker with no options. 86 87 else: 88 # Where no region is active, the text since the last match plus the 89 # marker are added to the current "null" region. 90 91 if not depth: 92 93 # Add to the string portion of the "null" region. 94 95 regions[-1][1] += s[last:end] 96 97 # Where a region is active, the end marker and preceding text is 98 # either incorporated into the current region if more than one 99 # region is active, or the preceding text is incorporated into the 100 # current region and the details of the region are then obtained. 101 102 else: 103 if depth > 1: 104 regions[-1] += s[last:end] 105 106 # Terminate the active region, interpreting its contents. 107 108 else: 109 regions[-1] = get_section_details(regions[-1] + s[last:end]) 110 depth -= 1 111 112 last = end 113 114 # Where a region is still active, terminate it. 115 116 if depth: 117 regions[-1] = get_section_details(regions[-1] + s[last:]) 118 119 # Otherwise, add a "null" region. 120 121 else: 122 regions.append((None, s[last:])) 123 124 return regions 125 126 # Section inspection. 127 128 section_regexp_str = r"{(?P<sectiontype>[^\n:]*?)(?::(?P<options>.*?))?}(?P<section>.*){(?P=sectiontype)}" 129 section_regexp = re.compile(section_regexp_str, re.DOTALL | re.MULTILINE) 130 131 def get_section_details(s): 132 133 "Return the details of a section 's' in the form (type, text)." 134 135 match = section_regexp.match(s) 136 if match: 137 return (match.group("sectiontype"), match.group("options")), match.group("section") 138 else: 139 return None, s 140 141 # Heading, table and list extraction. 142 143 list_regexp_str = r"^\s*(?P<listtype>[*#-])[*#-]*\s+.*(\n\s*(?P=listtype).*?)*(?:\n|$)" 144 table_regexp_str = r"^((?P<celltype>[|]{1,2})((.|\n(?!\n))+?(?P=celltype))+(\n|$))+" 145 blocktext_regexp_str = r"^(?P<type>h\d|bq)\.\s+(?P<text>.*)$" 146 147 blockelement_regexp = re.compile( 148 "(" + list_regexp_str + ")" 149 "|" 150 "(" + table_regexp_str + ")" 151 "|" 152 "(" + blocktext_regexp_str + ")", 153 re.MULTILINE 154 ) 155 156 def get_block_elements(s): 157 158 """ 159 Extract headings, tables and lists from the given string 's'. 160 """ 161 162 last = 0 163 blocks = [] 164 for match in blockelement_regexp.finditer(s): 165 start, end = match.span() 166 matchtype = match.group("listtype") and "list" or match.group("celltype") and "table" or match.group("type") 167 blocks.append((None, s[last:start])) 168 blocks.append((matchtype, match.group("text") or s[start:end])) 169 last = end 170 blocks.append((None, s[last:])) 171 return blocks 172 173 # Block extraction. 174 175 block_regexp_str = r"^(?:\s*\n)+" 176 block_regexp = re.compile(block_regexp_str, re.MULTILINE) 177 178 def get_basic_blocks(s): 179 180 """ 181 Return blocks from the given string 's' by splitting the text on blank lines 182 and eliminating those lines. 183 """ 184 185 return [b for b in block_regexp.split(s) if b.strip()] 186 187 # Block inspection. 188 189 def get_blocks(s): 190 191 """ 192 Return blocks from the given string 's', inspecting the basic blocks and 193 generating additional block-level text where appropriate. 194 """ 195 196 blocks = [] 197 198 for blocktype, blocktext in get_block_elements(s): 199 200 # Collect heading, list and table blocks. 201 202 if blocktype is not None: 203 blocks.append((blocktype, blocktext)) 204 205 # Attempt to find new subblocks in other regions. 206 207 else: 208 for block in get_basic_blocks(blocktext): 209 blocks.append((None, block)) 210 211 return blocks 212 213 # List item inspection. 214 215 listitem_regexp_str = r"^(?P<marker> *[-*#]+)\s+(?P<text>.*)$" 216 listitem_regexp = re.compile(listitem_regexp_str, re.MULTILINE) 217 218 def get_list_items(text): 219 220 "Return a list of (marker, text) tuples for the given list 'text'." 221 222 items = [] 223 224 for match in listitem_regexp.finditer(text): 225 items.append((match.group("marker"), match.group("text"))) 226 227 return items 228 229 # Content inspection. 230 231 monospace_regexp_str = r"{{(?P<monotext>.*?)}}" 232 link_regexp_str = r"[[](?P<linktext>.*?)]" 233 image_regexp_str = r"!(?P<imagetext>\w.*?)!" 234 macro_regexp_str = r"{(?P<macro>.*?):(?P<options>.*?)}" 235 236 # Word-dependent patterns. 237 # Here, the unbracketed markers must test for the absence of surrounding word 238 # characters. 239 240 italic_regexp_str = r"(?:(?<!\w)_|\{_\})(?P<italictext>.*?)(?:_(?!\w)|\{_\})" 241 bold_regexp_str = r"(?:(?<!\w)\*|\{\*\})(?P<boldtext>.*?)(?:\*(?!\w)|\{\*\})" 242 del_regexp_str = r"(?:(?<!\w)-|\{-\})(?P<deltext>.*?)(?:-(?!\w)|\{-\})" 243 underline_regexp_str = r"(?:(?<!\w)\+|\{\+\})(?P<underlinetext>.*?)(?:\+(?!\w)|\{\+\})" 244 sub_regexp_str = r"(?:(?<!\w)~|\{~\})(?P<subtext>.*?)(?:~(?!\w)|\{~\})" 245 246 content_regexp_str = ( 247 "(" + monospace_regexp_str + ")" 248 "|" 249 "(" + link_regexp_str + ")" 250 "|" 251 "(" + image_regexp_str + ")" 252 "|" 253 "(" + macro_regexp_str + ")" 254 "|" 255 "(" + italic_regexp_str + ")" 256 "|" 257 "(" + bold_regexp_str + ")" 258 "|" 259 "(" + del_regexp_str + ")" 260 "|" 261 "(" + underline_regexp_str + ")" 262 "|" 263 "(" + sub_regexp_str + ")" 264 ) 265 266 # Table row inspection. 267 268 cellsep_regexp_str = r"(?P<celltype>[|]{1,2})" 269 270 table_content_regexp_str = ( 271 content_regexp_str + 272 "|" 273 "(" + cellsep_regexp_str + ")" 274 ) 275 276 content_regexp = re.compile(content_regexp_str) 277 table_content_regexp = re.compile(table_content_regexp_str) 278 279 def get_table_rows(text): 280 281 "Return a list of (cellsep, columns) tuples for the given table 'text'." 282 283 rows = [] 284 285 for row in text.split("|\n"): 286 if not row: 287 break 288 289 row += "|" 290 cellsep = None 291 columns = [""] 292 last = 0 293 for match in table_content_regexp.finditer(row): 294 start, end = match.span() 295 columns[-1] += row[last:start] 296 297 if match.group("celltype"): 298 if cellsep is None: 299 cellsep = match.group("celltype") 300 columns.append("") 301 else: 302 columns[-1] += match.group() 303 304 last = end 305 306 columns[-1] += row[last:] 307 308 if cellsep: 309 rows.append((cellsep, columns[1:-1])) 310 311 return rows 312 313 # Notation conversion. 314 315 notation_mapping = [ 316 (r"\!", "!"), 317 (r"\-", "-"), 318 (r"\\""\n", "<<BR>>"), 319 (r"\\ ", "<<BR>>"), 320 (r"\~", "~"), 321 ] 322 323 preformatted_notation_mapping = [ 324 (r"\!", "!"), 325 (r"\-", "-"), 326 (r"\\""\n", "\n"), 327 (r"\\ ", "\n"), 328 (r"\~", "~"), 329 ] 330 331 # Translation helpers. 332 333 markers = { 334 "*" : "*", 335 "#" : "1.", 336 "-" : "*", 337 } 338 339 cellseps = { 340 "|" : "\n|| ", 341 "||" : "\n|| ", 342 } 343 344 cellextra = { 345 "|" : "", 346 "||" : "'''", 347 } 348 349 sectiontypes = { 350 "code" : "", 351 "noformat" : "", 352 "quote" : "", 353 "info" : "#!wiki important", 354 "note" : "#!wiki caution", 355 "tip" : "#!wiki tip", 356 "warning" : "#!wiki warning", 357 } 358 359 preformatted_sectiontypes = (None, "noformat") 360 361 macroargs = { 362 "color" : "col", 363 } 364 365 macrotypes = { 366 "anchor" : "<<Anchor(%(args)s)>>", 367 "color" : "<<Color2(%(content)s, %(args)s)>>", 368 } 369 370 class ConfluenceParser: 371 372 "A parser for Confluence markup." 373 374 def __init__(self): 375 self.max_level = self.level = 0 376 self.in_heading = False 377 self.held_anchors = [] 378 379 def translate_marker(self, marker): 380 381 "Translate the given 'marker' to a suitable Moin representation." 382 383 return " " * len(marker) + markers[marker[-1]] 384 385 def translate_cellsep(self, cellsep): 386 387 "Translate the given 'cellsep' to a suitable Moin representation." 388 389 return cellseps[cellsep] 390 391 def translate_cell(self, cellsep, text): 392 393 "Using 'cellsep', translate the cell 'text'." 394 395 return cellextra[cellsep] + self.parse_text(text).strip() + cellextra[cellsep] 396 397 def translate_content_match(self, match): 398 399 "Translate the content described by the given 'match', returning a string." 400 401 if match.group("monotext"): 402 self.enter_section(); self.leave_section() 403 return "{{{%s}}}" % match.group("monotext") 404 405 elif match.group("linktext"): 406 parts = match.group("linktext").split("|") 407 408 # NOTE: Proper detection of external links required. 409 410 if len(parts) == 1: 411 label, target, title = None, parts[0], None 412 elif len(parts) == 2: 413 (label, target), title = parts, None 414 else: 415 label, target, title = parts 416 417 target = target.strip() 418 419 # Look for namespace links and rewrite them. 420 421 if target.find(":") != -1: 422 prefix = "" 423 space, rest = target.split(":", 1) 424 if space not in URL_SCHEMES: 425 target = "%s/%s" % (space, rest) 426 427 # Detect anchors. 428 429 elif target.startswith("#"): 430 prefix = "" 431 432 # Detect attachments. 433 434 elif target.startswith("^"): 435 prefix = "attachment:" 436 437 # Link to other pages within a space. 438 439 else: 440 prefix = "../" 441 442 # Make the link tidier by making a target if none was given. 443 444 if not label: 445 label = target 446 447 if not label and not title: 448 return "[[%s%s]]" % (prefix, target) 449 elif not title: 450 return "[[%s%s|%s]]" % (prefix, target, label) 451 else: 452 return "[[%s%s|%s|title=%s]]" % (prefix, target, label, title) 453 454 elif match.group("imagetext"): 455 parts = match.group("imagetext").split("|") 456 457 # NOTE: Proper detection of external links required. 458 459 if parts[0].startswith("http"): 460 prefix = "" 461 else: 462 prefix = "attachment:" 463 464 # NOTE: Proper options conversion required. 465 466 if len(parts) == 1: 467 return "{{%s%s}}" % (prefix, parts[0]) 468 else: 469 return "{{%s%s|%s}}" % (prefix, parts[0], parts[1]) 470 471 elif match.group("macro"): 472 macro_name = match.group("macro") 473 if macrotypes.has_key(macro_name): 474 argname = macroargs.get(macro_name) 475 result = macrotypes[macro_name] % { 476 "args" : quote_macro_argument((argname and ("%s=" % argname) or "") + match.group("options")) 477 } 478 if not self.forbids_macros(): 479 return result 480 if macro_name == "anchor": 481 self.held_anchors.append(result) 482 return "" 483 484 elif match.group("italictext"): 485 return "''%s''" % self.translate_content(match.group("italictext")) 486 487 elif match.group("boldtext"): 488 return "'''%s'''" % self.translate_content(match.group("boldtext")) 489 490 elif match.group("deltext"): 491 return "--(%s)--" % self.translate_content(match.group("deltext")) 492 493 elif match.group("underlinetext"): 494 return "__%s__" % self.translate_content(match.group("underlinetext")) 495 496 elif match.group("subtext"): 497 return ",,%s,," % self.translate_content(match.group("subtext")) 498 499 else: 500 return self.translate_text(match.group()) 501 502 def translate_text(self, s, preformatted=False): 503 504 "Translate the plain text string 's', converting notation." 505 506 for before, after in preformatted and preformatted_notation_mapping or notation_mapping: 507 s = s.replace(before, after) 508 return s 509 510 def translate_content(self, text, sectiontype=None): 511 512 """ 513 Return a translation of the given 'text'. If the optional 'sectiontype' is 514 specified, the translation may be modified to a form appropriate to the 515 section being translated. 516 """ 517 518 parts = [] 519 preformatted = sectiontype in preformatted_sectiontypes 520 521 last = 0 522 for match in content_regexp.finditer(text): 523 start, end = match.span() 524 parts.append(self.translate_text(text[last:start], preformatted)) 525 526 # Handle unformatted sections. 527 528 if sectiontype in ("code", "noformat"): 529 parts.append(match.group()) 530 else: 531 parts.append(self.translate_content_match(match)) 532 533 last = end 534 535 parts.append(self.translate_text(text[last:], preformatted)) 536 return "".join(parts) 537 538 def translate_block(self, blocktype, blocktext): 539 540 "Translate the block with the given 'blocktype' and 'blocktext'." 541 542 if blocktype in headings: 543 self.in_heading = True 544 self.held_anchors = [] 545 546 parts = [] 547 548 # Translate headings and blockquotes. 549 550 if blocktypes.has_key(blocktype): 551 text = self.translate_content(blocktext) 552 for anchor in self.held_anchors: 553 parts.append(anchor) 554 parts.append(blocktypes[blocktype] % text) 555 556 # Translate list items. 557 558 elif blocktype == "list": 559 for listmarker, listitem in get_list_items(blocktext): 560 parts.append("%s %s" % (self.translate_marker(listmarker), self.translate_content(listitem))) 561 562 # Translate table items. 563 564 elif blocktype == "table": 565 566 # Enter the table. 567 568 self.enter_section() 569 570 table_parts = [] 571 first = True 572 573 for cellsep, columns in get_table_rows(blocktext): 574 if not first: 575 table_parts.append("==") 576 else: 577 first = False 578 moinsep = self.translate_cellsep(cellsep) 579 table_parts.append(moinsep.join([self.translate_cell(cellsep, column) for column in columns])) 580 581 # Nest the section appropriately. 582 583 opening, closing = self.nest_section() 584 585 parts.append("%s#!table" % opening) 586 parts += table_parts 587 parts.append(closing) 588 589 # Leave the table. 590 591 self.leave_section() 592 593 # Handle anonymous blocks. 594 595 else: 596 parts.append(self.translate_content(blocktext)) 597 598 if blocktype in headings: 599 self.in_heading = False 600 601 return "\n".join(parts) 602 603 def translate_section(self, sectiontype, options, text): 604 605 """ 606 Translate the section with the given 'sectiontype', 'options' and 607 'text'. 608 """ 609 610 parts = [] 611 612 # Enter the section. 613 614 self.enter_section() 615 616 mointype = sectiontypes.get(sectiontype) 617 section_content = self.translate_content(text.strip(), sectiontype) 618 619 # Nest the section appropriately. 620 621 opening, closing = self.nest_section() 622 623 parts.append("%s%s\n" % (opening, mointype or "")) 624 if options: 625 parts.append("## %s\n" % options) 626 parts.append(section_content) 627 parts.append("\n%s\n" % closing) 628 629 # Leave the section. 630 631 self.leave_section() 632 633 return parts 634 635 def enter_section(self): 636 self.level += 1 637 self.max_level = max(self.level, self.max_level) 638 639 def leave_section(self): 640 self.level -= 1 641 if not self.level: 642 self.max_level = 0 643 644 def nest_section(self): 645 level = 3 + self.max_level - self.level 646 opening = "{" * level 647 closing = "}" * level 648 return opening, closing 649 650 # General parsing. 651 652 def parse_text(self, s): 653 654 "Parse the content in the string 's', returning the translation." 655 656 parts = [] 657 658 # Control spacing between blocks and other blocks or sections. 659 660 preceded_by_block = False 661 662 for type, text in get_regions(s): 663 664 # Handle list, heading, blockquote or anonymous blocks. 665 666 if type is None: 667 if preceded_by_block: 668 parts.append("\n") 669 670 first = True 671 for blocktype, blocktext in get_blocks(text): 672 if not first: 673 parts.append("\n") 674 else: 675 first = False 676 parts.append("%s" % self.translate_block(blocktype, blocktext)) 677 678 if not first: 679 preceded_by_block = True 680 681 # Handle sections. 682 683 else: 684 sectiontype, options = type 685 686 # Direct translations of sections. 687 688 if sectiontypes.has_key(sectiontype): 689 if preceded_by_block: 690 parts.append("\n") 691 692 parts += self.translate_section(sectiontype, options, text) 693 preceded_by_block = True 694 695 # Translations of macros (which can look like sections). 696 697 elif macrotypes.has_key(sectiontype) and not self.forbids_macros(): 698 argname = macroargs.get(sectiontype) 699 parts.append(macrotypes[sectiontype] % { 700 "content" : quote_macro_argument(self.parse_text(text)), 701 "args" : quote_macro_argument((argname and ("%s=" % argname) or "") + options) 702 }) 703 preceded_by_block = False 704 705 # Unrecognised sections. 706 707 else: 708 parts += self.translate_section(sectiontype, None, text) 709 preceded_by_block = False 710 711 return "".join(parts) 712 713 def forbids_macros(self): 714 return self.in_heading 715 716 def parse(s, out): 717 718 "Parse the content in the string 's', writing a translation to 'out'." 719 720 parser = ConfluenceParser() 721 out.write(parser.parse_text(s)) 722 723 if __name__ == "__main__": 724 s = codecs.getreader("utf-8")(sys.stdin).read() 725 out = codecs.getwriter("utf-8")(sys.stdout) 726 parse(s, out) 727 728 # vim: tabstop=4 expandtab shiftwidth=4