1 #!/usr/bin/env python 2 3 """ 4 Confluence Wiki syntax parsing. 5 6 Copyright (C) 2012, 2013 Paul Boddie <paul@boddie.org.uk> 7 8 This software is free software; you can redistribute it and/or 9 modify it under the terms of the GNU General Public License as 10 published by the Free Software Foundation; either version 2 of 11 the License, or (at your option) any later version. 12 13 This software is distributed in the hope that it will be useful, 14 but WITHOUT ANY WARRANTY; without even the implied warranty of 15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 GNU General Public License for more details. 17 18 You should have received a copy of the GNU General Public 19 License along with this library; see the file LICENCE.txt 20 If not, write to the Free Software Foundation, Inc., 21 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA 22 23 -------- 24 25 The basic procedure is as follows: 26 27 1. Wiki pages are first split up into regions. 28 2. Then, within these regions, the text is split into blocks. 29 1. First, lists are identified. 30 2. Additionally, other block-like elements are identified. 31 3. Each block is then parsed. 32 """ 33 34 from common import * 35 import re 36 import sys 37 import codecs 38 39 # Section extraction. 40 41 sections_regexp_str = r"(?<!{){(?P<type>[^-_*+{}\n:]+)(:[^}\n]+)?}.*?{(?P=type)}" 42 sections_regexp = re.compile(sections_regexp_str, re.DOTALL | re.MULTILINE) 43 44 def get_regions(s): 45 46 """ 47 Return a list of regions from 's'. Each region is specified using a tuple of 48 the form (type, text). 49 """ 50 51 last = 0 52 regions = [] 53 for match in sections_regexp.finditer(s): 54 start, end = match.span() 55 regions.append((None, s[last:start])) 56 regions.append(get_section_details(s[start:end])) 57 last = end 58 regions.append((None, s[last:])) 59 return regions 60 61 # Section inspection. 62 63 section_regexp_str = r"{(?P<sectiontype>[^\n:]*?)(?::(?P<options>.*?))?}(?P<section>.*){(?P=sectiontype)}" 64 section_regexp = re.compile(section_regexp_str, re.DOTALL | re.MULTILINE) 65 66 def get_section_details(s): 67 68 "Return the details of a section 's' in the form (type, text)." 69 70 match = section_regexp.match(s) 71 if match: 72 return (match.group("sectiontype"), match.group("options")), match.group("section") 73 else: 74 return None, s 75 76 # Heading, table and list extraction. 77 78 list_regexp_str = r"^\s*(?P<listtype>[*#-])[*#-]*\s+.*(\n\s*(?P=listtype).*?)*(?:\n|$)" 79 table_regexp_str = r"^((?P<celltype>[|]{1,2})((.|\n(?!\n))+?(?P=celltype))+(\n|$))+" 80 blocktext_regexp_str = r"^(?P<type>h\d|bq)\.\s+(?P<text>.*)$" 81 82 blockelement_regexp = re.compile( 83 "(" + list_regexp_str + ")" 84 "|" 85 "(" + table_regexp_str + ")" 86 "|" 87 "(" + blocktext_regexp_str + ")", 88 re.MULTILINE 89 ) 90 91 def get_block_elements(s): 92 93 """ 94 Extract headings, tables and lists from the given string 's'. 95 """ 96 97 last = 0 98 blocks = [] 99 for match in blockelement_regexp.finditer(s): 100 start, end = match.span() 101 matchtype = match.group("listtype") and "list" or match.group("celltype") and "table" or match.group("type") 102 blocks.append((None, s[last:start])) 103 blocks.append((matchtype, match.group("text") or s[start:end])) 104 last = end 105 blocks.append((None, s[last:])) 106 return blocks 107 108 # Block extraction. 109 110 block_regexp_str = r"^(?:\s*\n)+" 111 block_regexp = re.compile(block_regexp_str, re.MULTILINE) 112 113 def get_basic_blocks(s): 114 115 """ 116 Return blocks from the given string 's' by splitting the text on blank lines 117 and eliminating those lines. 118 """ 119 120 return [b for b in block_regexp.split(s) if b.strip()] 121 122 # Block inspection. 123 124 def get_blocks(s): 125 126 """ 127 Return blocks from the given string 's', inspecting the basic blocks and 128 generating additional block-level text where appropriate. 129 """ 130 131 blocks = [] 132 133 for blocktype, blocktext in get_block_elements(s): 134 135 # Collect heading, list and table blocks. 136 137 if blocktype is not None: 138 blocks.append((blocktype, blocktext)) 139 140 # Attempt to find new subblocks in other regions. 141 142 else: 143 for block in get_basic_blocks(blocktext): 144 blocks.append((None, block)) 145 146 return blocks 147 148 # List item inspection. 149 150 listitem_regexp_str = r"^(?P<marker> *[-*#]+)\s+(?P<text>.*)$" 151 listitem_regexp = re.compile(listitem_regexp_str, re.MULTILINE) 152 153 def get_list_items(text): 154 155 "Return a list of (marker, text) tuples for the given list 'text'." 156 157 items = [] 158 159 for match in listitem_regexp.finditer(text): 160 items.append((match.group("marker"), match.group("text"))) 161 162 return items 163 164 # Content inspection. 165 166 monospace_regexp_str = r"{{(?P<monotext>.*?)}}" 167 link_regexp_str = r"[[](?P<linktext>.*?)]" 168 image_regexp_str = r"!(?P<imagetext>\w.*?)!" 169 macro_regexp_str = r"{(?P<macro>.*?):(?P<options>.*?)}" 170 171 # Word-dependent patterns. 172 # Here, the unbracketed markers must test for the absence of surrounding word 173 # characters. 174 175 italic_regexp_str = r"(?:(?<!\w)_|\{_\})(?P<italictext>.*?)(?:_(?!\w)|\{_\})" 176 bold_regexp_str = r"(?:(?<!\w)\*|\{\*\})(?P<boldtext>.*?)(?:\*(?!\w)|\{\*\})" 177 del_regexp_str = r"(?:(?<!\w)-|\{-\})(?P<deltext>.*?)(?:-(?!\w)|\{-\})" 178 underline_regexp_str = r"(?:(?<!\w)\+|\{\+\})(?P<underlinetext>.*?)(?:\+(?!\w)|\{\+\})" 179 sub_regexp_str = r"(?:(?<!\w)~|\{~\})(?P<subtext>.*?)(?:~(?!\w)|\{~\})" 180 181 content_regexp_str = ( 182 "(" + monospace_regexp_str + ")" 183 "|" 184 "(" + link_regexp_str + ")" 185 "|" 186 "(" + image_regexp_str + ")" 187 "|" 188 "(" + macro_regexp_str + ")" 189 "|" 190 "(" + italic_regexp_str + ")" 191 "|" 192 "(" + bold_regexp_str + ")" 193 "|" 194 "(" + del_regexp_str + ")" 195 "|" 196 "(" + underline_regexp_str + ")" 197 "|" 198 "(" + sub_regexp_str + ")" 199 ) 200 201 # Table row inspection. 202 203 cellsep_regexp_str = r"(?P<celltype>[|]{1,2})" 204 205 table_content_regexp_str = ( 206 content_regexp_str + 207 "|" 208 "(" + cellsep_regexp_str + ")" 209 ) 210 211 content_regexp = re.compile(content_regexp_str) 212 table_content_regexp = re.compile(table_content_regexp_str) 213 214 def get_table_rows(text): 215 216 "Return a list of (cellsep, columns) tuples for the given table 'text'." 217 218 rows = [] 219 220 for row in text.split("|\n"): 221 if not row: 222 break 223 224 row += "|" 225 cellsep = None 226 columns = [""] 227 last = 0 228 for match in table_content_regexp.finditer(row): 229 start, end = match.span() 230 columns[-1] += row[last:start] 231 232 if match.group("celltype"): 233 if cellsep is None: 234 cellsep = match.group("celltype") 235 columns.append("") 236 else: 237 columns[-1] += match.group() 238 239 last = end 240 241 columns[-1] += row[last:] 242 243 if cellsep: 244 rows.append((cellsep, columns[1:-1])) 245 246 return rows 247 248 # Notation conversion. 249 250 notation_mapping = [ 251 (r"\!", "!"), 252 (r"\-", "-"), 253 (r"\\""\n", "<<BR>>"), 254 (r"\\ ", "<<BR>>"), 255 (r"\~", "~"), 256 ] 257 258 preformatted_notation_mapping = [ 259 (r"\!", "!"), 260 (r"\-", "-"), 261 (r"\\""\n", "\n"), 262 (r"\\ ", "\n"), 263 (r"\~", "~"), 264 ] 265 266 # Translation helpers. 267 268 markers = { 269 "*" : "*", 270 "#" : "1.", 271 "-" : "*", 272 } 273 274 cellseps = { 275 "|" : "\n|| ", 276 "||" : "\n|| ", 277 } 278 279 cellextra = { 280 "|" : "", 281 "||" : "'''", 282 } 283 284 sectiontypes = { 285 "code" : "", 286 "noformat" : "", 287 "quote" : "", 288 "info" : "#!wiki important", 289 "note" : "#!wiki caution", 290 "tip" : "#!wiki tip", 291 "warning" : "#!wiki warning", 292 } 293 294 preformatted_sectiontypes = (None, "noformat") 295 296 macroargs = { 297 "color" : "col", 298 } 299 300 macrotypes = { 301 "anchor" : "<<Anchor(%(args)s)>>", 302 "color" : "<<Color2(%(content)s, %(args)s)>>", 303 } 304 305 class ConfluenceParser: 306 307 "A parser for Confluence markup." 308 309 def __init__(self): 310 self.max_level = self.level = 0 311 self.in_heading = False 312 self.held_anchors = [] 313 314 def translate_marker(self, marker): 315 316 "Translate the given 'marker' to a suitable Moin representation." 317 318 return " " * len(marker) + markers[marker[-1]] 319 320 def translate_cellsep(self, cellsep): 321 322 "Translate the given 'cellsep' to a suitable Moin representation." 323 324 return cellseps[cellsep] 325 326 def translate_cell(self, cellsep, text): 327 328 "Using 'cellsep', translate the cell 'text'." 329 330 return cellextra[cellsep] + self.parse_text(text).strip() + cellextra[cellsep] 331 332 def translate_content_match(self, match): 333 334 "Translate the content described by the given 'match', returning a string." 335 336 if match.group("monotext"): 337 self.enter_section(); self.leave_section() 338 return "{{{%s}}}" % match.group("monotext") 339 340 elif match.group("linktext"): 341 parts = match.group("linktext").split("|") 342 343 # NOTE: Proper detection of external links required. 344 345 if len(parts) == 1: 346 label, target, title = None, parts[0], None 347 elif len(parts) == 2: 348 (label, target), title = parts, None 349 else: 350 label, target, title = parts 351 352 target = target.strip() 353 354 # Look for namespace links and rewrite them. 355 356 if target.find(":") != -1: 357 prefix = "" 358 space, rest = target.split(":", 1) 359 if space not in URL_SCHEMES: 360 target = "%s/%s" % (space, rest) 361 362 # Detect anchors. 363 364 elif target.startswith("#"): 365 prefix = "" 366 367 # Detect attachments. 368 369 elif target.startswith("^"): 370 prefix = "attachment:" 371 372 # Link to other pages within a space. 373 374 else: 375 prefix = "../" 376 377 # Make the link tidier by making a target if none was given. 378 379 if not label: 380 label = target 381 382 if not label and not title: 383 return "[[%s%s]]" % (prefix, target) 384 elif not title: 385 return "[[%s%s|%s]]" % (prefix, target, label) 386 else: 387 return "[[%s%s|%s|title=%s]]" % (prefix, target, label, title) 388 389 elif match.group("imagetext"): 390 parts = match.group("imagetext").split("|") 391 392 # NOTE: Proper detection of external links required. 393 394 if parts[0].startswith("http"): 395 prefix = "" 396 else: 397 prefix = "attachment:" 398 399 # NOTE: Proper options conversion required. 400 401 if len(parts) == 1: 402 return "{{%s%s}}" % (prefix, parts[0]) 403 else: 404 return "{{%s%s|%s}}" % (prefix, parts[0], parts[1]) 405 406 elif match.group("macro"): 407 macro_name = match.group("macro") 408 if macrotypes.has_key(macro_name): 409 argname = macroargs.get(macro_name) 410 result = macrotypes[macro_name] % { 411 "args" : quote_macro_argument((argname and ("%s=" % argname) or "") + match.group("options")) 412 } 413 if not self.forbids_macros(): 414 return result 415 if macro_name == "anchor": 416 self.held_anchors.append(result) 417 return "" 418 419 elif match.group("italictext"): 420 return "''%s''" % self.translate_content(match.group("italictext")) 421 422 elif match.group("boldtext"): 423 return "'''%s'''" % self.translate_content(match.group("boldtext")) 424 425 elif match.group("deltext"): 426 return "--(%s)--" % self.translate_content(match.group("deltext")) 427 428 elif match.group("underlinetext"): 429 return "__%s__" % self.translate_content(match.group("underlinetext")) 430 431 elif match.group("subtext"): 432 return ",,%s,," % self.translate_content(match.group("subtext")) 433 434 else: 435 return self.translate_text(match.group()) 436 437 def translate_text(self, s, preformatted=False): 438 439 "Translate the plain text string 's', converting notation." 440 441 for before, after in preformatted and preformatted_notation_mapping or notation_mapping: 442 s = s.replace(before, after) 443 return s 444 445 def translate_content(self, text, sectiontype=None): 446 447 """ 448 Return a translation of the given 'text'. If the optional 'sectiontype' is 449 specified, the translation may be modified to a form appropriate to the 450 section being translated. 451 """ 452 453 parts = [] 454 preformatted = sectiontype in preformatted_sectiontypes 455 456 last = 0 457 for match in content_regexp.finditer(text): 458 start, end = match.span() 459 parts.append(self.translate_text(text[last:start], preformatted)) 460 461 # Handle unformatted sections. 462 463 if sectiontype in ("code", "noformat"): 464 parts.append(match.group()) 465 else: 466 parts.append(self.translate_content_match(match)) 467 468 last = end 469 470 parts.append(self.translate_text(text[last:], preformatted)) 471 return "".join(parts) 472 473 def translate_block(self, blocktype, blocktext): 474 475 "Translate the block with the given 'blocktype' and 'blocktext'." 476 477 if blocktype in headings: 478 self.in_heading = True 479 self.held_anchors = [] 480 481 parts = [] 482 483 # Translate headings and blockquotes. 484 485 if blocktypes.has_key(blocktype): 486 text = self.translate_content(blocktext) 487 for anchor in self.held_anchors: 488 parts.append(anchor) 489 parts.append(blocktypes[blocktype] % text) 490 491 # Translate list items. 492 493 elif blocktype == "list": 494 for listmarker, listitem in get_list_items(blocktext): 495 parts.append("%s %s" % (self.translate_marker(listmarker), self.translate_content(listitem))) 496 497 # Translate table items. 498 499 elif blocktype == "table": 500 501 # Enter the table. 502 503 self.enter_section() 504 505 table_parts = [] 506 first = True 507 508 for cellsep, columns in get_table_rows(blocktext): 509 if not first: 510 table_parts.append("==") 511 else: 512 first = False 513 moinsep = self.translate_cellsep(cellsep) 514 table_parts.append(moinsep.join([self.translate_cell(cellsep, column) for column in columns])) 515 516 # Nest the section appropriately. 517 518 opening, closing = self.nest_section() 519 520 parts.append("%s#!table" % opening) 521 parts += table_parts 522 parts.append(closing) 523 524 # Leave the table. 525 526 self.leave_section() 527 528 # Handle anonymous blocks. 529 530 else: 531 parts.append(self.translate_content(blocktext)) 532 533 if blocktype in headings: 534 self.in_heading = False 535 536 return "\n".join(parts) 537 538 def translate_section(self, sectiontype, options, text): 539 540 """ 541 Translate the section with the given 'sectiontype', 'options' and 542 'text'. 543 """ 544 545 parts = [] 546 547 # Enter the section. 548 549 self.enter_section() 550 551 mointype = sectiontypes.get(sectiontype) 552 section_content = self.translate_content(text.strip(), sectiontype) 553 554 # Nest the section appropriately. 555 556 opening, closing = self.nest_section() 557 558 parts.append("%s%s\n" % (opening, mointype or "")) 559 if options: 560 parts.append("## %s\n" % options) 561 parts.append(section_content) 562 parts.append("\n%s\n" % closing) 563 564 # Leave the section. 565 566 self.leave_section() 567 568 return parts 569 570 def enter_section(self): 571 self.level += 1 572 self.max_level = max(self.level, self.max_level) 573 574 def leave_section(self): 575 self.level -= 1 576 if not self.level: 577 self.max_level = 0 578 579 def nest_section(self): 580 level = 3 + self.max_level - self.level 581 opening = "{" * level 582 closing = "}" * level 583 return opening, closing 584 585 # General parsing. 586 587 def parse_text(self, s): 588 589 "Parse the content in the string 's', returning the translation." 590 591 parts = [] 592 593 # Control spacing between blocks and other blocks or sections. 594 595 preceded_by_block = False 596 597 for type, text in get_regions(s): 598 599 # Handle list, heading, blockquote or anonymous blocks. 600 601 if type is None: 602 if preceded_by_block: 603 parts.append("\n") 604 605 first = True 606 for blocktype, blocktext in get_blocks(text): 607 if not first: 608 parts.append("\n") 609 else: 610 first = False 611 parts.append("%s" % self.translate_block(blocktype, blocktext)) 612 613 if not first: 614 preceded_by_block = True 615 616 # Handle sections. 617 618 else: 619 sectiontype, options = type 620 621 # Direct translations of sections. 622 623 if sectiontypes.has_key(sectiontype): 624 if preceded_by_block: 625 parts.append("\n") 626 627 parts += self.translate_section(sectiontype, options, text) 628 preceded_by_block = True 629 630 # Translations of macros (which can look like sections). 631 632 elif macrotypes.has_key(sectiontype) and not self.forbids_macros(): 633 argname = macroargs.get(sectiontype) 634 parts.append(macrotypes[sectiontype] % { 635 "content" : quote_macro_argument(self.translate_content(text, sectiontype)), 636 "args" : quote_macro_argument((argname and ("%s=" % argname) or "") + options) 637 }) 638 preceded_by_block = False 639 640 # Unrecognised sections. 641 642 else: 643 parts += self.translate_section(sectiontype, None, text) 644 preceded_by_block = False 645 646 return "".join(parts) 647 648 def forbids_macros(self): 649 return self.in_heading 650 651 def parse(s, out): 652 653 "Parse the content in the string 's', writing a translation to 'out'." 654 655 parser = ConfluenceParser() 656 out.write(parser.parse_text(s)) 657 658 if __name__ == "__main__": 659 s = codecs.getreader("utf-8")(sys.stdin).read() 660 out = codecs.getwriter("utf-8")(sys.stdout) 661 parse(s, out) 662 663 # vim: tabstop=4 expandtab shiftwidth=4