1 #!/usr/bin/env python 2 3 """ 4 Confluence Wiki syntax parsing. 5 6 Copyright (C) 2012, 2013 Paul Boddie <paul@boddie.org.uk> 7 8 This software is free software; you can redistribute it and/or 9 modify it under the terms of the GNU General Public License as 10 published by the Free Software Foundation; either version 2 of 11 the License, or (at your option) any later version. 12 13 This software is distributed in the hope that it will be useful, 14 but WITHOUT ANY WARRANTY; without even the implied warranty of 15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 GNU General Public License for more details. 17 18 You should have received a copy of the GNU General Public 19 License along with this library; see the file LICENCE.txt 20 If not, write to the Free Software Foundation, Inc., 21 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA 22 23 -------- 24 25 The basic procedure is as follows: 26 27 1. Wiki pages are first split up into regions. 28 2. Then, within these regions, the text is split into blocks. 29 1. First, lists are identified. 30 2. Additionally, other block-like elements are identified. 31 3. Each block is then parsed. 32 """ 33 34 from common import * 35 import re 36 import sys 37 import codecs 38 39 # Section extraction. 40 41 sections_regexp_str = r"(?<!{){(?P<type>[^-_*+{}\n:]+)(:[^}\n]+)?}.*?{(?P=type)}" 42 sections_regexp = re.compile(sections_regexp_str, re.DOTALL | re.MULTILINE) 43 44 def get_regions(s): 45 46 """ 47 Return a list of regions from 's'. Each region is specified using a tuple of 48 the form (type, text). 49 """ 50 51 last = 0 52 regions = [] 53 for match in sections_regexp.finditer(s): 54 start, end = match.span() 55 regions.append((None, s[last:start])) 56 regions.append(get_section_details(s[start:end])) 57 last = end 58 regions.append((None, s[last:])) 59 return regions 60 61 # Section inspection. 62 63 section_regexp_str = r"{(?P<sectiontype>[^\n:]*?)(?::(?P<options>.*?))?}(?P<section>.*){(?P=sectiontype)}" 64 section_regexp = re.compile(section_regexp_str, re.DOTALL | re.MULTILINE) 65 66 def get_section_details(s): 67 68 "Return the details of a section 's' in the form (type, text)." 69 70 match = section_regexp.match(s) 71 if match: 72 return (match.group("sectiontype"), match.group("options")), match.group("section") 73 else: 74 return None, s 75 76 # Heading, table and list extraction. 77 78 list_regexp_str = r"^\s*(?P<listtype>[*#-])[*#-]*\s+.*(\n\s*(?P=listtype).*?)*(?:\n|$)" 79 table_regexp_str = r"^((?P<celltype>[|]{1,2})((.|\n(?!\n))+?(?P=celltype))+(\n|$))+" 80 blocktext_regexp_str = r"^(?P<type>h\d|bq)\.\s+(?P<text>.*)$" 81 82 blockelement_regexp = re.compile( 83 "(" + list_regexp_str + ")" 84 "|" 85 "(" + table_regexp_str + ")" 86 "|" 87 "(" + blocktext_regexp_str + ")", 88 re.MULTILINE 89 ) 90 91 def get_block_elements(s): 92 93 """ 94 Extract headings, tables and lists from the given string 's'. 95 """ 96 97 last = 0 98 blocks = [] 99 for match in blockelement_regexp.finditer(s): 100 start, end = match.span() 101 matchtype = match.group("listtype") and "list" or match.group("celltype") and "table" or match.group("type") 102 blocks.append((None, s[last:start])) 103 blocks.append((matchtype, match.group("text") or s[start:end])) 104 last = end 105 blocks.append((None, s[last:])) 106 return blocks 107 108 # Block extraction. 109 110 block_regexp_str = r"^(?:\s*\n)+" 111 block_regexp = re.compile(block_regexp_str, re.MULTILINE) 112 113 def get_basic_blocks(s): 114 115 """ 116 Return blocks from the given string 's' by splitting the text on blank lines 117 and eliminating those lines. 118 """ 119 120 return [b for b in block_regexp.split(s) if b.strip()] 121 122 # Block inspection. 123 124 def get_blocks(s): 125 126 """ 127 Return blocks from the given string 's', inspecting the basic blocks and 128 generating additional block-level text where appropriate. 129 """ 130 131 blocks = [] 132 133 for blocktype, blocktext in get_block_elements(s): 134 135 # Collect heading, list and table blocks. 136 137 if blocktype is not None: 138 blocks.append((blocktype, blocktext)) 139 140 # Attempt to find new subblocks in other regions. 141 142 else: 143 for block in get_basic_blocks(blocktext): 144 blocks.append((None, block)) 145 146 return blocks 147 148 # List item inspection. 149 150 listitem_regexp_str = r"^(?P<marker> *[-*#]+)\s+(?P<text>.*)$" 151 listitem_regexp = re.compile(listitem_regexp_str, re.MULTILINE) 152 153 def get_list_items(text): 154 155 "Return a list of (marker, text) tuples for the given list 'text'." 156 157 items = [] 158 159 for match in listitem_regexp.finditer(text): 160 items.append((match.group("marker"), match.group("text"))) 161 162 return items 163 164 # Content inspection. 165 166 monospace_regexp_str = r"{{(?P<monotext>.*?)}}" 167 link_regexp_str = r"[[](?P<linktext>.*?)]" 168 image_regexp_str = r"!(?P<imagetext>\w.*?)!" 169 170 # Word-dependent patterns. 171 # Here, the unbracketed markers must test for the absence of surrounding word 172 # characters. 173 174 italic_regexp_str = r"(?:(?<!\w)_|\{_\})(?P<italictext>.*?)(?:_(?!\w)|\{_\})" 175 bold_regexp_str = r"(?:(?<!\w)\*|\{\*\})(?P<boldtext>.*?)(?:\*(?!\w)|\{\*\})" 176 del_regexp_str = r"(?:(?<!\w)-|\{-\})(?P<deltext>.*?)(?:-(?!\w)|\{-\})" 177 underline_regexp_str = r"(?:(?<!\w)\+|\{\+\})(?P<underlinetext>.*?)(?:\+(?!\w)|\{\+\})" 178 sub_regexp_str = r"(?:(?<!\w)~|\{~\})(?P<subtext>.*?)(?:~(?!\w)|\{~\})" 179 180 content_regexp_str = ( 181 "(" + monospace_regexp_str + ")" 182 "|" 183 "(" + link_regexp_str + ")" 184 "|" 185 "(" + image_regexp_str + ")" 186 "|" 187 "(" + italic_regexp_str + ")" 188 "|" 189 "(" + bold_regexp_str + ")" 190 "|" 191 "(" + del_regexp_str + ")" 192 "|" 193 "(" + underline_regexp_str + ")" 194 "|" 195 "(" + sub_regexp_str + ")" 196 ) 197 198 # Table row inspection. 199 200 cellsep_regexp_str = r"(?P<celltype>[|]{1,2})" 201 202 table_content_regexp_str = ( 203 content_regexp_str + 204 "|" 205 "(" + cellsep_regexp_str + ")" 206 ) 207 208 content_regexp = re.compile(content_regexp_str) 209 table_content_regexp = re.compile(table_content_regexp_str) 210 211 def get_table_rows(text): 212 213 "Return a list of (cellsep, columns) tuples for the given table 'text'." 214 215 rows = [] 216 217 for row in text.split("|\n"): 218 if not row: 219 break 220 221 row += "|" 222 cellsep = None 223 columns = [""] 224 last = 0 225 for match in table_content_regexp.finditer(row): 226 start, end = match.span() 227 columns[-1] += row[last:start] 228 229 if match.group("celltype"): 230 if cellsep is None: 231 cellsep = match.group("celltype") 232 columns.append("") 233 else: 234 columns[-1] += match.group() 235 236 last = end 237 238 columns[-1] += row[last:] 239 240 if cellsep: 241 rows.append((cellsep, columns[1:-1])) 242 243 return rows 244 245 # Notation conversion. 246 247 notation_mapping = [ 248 (r"\!", "!"), 249 (r"\-", "-"), 250 (r"\\""\n", "<<BR>>"), 251 (r"\\ ", "<<BR>>"), 252 (r"\~", "~"), 253 ] 254 255 preformatted_notation_mapping = [ 256 (r"\!", "!"), 257 (r"\-", "-"), 258 (r"\\""\n", "\n"), 259 (r"\\ ", "\n"), 260 (r"\~", "~"), 261 ] 262 263 # Translation helpers. 264 265 markers = { 266 "*" : "*", 267 "#" : "1.", 268 "-" : "*", 269 } 270 271 cellseps = { 272 "|" : "\n|| ", 273 "||" : "\n|| ", 274 } 275 276 cellextra = { 277 "|" : "", 278 "||" : "'''", 279 } 280 281 sectiontypes = { 282 "code" : "", 283 "noformat" : "", 284 "quote" : "", 285 "info" : "#!wiki important", 286 "note" : "#!wiki caution", 287 "tip" : "#!wiki tip", 288 "warning" : "#!wiki warning", 289 } 290 291 preformatted_sectiontypes = (None, "noformat") 292 293 macrotypes = { 294 "anchor" : "<<Anchor(%s)>>", 295 "color" : "<<Color(%s)>>", 296 } 297 298 class ConfluenceParser: 299 300 "A parser for Confluence markup." 301 302 def __init__(self): 303 self.max_level = self.level = 0 304 305 def translate_marker(self, marker): 306 307 "Translate the given 'marker' to a suitable Moin representation." 308 309 return " " * len(marker) + markers[marker[-1]] 310 311 def translate_cellsep(self, cellsep): 312 313 "Translate the given 'cellsep' to a suitable Moin representation." 314 315 return cellseps[cellsep] 316 317 def translate_cell(self, cellsep, text): 318 319 "Using 'cellsep', translate the cell 'text'." 320 321 return cellextra[cellsep] + self.parse_text(text).strip() + cellextra[cellsep] 322 323 def translate_content_match(self, match): 324 325 "Translate the content described by the given 'match', returning a string." 326 327 if match.group("monotext"): 328 self.enter_section(); self.leave_section() 329 return "{{{%s}}}" % match.group("monotext") 330 331 elif match.group("linktext"): 332 parts = match.group("linktext").split("|") 333 334 # NOTE: Proper detection of external links required. 335 336 if len(parts) == 1: 337 label, target, title = None, parts[0], None 338 elif len(parts) == 2: 339 (label, target), title = parts, None 340 else: 341 label, target, title = parts 342 343 target = target.strip() 344 345 # Look for namespace links and rewrite them. 346 347 if target.find(":") != -1: 348 prefix = "" 349 space, rest = target.split(":", 1) 350 if space not in URL_SCHEMES: 351 target = "%s/%s" % (space, rest) 352 353 # Detect anchors. 354 355 elif target.startswith("#"): 356 prefix = "" 357 358 # Detect attachments. 359 360 elif target.startswith("^"): 361 prefix = "attachment:" 362 363 # Link to other pages within a space. 364 365 else: 366 prefix = "../" 367 368 # Make the link tidier by making a target if none was given. 369 370 if not label: 371 label = target 372 373 if not label and not title: 374 return "[[%s%s]]" % (prefix, target) 375 elif not title: 376 return "[[%s%s|%s]]" % (prefix, target, label) 377 else: 378 return "[[%s%s|%s|title=%s]]" % (prefix, target, label, title) 379 380 elif match.group("imagetext"): 381 parts = match.group("imagetext").split("|") 382 383 # NOTE: Proper detection of external links required. 384 385 if parts[0].startswith("http"): 386 prefix = "" 387 else: 388 prefix = "attachment:" 389 390 # NOTE: Proper options conversion required. 391 392 if len(parts) == 1: 393 return "{{%s%s}}" % (prefix, parts[0]) 394 else: 395 return "{{%s%s|%s}}" % (prefix, parts[0], parts[1]) 396 397 elif match.group("italictext"): 398 return "''%s''" % self.translate_content(match.group("italictext")) 399 400 elif match.group("boldtext"): 401 return "'''%s'''" % self.translate_content(match.group("boldtext")) 402 403 elif match.group("deltext"): 404 return "--(%s)--" % self.translate_content(match.group("deltext")) 405 406 elif match.group("underlinetext"): 407 return "__%s__" % self.translate_content(match.group("underlinetext")) 408 409 elif match.group("subtext"): 410 return ",,%s,," % self.translate_content(match.group("subtext")) 411 412 else: 413 return self.translate_text(match.group()) 414 415 def translate_text(self, s, preformatted=False): 416 417 "Translate the plain text string 's', converting notation." 418 419 for before, after in preformatted and preformatted_notation_mapping or notation_mapping: 420 s = s.replace(before, after) 421 return s 422 423 def translate_content(self, text, sectiontype=None): 424 425 """ 426 Return a translation of the given 'text'. If the optional 'sectiontype' is 427 specified, the translation may be modified to a form appropriate to the 428 section being translated. 429 """ 430 431 parts = [] 432 preformatted = sectiontype in preformatted_sectiontypes 433 434 last = 0 435 for match in content_regexp.finditer(text): 436 start, end = match.span() 437 parts.append(self.translate_text(text[last:start], preformatted)) 438 439 # Handle unformatted sections. 440 441 if sectiontype in ("code", "noformat"): 442 parts.append(match.group()) 443 else: 444 parts.append(self.translate_content_match(match)) 445 446 last = end 447 448 parts.append(self.translate_text(text[last:], preformatted)) 449 return "".join(parts) 450 451 def translate_block(self, blocktype, blocktext): 452 453 "Translate the block with the given 'blocktype' and 'blocktext'." 454 455 parts = [] 456 457 # Translate headings and blockquotes. 458 459 if blocktypes.has_key(blocktype): 460 parts.append(blocktypes[blocktype] % blocktext) 461 462 # Translate list items. 463 464 elif blocktype == "list": 465 for listmarker, listitem in get_list_items(blocktext): 466 parts.append("%s %s" % (self.translate_marker(listmarker), self.translate_content(listitem))) 467 468 # Translate table items. 469 470 elif blocktype == "table": 471 472 # Enter the table. 473 474 self.enter_section() 475 476 table_parts = [] 477 first = True 478 479 for cellsep, columns in get_table_rows(blocktext): 480 if not first: 481 table_parts.append("==") 482 else: 483 first = False 484 moinsep = self.translate_cellsep(cellsep) 485 table_parts.append(moinsep.join([self.translate_cell(cellsep, column) for column in columns])) 486 487 # Nest the section appropriately. 488 489 opening, closing = self.nest_section() 490 491 parts.append("%s#!table" % opening) 492 parts += table_parts 493 parts.append(closing) 494 495 # Leave the table. 496 497 self.leave_section() 498 499 # Handle anonymous blocks. 500 501 else: 502 parts.append(self.translate_content(blocktext)) 503 504 return "\n".join(parts) 505 506 def translate_section(self, sectiontype, options, text): 507 508 """ 509 Translate the section with the given 'sectiontype', 'options' and 510 'text'. 511 """ 512 513 parts = [] 514 515 # Enter the section. 516 517 self.enter_section() 518 519 mointype = sectiontypes.get(sectiontype) 520 section_content = self.translate_content(text.strip(), sectiontype) 521 522 # Nest the section appropriately. 523 524 opening, closing = self.nest_section() 525 526 parts.append("%s%s\n" % (opening, mointype or "")) 527 if options: 528 parts.append("## %s\n" % options) 529 parts.append(section_content) 530 parts.append("\n%s\n" % closing) 531 532 # Leave the section. 533 534 self.leave_section() 535 536 return parts 537 538 def enter_section(self): 539 self.level += 1 540 self.max_level = max(self.level, self.max_level) 541 542 def leave_section(self): 543 self.level -= 1 544 if not self.level: 545 self.max_level = 0 546 547 def nest_section(self): 548 level = 3 + self.max_level - self.level 549 opening = "{" * level 550 closing = "}" * level 551 return opening, closing 552 553 # General parsing. 554 555 def parse_text(self, s): 556 557 "Parse the content in the string 's', returning the translation." 558 559 parts = [] 560 561 # Control spacing between blocks and other blocks or sections. 562 563 preceded_by_block = False 564 565 for type, text in get_regions(s): 566 567 # Handle list, heading, blockquote or anonymous blocks. 568 569 if type is None: 570 if preceded_by_block: 571 parts.append("\n") 572 573 first = True 574 for blocktype, blocktext in get_blocks(text): 575 if not first: 576 parts.append("\n") 577 else: 578 first = False 579 parts.append("%s" % self.translate_block(blocktype, blocktext)) 580 581 if not first: 582 preceded_by_block = True 583 584 # Handle sections. 585 586 else: 587 sectiontype, options = type 588 589 # Direct translations of sections. 590 591 if sectiontypes.has_key(sectiontype): 592 if preceded_by_block: 593 parts.append("\n") 594 595 parts += self.translate_section(sectiontype, options, text) 596 preceded_by_block = True 597 598 # Translations of macros (which can look like sections). 599 600 elif macrotypes.has_key(sectiontype): 601 parts.append(macrotypes[sectiontype] % self.translate_content(text, sectiontype)) 602 preceded_by_block = False 603 604 # Unrecognised sections. 605 606 else: 607 parts += self.translate_section(sectiontype, None, text) 608 preceded_by_block = False 609 610 return "".join(parts) 611 612 def parse(s, out): 613 614 "Parse the content in the string 's', writing a translation to 'out'." 615 616 parser = ConfluenceParser() 617 out.write(parser.parse_text(s)) 618 619 if __name__ == "__main__": 620 s = codecs.getreader("utf-8")(sys.stdin).read() 621 out = codecs.getwriter("utf-8")(sys.stdout) 622 parse(s, out) 623 624 # vim: tabstop=4 expandtab shiftwidth=4