1 #!/usr/bin/env python 2 3 """ 4 Moin wiki format parser. 5 6 Copyright (C) 2017, 2018 Paul Boddie <paul@boddie.org.uk> 7 8 This program is free software; you can redistribute it and/or modify it under 9 the terms of the GNU General Public License as published by the Free Software 10 Foundation; either version 3 of the License, or (at your option) any later 11 version. 12 13 This program is distributed in the hope that it will be useful, but WITHOUT 14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS 15 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more 16 details. 17 18 You should have received a copy of the GNU General Public License along with 19 this program. If not, see <http://www.gnu.org/licenses/>. 20 """ 21 22 # Document transformations. 23 24 from moinformat.links import get_linker 25 from moinformat.macros import get_macro 26 27 # Parser functionality and pattern definition. 28 29 from moinformat.parsers.common import ParserBase, get_patterns, \ 30 excl, expect, group, optional, recur, \ 31 repeat 32 33 # Serialisation. 34 35 from moinformat.serialisers import serialise 36 37 # Document tree nodes. 38 39 from moinformat.tree.moin import Break, DefItem, DefTerm, FontStyle, Heading, \ 40 Larger, Link, List, ListItem, Macro, \ 41 Monospace, Region, Rule, Smaller, \ 42 Strikethrough, Subscript, Superscript, Table, \ 43 TableAttr, TableAttrs, TableCell, TableRow, \ 44 Text, Underline 45 46 join = "".join 47 48 class MoinParser(ParserBase): 49 50 "A wiki region parser." 51 52 def __init__(self, formats=None, root=None): 53 54 """ 55 Initialise the parser with any given 'formats' mapping from region type 56 names to parser objects. An optional 'root' indicates the document-level 57 parser. 58 """ 59 60 # Introduce this class as the default parser for the wiki format. 61 62 default_formats = {"wiki" : MoinParser, "moin" : MoinParser} 63 if formats: 64 default_formats.update(formats) 65 66 ParserBase.__init__(self, default_formats, root) 67 68 # Record certain node occurrences for later evaluation. 69 70 self.macros = [] 71 self.links = [] 72 73 # Principal parser methods. 74 75 def parse(self, s): 76 77 """ 78 Parse page text 's'. Pages consist of regions delimited by markers. 79 """ 80 81 self.items = self.get_items(s) 82 self.region = Region([], type="moin") 83 84 # Parse page header. 85 86 self.parse_region_header(self.region) 87 88 # Handle pages directly with this parser. Pages do not need to use an 89 # explicit format indicator. 90 91 if not self.region.type: 92 self.parse_region_content(self.items, self.region) 93 94 # Otherwise, test the type and find an appropriate parser. 95 96 else: 97 self.parse_region_type(self.region) 98 99 return self.region 100 101 102 103 # Macro evaluation. 104 105 def evaluate_macros(self): 106 107 "Evaluate the macro nodes in the document." 108 109 for node in self.macros: 110 111 # Obtain a class for the named macro. 112 113 macro_cls = get_macro(node.name) 114 if not macro_cls: 115 continue 116 117 # Instantiate the class and evaluate the macro. 118 119 macro = macro_cls(node, self.region) 120 macro.evaluate() 121 122 # Link translation. 123 124 def translate_links(self, scheme, pagename): 125 126 """ 127 Translate the link nodes in the document for the given 'scheme' and 128 employing the given document 'pagename'. 129 """ 130 131 # Obtain a class for the named linker. 132 133 linker_cls = get_linker(scheme) 134 if not linker_cls: 135 return 136 137 # Instantiate the class with document metadata. 138 139 linker = linker_cls(pagename) 140 141 for node in self.links: 142 143 # Translate the link. 144 145 node.target = linker.translate(node.target) 146 147 148 149 # Parser methods supporting different page features. 150 151 def parse_attrname(self, attrs): 152 153 "Handle an attribute name within 'attrs'." 154 155 name = self.match_group("name") 156 attr = TableAttr(name) 157 158 preceding = self.read_until(["attrvalue"], False) 159 if preceding == "": 160 attr.quote = self.match_group("quote") 161 attr.value = self.match_group("value") 162 163 attrs.append(attr) 164 165 def parse_break(self, region): 166 167 "Handle a paragraph break within 'region'." 168 169 self.add_node(region, Break()) 170 self.new_block(region) 171 172 def parse_defitem(self, region, extra=""): 173 174 "Handle a definition item within 'region'." 175 176 pad = self.match_group("pad") 177 item = DefItem([], pad, extra) 178 self.parse_region_details(item, ["listitemend"]) 179 self.add_node(region, item) 180 self.new_block(region) 181 182 def parse_defterm(self, region): 183 184 "Handle a definition term within 'region'." 185 186 pad = self.match_group("pad") 187 term = DefTerm([], pad) 188 self.parse_region_details(term, ["deftermend", "deftermsep"]) 189 self.add_node(region, term) 190 if self.matching_pattern() == "deftermsep": 191 self.parse_defitem(region) 192 193 def parse_defterm_empty(self, region): 194 195 "Handle an empty definition term within 'region'." 196 197 extra = self.match_group("pad") 198 self.parse_region_details(region, ["deftermsep"]) 199 self.parse_defitem(region, extra) 200 201 def parse_fontstyle(self, region): 202 203 "Handle emphasis and strong styles." 204 205 n = len(self.match_group("style")) 206 207 # Handle endings. 208 209 if isinstance(region, FontStyle): 210 emphasis = n in (2, 4, 5) 211 strong = n in (3, 5, 6) 212 active = True 213 214 if region.emphasis and emphasis: 215 active = region.close_emphasis() 216 n -= 2 217 if region.strong and strong: 218 active = region.close_strong() 219 n -= 3 220 221 if not active: 222 if n: 223 self.items.rewind(n) 224 raise StopIteration 225 226 elif not n: 227 return 228 229 # Handle new styles. 230 231 emphasis = n in (2, 4, 5) 232 strong = n in (3, 5, 6) 233 double = n in (4, 6) 234 235 span = FontStyle([], emphasis, strong) 236 if not double: 237 self.parse_region_details(span, self.inline_pattern_names) 238 region.append_inline(span) 239 240 def parse_halign(self, attrs): 241 242 "Handle horizontal alignment within 'attrs'." 243 244 value = self.match_group("value") 245 attr = TableAttr("halign", value == "(" and "left" or value == ")" and "right" or "center", True) 246 attrs.append(attr) 247 248 def parse_heading(self, region): 249 250 "Handle a heading." 251 252 start_extra = self.match_group("extra") 253 level = len(self.match_group("level")) 254 start_pad = self.match_group("pad") 255 heading = Heading([], level, start_extra, start_pad) 256 self.parse_region_details(heading, ["headingend"] + self.inline_pattern_names) 257 self.add_node(region, heading) 258 self.new_block(region) 259 260 def parse_heading_end(self, heading): 261 262 "Handle the end of a heading." 263 264 level = len(self.match_group("level")) 265 if heading.level == level: 266 heading.end_pad = self.match_group("pad") 267 heading.end_extra = self.match_group("extra") 268 raise StopIteration 269 270 def parse_list(self, item): 271 272 "Create a list, starting with 'item'." 273 274 list = List([item], item.indent, item.marker, item.num) 275 self.parse_region_details(list, self.list_pattern_names, True) 276 return list 277 278 def parse_listitem(self, region): 279 280 "Handle a list item marker within 'region'." 281 282 indent = len(self.match_group("indent")) 283 marker = self.match_group("marker") 284 num = self.match_group("num") 285 space = self.match_group("pad") 286 287 last = region.node(-1) 288 289 new_list = not isinstance(last, (List, ListItem)) 290 same_indent = not new_list and indent == last.indent 291 new_marker = not new_list and last.marker != marker and same_indent 292 new_num = not new_list and num is not None and last.num != num and same_indent 293 294 # If the marker or number changes at the same indent, or if the indent 295 # is smaller, queue the item and end the list. 296 297 # Note that Moin format does not seek to support item renumbering, 298 # instead starting new lists on number changes. 299 300 if not new_list and (new_marker or new_num or indent < last.indent): 301 self.queue_match() 302 self.end_region(region) 303 304 # Obtain a list item and populate it. 305 306 item = ListItem([], indent, marker, space, num) 307 self.parse_region_details(item, self.listitem_pattern_names) 308 309 # Start a new list if not preceded by a list item, adding a trailing 310 # block for new elements. 311 312 if new_list: 313 item = self.parse_list(item) 314 self.add_node(region, item) 315 self.new_block(region) 316 317 # Add a nested list to the last item. 318 319 elif indent > last.indent: 320 item = self.parse_list(item) 321 self.add_node(last, item) 322 323 # Add the item to the current list. 324 325 else: 326 self.add_node(region, item) 327 328 def parse_rule(self, region): 329 330 "Handle a horizontal rule within 'region'." 331 332 length = len(self.match_group("rule")) 333 rule = Rule(length) 334 self.add_node(region, rule) 335 self.new_block(region) 336 337 def parse_section(self, region): 338 339 "Handle the start of a new section within 'region'." 340 341 # Parse the section and start a new block after the section. 342 343 indent = len(self.match_group("indent")) 344 level = len(self.match_group("level")) 345 346 section = self.parse_region(level, indent, "inline") 347 348 # If the section is inline, treat it like any other inline element. 349 350 if section.type == "inline": 351 region.append_inline(section) 352 353 # Otherwise, add it as a new block element. 354 355 else: 356 self.add_node(region, section) 357 if region.allow_blocks: 358 self.new_block(region) 359 360 def parse_section_end(self, region): 361 362 "Handle the end of a new section within 'region'." 363 364 level = self.match_group("level") 365 feature = self.match_group("feature") 366 region.extra = self.match_group("extra") 367 368 if region.have_end(level): 369 raise StopIteration 370 else: 371 region.append_inline(Text(feature)) 372 373 def parse_table_attrs(self, cell): 374 375 "Handle the start of table attributes within 'cell'." 376 377 attrs = TableAttrs([]) 378 self.parse_region_details(attrs, self.table_pattern_names) 379 380 # Test the validity of the attributes. 381 382 last = None 383 384 for node in attrs.nodes: 385 386 # Text separator nodes must be whitespace. 387 388 if isinstance(node, Text): 389 if node.s.strip(): 390 break 391 392 # Named attributes must be preceded by space if not the first. 393 394 elif last and not node.concise and not isinstance(last, Text): 395 break 396 397 last = node 398 399 # All nodes were valid: preserve the collection. 400 401 else: 402 # Add the attributes as a node, also recording their presence. 403 404 cell.append(attrs) 405 cell.attrs = attrs 406 return 407 408 # Invalid nodes were found: serialise the attributes as text. 409 410 cell.append_inline(Text(serialise(attrs))) 411 412 def parse_table_row(self, region): 413 414 "Handle the start of a table row within 'region'." 415 416 # Identify any active table. 417 418 table = region.node(-2) 419 block = region.node(-1) 420 421 if not (isinstance(table, Table) and block.empty()): 422 new_table = table = Table([]) 423 else: 424 new_table = None 425 426 row = TableRow([]) 427 428 while True: 429 cell = TableCell([]) 430 self.parse_region_details(cell, self.table_region_pattern_names) 431 432 # Handle the end of the row. 433 434 if self.matching_pattern() == "tableend": 435 trailing = self.match_group("extra") 436 437 # If the cell was started but not finished, convert the row into text. 438 439 if not row.nodes or not cell.empty(): 440 for node in row.nodes: 441 region.append_inline(Text(serialise(node))) 442 region.append_inline(Text(serialise(cell) + trailing)) 443 444 self.new_block(region) 445 return 446 447 # Append the final cell, if not empty. 448 449 else: 450 row.trailing = trailing 451 452 if not cell.empty(): 453 row.append(cell) 454 break 455 456 # A cell separator has been found. 457 458 row.append(cell) 459 460 # Add the row to the table and any new table to the region. 461 462 table.add(row) 463 if new_table: 464 self.add_node(region, new_table) 465 466 self.new_block(region) 467 468 def parse_valign(self, attrs): 469 470 "Handle vertical alignment within 'attrs'." 471 472 value = self.match_group("value") 473 attr = TableAttr("valign", value == "^" and "top" or "bottom", True) 474 attrs.append(attr) 475 476 477 478 # Inline formatting handlers. 479 480 def parse_inline(self, region, cls, pattern_name): 481 482 "Handle an inline region." 483 484 span = cls([]) 485 self.parse_region_details(span, self.inline_patterns_for(pattern_name)) 486 region.append_inline(span) 487 488 def parse_larger(self, region): 489 self.parse_inline(region, Larger, "larger") 490 491 def parse_monospace(self, region): 492 span = Monospace([]) 493 self.parse_region_details(span, ["monospaceend"]) 494 region.append_inline(span) 495 496 def parse_smaller(self, region): 497 self.parse_inline(region, Smaller, "smaller") 498 499 def parse_strike(self, region): 500 self.parse_inline(region, Strikethrough, "strike") 501 502 def parse_sub(self, region): 503 self.parse_inline(region, Subscript, "sub") 504 505 def parse_super(self, region): 506 self.parse_inline(region, Superscript, "super") 507 508 def parse_underline(self, region): 509 self.parse_inline(region, Underline, "underline") 510 511 512 513 # Complete inline pattern handlers. 514 515 def parse_link(self, region): 516 target = self.match_group("target") 517 text = self.match_group("text") 518 link = Link(text and [Text(text)], target) 519 region.append_inline(link) 520 521 # Record the link for later processing. 522 523 self.root.links.append(link) 524 525 def parse_macro(self, region): 526 name = self.match_group("name") 527 args = self.match_group("args") 528 529 # Obtain the raw arguments. Moin usually leaves it to the macro to 530 # interpret the individual arguments. 531 532 arglist = args and args.split(",") or [] 533 macro = Macro(name, arglist) 534 region.append_inline(macro) 535 536 # Record the macro for later processing. 537 538 self.root.macros.append(macro) 539 540 541 542 # Table attribute handlers. 543 544 def parse_table_attr(self, attrs, pattern_name): 545 546 "Handle a table attribute." 547 548 attrs.append(TableAttr(pattern_name, self.match_group("value"), True)) 549 550 def parse_colour(self, cell): 551 self.parse_table_attr(cell, "colour") 552 553 def parse_colspan(self, cell): 554 self.parse_table_attr(cell, "colspan") 555 556 def parse_rowspan(self, cell): 557 self.parse_table_attr(cell, "rowspan") 558 559 def parse_width(self, cell): 560 self.parse_table_attr(cell, "width") 561 562 563 564 # Regular expressions. 565 566 syntax = { 567 # Page regions: 568 569 "regionstart" : join((group("indent", r"\N*"), # ws... (optional) 570 group("level", repeat("[{]", 3)))), # {{{... 571 572 "regionend" : join((r"\N*", # ws... (optional) 573 group("feature", join(( 574 group("level", repeat("[}]", 3)), # }}}... 575 group("extra", r"\n"), 576 "?"))))), # nl (optional) 577 578 "header" : join(("#!", # #! 579 group("args", ".*?"), "\n")), # text-excl-nl 580 581 # Region contents: 582 583 # Line-oriented patterns support features which require their own 584 # separate lines. 585 586 "break" : r"^(\s*?)\n", # blank line 587 588 "defterm" : join(("^", 589 group("pad", r"\N+"), # ws... 590 expect(".+?::"))), # text :: 591 592 "defterm_empty" : join(("^", 593 group("pad", r"\N+"), # ws... 594 expect("::\s+"))), # :: 595 # ws... (optional) 596 597 "heading" : join(("^", 598 group("extra", r"\N*"), # ws... (optional) 599 group("level", "=+"), # =... 600 group("pad", r"\s+"), # ws... 601 expect(join((r".*?\N+", # text 602 recur("level"), # =... 603 r"\N*$"))))), # ws... (optional) 604 605 "listitem" : join(("^", 606 group("indent", r"\N+"), # ws... 607 group("marker", r"\*"), # list-marker 608 group("pad", r"\s*"))), # ws... (optional) 609 610 "listitem_num" : join(("^", 611 group("indent", r"\N+"), # ws... 612 group("marker", r"\d+\."), # decimal-marker 613 optional(join(("#", group("num", r"\d+")))), # # num (optional) 614 group("pad", r"\s+"))), # ws... 615 616 "listitem_alpha": join(("^", 617 group("indent", r"\N+"), # ws... 618 group("marker", r"[aA]\."), # alpha-marker 619 optional(join(("#", group("num", r"\d+")))), # # num (optional) 620 group("pad", r"\s+"))), # ws... 621 622 "listitem_roman": join(("^", 623 group("indent", r"\N+"), # ws... 624 group("marker", r"[iI]\."), # roman-marker 625 optional(join(("#", group("num", r"\d+")))), # # num (optional) 626 group("pad", r"\s+"))), # ws... 627 628 "listitem_dot" : join(("^", 629 group("indent", r"\N+"), # ws... 630 group("marker", r"\."), # dot-marker 631 group("pad", r"\s*"))), # ws... (optional) 632 633 "tablerow" : r"^\|\|", # || 634 635 # Region contents: 636 637 # Inline patterns are for markup features that appear within blocks. 638 # The patterns below start inline spans that can contain other markup 639 # features. 640 641 "fontstyle" : group("style", repeat("'", 2, 6)), # ''... 642 "larger" : r"~\+", # ~+ 643 "monospace" : r"`", # ` 644 "rule" : group("rule", "-----*"), # ----... 645 "smaller" : r"~-", # ~- 646 "strike" : r"--\(", # --( 647 "sub" : r",,", # ,, 648 "super" : r"\^", # ^ 649 "underline" : r"__", # __ 650 651 # Complete inline patterns are for markup features that do not support 652 # arbitrary content within them: 653 654 "link" : join((r"\[\[", # [[ 655 group("target", ".*?"), # target 656 optional(join((r"\|", group("text", ".*?")))), # | text (optional) 657 "]]")), # ]] 658 659 "macro" : join(("<<", # << 660 group("name", "\w+?"), # digit-letter... 661 optional(join((r"\(", # ( (optional) 662 group("args", ".*?"), # not-)... 663 r"\)"))), # ) (optional) 664 ">>")), # >> 665 666 # Ending patterns for inline features: 667 668 "largerend" : r"\+~", # +~ 669 "monospaceend" : r"`", # ` 670 "smallerend" : r"-~", # -~ 671 "strikeend" : r"\)--", # )-- 672 "subend" : r",,", # ,, 673 "superend" : r"\^", # ^ 674 "underlineend" : r"__", # __ 675 676 # Heading contents: 677 678 "headingend" : join((group("pad", r"\N+"), # ws... 679 group("level", "=+"), # =... 680 group("extra", r"\N*\n"))), # ws (optional) nl 681 682 # List contents: 683 684 "deftermend" : join(("::", group("pad", r"\s*?\n"))), # :: 685 # ws... (optional) 686 # nl 687 688 "deftermsep" : join(("::", group("pad", r"\s+"))), # :: 689 # ws... (optional) 690 691 "listitemend" : r"^", # next line 692 693 # Table contents: 694 695 "tableattrs" : r"<", # < 696 "tablecell" : r"\|\|", # || 697 698 "tableend" : join((group("extra", r"\s*?"), # ws... (optional) 699 "^")), # next line 700 701 # Table attributes: 702 703 "tableattrsend" : r">", # > 704 "halign" : group("value", "[(:)]"), # halign-marker 705 "valign" : group("value", "[v^]"), # valign-marker 706 "colour" : group("value", join(("\#", # # 707 repeat("[0-9A-F]", 6, 6)))), # nnnnnn 708 709 "colspan" : join(("-", # - 710 group("value", "\d+"))), # n... 711 712 "rowspan" : join((r"\|", # | 713 group("value", "\d+"))), # n... 714 715 "width" : group("value", "\d+%"), # n... % 716 717 "attrname" : join((excl(r"[-\d]"), # not-dash-or-digit 718 group("name", r"[-\w]+"))), # dash-digit-letter... 719 720 "attrvalue" : join(("=", group("quote", r"\Q"), # quote 721 group("value", ".*?"), # non-quote... (optional) 722 recur("quote"))), # quote 723 } 724 725 patterns = get_patterns(syntax) 726 727 728 729 # Patterns available within certain markup features. 730 731 table_pattern_names = [ 732 "attrname", "colour", "colspan", "halign", "rowspan", "tableattrsend", 733 "valign", "width" 734 ] 735 736 inline_pattern_names = [ 737 "fontstyle", "larger", "link", "macro", "monospace", "regionstart", 738 "smaller", "strike", "sub", "super", "underline", 739 ] 740 741 list_pattern_names = [ 742 "listitem", "listitem_alpha", "listitem_dot", "listitem_num", 743 "listitem_roman", 744 ] 745 746 listitem_pattern_names = inline_pattern_names + ["listitemend"] 747 748 region_without_table_pattern_names = inline_pattern_names + list_pattern_names + [ 749 "break", "heading", "defterm", "defterm_empty", 750 "regionend", "rule", 751 ] 752 753 region_pattern_names = region_without_table_pattern_names + ["tablerow"] 754 755 table_region_pattern_names = inline_pattern_names + [ 756 "tableattrs", "tablecell", "tableend" 757 ] 758 759 def inline_patterns_for(self, name): 760 names = self.inline_pattern_names[:] 761 names[names.index(name)] = "%send" % name 762 return names 763 764 765 766 # Pattern handlers. 767 768 end_region = ParserBase.end_region 769 770 handlers = { 771 None : end_region, 772 "attrname" : parse_attrname, 773 "break" : parse_break, 774 "colour" : parse_colour, 775 "colspan" : parse_colspan, 776 "defterm" : parse_defterm, 777 "defterm_empty" : parse_defterm_empty, 778 "deftermend" : end_region, 779 "deftermsep" : end_region, 780 "fontstyle" : parse_fontstyle, 781 "halign" : parse_halign, 782 "heading" : parse_heading, 783 "headingend" : parse_heading_end, 784 "larger" : parse_larger, 785 "largerend" : end_region, 786 "link" : parse_link, 787 "macro" : parse_macro, 788 "listitemend" : end_region, 789 "listitem" : parse_listitem, 790 "listitem_alpha" : parse_listitem, 791 "listitem_dot" : parse_listitem, 792 "listitem_num" : parse_listitem, 793 "listitem_roman" : parse_listitem, 794 "monospace" : parse_monospace, 795 "monospaceend" : end_region, 796 "regionstart" : parse_section, 797 "regionend" : parse_section_end, 798 "rowspan" : parse_rowspan, 799 "rule" : parse_rule, 800 "smaller" : parse_smaller, 801 "smallerend" : end_region, 802 "strike" : parse_strike, 803 "strikeend" : end_region, 804 "sub" : parse_sub, 805 "subend" : end_region, 806 "super" : parse_super, 807 "superend" : end_region, 808 "tableattrs" : parse_table_attrs, 809 "tableattrsend" : end_region, 810 "tablerow" : parse_table_row, 811 "tablecell" : end_region, 812 "tableend" : end_region, 813 "underline" : parse_underline, 814 "underlineend" : end_region, 815 "valign" : parse_valign, 816 "width" : parse_width, 817 } 818 819 parser = MoinParser 820 821 # vim: tabstop=4 expandtab shiftwidth=4