1 #!/usr/bin/env python 2 3 """ 4 Moin wiki format parser. 5 6 Copyright (C) 2017, 2018 Paul Boddie <paul@boddie.org.uk> 7 8 This program is free software; you can redistribute it and/or modify it under 9 the terms of the GNU General Public License as published by the Free Software 10 Foundation; either version 3 of the License, or (at your option) any later 11 version. 12 13 This program is distributed in the hope that it will be useful, but WITHOUT 14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS 15 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more 16 details. 17 18 You should have received a copy of the GNU General Public License along with 19 this program. If not, see <http://www.gnu.org/licenses/>. 20 """ 21 22 # Document transformations. 23 24 from moinformat.macros import get_macro 25 26 # Parser functionality and pattern definition. 27 28 from moinformat.parsers.common import ParserBase, get_patterns, choice, \ 29 excl, expect, group, optional, recur, \ 30 repeat 31 32 # Serialisation. 33 34 from moinformat.serialisers import serialise 35 36 # Document tree nodes. 37 38 from moinformat.tree.moin import Anchor, Break, DefItem, DefTerm, FontStyle, \ 39 Heading, Larger, LineBreak, Link, List, \ 40 ListItem, Macro, Monospace, Region, Rule, \ 41 Smaller, Strikethrough, Subscript, \ 42 Superscript, Table, TableAttr, TableAttrs, \ 43 TableCell, TableRow, Text, Underline 44 45 join = "".join 46 47 class MoinParser(ParserBase): 48 49 "A wiki region parser." 50 51 format = "moin" 52 53 def __init__(self, formats=None, root=None): 54 55 """ 56 Initialise the parser with any given 'formats' mapping from region type 57 names to parser objects. An optional 'root' indicates the document-level 58 parser. 59 """ 60 61 # Introduce this class as the default parser for the wiki format. 62 63 default_formats = {"wiki" : MoinParser, "moin" : MoinParser} 64 if formats: 65 default_formats.update(formats) 66 67 ParserBase.__init__(self, default_formats, root) 68 69 # Record certain node occurrences for later evaluation. 70 71 self.macros = [] 72 73 # Principal parser methods. 74 75 def parse(self, s): 76 77 """ 78 Parse page text 's'. Pages consist of regions delimited by markers. 79 """ 80 81 self.items = self.get_items(s) 82 self.region = Region([], type="moin") 83 84 # Parse page header. 85 86 self.parse_region_header(self.region) 87 88 # Handle pages directly with this parser. Pages do not need to use an 89 # explicit format indicator. 90 91 if not self.region.type: 92 self.parse_region_content(self.items, self.region) 93 94 # Otherwise, test the type and find an appropriate parser. 95 96 else: 97 self.parse_region_type(self.region) 98 99 return self.region 100 101 102 103 # Macro evaluation. 104 105 def evaluate_macros(self): 106 107 "Evaluate the macro nodes in the document." 108 109 for node in self.macros: 110 111 # Obtain a class for the named macro. 112 113 macro_cls = get_macro(node.name) 114 if not macro_cls: 115 continue 116 117 # Instantiate the class and evaluate the macro. 118 119 macro = macro_cls(node, self.region) 120 macro.evaluate() 121 122 123 124 # Parser methods supporting different page features. 125 126 def parse_attrname(self, attrs): 127 128 "Handle an attribute name within 'attrs'." 129 130 name = self.match_group("name") 131 attr = TableAttr(name) 132 133 preceding = self.read_until(["attrvalue"], False) 134 if preceding == "": 135 attr.quote = self.match_group("quote") 136 attr.value = self.match_group("value") 137 138 attrs.append(attr) 139 140 def parse_break(self, region): 141 142 "Handle a paragraph break within 'region'." 143 144 self.add_node(region, Break()) 145 self.new_block(region) 146 147 def parse_defitem(self, region, extra=""): 148 149 "Handle a definition item within 'region'." 150 151 pad = self.match_group("pad") 152 item = DefItem([], pad, extra) 153 self.parse_region_details(item, ["listitemend"]) 154 self.add_node(region, item) 155 self.new_block(region) 156 157 def parse_defterm(self, region): 158 159 "Handle a definition term within 'region'." 160 161 pad = self.match_group("pad") 162 term = DefTerm([], pad) 163 self.parse_region_details(term, ["deftermend", "deftermsep"]) 164 self.add_node(region, term) 165 166 if self.matching_pattern() == "deftermsep": 167 self.parse_defitem(region) 168 169 # Add padding from the separator to the term, there being no item. 170 171 else: 172 term.extra = self.match_group("pad") 173 174 def parse_defterm_empty(self, region): 175 176 "Handle an empty definition term within 'region'." 177 178 extra = self.match_group("pad") 179 self.parse_region_details(region, ["deftermsep"]) 180 self.parse_defitem(region, extra) 181 182 def parse_fontstyle(self, region): 183 184 "Handle emphasis and strong styles." 185 186 n = len(self.match_group("style")) 187 188 # Handle endings. 189 190 if isinstance(region, FontStyle): 191 emphasis = n in (2, 4, 5) 192 strong = n in (3, 5, 6) 193 active = True 194 195 if region.emphasis and emphasis: 196 active = region.close_emphasis() 197 n -= 2 198 if region.strong and strong: 199 active = region.close_strong() 200 n -= 3 201 202 if not active: 203 if n: 204 self.items.rewind(n) 205 raise StopIteration 206 207 elif not n: 208 return 209 210 # Handle new styles. 211 212 emphasis = n in (2, 4, 5) 213 strong = n in (3, 5, 6) 214 double = n in (4, 6) 215 216 span = FontStyle([], emphasis, strong) 217 if not double: 218 self.parse_region_details(span, self.inline_pattern_names) 219 region.append_inline(span) 220 221 def parse_halign(self, attrs): 222 223 "Handle horizontal alignment within 'attrs'." 224 225 value = self.match_group("value") 226 attr = TableAttr("halign", value == "(" and "left" or value == ")" and "right" or "center", True) 227 attrs.append(attr) 228 229 def parse_heading(self, region): 230 231 "Handle a heading." 232 233 start_extra = self.match_group("extra") 234 level = len(self.match_group("level")) 235 start_pad = self.match_group("pad") 236 heading = Heading([], level, start_extra, start_pad) 237 self.parse_region_details(heading, ["headingend"] + self.inline_pattern_names) 238 self.add_node(region, heading) 239 self.new_block(region) 240 241 def parse_heading_end(self, heading): 242 243 "Handle the end of a heading." 244 245 level = len(self.match_group("level")) 246 if heading.level == level: 247 heading.end_pad = self.match_group("pad") 248 heading.end_extra = self.match_group("extra") 249 raise StopIteration 250 251 def parse_list(self, item): 252 253 "Create a list, starting with 'item'." 254 255 list = List([item], item.indent, item.marker, item.num) 256 self.parse_region_details(list, self.list_pattern_names, True) 257 return list 258 259 def parse_listitem(self, region): 260 261 "Handle a list item marker within 'region'." 262 263 indent = len(self.match_group("indent")) 264 marker = self.match_group("marker") 265 num = self.match_group("num") 266 space = self.match_group("pad") 267 268 last = region.node(-1) 269 270 new_list = not isinstance(last, (List, ListItem)) 271 same_indent = not new_list and indent == last.indent 272 new_marker = not new_list and last.marker != marker and same_indent 273 new_num = not new_list and num is not None and last.num != num and same_indent 274 275 # If the marker or number changes at the same indent, or if the indent 276 # is smaller, queue the item and end the list. 277 278 # Note that Moin format does not seek to support item renumbering, 279 # instead starting new lists on number changes. 280 281 if not new_list and (new_marker or new_num or indent < last.indent): 282 self.queue_match() 283 self.end_region(region) 284 285 # Obtain a list item and populate it. 286 287 item = ListItem([], indent, marker, space, num) 288 self.parse_region_details(item, self.listitem_pattern_names) 289 290 # Start a new list if not preceded by a list item, adding a trailing 291 # block for new elements. 292 293 if new_list: 294 item = self.parse_list(item) 295 self.add_node(region, item) 296 self.new_block(region) 297 298 # Add a nested list to the last item. 299 300 elif indent > last.indent: 301 item = self.parse_list(item) 302 self.add_node(last, item) 303 304 # Add the item to the current list. 305 306 else: 307 self.add_node(region, item) 308 309 def parse_rule(self, region): 310 311 "Handle a horizontal rule within 'region'." 312 313 length = len(self.match_group("rule")) 314 rule = Rule(length) 315 self.add_node(region, rule) 316 self.new_block(region) 317 318 def parse_section(self, region): 319 320 "Handle the start of a new section within 'region'." 321 322 # Parse the section and start a new block after the section. 323 324 indent = len(self.match_group("indent")) 325 level = len(self.match_group("level")) 326 327 section = self.parse_region(level, indent, "inline") 328 329 # If the section is inline, treat it like any other inline element. 330 331 if section.type == "inline": 332 region.append_inline(section) 333 334 # Otherwise, add it as a new block element. 335 336 else: 337 self.add_node(region, section) 338 if region.allow_blocks: 339 self.new_block(region) 340 341 def parse_table_attrs(self, cell): 342 343 "Handle the start of table attributes within 'cell'." 344 345 attrs = TableAttrs([]) 346 self.parse_region_details(attrs, self.table_attr_pattern_names) 347 348 # Test the validity of the attributes. 349 350 last = None 351 352 for node in attrs.nodes: 353 354 # Text separator nodes must be whitespace. 355 356 if isinstance(node, Text): 357 if node.s.strip(): 358 break 359 360 # Named attributes must be preceded by space if not the first. 361 362 elif last and not node.concise and not isinstance(last, Text): 363 break 364 365 last = node 366 367 # All nodes were valid: preserve the collection. 368 369 else: 370 # Add the attributes as a node, also recording their presence. 371 372 cell.append(attrs) 373 cell.attrs = attrs 374 return 375 376 # Invalid nodes were found: serialise the attributes as text. 377 378 cell.append_inline(Text(serialise(attrs))) 379 380 def parse_table_row(self, region): 381 382 "Handle the start of a table row within 'region'." 383 384 # Identify any active table. 385 386 table = region.node(-2) 387 block = region.node(-1) 388 389 if not (isinstance(table, Table) and block.empty()): 390 new_table = table = Table([]) 391 else: 392 new_table = None 393 394 row = TableRow([]) 395 396 while True: 397 cell = TableCell([]) 398 self.parse_region_details(cell, self.table_row_pattern_names) 399 400 # Handle the end of the row. 401 402 if self.matching_pattern() == "tableend": 403 trailing = self.match_group("extra") 404 405 # If the cell was started but not finished, convert the row into text. 406 407 if not row.nodes or not cell.empty(): 408 for node in row.nodes: 409 region.append_inline(Text(serialise(node))) 410 region.append_inline(Text(serialise(cell) + trailing)) 411 412 self.new_block(region) 413 return 414 415 # Append the final cell, if not empty. 416 417 else: 418 row.trailing = trailing 419 420 if not cell.empty(): 421 row.append(cell) 422 break 423 424 # A cell separator has been found. 425 426 row.append(cell) 427 428 # Add the row to the table and any new table to the region. 429 430 table.add(row) 431 if new_table: 432 self.add_node(region, new_table) 433 434 self.new_block(region) 435 436 def parse_valign(self, attrs): 437 438 "Handle vertical alignment within 'attrs'." 439 440 value = self.match_group("value") 441 attr = TableAttr("valign", value == "^" and "top" or "bottom", True) 442 attrs.append(attr) 443 444 445 446 def inline_patterns_for(self, name): 447 names = self.inline_pattern_names[:] 448 names[names.index(name)] = "%send" % name 449 return names 450 451 452 453 # Inline formatting handlers. 454 455 def parse_inline(self, region, cls, pattern_name): 456 457 "Handle an inline region." 458 459 span = cls([]) 460 self.parse_region_details(span, self.inline_patterns_for(pattern_name)) 461 region.append_inline(span) 462 463 def parse_larger(self, region): 464 self.parse_inline(region, Larger, "larger") 465 466 def parse_monospace(self, region): 467 span = Monospace([]) 468 self.parse_region_details(span, ["monospaceend"]) 469 region.append_inline(span) 470 471 def parse_smaller(self, region): 472 self.parse_inline(region, Smaller, "smaller") 473 474 def parse_strike(self, region): 475 self.parse_inline(region, Strikethrough, "strike") 476 477 def parse_sub(self, region): 478 self.parse_inline(region, Subscript, "sub") 479 480 def parse_super(self, region): 481 self.parse_inline(region, Superscript, "super") 482 483 def parse_underline(self, region): 484 self.parse_inline(region, Underline, "underline") 485 486 487 488 # Complete inline pattern handlers. 489 490 def parse_anchor(self, region): 491 target = self.match_group("target") 492 anchor = Anchor(target) 493 region.append_inline(anchor) 494 495 def parse_linebreak(self, region): 496 region.append_inline(LineBreak()) 497 498 def parse_link(self, region): 499 target = self.match_group("target") 500 text = self.match_group("text") 501 link = Link(text and [Text(text)] or [], target) 502 region.append_inline(link) 503 504 def parse_macro(self, region): 505 name = self.match_group("name") 506 args = self.match_group("args") 507 508 # Obtain the raw arguments. Moin usually leaves it to the macro to 509 # interpret the individual arguments. 510 511 arglist = args and args.split(",") or [] 512 macro = Macro(name, arglist, region.append_point()) 513 region.append_inline(macro) 514 515 # Record the macro for later processing. 516 517 self.root.macros.append(macro) 518 519 520 521 # Table attribute handlers. 522 523 def parse_table_attr(self, attrs, pattern_name): 524 525 "Handle a table attribute." 526 527 attrs.append(TableAttr(pattern_name, self.match_group("value"), True)) 528 529 def parse_colour(self, cell): 530 self.parse_table_attr(cell, "colour") 531 532 def parse_colspan(self, cell): 533 self.parse_table_attr(cell, "colspan") 534 535 def parse_rowspan(self, cell): 536 self.parse_table_attr(cell, "rowspan") 537 538 def parse_width(self, cell): 539 self.parse_table_attr(cell, "width") 540 541 542 543 # Regular expressions. 544 545 syntax = { 546 # Page regions: 547 548 "regionstart" : join((group("indent", r"\N*"), # ws... (optional) 549 group("level", repeat("[{]", 3)))), # {{{... 550 551 "regionend" : join((r"\N*", # ws... (optional) 552 group("feature", join(( 553 group("level", repeat("[}]", 3)), # }}}... 554 group("extra", r"\n"), 555 "?"))))), # nl (optional) 556 557 "header" : join(("#!", # #! 558 group("args", ".*?"), "\n")), # text-excl-nl 559 560 # Region contents: 561 562 # Line-oriented patterns support features which require their own 563 # separate lines. 564 565 "break" : r"^(\s*?)\n", # blank line 566 567 "defterm" : join(("^", 568 group("pad", r"\N+"), # ws... 569 expect(".+?::"))), # text :: 570 571 "defterm_empty" : join(("^", 572 group("pad", r"\N+"), # ws... 573 expect("::\s+"))), # :: ws... 574 575 "heading" : join(("^", 576 group("extra", r"\N*"), # ws... (optional) 577 group("level", "=+"), # =... 578 group("pad", r"\s+"), # ws... 579 expect(join((r".*?\N+", # text 580 recur("level"), # =... 581 r"\N*$"))))), # ws... (optional) 582 583 "listitem" : join(("^", 584 group("indent", r"\N+"), # ws... 585 group("marker", r"\*"), # list-marker 586 group("pad", r"\s*"))), # ws... (optional) 587 588 "listitem_num" : join(("^", 589 group("indent", r"\N+"), # ws... 590 group("marker", r"\d+\."), # decimal-marker 591 optional(join(("#", group("num", r"\d+")))), # # num (optional) 592 group("pad", r"\s+"))), # ws... 593 594 "listitem_alpha": join(("^", 595 group("indent", r"\N+"), # ws... 596 group("marker", r"[aA]\."), # alpha-marker 597 optional(join(("#", group("num", r"\d+")))), # # num (optional) 598 group("pad", r"\s+"))), # ws... 599 600 "listitem_roman": join(("^", 601 group("indent", r"\N+"), # ws... 602 group("marker", r"[iI]\."), # roman-marker 603 optional(join(("#", group("num", r"\d+")))), # # num (optional) 604 group("pad", r"\s+"))), # ws... 605 606 "listitem_dot" : join(("^", 607 group("indent", r"\N+"), # ws... 608 group("marker", r"\."), # dot-marker 609 group("pad", r"\s*"))), # ws... (optional) 610 611 "tablerow" : r"^\|\|", # || 612 613 # Region contents: 614 615 # Inline patterns are for markup features that appear within blocks. 616 # The patterns below start inline spans that can contain other markup 617 # features. 618 619 "fontstyle" : group("style", repeat("'", 2, 6)), # ''... 620 "larger" : r"~\+", # ~+ 621 "monospace" : r"`", # ` 622 "rule" : group("rule", "-----*"), # ----... 623 "smaller" : r"~-", # ~- 624 "strike" : r"--\(", # --( 625 "sub" : r",,", # ,, 626 "super" : r"\^", # ^ 627 "underline" : r"__", # __ 628 629 # Complete inline patterns are for markup features that do not support 630 # arbitrary content within them: 631 632 "anchor" : join((r"\(\(", # (( 633 group("target", ".*?"), # target 634 r"\)\)")), # )) 635 636 "linebreak" : r"\\\\", # \\ 637 638 "link" : join((r"\[\[", # [[ 639 group("target", ".*?"), # target 640 optional(join((r"\|", group("text", ".*?")))), # | text (optional) 641 "]]")), # ]] 642 643 "macro" : join(("<<", # << 644 group("name", "\w+?"), # digit-letter... 645 optional(join((r"\(", # ( (optional) 646 group("args", ".*?"), # not-)... 647 r"\)"))), # ) (optional) 648 ">>")), # >> 649 650 # Ending patterns for inline features: 651 652 "largerend" : r"\+~", # +~ 653 "monospaceend" : r"`", # ` 654 "smallerend" : r"-~", # -~ 655 "strikeend" : r"\)--", # )-- 656 "subend" : r",,", # ,, 657 "superend" : r"\^", # ^ 658 "underlineend" : r"__", # __ 659 660 # Heading contents: 661 662 "headingend" : join((group("pad", r"\N+"), # ws... 663 group("level", "=+"), # =... 664 group("extra", r"\N*\n"))), # ws (optional) nl 665 666 # List contents: 667 668 "deftermend" : join(("::", group("pad", r"\s*?\n"))), # :: 669 # ws... (optional) 670 # nl 671 672 "deftermsep" : join(("::", group("pad", r"\s+"))), # :: 673 # ws... 674 675 "listitemend" : join((r"^", # next line 676 choice((excl(r"\N"), # without indent 677 expect(r"\N+\*"), # or with ws... list-marker 678 expect(r"\N+\d\."), # or with ws... decimal-marker 679 expect(r"\N+[aA]\."), # or with ws... alpha-marker 680 expect(r"\N+[iI]\."), # or with ws... roman-marker 681 expect(r"\N+\."), # or with ws... dot-marker 682 expect(r"\N+.+?::\s"), # or with ws... text :: ws (next defterm) 683 expect(r"\N+::\s"))))), # or with ws... :: ws (next defitem) 684 685 # Table contents: 686 687 "tableattrs" : join(("<", # lt 688 excl("<"))), # not-lt 689 690 "tablecell" : r"\|\|", # || 691 692 "tableend" : join((group("extra", r"\s*?"), # ws... (optional) 693 "^")), # next line 694 695 # Table attributes: 696 697 "tableattrsend" : r">", # > 698 "halign" : group("value", "[(:)]"), # halign-marker 699 "valign" : group("value", "[v^]"), # valign-marker 700 "colour" : group("value", join(("\#", # # 701 repeat("[0-9A-F]", 6, 6)))), # nnnnnn 702 703 "colspan" : join(("-", # - 704 group("value", "\d+"))), # n... 705 706 "rowspan" : join((r"\|", # | 707 group("value", "\d+"))), # n... 708 709 "width" : group("value", "\d+%"), # n... % 710 711 "attrname" : join((excl(r"[-\d]"), # not-dash-or-digit 712 group("name", r"[-\w]+"))), # dash-digit-letter... 713 714 "attrvalue" : join(("=", group("quote", r"\Q"), # quote 715 group("value", ".*?"), # non-quote... (optional) 716 recur("quote"))), # quote 717 } 718 719 patterns = get_patterns(syntax) 720 721 722 723 # Patterns available within certain markup features. 724 725 table_attr_pattern_names = [ 726 "attrname", "colour", "colspan", "halign", "rowspan", "tableattrsend", 727 "valign", "width" 728 ] 729 730 inline_pattern_names = [ 731 "anchor", "fontstyle", "larger", "linebreak", "link", "macro", 732 "monospace", "regionstart", "smaller", "strike", "sub", "super", 733 "underline", 734 ] 735 736 list_pattern_names = [ 737 "listitem", "listitem_alpha", "listitem_dot", "listitem_num", 738 "listitem_roman", 739 ] 740 741 listitem_pattern_names = inline_pattern_names + ["listitemend"] 742 743 region_without_table_pattern_names = inline_pattern_names + list_pattern_names + [ 744 "break", "heading", "defterm", "defterm_empty", 745 "regionend", "rule", 746 ] 747 748 table_row_pattern_names = inline_pattern_names + [ 749 "tableattrs", "tablecell", "tableend" 750 ] 751 752 # The region pattern names are specifically used by the common parser 753 # functionality. 754 755 region_pattern_names = region_without_table_pattern_names + ["tablerow"] 756 757 758 759 # Pattern handlers. 760 761 end_region = ParserBase.end_region 762 parse_section_end = ParserBase.parse_region_end 763 764 handlers = { 765 None : end_region, 766 "anchor" : parse_anchor, 767 "attrname" : parse_attrname, 768 "break" : parse_break, 769 "colour" : parse_colour, 770 "colspan" : parse_colspan, 771 "defterm" : parse_defterm, 772 "defterm_empty" : parse_defterm_empty, 773 "deftermend" : end_region, 774 "deftermsep" : end_region, 775 "fontstyle" : parse_fontstyle, 776 "halign" : parse_halign, 777 "heading" : parse_heading, 778 "headingend" : parse_heading_end, 779 "larger" : parse_larger, 780 "largerend" : end_region, 781 "linebreak" : parse_linebreak, 782 "link" : parse_link, 783 "macro" : parse_macro, 784 "listitemend" : end_region, 785 "listitem" : parse_listitem, 786 "listitem_alpha" : parse_listitem, 787 "listitem_dot" : parse_listitem, 788 "listitem_num" : parse_listitem, 789 "listitem_roman" : parse_listitem, 790 "monospace" : parse_monospace, 791 "monospaceend" : end_region, 792 "regionstart" : parse_section, 793 "regionend" : parse_section_end, 794 "rowspan" : parse_rowspan, 795 "rule" : parse_rule, 796 "smaller" : parse_smaller, 797 "smallerend" : end_region, 798 "strike" : parse_strike, 799 "strikeend" : end_region, 800 "sub" : parse_sub, 801 "subend" : end_region, 802 "super" : parse_super, 803 "superend" : end_region, 804 "tableattrs" : parse_table_attrs, 805 "tableattrsend" : end_region, 806 "tablerow" : parse_table_row, 807 "tablecell" : end_region, 808 "tableend" : end_region, 809 "underline" : parse_underline, 810 "underlineend" : end_region, 811 "valign" : parse_valign, 812 "width" : parse_width, 813 } 814 815 parser = MoinParser 816 817 # vim: tabstop=4 expandtab shiftwidth=4