1 #!/usr/bin/env python 2 3 """ 4 Moin wiki format parser. 5 6 Copyright (C) 2017, 2018 Paul Boddie <paul@boddie.org.uk> 7 8 This program is free software; you can redistribute it and/or modify it under 9 the terms of the GNU General Public License as published by the Free Software 10 Foundation; either version 3 of the License, or (at your option) any later 11 version. 12 13 This program is distributed in the hope that it will be useful, but WITHOUT 14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS 15 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more 16 details. 17 18 You should have received a copy of the GNU General Public License along with 19 this program. If not, see <http://www.gnu.org/licenses/>. 20 """ 21 22 from moinformat.macros import get_macro 23 from moinformat.parsers.common import ParserBase, get_patterns, \ 24 excl, expect, group, optional, recur, \ 25 repeat 26 from moinformat.serialisers import serialise 27 from moinformat.tree.moin import Break, DefItem, DefTerm, FontStyle, Heading, \ 28 Larger, Link, List, ListItem, Macro, \ 29 Monospace, Region, Rule, Smaller, \ 30 Strikethrough, Subscript, Superscript, Table, \ 31 TableAttr, TableAttrs, TableCell, TableRow, \ 32 Text, Underline 33 34 join = "".join 35 36 class MoinParser(ParserBase): 37 38 "A wiki region parser." 39 40 def __init__(self, formats=None, root=None): 41 42 """ 43 Initialise the parser with any given 'formats' mapping from region type 44 names to parser objects. An optional 'root' indicates the document-level 45 parser. 46 """ 47 48 # Introduce this class as the default parser for the wiki format. 49 50 default_formats = {"wiki" : MoinParser, "moin" : MoinParser} 51 if formats: 52 default_formats.update(formats) 53 54 ParserBase.__init__(self, default_formats, root) 55 56 # Record macro occurrences for later evaluation. 57 58 self.macros = [] 59 60 # Principal parser methods. 61 62 def parse(self, s): 63 64 """ 65 Parse page text 's'. Pages consist of regions delimited by markers. 66 """ 67 68 self.items = self.get_items(s) 69 self.region = Region([], type="moin") 70 71 # Parse page header. 72 73 self.parse_region_header(self.region) 74 75 # Handle pages directly with this parser. Pages do not need to use an 76 # explicit format indicator. 77 78 if not self.region.type: 79 self.parse_region_content(self.items, self.region) 80 81 # Otherwise, test the type and find an appropriate parser. 82 83 else: 84 self.parse_region_type(self.region) 85 86 return self.region 87 88 89 90 # Macro evaluation. 91 92 def evaluate_macros(self): 93 94 "Evaluate the macro nodes in the document." 95 96 for node in self.macros: 97 98 # Obtain a class for the named macro. 99 100 macro_cls = get_macro(node.name) 101 if not macro_cls: 102 continue 103 104 # Instantiate the class and evaluate the macro. 105 106 macro = macro_cls(node, self.region) 107 macro.evaluate() 108 109 110 111 # Parser methods supporting different page features. 112 113 def parse_attrname(self, attrs): 114 115 "Handle an attribute name within 'attrs'." 116 117 name = self.match_group("name") 118 attr = TableAttr(name) 119 120 preceding = self.read_until(["attrvalue"], False) 121 if preceding == "": 122 attr.quote = self.match_group("quote") 123 attr.value = self.match_group("value") 124 125 attrs.append(attr) 126 127 def parse_break(self, region): 128 129 "Handle a paragraph break within 'region'." 130 131 self.add_node(region, Break()) 132 self.new_block(region) 133 134 def parse_defitem(self, region, extra=""): 135 136 "Handle a definition item within 'region'." 137 138 pad = self.match_group("pad") 139 item = DefItem([], pad, extra) 140 self.parse_region_details(item, ["listitemend"]) 141 self.add_node(region, item) 142 self.new_block(region) 143 144 def parse_defterm(self, region): 145 146 "Handle a definition term within 'region'." 147 148 pad = self.match_group("pad") 149 term = DefTerm([], pad) 150 self.parse_region_details(term, ["deftermend", "deftermsep"]) 151 self.add_node(region, term) 152 if self.matching_pattern() == "deftermsep": 153 self.parse_defitem(region) 154 155 def parse_defterm_empty(self, region): 156 157 "Handle an empty definition term within 'region'." 158 159 extra = self.match_group("pad") 160 self.parse_region_details(region, ["deftermsep"]) 161 self.parse_defitem(region, extra) 162 163 def parse_fontstyle(self, region): 164 165 "Handle emphasis and strong styles." 166 167 n = len(self.match_group("style")) 168 169 # Handle endings. 170 171 if isinstance(region, FontStyle): 172 emphasis = n in (2, 4, 5) 173 strong = n in (3, 5, 6) 174 active = True 175 176 if region.emphasis and emphasis: 177 active = region.close_emphasis() 178 n -= 2 179 if region.strong and strong: 180 active = region.close_strong() 181 n -= 3 182 183 if not active: 184 if n: 185 self.items.rewind(n) 186 raise StopIteration 187 188 elif not n: 189 return 190 191 # Handle new styles. 192 193 emphasis = n in (2, 4, 5) 194 strong = n in (3, 5, 6) 195 double = n in (4, 6) 196 197 span = FontStyle([], emphasis, strong) 198 if not double: 199 self.parse_region_details(span, self.inline_pattern_names) 200 region.append_inline(span) 201 202 def parse_halign(self, attrs): 203 204 "Handle horizontal alignment within 'attrs'." 205 206 value = self.match_group("value") 207 attr = TableAttr("halign", value == "(" and "left" or value == ")" and "right" or "center", True) 208 attrs.append(attr) 209 210 def parse_heading(self, region): 211 212 "Handle a heading." 213 214 start_extra = self.match_group("extra") 215 level = len(self.match_group("level")) 216 start_pad = self.match_group("pad") 217 heading = Heading([], level, start_extra, start_pad) 218 self.parse_region_details(heading, ["headingend"] + self.inline_pattern_names) 219 self.add_node(region, heading) 220 self.new_block(region) 221 222 def parse_heading_end(self, heading): 223 224 "Handle the end of a heading." 225 226 level = len(self.match_group("level")) 227 if heading.level == level: 228 heading.end_pad = self.match_group("pad") 229 heading.end_extra = self.match_group("extra") 230 raise StopIteration 231 232 def parse_list(self, item): 233 234 "Create a list, starting with 'item'." 235 236 list = List([item], item.indent, item.marker, item.num) 237 self.parse_region_details(list, self.list_pattern_names, True) 238 return list 239 240 def parse_listitem(self, region): 241 242 "Handle a list item marker within 'region'." 243 244 indent = len(self.match_group("indent")) 245 marker = self.match_group("marker") 246 num = self.match_group("num") 247 space = self.match_group("pad") 248 249 last = region.node(-1) 250 251 new_list = not isinstance(last, (List, ListItem)) 252 same_indent = not new_list and indent == last.indent 253 new_marker = not new_list and last.marker != marker and same_indent 254 new_num = not new_list and num is not None and last.num != num and same_indent 255 256 # If the marker or number changes at the same indent, or if the indent 257 # is smaller, queue the item and end the list. 258 259 # Note that Moin format does not seek to support item renumbering, 260 # instead starting new lists on number changes. 261 262 if not new_list and (new_marker or new_num or indent < last.indent): 263 self.queue_match() 264 self.end_region(region) 265 266 # Obtain a list item and populate it. 267 268 item = ListItem([], indent, marker, space, num) 269 self.parse_region_details(item, self.listitem_pattern_names) 270 271 # Start a new list if not preceded by a list item, adding a trailing 272 # block for new elements. 273 274 if new_list: 275 item = self.parse_list(item) 276 self.add_node(region, item) 277 self.new_block(region) 278 279 # Add a nested list to the last item. 280 281 elif indent > last.indent: 282 item = self.parse_list(item) 283 self.add_node(last, item) 284 285 # Add the item to the current list. 286 287 else: 288 self.add_node(region, item) 289 290 def parse_rule(self, region): 291 292 "Handle a horizontal rule within 'region'." 293 294 length = len(self.match_group("rule")) 295 rule = Rule(length) 296 self.add_node(region, rule) 297 self.new_block(region) 298 299 def parse_section(self, region): 300 301 "Handle the start of a new section within 'region'." 302 303 # Parse the section and start a new block after the section. 304 305 indent = len(self.match_group("indent")) 306 level = len(self.match_group("level")) 307 308 section = self.parse_region(level, indent, "inline") 309 310 # If the section is inline, treat it like any other inline element. 311 312 if section.type == "inline": 313 region.append_inline(section) 314 315 # Otherwise, add it as a new block element. 316 317 else: 318 self.add_node(region, section) 319 if region.allow_blocks: 320 self.new_block(region) 321 322 def parse_section_end(self, region): 323 324 "Handle the end of a new section within 'region'." 325 326 level = self.match_group("level") 327 feature = self.match_group("feature") 328 region.extra = self.match_group("extra") 329 330 if region.have_end(level): 331 raise StopIteration 332 else: 333 region.append_inline(Text(feature)) 334 335 def parse_table_attrs(self, cell): 336 337 "Handle the start of table attributes within 'cell'." 338 339 attrs = TableAttrs([]) 340 self.parse_region_details(attrs, self.table_pattern_names) 341 342 # Test the validity of the attributes. 343 344 last = None 345 346 for node in attrs.nodes: 347 348 # Text separator nodes must be whitespace. 349 350 if isinstance(node, Text): 351 if node.s.strip(): 352 break 353 354 # Named attributes must be preceded by space if not the first. 355 356 elif last and not node.concise and not isinstance(last, Text): 357 break 358 359 last = node 360 361 # All nodes were valid: preserve the collection. 362 363 else: 364 # Add the attributes as a node, also recording their presence. 365 366 cell.append(attrs) 367 cell.attrs = attrs 368 return 369 370 # Invalid nodes were found: serialise the attributes as text. 371 372 cell.append_inline(Text(serialise(attrs))) 373 374 def parse_table_row(self, region): 375 376 "Handle the start of a table row within 'region'." 377 378 # Identify any active table. 379 380 table = region.node(-2) 381 block = region.node(-1) 382 383 if not (isinstance(table, Table) and block.empty()): 384 new_table = table = Table([]) 385 else: 386 new_table = None 387 388 row = TableRow([]) 389 390 while True: 391 cell = TableCell([]) 392 self.parse_region_details(cell, self.table_region_pattern_names) 393 394 # Handle the end of the row. 395 396 if self.matching_pattern() == "tableend": 397 trailing = self.match_group("extra") 398 399 # If the cell was started but not finished, convert the row into text. 400 401 if not row.nodes or not cell.empty(): 402 for node in row.nodes: 403 region.append_inline(Text(serialise(node))) 404 region.append_inline(Text(serialise(cell) + trailing)) 405 406 self.new_block(region) 407 return 408 409 # Append the final cell, if not empty. 410 411 else: 412 row.trailing = trailing 413 414 if not cell.empty(): 415 row.append(cell) 416 break 417 418 # A cell separator has been found. 419 420 row.append(cell) 421 422 # Add the row to the table and any new table to the region. 423 424 table.add(row) 425 if new_table: 426 self.add_node(region, new_table) 427 428 self.new_block(region) 429 430 def parse_valign(self, attrs): 431 432 "Handle vertical alignment within 'attrs'." 433 434 value = self.match_group("value") 435 attr = TableAttr("valign", value == "^" and "top" or "bottom", True) 436 attrs.append(attr) 437 438 439 440 # Inline formatting handlers. 441 442 def parse_inline(self, region, cls, pattern_name): 443 444 "Handle an inline region." 445 446 span = cls([]) 447 self.parse_region_details(span, self.inline_patterns_for(pattern_name)) 448 region.append_inline(span) 449 450 def parse_larger(self, region): 451 self.parse_inline(region, Larger, "larger") 452 453 def parse_monospace(self, region): 454 span = Monospace([]) 455 self.parse_region_details(span, ["monospaceend"]) 456 region.append_inline(span) 457 458 def parse_smaller(self, region): 459 self.parse_inline(region, Smaller, "smaller") 460 461 def parse_strike(self, region): 462 self.parse_inline(region, Strikethrough, "strike") 463 464 def parse_sub(self, region): 465 self.parse_inline(region, Subscript, "sub") 466 467 def parse_super(self, region): 468 self.parse_inline(region, Superscript, "super") 469 470 def parse_underline(self, region): 471 self.parse_inline(region, Underline, "underline") 472 473 474 475 # Complete inline pattern handlers. 476 477 def parse_link(self, region): 478 target = self.match_group("target") 479 text = self.match_group("text") 480 link = Link(text and [Text(text)], target) 481 region.append_inline(link) 482 483 def parse_macro(self, region): 484 name = self.match_group("name") 485 args = self.match_group("args") 486 487 # Obtain the raw arguments. Moin usually leaves it to the macro to 488 # interpret the individual arguments. 489 490 arglist = args and args.split(",") or [] 491 macro = Macro(name, arglist) 492 region.append_inline(macro) 493 494 # Record the macro for later processing. 495 496 self.root.macros.append(macro) 497 498 499 500 # Table attribute handlers. 501 502 def parse_table_attr(self, attrs, pattern_name): 503 504 "Handle a table attribute." 505 506 attrs.append(TableAttr(pattern_name, self.match_group("value"), True)) 507 508 def parse_colour(self, cell): 509 self.parse_table_attr(cell, "colour") 510 511 def parse_colspan(self, cell): 512 self.parse_table_attr(cell, "colspan") 513 514 def parse_rowspan(self, cell): 515 self.parse_table_attr(cell, "rowspan") 516 517 def parse_width(self, cell): 518 self.parse_table_attr(cell, "width") 519 520 521 522 # Regular expressions. 523 524 syntax = { 525 # Page regions: 526 527 "regionstart" : join((group("indent", r"\N*"), # ws... (optional) 528 group("level", repeat("[{]", 3)))), # {{{... 529 530 "regionend" : join((r"\N*", # ws... (optional) 531 group("feature", join(( 532 group("level", repeat("[}]", 3)), # }}}... 533 group("extra", r"\n"), 534 "?"))))), # nl (optional) 535 536 "header" : join(("#!", # #! 537 group("args", ".*?"), "\n")), # text-excl-nl 538 539 # Region contents: 540 541 # Line-oriented patterns support features which require their own 542 # separate lines. 543 544 "break" : r"^(\s*?)\n", # blank line 545 546 "defterm" : join(("^", 547 group("pad", r"\N+"), # ws... 548 expect(".+?::"))), # text :: 549 550 "defterm_empty" : join(("^", 551 group("pad", r"\N+"), # ws... 552 expect("::\s+"))), # :: 553 # ws... (optional) 554 555 "heading" : join(("^", 556 group("extra", r"\N*"), # ws... (optional) 557 group("level", "=+"), # =... 558 group("pad", r"\s+"), # ws... 559 expect(join((r".*?\N+", # text 560 recur("level"), # =... 561 r"\N*$"))))), # ws... (optional) 562 563 "listitem" : join(("^", 564 group("indent", r"\N+"), # ws... 565 group("marker", r"\*"), # list-marker 566 group("pad", r"\s*"))), # ws... (optional) 567 568 "listitem_num" : join(("^", 569 group("indent", r"\N+"), # ws... 570 group("marker", r"\d+\."), # decimal-marker 571 optional(join(("#", group("num", r"\d+")))), # # num (optional) 572 group("pad", r"\s+"))), # ws... 573 574 "listitem_alpha": join(("^", 575 group("indent", r"\N+"), # ws... 576 group("marker", r"[aA]\."), # alpha-marker 577 optional(join(("#", group("num", r"\d+")))), # # num (optional) 578 group("pad", r"\s+"))), # ws... 579 580 "listitem_roman": join(("^", 581 group("indent", r"\N+"), # ws... 582 group("marker", r"[iI]\."), # roman-marker 583 optional(join(("#", group("num", r"\d+")))), # # num (optional) 584 group("pad", r"\s+"))), # ws... 585 586 "listitem_dot" : join(("^", 587 group("indent", r"\N+"), # ws... 588 group("marker", r"\."), # dot-marker 589 group("pad", r"\s*"))), # ws... (optional) 590 591 "tablerow" : r"^\|\|", # || 592 593 # Region contents: 594 595 # Inline patterns are for markup features that appear within blocks. 596 # The patterns below start inline spans that can contain other markup 597 # features. 598 599 "fontstyle" : group("style", repeat("'", 2, 6)), # ''... 600 "larger" : r"~\+", # ~+ 601 "monospace" : r"`", # ` 602 "rule" : group("rule", "-----*"), # ----... 603 "smaller" : r"~-", # ~- 604 "strike" : r"--\(", # --( 605 "sub" : r",,", # ,, 606 "super" : r"\^", # ^ 607 "underline" : r"__", # __ 608 609 # Complete inline patterns are for markup features that do not support 610 # arbitrary content within them: 611 612 "link" : join((r"\[\[", # [[ 613 group("target", ".*?"), # target 614 optional(join((r"\|", group("text", ".*?")))), # | text (optional) 615 "]]")), # ]] 616 617 "macro" : join(("<<", # << 618 group("name", "\w+?"), # digit-letter... 619 optional(join((r"\(", # ( (optional) 620 group("args", ".*?"), # not-)... 621 r"\)"))), # ) (optional) 622 ">>")), # >> 623 624 # Ending patterns for inline features: 625 626 "largerend" : r"\+~", # +~ 627 "monospaceend" : r"`", # ` 628 "smallerend" : r"-~", # -~ 629 "strikeend" : r"\)--", # )-- 630 "subend" : r",,", # ,, 631 "superend" : r"\^", # ^ 632 "underlineend" : r"__", # __ 633 634 # Heading contents: 635 636 "headingend" : join((group("pad", r"\N+"), # ws... 637 group("level", "=+"), # =... 638 group("extra", r"\N*\n"))), # ws (optional) nl 639 640 # List contents: 641 642 "deftermend" : join(("::", group("pad", r"\s*?\n"))), # :: 643 # ws... (optional) 644 # nl 645 646 "deftermsep" : join(("::", group("pad", r"\s+"))), # :: 647 # ws... (optional) 648 649 "listitemend" : r"^", # next line 650 651 # Table contents: 652 653 "tableattrs" : r"<", # < 654 "tablecell" : r"\|\|", # || 655 656 "tableend" : join((group("extra", r"\s*?"), # ws... (optional) 657 "^")), # next line 658 659 # Table attributes: 660 661 "tableattrsend" : r">", # > 662 "halign" : group("value", "[(:)]"), # halign-marker 663 "valign" : group("value", "[v^]"), # valign-marker 664 "colour" : group("value", join(("\#", # # 665 repeat("[0-9A-F]", 6, 6)))), # nnnnnn 666 667 "colspan" : join(("-", # - 668 group("value", "\d+"))), # n... 669 670 "rowspan" : join((r"\|", # | 671 group("value", "\d+"))), # n... 672 673 "width" : group("value", "\d+%"), # n... % 674 675 "attrname" : join((excl(r"[-\d]"), # not-dash-or-digit 676 group("name", r"[-\w]+"))), # dash-digit-letter... 677 678 "attrvalue" : join(("=", group("quote", r"\Q"), # quote 679 group("value", ".*?"), # non-quote... (optional) 680 recur("quote"))), # quote 681 } 682 683 patterns = get_patterns(syntax) 684 685 686 687 # Patterns available within certain markup features. 688 689 table_pattern_names = [ 690 "attrname", "colour", "colspan", "halign", "rowspan", "tableattrsend", 691 "valign", "width" 692 ] 693 694 inline_pattern_names = [ 695 "fontstyle", "larger", "link", "macro", "monospace", "regionstart", 696 "smaller", "strike", "sub", "super", "underline", 697 ] 698 699 list_pattern_names = [ 700 "listitem", "listitem_alpha", "listitem_dot", "listitem_num", 701 "listitem_roman", 702 ] 703 704 listitem_pattern_names = inline_pattern_names + ["listitemend"] 705 706 region_without_table_pattern_names = inline_pattern_names + list_pattern_names + [ 707 "break", "heading", "defterm", "defterm_empty", 708 "regionend", "rule", 709 ] 710 711 region_pattern_names = region_without_table_pattern_names + ["tablerow"] 712 713 table_region_pattern_names = inline_pattern_names + [ 714 "tableattrs", "tablecell", "tableend" 715 ] 716 717 def inline_patterns_for(self, name): 718 names = self.inline_pattern_names[:] 719 names[names.index(name)] = "%send" % name 720 return names 721 722 723 724 # Pattern handlers. 725 726 end_region = ParserBase.end_region 727 728 handlers = { 729 None : end_region, 730 "attrname" : parse_attrname, 731 "break" : parse_break, 732 "colour" : parse_colour, 733 "colspan" : parse_colspan, 734 "defterm" : parse_defterm, 735 "defterm_empty" : parse_defterm_empty, 736 "deftermend" : end_region, 737 "deftermsep" : end_region, 738 "fontstyle" : parse_fontstyle, 739 "halign" : parse_halign, 740 "heading" : parse_heading, 741 "headingend" : parse_heading_end, 742 "larger" : parse_larger, 743 "largerend" : end_region, 744 "link" : parse_link, 745 "macro" : parse_macro, 746 "listitemend" : end_region, 747 "listitem" : parse_listitem, 748 "listitem_alpha" : parse_listitem, 749 "listitem_dot" : parse_listitem, 750 "listitem_num" : parse_listitem, 751 "listitem_roman" : parse_listitem, 752 "monospace" : parse_monospace, 753 "monospaceend" : end_region, 754 "regionstart" : parse_section, 755 "regionend" : parse_section_end, 756 "rowspan" : parse_rowspan, 757 "rule" : parse_rule, 758 "smaller" : parse_smaller, 759 "smallerend" : end_region, 760 "strike" : parse_strike, 761 "strikeend" : end_region, 762 "sub" : parse_sub, 763 "subend" : end_region, 764 "super" : parse_super, 765 "superend" : end_region, 766 "tableattrs" : parse_table_attrs, 767 "tableattrsend" : end_region, 768 "tablerow" : parse_table_row, 769 "tablecell" : end_region, 770 "tableend" : end_region, 771 "underline" : parse_underline, 772 "underlineend" : end_region, 773 "valign" : parse_valign, 774 "width" : parse_width, 775 } 776 777 parser = MoinParser 778 779 # vim: tabstop=4 expandtab shiftwidth=4