1 #!/usr/bin/env python 2 3 """ 4 Moin wiki format parser. 5 6 Copyright (C) 2017, 2018 Paul Boddie <paul@boddie.org.uk> 7 8 This program is free software; you can redistribute it and/or modify it under 9 the terms of the GNU General Public License as published by the Free Software 10 Foundation; either version 3 of the License, or (at your option) any later 11 version. 12 13 This program is distributed in the hope that it will be useful, but WITHOUT 14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS 15 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more 16 details. 17 18 You should have received a copy of the GNU General Public License along with 19 this program. If not, see <http://www.gnu.org/licenses/>. 20 """ 21 22 from moinformat.parsers.common import ParserBase, get_patterns, \ 23 excl, expect, group, optional, recur, \ 24 repeat 25 from moinformat.serialisers import serialise 26 from moinformat.tree.moin import Break, DefItem, DefTerm, FontStyle, Heading, \ 27 Larger, Link, List, ListItem, Macro, \ 28 Monospace, Region, Rule, Smaller, \ 29 Strikethrough, Subscript, Superscript, Table, \ 30 TableAttr, TableAttrs, TableCell, TableRow, \ 31 Text, Underline 32 33 join = "".join 34 35 class MoinParser(ParserBase): 36 37 "A wiki region parser." 38 39 def __init__(self, formats=None, root=None): 40 41 """ 42 Initialise the parser with any given 'formats' mapping from region type 43 names to parser objects. An optional 'root' indicates the document-level 44 parser. 45 """ 46 47 # Introduce this class as the default parser for the wiki format. 48 49 default_formats = {"wiki" : MoinParser, "moin" : MoinParser} 50 if formats: 51 default_formats.update(formats) 52 53 ParserBase.__init__(self, default_formats, root) 54 55 # Record macro occurrences for later evaluation. 56 57 self.macros = [] 58 59 # Principal parser methods. 60 61 def parse(self, s): 62 63 """ 64 Parse page text 's'. Pages consist of regions delimited by markers. 65 """ 66 67 self.items = self.get_items(s) 68 self.region = Region([], type="moin") 69 70 # Parse page header. 71 72 self.parse_region_header(self.region) 73 74 # Handle pages directly with this parser. Pages do not need to use an 75 # explicit format indicator. 76 77 if not self.region.type: 78 self.parse_region_content(self.items, self.region) 79 80 # Otherwise, test the type and find an appropriate parser. 81 82 else: 83 self.parse_region_type(self.region) 84 85 return self.region 86 87 88 89 # Parser methods supporting different page features. 90 91 def parse_attrname(self, attrs): 92 93 "Handle an attribute name within 'attrs'." 94 95 name = self.match_group("name") 96 attr = TableAttr(name) 97 98 preceding = self.read_until(["attrvalue"], False) 99 if preceding == "": 100 attr.quote = self.match_group("quote") 101 attr.value = self.match_group("value") 102 103 attrs.append(attr) 104 105 def parse_break(self, region): 106 107 "Handle a paragraph break within 'region'." 108 109 self.add_node(region, Break()) 110 self.new_block(region) 111 112 def parse_defitem(self, region, extra=""): 113 114 "Handle a definition item within 'region'." 115 116 pad = self.match_group("pad") 117 item = DefItem([], pad, extra) 118 self.parse_region_details(item, ["listitemend"]) 119 self.add_node(region, item) 120 self.new_block(region) 121 122 def parse_defterm(self, region): 123 124 "Handle a definition term within 'region'." 125 126 pad = self.match_group("pad") 127 term = DefTerm([], pad) 128 self.parse_region_details(term, ["deftermend", "deftermsep"]) 129 self.add_node(region, term) 130 if self.matching_pattern() == "deftermsep": 131 self.parse_defitem(region) 132 133 def parse_defterm_empty(self, region): 134 135 "Handle an empty definition term within 'region'." 136 137 extra = self.match_group("pad") 138 self.parse_region_details(region, ["deftermsep"]) 139 self.parse_defitem(region, extra) 140 141 def parse_fontstyle(self, region): 142 143 "Handle emphasis and strong styles." 144 145 n = len(self.match_group("style")) 146 147 # Handle endings. 148 149 if isinstance(region, FontStyle): 150 emphasis = n in (2, 4, 5) 151 strong = n in (3, 5, 6) 152 active = True 153 154 if region.emphasis and emphasis: 155 active = region.close_emphasis() 156 n -= 2 157 if region.strong and strong: 158 active = region.close_strong() 159 n -= 3 160 161 if not active: 162 if n: 163 self.items.rewind(n) 164 raise StopIteration 165 166 elif not n: 167 return 168 169 # Handle new styles. 170 171 emphasis = n in (2, 4, 5) 172 strong = n in (3, 5, 6) 173 double = n in (4, 6) 174 175 span = FontStyle([], emphasis, strong) 176 if not double: 177 self.parse_region_details(span, self.inline_pattern_names) 178 region.append_inline(span) 179 180 def parse_halign(self, attrs): 181 182 "Handle horizontal alignment within 'attrs'." 183 184 value = self.match_group("value") 185 attr = TableAttr("halign", value == "(" and "left" or value == ")" and "right" or "center", True) 186 attrs.append(attr) 187 188 def parse_heading(self, region): 189 190 "Handle a heading." 191 192 start_extra = self.match_group("extra") 193 level = len(self.match_group("level")) 194 start_pad = self.match_group("pad") 195 heading = Heading([], level, start_extra, start_pad) 196 self.parse_region_details(heading, ["headingend"] + self.inline_pattern_names) 197 self.add_node(region, heading) 198 self.new_block(region) 199 200 def parse_heading_end(self, heading): 201 202 "Handle the end of a heading." 203 204 level = len(self.match_group("level")) 205 if heading.level == level: 206 heading.end_pad = self.match_group("pad") 207 heading.end_extra = self.match_group("extra") 208 raise StopIteration 209 210 def parse_list(self, item): 211 212 "Create a list, starting with 'item'." 213 214 list = List([item], item.indent, item.marker, item.num) 215 self.parse_region_details(list, self.list_pattern_names, True) 216 return list 217 218 def parse_listitem(self, region): 219 220 "Handle a list item marker within 'region'." 221 222 indent = len(self.match_group("indent")) 223 marker = self.match_group("marker") 224 num = self.match_group("num") 225 space = self.match_group("pad") 226 227 last = region.node(-1) 228 229 new_list = not isinstance(last, (List, ListItem)) 230 same_indent = not new_list and indent == last.indent 231 new_marker = not new_list and last.marker != marker and same_indent 232 new_num = not new_list and num is not None and last.num != num and same_indent 233 234 # If the marker or number changes at the same indent, or if the indent 235 # is smaller, queue the item and end the list. 236 237 # Note that Moin format does not seek to support item renumbering, 238 # instead starting new lists on number changes. 239 240 if not new_list and (new_marker or new_num or indent < last.indent): 241 self.queue_match() 242 self.end_region(region) 243 244 # Obtain a list item and populate it. 245 246 item = ListItem([], indent, marker, space, num) 247 self.parse_region_details(item, self.listitem_pattern_names) 248 249 # Start a new list if not preceded by a list item, adding a trailing 250 # block for new elements. 251 252 if new_list: 253 item = self.parse_list(item) 254 self.add_node(region, item) 255 self.new_block(region) 256 257 # Add a nested list to the last item. 258 259 elif indent > last.indent: 260 item = self.parse_list(item) 261 self.add_node(last, item) 262 263 # Add the item to the current list. 264 265 else: 266 self.add_node(region, item) 267 268 def parse_rule(self, region): 269 270 "Handle a horizontal rule within 'region'." 271 272 length = len(self.match_group("rule")) 273 rule = Rule(length) 274 self.add_node(region, rule) 275 self.new_block(region) 276 277 def parse_section(self, region): 278 279 "Handle the start of a new section within 'region'." 280 281 # Parse the section and start a new block after the section. 282 283 indent = len(self.match_group("indent")) 284 level = len(self.match_group("level")) 285 286 section = self.parse_region(level, indent, "inline") 287 288 # If the section is inline, treat it like any other inline element. 289 290 if section.type == "inline": 291 region.append_inline(section) 292 293 # Otherwise, add it as a new block element. 294 295 else: 296 self.add_node(region, section) 297 if region.allow_blocks: 298 self.new_block(region) 299 300 def parse_section_end(self, region): 301 302 "Handle the end of a new section within 'region'." 303 304 level = self.match_group("level") 305 feature = self.match_group("feature") 306 region.extra = self.match_group("extra") 307 308 if region.have_end(level): 309 raise StopIteration 310 else: 311 region.append_inline(Text(feature)) 312 313 def parse_table_attrs(self, cell): 314 315 "Handle the start of table attributes within 'cell'." 316 317 attrs = TableAttrs([]) 318 self.parse_region_details(attrs, self.table_pattern_names) 319 320 # Test the validity of the attributes. 321 322 last = None 323 324 for node in attrs.nodes: 325 326 # Text separator nodes must be whitespace. 327 328 if isinstance(node, Text): 329 if node.s.strip(): 330 break 331 332 # Named attributes must be preceded by space if not the first. 333 334 elif last and not node.concise and not isinstance(last, Text): 335 break 336 337 last = node 338 339 # All nodes were valid: preserve the collection. 340 341 else: 342 # Add the attributes as a node, also recording their presence. 343 344 cell.append(attrs) 345 cell.attrs = attrs 346 return 347 348 # Invalid nodes were found: serialise the attributes as text. 349 350 cell.append_inline(Text(serialise(attrs))) 351 352 def parse_table_row(self, region): 353 354 "Handle the start of a table row within 'region'." 355 356 # Identify any active table. 357 358 table = region.node(-2) 359 block = region.node(-1) 360 361 if not (isinstance(table, Table) and block.empty()): 362 new_table = table = Table([]) 363 else: 364 new_table = None 365 366 row = TableRow([]) 367 368 while True: 369 cell = TableCell([]) 370 self.parse_region_details(cell, self.table_region_pattern_names) 371 372 # Handle the end of the row. 373 374 if self.matching_pattern() == "tableend": 375 trailing = self.match_group("extra") 376 377 # If the cell was started but not finished, convert the row into text. 378 379 if not row.nodes or not cell.empty(): 380 for node in row.nodes: 381 region.append_inline(Text(serialise(node))) 382 region.append_inline(Text(serialise(cell) + trailing)) 383 384 self.new_block(region) 385 return 386 387 # Append the final cell, if not empty. 388 389 else: 390 row.trailing = trailing 391 392 if not cell.empty(): 393 row.append(cell) 394 break 395 396 # A cell separator has been found. 397 398 row.append(cell) 399 400 # Add the row to the table and any new table to the region. 401 402 table.add(row) 403 if new_table: 404 self.add_node(region, new_table) 405 406 self.new_block(region) 407 408 def parse_valign(self, attrs): 409 410 "Handle vertical alignment within 'attrs'." 411 412 value = self.match_group("value") 413 attr = TableAttr("valign", value == "^" and "top" or "bottom", True) 414 attrs.append(attr) 415 416 417 418 # Inline formatting handlers. 419 420 def parse_inline(self, region, cls, pattern_name): 421 422 "Handle an inline region." 423 424 span = cls([]) 425 self.parse_region_details(span, self.inline_patterns_for(pattern_name)) 426 region.append_inline(span) 427 428 def parse_larger(self, region): 429 self.parse_inline(region, Larger, "larger") 430 431 def parse_monospace(self, region): 432 span = Monospace([]) 433 self.parse_region_details(span, ["monospaceend"]) 434 region.append_inline(span) 435 436 def parse_smaller(self, region): 437 self.parse_inline(region, Smaller, "smaller") 438 439 def parse_strike(self, region): 440 self.parse_inline(region, Strikethrough, "strike") 441 442 def parse_sub(self, region): 443 self.parse_inline(region, Subscript, "sub") 444 445 def parse_super(self, region): 446 self.parse_inline(region, Superscript, "super") 447 448 def parse_underline(self, region): 449 self.parse_inline(region, Underline, "underline") 450 451 452 453 # Complete inline pattern handlers. 454 455 def parse_link(self, region): 456 target = self.match_group("target") 457 text = self.match_group("text") 458 link = Link(text and [Text(text)], target) 459 region.append_inline(link) 460 461 def parse_macro(self, region): 462 name = self.match_group("name") 463 args = self.match_group("args") 464 465 # Obtain the raw arguments. Moin usually leaves it to the macro to 466 # interpret the individual arguments. 467 468 arglist = args and args.split(",") or [] 469 macro = Macro(name, arglist) 470 region.append_inline(macro) 471 472 # Record the macro for later processing. 473 474 self.root.macros.append(macro) 475 476 477 478 # Table attribute handlers. 479 480 def parse_table_attr(self, attrs, pattern_name): 481 482 "Handle a table attribute." 483 484 attrs.append(TableAttr(pattern_name, self.match_group("value"), True)) 485 486 def parse_colour(self, cell): 487 self.parse_table_attr(cell, "colour") 488 489 def parse_colspan(self, cell): 490 self.parse_table_attr(cell, "colspan") 491 492 def parse_rowspan(self, cell): 493 self.parse_table_attr(cell, "rowspan") 494 495 def parse_width(self, cell): 496 self.parse_table_attr(cell, "width") 497 498 499 500 # Regular expressions. 501 502 syntax = { 503 # Page regions: 504 505 "regionstart" : join((group("indent", r"\N*"), # ws... (optional) 506 group("level", repeat("[{]", 3)))), # {{{... 507 508 "regionend" : join((r"\N*", # ws... (optional) 509 group("feature", join(( 510 group("level", repeat("[}]", 3)), # }}}... 511 group("extra", r"\n"), 512 "?"))))), # nl (optional) 513 514 "header" : join(("#!", # #! 515 group("args", ".*?"), "\n")), # text-excl-nl 516 517 # Region contents: 518 519 # Line-oriented patterns support features which require their own 520 # separate lines. 521 522 "break" : r"^(\s*?)\n", # blank line 523 524 "defterm" : join(("^", 525 group("pad", r"\N+"), # ws... 526 expect(".+?::"))), # text :: 527 528 "defterm_empty" : join(("^", 529 group("pad", r"\N+"), # ws... 530 expect("::\s+"))), # :: 531 # ws... (optional) 532 533 "heading" : join(("^", 534 group("extra", r"\N*"), # ws... (optional) 535 group("level", "=+"), # =... 536 group("pad", r"\s+"), # ws... 537 expect(join((r".*?\N+", # text 538 recur("level"), # =... 539 r"\N*$"))))), # ws... (optional) 540 541 "listitem" : join(("^", 542 group("indent", r"\N+"), # ws... 543 group("marker", r"\*"), # list-marker 544 group("pad", r"\s*"))), # ws... (optional) 545 546 "listitem_num" : join(("^", 547 group("indent", r"\N+"), # ws... 548 group("marker", r"\d+\."), # decimal-marker 549 optional(join(("#", group("num", r"\d+")))), # # num (optional) 550 group("pad", r"\s+"))), # ws... 551 552 "listitem_alpha": join(("^", 553 group("indent", r"\N+"), # ws... 554 group("marker", r"[aA]\."), # alpha-marker 555 optional(join(("#", group("num", r"\d+")))), # # num (optional) 556 group("pad", r"\s+"))), # ws... 557 558 "listitem_roman": join(("^", 559 group("indent", r"\N+"), # ws... 560 group("marker", r"[iI]\."), # roman-marker 561 optional(join(("#", group("num", r"\d+")))), # # num (optional) 562 group("pad", r"\s+"))), # ws... 563 564 "listitem_dot" : join(("^", 565 group("indent", r"\N+"), # ws... 566 group("marker", r"\."), # dot-marker 567 group("pad", r"\s*"))), # ws... (optional) 568 569 "tablerow" : r"^\|\|", # || 570 571 # Region contents: 572 573 # Inline patterns are for markup features that appear within blocks. 574 # The patterns below start inline spans that can contain other markup 575 # features. 576 577 "fontstyle" : group("style", repeat("'", 2, 6)), # ''... 578 "larger" : r"~\+", # ~+ 579 "monospace" : r"`", # ` 580 "rule" : group("rule", "-----*"), # ----... 581 "smaller" : r"~-", # ~- 582 "strike" : r"--\(", # --( 583 "sub" : r",,", # ,, 584 "super" : r"\^", # ^ 585 "underline" : r"__", # __ 586 587 # Complete inline patterns are for markup features that do not support 588 # arbitrary content within them: 589 590 "link" : join((r"\[\[", # [[ 591 group("target", ".*?"), # target 592 optional(join((r"\|", group("text", ".*?")))), # | text (optional) 593 "]]")), # ]] 594 595 "macro" : join(("<<", # << 596 group("name", "\w+?"), # digit-letter... 597 optional(join((r"\(", # ( (optional) 598 group("args", ".*?"), # not-)... 599 r"\)"))), # ) (optional) 600 ">>")), # >> 601 602 # Ending patterns for inline features: 603 604 "largerend" : r"\+~", # +~ 605 "monospaceend" : r"`", # ` 606 "smallerend" : r"-~", # -~ 607 "strikeend" : r"\)--", # )-- 608 "subend" : r",,", # ,, 609 "superend" : r"\^", # ^ 610 "underlineend" : r"__", # __ 611 612 # Heading contents: 613 614 "headingend" : join((group("pad", r"\N+"), # ws... 615 group("level", "=+"), # =... 616 group("extra", r"\N*\n"))), # ws (optional) nl 617 618 # List contents: 619 620 "deftermend" : join(("::", group("pad", r"\s*?\n"))), # :: 621 # ws... (optional) 622 # nl 623 624 "deftermsep" : join(("::", group("pad", r"\s+"))), # :: 625 # ws... (optional) 626 627 "listitemend" : r"^", # next line 628 629 # Table contents: 630 631 "tableattrs" : r"<", # < 632 "tablecell" : r"\|\|", # || 633 634 "tableend" : join((group("extra", r"\s*?"), # ws... (optional) 635 "^")), # next line 636 637 # Table attributes: 638 639 "tableattrsend" : r">", # > 640 "halign" : group("value", "[(:)]"), # halign-marker 641 "valign" : group("value", "[v^]"), # valign-marker 642 "colour" : group("value", join(("\#", # # 643 repeat("[0-9A-F]", 6, 6)))), # nnnnnn 644 645 "colspan" : join(("-", # - 646 group("value", "\d+"))), # n... 647 648 "rowspan" : join((r"\|", # | 649 group("value", "\d+"))), # n... 650 651 "width" : group("value", "\d+%"), # n... % 652 653 "attrname" : join((excl(r"[-\d]"), # not-dash-or-digit 654 group("name", r"[-\w]+"))), # dash-digit-letter... 655 656 "attrvalue" : join(("=", group("quote", r"\Q"), # quote 657 group("value", ".*?"), # non-quote... (optional) 658 recur("quote"))), # quote 659 } 660 661 patterns = get_patterns(syntax) 662 663 664 665 # Patterns available within certain markup features. 666 667 table_pattern_names = [ 668 "attrname", "colour", "colspan", "halign", "rowspan", "tableattrsend", 669 "valign", "width" 670 ] 671 672 inline_pattern_names = [ 673 "fontstyle", "larger", "link", "macro", "monospace", "regionstart", 674 "smaller", "strike", "sub", "super", "underline", 675 ] 676 677 list_pattern_names = [ 678 "listitem", "listitem_alpha", "listitem_dot", "listitem_num", 679 "listitem_roman", 680 ] 681 682 listitem_pattern_names = inline_pattern_names + ["listitemend"] 683 684 region_without_table_pattern_names = inline_pattern_names + list_pattern_names + [ 685 "break", "heading", "defterm", "defterm_empty", 686 "regionend", "rule", 687 ] 688 689 region_pattern_names = region_without_table_pattern_names + ["tablerow"] 690 691 table_region_pattern_names = inline_pattern_names + [ 692 "tableattrs", "tablecell", "tableend" 693 ] 694 695 def inline_patterns_for(self, name): 696 names = self.inline_pattern_names[:] 697 names[names.index(name)] = "%send" % name 698 return names 699 700 701 702 # Pattern handlers. 703 704 end_region = ParserBase.end_region 705 706 handlers = { 707 None : end_region, 708 "attrname" : parse_attrname, 709 "break" : parse_break, 710 "colour" : parse_colour, 711 "colspan" : parse_colspan, 712 "defterm" : parse_defterm, 713 "defterm_empty" : parse_defterm_empty, 714 "deftermend" : end_region, 715 "deftermsep" : end_region, 716 "fontstyle" : parse_fontstyle, 717 "halign" : parse_halign, 718 "heading" : parse_heading, 719 "headingend" : parse_heading_end, 720 "larger" : parse_larger, 721 "largerend" : end_region, 722 "link" : parse_link, 723 "macro" : parse_macro, 724 "listitemend" : end_region, 725 "listitem" : parse_listitem, 726 "listitem_alpha" : parse_listitem, 727 "listitem_dot" : parse_listitem, 728 "listitem_num" : parse_listitem, 729 "listitem_roman" : parse_listitem, 730 "monospace" : parse_monospace, 731 "monospaceend" : end_region, 732 "regionstart" : parse_section, 733 "regionend" : parse_section_end, 734 "rowspan" : parse_rowspan, 735 "rule" : parse_rule, 736 "smaller" : parse_smaller, 737 "smallerend" : end_region, 738 "strike" : parse_strike, 739 "strikeend" : end_region, 740 "sub" : parse_sub, 741 "subend" : end_region, 742 "super" : parse_super, 743 "superend" : end_region, 744 "tableattrs" : parse_table_attrs, 745 "tableattrsend" : end_region, 746 "tablerow" : parse_table_row, 747 "tablecell" : end_region, 748 "tableend" : end_region, 749 "underline" : parse_underline, 750 "underlineend" : end_region, 751 "valign" : parse_valign, 752 "width" : parse_width, 753 } 754 755 parser = MoinParser 756 757 # vim: tabstop=4 expandtab shiftwidth=4