1 #!/usr/bin/env python 2 3 """ 4 Moin wiki format parser. 5 6 Copyright (C) 2017, 2018 Paul Boddie <paul@boddie.org.uk> 7 8 This program is free software; you can redistribute it and/or modify it under 9 the terms of the GNU General Public License as published by the Free Software 10 Foundation; either version 3 of the License, or (at your option) any later 11 version. 12 13 This program is distributed in the hope that it will be useful, but WITHOUT 14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS 15 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more 16 details. 17 18 You should have received a copy of the GNU General Public License along with 19 this program. If not, see <http://www.gnu.org/licenses/>. 20 """ 21 22 from moinformat.parsers.common import ParserBase, get_patterns, \ 23 excl, expect, group, optional, recur, repeat 24 from moinformat.serialisers import serialise 25 from moinformat.tree import Break, DefItem, DefTerm, FontStyle, Heading, \ 26 Larger, Link, List, ListItem, Macro, Monospace, \ 27 Region, Rule, Smaller, Strikethrough, Subscript, \ 28 Superscript, Table, TableAttr, TableAttrs, \ 29 TableCell, TableRow, Text, Underline 30 31 join = "".join 32 33 class MoinParser(ParserBase): 34 35 "A wiki region parser." 36 37 def __init__(self, formats=None): 38 39 """ 40 Initialise the parser with any given 'formats' mapping from region type 41 names to parser objects. 42 """ 43 44 # Introduce this class as the default parser for the wiki format. 45 46 default_formats = {"wiki" : MoinParser, "moin" : MoinParser} 47 if formats: 48 default_formats.update(formats) 49 50 ParserBase.__init__(self, default_formats) 51 52 # Principal parser methods. 53 54 def parse(self, s): 55 56 """ 57 Parse page text 's'. Pages consist of regions delimited by markers. 58 """ 59 60 self.items = self.get_items(s) 61 self.region = Region([], type="moin") 62 63 # Parse page header. 64 65 self.parse_region_header(self.region) 66 67 # Handle pages directly with this parser. Pages do not need to use an 68 # explicit format indicator. 69 70 if not self.region.type: 71 self.parse_region_content(self.items, self.region) 72 73 # Otherwise, test the type and find an appropriate parser. 74 75 else: 76 self.parse_region_type(self.region) 77 78 return self.region 79 80 81 82 # Parser methods supporting different page features. 83 84 def parse_attrname(self, attrs): 85 86 "Handle an attribute name within 'attrs'." 87 88 name = self.match_group("name") 89 attr = TableAttr(name) 90 91 preceding = self.read_until(["attrvalue"], False) 92 if preceding == "": 93 attr.quote = self.match_group("quote") 94 attr.value = self.match_group("value") 95 96 attrs.append(attr) 97 98 def parse_break(self, region): 99 100 "Handle a paragraph break within 'region'." 101 102 self.add_node(region, Break()) 103 self.new_block(region) 104 105 def parse_defitem(self, region, extra=""): 106 107 "Handle a definition item within 'region'." 108 109 pad = self.match_group("pad") 110 item = DefItem([], pad, extra) 111 self.parse_region_details(item, ["listitemend"]) 112 self.add_node(region, item) 113 self.new_block(region) 114 115 def parse_defterm(self, region): 116 117 "Handle a definition term within 'region'." 118 119 pad = self.match_group("pad") 120 term = DefTerm([], pad) 121 self.parse_region_details(term, ["deftermend", "deftermsep"]) 122 self.add_node(region, term) 123 if self.matching_pattern() == "deftermsep": 124 self.parse_defitem(region) 125 126 def parse_defterm_empty(self, region): 127 128 "Handle an empty definition term within 'region'." 129 130 extra = self.match_group("pad") 131 self.parse_region_details(region, ["deftermsep"]) 132 self.parse_defitem(region, extra) 133 134 def parse_fontstyle(self, region): 135 136 "Handle emphasis and strong styles." 137 138 n = len(self.match_group("style")) 139 140 # Handle endings. 141 142 if isinstance(region, FontStyle): 143 emphasis = n in (2, 4, 5) 144 strong = n in (3, 5, 6) 145 active = True 146 147 if region.emphasis and emphasis: 148 active = region.close_emphasis() 149 n -= 2 150 if region.strong and strong: 151 active = region.close_strong() 152 n -= 3 153 154 if not active: 155 if n: 156 self.items.rewind(n) 157 raise StopIteration 158 159 elif not n: 160 return 161 162 # Handle new styles. 163 164 emphasis = n in (2, 4, 5) 165 strong = n in (3, 5, 6) 166 double = n in (4, 6) 167 168 span = FontStyle([], emphasis, strong) 169 if not double: 170 self.parse_region_details(span, self.inline_pattern_names) 171 region.append_inline(span) 172 173 def parse_halign(self, attrs): 174 175 "Handle horizontal alignment within 'attrs'." 176 177 value = self.match_group("value") 178 attr = TableAttr("halign", value == "(" and "left" or value == ")" and "right" or "center", True) 179 attrs.append(attr) 180 181 def parse_heading(self, region): 182 183 "Handle a heading." 184 185 start_extra = self.match_group("extra") 186 level = len(self.match_group("level")) 187 start_pad = self.match_group("pad") 188 heading = Heading([], level, start_extra, start_pad) 189 self.parse_region_details(heading, ["headingend"] + self.inline_pattern_names) 190 self.add_node(region, heading) 191 self.new_block(region) 192 193 def parse_heading_end(self, heading): 194 195 "Handle the end of a heading." 196 197 level = len(self.match_group("level")) 198 if heading.level == level: 199 heading.end_pad = self.match_group("pad") 200 heading.end_extra = self.match_group("extra") 201 raise StopIteration 202 203 def parse_list(self, item): 204 205 "Create a list, starting with 'item'." 206 207 list = List([item], item.indent, item.marker, item.num) 208 self.parse_region_details(list, self.list_pattern_names, True) 209 return list 210 211 def parse_listitem(self, region): 212 213 "Handle a list item marker within 'region'." 214 215 indent = len(self.match_group("indent")) 216 marker = self.match_group("marker") 217 num = self.match_group("num") 218 space = self.match_group("pad") 219 220 last = region.node(-1) 221 222 new_list = not isinstance(last, (List, ListItem)) 223 same_indent = not new_list and indent == last.indent 224 new_marker = not new_list and last.marker != marker and same_indent 225 new_num = not new_list and num is not None and last.num != num and same_indent 226 227 # If the marker or number changes at the same indent, or if the indent 228 # is smaller, queue the item and end the list. 229 230 # Note that Moin format does not seek to support item renumbering, 231 # instead starting new lists on number changes. 232 233 if not new_list and (new_marker or new_num or indent < last.indent): 234 self.queue_match() 235 self.end_region(region) 236 237 # Obtain a list item and populate it. 238 239 item = ListItem([], indent, marker, space, num) 240 self.parse_region_details(item, self.listitem_pattern_names) 241 242 # Start a new list if not preceded by a list item, adding a trailing 243 # block for new elements. 244 245 if new_list: 246 item = self.parse_list(item) 247 self.add_node(region, item) 248 self.new_block(region) 249 250 # Add a nested list to the last item. 251 252 elif indent > last.indent: 253 item = self.parse_list(item) 254 self.add_node(last, item) 255 256 # Add the item to the current list. 257 258 else: 259 self.add_node(region, item) 260 261 def parse_rule(self, region): 262 263 "Handle a horizontal rule within 'region'." 264 265 length = len(self.match_group("rule")) 266 rule = Rule(length) 267 self.add_node(region, rule) 268 self.new_block(region) 269 270 def parse_section(self, region): 271 272 "Handle the start of a new section within 'region'." 273 274 # Parse the section and start a new block after the section. 275 276 indent = len(self.match_group("indent")) 277 level = len(self.match_group("level")) 278 279 section = self.parse_region(level, indent, "inline") 280 281 # If the section is inline, treat it like any other inline element. 282 283 if section.type == "inline": 284 region.append_inline(section) 285 286 # Otherwise, add it as a new block element. 287 288 else: 289 self.add_node(region, section) 290 if region.allow_blocks: 291 self.new_block(region) 292 293 def parse_section_end(self, region): 294 295 "Handle the end of a new section within 'region'." 296 297 level = self.match_group("level") 298 feature = self.match_group("feature") 299 region.extra = self.match_group("extra") 300 301 if region.have_end(level): 302 raise StopIteration 303 else: 304 region.append_inline(Text(feature)) 305 306 def parse_table_attrs(self, cell): 307 308 "Handle the start of table attributes within 'cell'." 309 310 attrs = TableAttrs([]) 311 self.parse_region_details(attrs, self.table_pattern_names) 312 313 # Test the validity of the attributes. 314 315 last = None 316 317 for node in attrs.nodes: 318 319 # Text separator nodes must be whitespace. 320 321 if isinstance(node, Text): 322 if node.s.strip(): 323 break 324 325 # Named attributes must be preceded by space if not the first. 326 327 elif last and not node.concise and not isinstance(last, Text): 328 break 329 330 last = node 331 332 # All nodes were valid: preserve the collection. 333 334 else: 335 # Add the attributes as a node, also recording their presence. 336 337 cell.append(attrs) 338 cell.attrs = attrs 339 return 340 341 # Invalid nodes were found: serialise the attributes as text. 342 343 cell.append_inline(Text(serialise(attrs))) 344 345 def parse_table_row(self, region): 346 347 "Handle the start of a table row within 'region'." 348 349 # Identify any active table. 350 351 table = region.node(-2) 352 block = region.node(-1) 353 354 if not (isinstance(table, Table) and block.empty()): 355 new_table = table = Table([]) 356 else: 357 new_table = None 358 359 row = TableRow([]) 360 361 while True: 362 cell = TableCell([]) 363 self.parse_region_details(cell, self.table_region_pattern_names) 364 365 # Handle the end of the row. 366 367 if self.matching_pattern() == "tableend": 368 trailing = self.match_group("extra") 369 370 # If the cell was started but not finished, convert the row into text. 371 372 if not row.nodes or not cell.empty(): 373 for node in row.nodes: 374 region.append_inline(Text(serialise(node))) 375 region.append_inline(Text(serialise(cell) + trailing)) 376 377 self.new_block(region) 378 return 379 380 # Append the final cell, if not empty. 381 382 else: 383 row.trailing = trailing 384 385 if not cell.empty(): 386 row.append(cell) 387 break 388 389 # A cell separator has been found. 390 391 row.append(cell) 392 393 # Add the row to the table and any new table to the region. 394 395 table.add(row) 396 if new_table: 397 self.add_node(region, new_table) 398 399 self.new_block(region) 400 401 def parse_valign(self, attrs): 402 403 "Handle vertical alignment within 'attrs'." 404 405 value = self.match_group("value") 406 attr = TableAttr("valign", value == "^" and "top" or "bottom", True) 407 attrs.append(attr) 408 409 410 411 # Inline formatting handlers. 412 413 def parse_inline(self, region, cls, pattern_name): 414 415 "Handle an inline region." 416 417 span = cls([]) 418 self.parse_region_details(span, self.inline_patterns_for(pattern_name)) 419 region.append_inline(span) 420 421 def parse_larger(self, region): 422 self.parse_inline(region, Larger, "larger") 423 424 def parse_monospace(self, region): 425 span = Monospace([]) 426 self.parse_region_details(span, ["monospaceend"]) 427 region.append_inline(span) 428 429 def parse_smaller(self, region): 430 self.parse_inline(region, Smaller, "smaller") 431 432 def parse_strike(self, region): 433 self.parse_inline(region, Strikethrough, "strike") 434 435 def parse_sub(self, region): 436 self.parse_inline(region, Subscript, "sub") 437 438 def parse_super(self, region): 439 self.parse_inline(region, Superscript, "super") 440 441 def parse_underline(self, region): 442 self.parse_inline(region, Underline, "underline") 443 444 445 446 # Complete inline pattern handlers. 447 448 def parse_link(self, region): 449 target = self.match_group("target") 450 text = self.match_group("text") 451 link = Link(text and [Text(text)], target) 452 region.append_inline(link) 453 454 def parse_macro(self, region): 455 name = self.match_group("name") 456 args = self.match_group("args") 457 458 # Obtain the raw arguments. Moin usually leaves it to the macro to 459 # interpret the individual arguments. 460 461 arglist = args and args.split(",") or [] 462 macro = Macro(name, arglist) 463 region.append_inline(macro) 464 465 466 467 # Table attribute handlers. 468 469 def parse_table_attr(self, attrs, pattern_name): 470 471 "Handle a table attribute." 472 473 attrs.append(TableAttr(pattern_name, self.match_group("value"), True)) 474 475 def parse_colour(self, cell): 476 self.parse_table_attr(cell, "colour") 477 478 def parse_colspan(self, cell): 479 self.parse_table_attr(cell, "colspan") 480 481 def parse_rowspan(self, cell): 482 self.parse_table_attr(cell, "rowspan") 483 484 def parse_width(self, cell): 485 self.parse_table_attr(cell, "width") 486 487 488 489 # Regular expressions. 490 491 syntax = { 492 # Page regions: 493 494 "regionstart" : join((group("indent", r"\N*"), # ws... (optional) 495 group("level", repeat("[{]", 3)))), # {{{... 496 497 "regionend" : join((r"\N*", # ws... (optional) 498 group("feature", join(( 499 group("level", repeat("[}]", 3)), # }}}... 500 group("extra", r"\n"), 501 "?"))))), # nl (optional) 502 503 "header" : join(("#!", # #! 504 group("args", ".*?"), "\n")), # text-excl-nl 505 506 # Region contents: 507 508 # Line-oriented patterns support features which require their own 509 # separate lines. 510 511 "break" : r"^(\s*?)\n", # blank line 512 513 "defterm" : join(("^", 514 group("pad", r"\N+"), # ws... 515 expect(".+?::"))), # text :: 516 517 "defterm_empty" : join(("^", 518 group("pad", r"\N+"), # ws... 519 expect("::\s+"))), # :: 520 # ws... (optional) 521 522 "heading" : join(("^", 523 group("extra", r"\N*"), # ws... (optional) 524 group("level", "=+"), # =... 525 group("pad", r"\s+"), # ws... 526 expect(join((r".*?\N+", # text 527 recur("level"), # =... 528 r"\N*$"))))), # ws... (optional) 529 530 "listitem" : join(("^", 531 group("indent", r"\N+"), # ws... 532 group("marker", r"\*"), # list-marker 533 group("pad", r"\s*"))), # ws... (optional) 534 535 "listitem_num" : join(("^", 536 group("indent", r"\N+"), # ws... 537 group("marker", r"\d+\."), # decimal-marker 538 optional(join(("#", group("num", r"\d+")))), # # num (optional) 539 group("pad", r"\s+"))), # ws... 540 541 "listitem_alpha": join(("^", 542 group("indent", r"\N+"), # ws... 543 group("marker", r"[aA]\."), # alpha-marker 544 optional(join(("#", group("num", r"\d+")))), # # num (optional) 545 group("pad", r"\s+"))), # ws... 546 547 "listitem_roman": join(("^", 548 group("indent", r"\N+"), # ws... 549 group("marker", r"[iI]\."), # roman-marker 550 optional(join(("#", group("num", r"\d+")))), # # num (optional) 551 group("pad", r"\s+"))), # ws... 552 553 "listitem_dot" : join(("^", 554 group("indent", r"\N+"), # ws... 555 group("marker", r"\."), # dot-marker 556 group("pad", r"\s*"))), # ws... (optional) 557 558 "tablerow" : r"^\|\|", # || 559 560 # Region contents: 561 562 # Inline patterns are for markup features that appear within blocks. 563 # The patterns below start inline spans that can contain other markup 564 # features. 565 566 "fontstyle" : group("style", repeat("'", 2, 6)), # ''... 567 "larger" : r"~\+", # ~+ 568 "monospace" : r"`", # ` 569 "rule" : group("rule", "-----*"), # ----... 570 "smaller" : r"~-", # ~- 571 "strike" : r"--\(", # --( 572 "sub" : r",,", # ,, 573 "super" : r"\^", # ^ 574 "underline" : r"__", # __ 575 576 # Complete inline patterns are for markup features that do not support 577 # arbitrary content within them: 578 579 "link" : join((r"\[\[", # [[ 580 group("target", ".*?"), # target 581 optional(join((r"\|", group("text", ".*?")))), # | text (optional) 582 "]]")), # ]] 583 584 "macro" : join(("<<", # << 585 group("name", "\w+?"), # digit-letter... 586 optional(join((r"\(", # ( (optional) 587 group("args", ".*?"), # not-)... 588 r"\)"))), # ) (optional) 589 ">>")), # >> 590 591 # Ending patterns for inline features: 592 593 "largerend" : r"\+~", # +~ 594 "monospaceend" : r"`", # ` 595 "smallerend" : r"-~", # -~ 596 "strikeend" : r"\)--", # )-- 597 "subend" : r",,", # ,, 598 "superend" : r"\^", # ^ 599 "underlineend" : r"__", # __ 600 601 # Heading contents: 602 603 "headingend" : join((group("pad", r"\N+"), # ws... 604 group("level", "=+"), # =... 605 group("extra", r"\N*\n"))), # ws (optional) nl 606 607 # List contents: 608 609 "deftermend" : join(("::", group("pad", r"\s*?\n"))), # :: 610 # ws... (optional) 611 # nl 612 613 "deftermsep" : join(("::", group("pad", r"\s+"))), # :: 614 # ws... (optional) 615 616 "listitemend" : r"^", # next line 617 618 # Table contents: 619 620 "tableattrs" : r"<", # < 621 "tablecell" : r"\|\|", # || 622 623 "tableend" : join((group("extra", r"\s*?"), # ws... (optional) 624 "^")), # next line 625 626 # Table attributes: 627 628 "tableattrsend" : r">", # > 629 "halign" : group("value", "[(:)]"), # halign-marker 630 "valign" : group("value", "[v^]"), # valign-marker 631 "colour" : group("value", join(("\#", # # 632 repeat("[0-9A-F]", 6, 6)))), # nnnnnn 633 634 "colspan" : join(("-", # - 635 group("value", "\d+"))), # n... 636 637 "rowspan" : join((r"\|", # | 638 group("value", "\d+"))), # n... 639 640 "width" : group("value", "\d+%"), # n... % 641 642 "attrname" : join((excl(r"[-\d]"), # not-dash-or-digit 643 group("name", r"[-\w]+"))), # dash-digit-letter... 644 645 "attrvalue" : join(("=", group("quote", r"\Q"), # quote 646 group("value", ".*?"), # non-quote... (optional) 647 recur("quote"))), # quote 648 } 649 650 patterns = get_patterns(syntax) 651 652 653 654 # Patterns available within certain markup features. 655 656 table_pattern_names = [ 657 "attrname", "colour", "colspan", "halign", "rowspan", "tableattrsend", 658 "valign", "width" 659 ] 660 661 inline_pattern_names = [ 662 "fontstyle", "larger", "link", "macro", "monospace", "regionstart", 663 "smaller", "strike", "sub", "super", "underline", 664 ] 665 666 list_pattern_names = [ 667 "listitem", "listitem_alpha", "listitem_dot", "listitem_num", 668 "listitem_roman", 669 ] 670 671 listitem_pattern_names = inline_pattern_names + ["listitemend"] 672 673 region_without_table_pattern_names = inline_pattern_names + list_pattern_names + [ 674 "break", "heading", "defterm", "defterm_empty", 675 "regionend", "rule", 676 ] 677 678 region_pattern_names = region_without_table_pattern_names + ["tablerow"] 679 680 table_region_pattern_names = inline_pattern_names + [ 681 "tableattrs", "tablecell", "tableend" 682 ] 683 684 def inline_patterns_for(self, name): 685 names = self.inline_pattern_names[:] 686 names[names.index(name)] = "%send" % name 687 return names 688 689 690 691 # Pattern handlers. 692 693 end_region = ParserBase.end_region 694 695 handlers = { 696 None : end_region, 697 "attrname" : parse_attrname, 698 "break" : parse_break, 699 "colour" : parse_colour, 700 "colspan" : parse_colspan, 701 "defterm" : parse_defterm, 702 "defterm_empty" : parse_defterm_empty, 703 "deftermend" : end_region, 704 "deftermsep" : end_region, 705 "fontstyle" : parse_fontstyle, 706 "halign" : parse_halign, 707 "heading" : parse_heading, 708 "headingend" : parse_heading_end, 709 "larger" : parse_larger, 710 "largerend" : end_region, 711 "link" : parse_link, 712 "macro" : parse_macro, 713 "listitemend" : end_region, 714 "listitem" : parse_listitem, 715 "listitem_alpha" : parse_listitem, 716 "listitem_dot" : parse_listitem, 717 "listitem_num" : parse_listitem, 718 "listitem_roman" : parse_listitem, 719 "monospace" : parse_monospace, 720 "monospaceend" : end_region, 721 "regionstart" : parse_section, 722 "regionend" : parse_section_end, 723 "rowspan" : parse_rowspan, 724 "rule" : parse_rule, 725 "smaller" : parse_smaller, 726 "smallerend" : end_region, 727 "strike" : parse_strike, 728 "strikeend" : end_region, 729 "sub" : parse_sub, 730 "subend" : end_region, 731 "super" : parse_super, 732 "superend" : end_region, 733 "tableattrs" : parse_table_attrs, 734 "tableattrsend" : end_region, 735 "tablerow" : parse_table_row, 736 "tablecell" : end_region, 737 "tableend" : end_region, 738 "underline" : parse_underline, 739 "underlineend" : end_region, 740 "valign" : parse_valign, 741 "width" : parse_width, 742 } 743 744 parser = MoinParser 745 746 # vim: tabstop=4 expandtab shiftwidth=4