1 #!/usr/bin/env python 2 3 """ 4 Moin wiki format parser. 5 6 Copyright (C) 2017, 2018 Paul Boddie <paul@boddie.org.uk> 7 8 This program is free software; you can redistribute it and/or modify it under 9 the terms of the GNU General Public License as published by the Free Software 10 Foundation; either version 3 of the License, or (at your option) any later 11 version. 12 13 This program is distributed in the hope that it will be useful, but WITHOUT 14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS 15 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more 16 details. 17 18 You should have received a copy of the GNU General Public License along with 19 this program. If not, see <http://www.gnu.org/licenses/>. 20 """ 21 22 from moinformat.parsers.common import ParserBase, get_patterns, \ 23 excl, expect, group, optional, recur, \ 24 repeat 25 from moinformat.serialisers import serialise 26 from moinformat.tree.moin import Break, DefItem, DefTerm, FontStyle, Heading, \ 27 Larger, Link, List, ListItem, Macro, \ 28 Monospace, Region, Rule, Smaller, \ 29 Strikethrough, Subscript, Superscript, Table, \ 30 TableAttr, TableAttrs, TableCell, TableRow, \ 31 Text, Underline 32 33 join = "".join 34 35 class MoinParser(ParserBase): 36 37 "A wiki region parser." 38 39 def __init__(self, formats=None): 40 41 """ 42 Initialise the parser with any given 'formats' mapping from region type 43 names to parser objects. 44 """ 45 46 # Introduce this class as the default parser for the wiki format. 47 48 default_formats = {"wiki" : MoinParser, "moin" : MoinParser} 49 if formats: 50 default_formats.update(formats) 51 52 ParserBase.__init__(self, default_formats) 53 54 # Principal parser methods. 55 56 def parse(self, s): 57 58 """ 59 Parse page text 's'. Pages consist of regions delimited by markers. 60 """ 61 62 self.items = self.get_items(s) 63 self.region = Region([], type="moin") 64 65 # Parse page header. 66 67 self.parse_region_header(self.region) 68 69 # Handle pages directly with this parser. Pages do not need to use an 70 # explicit format indicator. 71 72 if not self.region.type: 73 self.parse_region_content(self.items, self.region) 74 75 # Otherwise, test the type and find an appropriate parser. 76 77 else: 78 self.parse_region_type(self.region) 79 80 return self.region 81 82 83 84 # Parser methods supporting different page features. 85 86 def parse_attrname(self, attrs): 87 88 "Handle an attribute name within 'attrs'." 89 90 name = self.match_group("name") 91 attr = TableAttr(name) 92 93 preceding = self.read_until(["attrvalue"], False) 94 if preceding == "": 95 attr.quote = self.match_group("quote") 96 attr.value = self.match_group("value") 97 98 attrs.append(attr) 99 100 def parse_break(self, region): 101 102 "Handle a paragraph break within 'region'." 103 104 self.add_node(region, Break()) 105 self.new_block(region) 106 107 def parse_defitem(self, region, extra=""): 108 109 "Handle a definition item within 'region'." 110 111 pad = self.match_group("pad") 112 item = DefItem([], pad, extra) 113 self.parse_region_details(item, ["listitemend"]) 114 self.add_node(region, item) 115 self.new_block(region) 116 117 def parse_defterm(self, region): 118 119 "Handle a definition term within 'region'." 120 121 pad = self.match_group("pad") 122 term = DefTerm([], pad) 123 self.parse_region_details(term, ["deftermend", "deftermsep"]) 124 self.add_node(region, term) 125 if self.matching_pattern() == "deftermsep": 126 self.parse_defitem(region) 127 128 def parse_defterm_empty(self, region): 129 130 "Handle an empty definition term within 'region'." 131 132 extra = self.match_group("pad") 133 self.parse_region_details(region, ["deftermsep"]) 134 self.parse_defitem(region, extra) 135 136 def parse_fontstyle(self, region): 137 138 "Handle emphasis and strong styles." 139 140 n = len(self.match_group("style")) 141 142 # Handle endings. 143 144 if isinstance(region, FontStyle): 145 emphasis = n in (2, 4, 5) 146 strong = n in (3, 5, 6) 147 active = True 148 149 if region.emphasis and emphasis: 150 active = region.close_emphasis() 151 n -= 2 152 if region.strong and strong: 153 active = region.close_strong() 154 n -= 3 155 156 if not active: 157 if n: 158 self.items.rewind(n) 159 raise StopIteration 160 161 elif not n: 162 return 163 164 # Handle new styles. 165 166 emphasis = n in (2, 4, 5) 167 strong = n in (3, 5, 6) 168 double = n in (4, 6) 169 170 span = FontStyle([], emphasis, strong) 171 if not double: 172 self.parse_region_details(span, self.inline_pattern_names) 173 region.append_inline(span) 174 175 def parse_halign(self, attrs): 176 177 "Handle horizontal alignment within 'attrs'." 178 179 value = self.match_group("value") 180 attr = TableAttr("halign", value == "(" and "left" or value == ")" and "right" or "center", True) 181 attrs.append(attr) 182 183 def parse_heading(self, region): 184 185 "Handle a heading." 186 187 start_extra = self.match_group("extra") 188 level = len(self.match_group("level")) 189 start_pad = self.match_group("pad") 190 heading = Heading([], level, start_extra, start_pad) 191 self.parse_region_details(heading, ["headingend"] + self.inline_pattern_names) 192 self.add_node(region, heading) 193 self.new_block(region) 194 195 def parse_heading_end(self, heading): 196 197 "Handle the end of a heading." 198 199 level = len(self.match_group("level")) 200 if heading.level == level: 201 heading.end_pad = self.match_group("pad") 202 heading.end_extra = self.match_group("extra") 203 raise StopIteration 204 205 def parse_list(self, item): 206 207 "Create a list, starting with 'item'." 208 209 list = List([item], item.indent, item.marker, item.num) 210 self.parse_region_details(list, self.list_pattern_names, True) 211 return list 212 213 def parse_listitem(self, region): 214 215 "Handle a list item marker within 'region'." 216 217 indent = len(self.match_group("indent")) 218 marker = self.match_group("marker") 219 num = self.match_group("num") 220 space = self.match_group("pad") 221 222 last = region.node(-1) 223 224 new_list = not isinstance(last, (List, ListItem)) 225 same_indent = not new_list and indent == last.indent 226 new_marker = not new_list and last.marker != marker and same_indent 227 new_num = not new_list and num is not None and last.num != num and same_indent 228 229 # If the marker or number changes at the same indent, or if the indent 230 # is smaller, queue the item and end the list. 231 232 # Note that Moin format does not seek to support item renumbering, 233 # instead starting new lists on number changes. 234 235 if not new_list and (new_marker or new_num or indent < last.indent): 236 self.queue_match() 237 self.end_region(region) 238 239 # Obtain a list item and populate it. 240 241 item = ListItem([], indent, marker, space, num) 242 self.parse_region_details(item, self.listitem_pattern_names) 243 244 # Start a new list if not preceded by a list item, adding a trailing 245 # block for new elements. 246 247 if new_list: 248 item = self.parse_list(item) 249 self.add_node(region, item) 250 self.new_block(region) 251 252 # Add a nested list to the last item. 253 254 elif indent > last.indent: 255 item = self.parse_list(item) 256 self.add_node(last, item) 257 258 # Add the item to the current list. 259 260 else: 261 self.add_node(region, item) 262 263 def parse_rule(self, region): 264 265 "Handle a horizontal rule within 'region'." 266 267 length = len(self.match_group("rule")) 268 rule = Rule(length) 269 self.add_node(region, rule) 270 self.new_block(region) 271 272 def parse_section(self, region): 273 274 "Handle the start of a new section within 'region'." 275 276 # Parse the section and start a new block after the section. 277 278 indent = len(self.match_group("indent")) 279 level = len(self.match_group("level")) 280 281 section = self.parse_region(level, indent, "inline") 282 283 # If the section is inline, treat it like any other inline element. 284 285 if section.type == "inline": 286 region.append_inline(section) 287 288 # Otherwise, add it as a new block element. 289 290 else: 291 self.add_node(region, section) 292 if region.allow_blocks: 293 self.new_block(region) 294 295 def parse_section_end(self, region): 296 297 "Handle the end of a new section within 'region'." 298 299 level = self.match_group("level") 300 feature = self.match_group("feature") 301 region.extra = self.match_group("extra") 302 303 if region.have_end(level): 304 raise StopIteration 305 else: 306 region.append_inline(Text(feature)) 307 308 def parse_table_attrs(self, cell): 309 310 "Handle the start of table attributes within 'cell'." 311 312 attrs = TableAttrs([]) 313 self.parse_region_details(attrs, self.table_pattern_names) 314 315 # Test the validity of the attributes. 316 317 last = None 318 319 for node in attrs.nodes: 320 321 # Text separator nodes must be whitespace. 322 323 if isinstance(node, Text): 324 if node.s.strip(): 325 break 326 327 # Named attributes must be preceded by space if not the first. 328 329 elif last and not node.concise and not isinstance(last, Text): 330 break 331 332 last = node 333 334 # All nodes were valid: preserve the collection. 335 336 else: 337 # Add the attributes as a node, also recording their presence. 338 339 cell.append(attrs) 340 cell.attrs = attrs 341 return 342 343 # Invalid nodes were found: serialise the attributes as text. 344 345 cell.append_inline(Text(serialise(attrs))) 346 347 def parse_table_row(self, region): 348 349 "Handle the start of a table row within 'region'." 350 351 # Identify any active table. 352 353 table = region.node(-2) 354 block = region.node(-1) 355 356 if not (isinstance(table, Table) and block.empty()): 357 new_table = table = Table([]) 358 else: 359 new_table = None 360 361 row = TableRow([]) 362 363 while True: 364 cell = TableCell([]) 365 self.parse_region_details(cell, self.table_region_pattern_names) 366 367 # Handle the end of the row. 368 369 if self.matching_pattern() == "tableend": 370 trailing = self.match_group("extra") 371 372 # If the cell was started but not finished, convert the row into text. 373 374 if not row.nodes or not cell.empty(): 375 for node in row.nodes: 376 region.append_inline(Text(serialise(node))) 377 region.append_inline(Text(serialise(cell) + trailing)) 378 379 self.new_block(region) 380 return 381 382 # Append the final cell, if not empty. 383 384 else: 385 row.trailing = trailing 386 387 if not cell.empty(): 388 row.append(cell) 389 break 390 391 # A cell separator has been found. 392 393 row.append(cell) 394 395 # Add the row to the table and any new table to the region. 396 397 table.add(row) 398 if new_table: 399 self.add_node(region, new_table) 400 401 self.new_block(region) 402 403 def parse_valign(self, attrs): 404 405 "Handle vertical alignment within 'attrs'." 406 407 value = self.match_group("value") 408 attr = TableAttr("valign", value == "^" and "top" or "bottom", True) 409 attrs.append(attr) 410 411 412 413 # Inline formatting handlers. 414 415 def parse_inline(self, region, cls, pattern_name): 416 417 "Handle an inline region." 418 419 span = cls([]) 420 self.parse_region_details(span, self.inline_patterns_for(pattern_name)) 421 region.append_inline(span) 422 423 def parse_larger(self, region): 424 self.parse_inline(region, Larger, "larger") 425 426 def parse_monospace(self, region): 427 span = Monospace([]) 428 self.parse_region_details(span, ["monospaceend"]) 429 region.append_inline(span) 430 431 def parse_smaller(self, region): 432 self.parse_inline(region, Smaller, "smaller") 433 434 def parse_strike(self, region): 435 self.parse_inline(region, Strikethrough, "strike") 436 437 def parse_sub(self, region): 438 self.parse_inline(region, Subscript, "sub") 439 440 def parse_super(self, region): 441 self.parse_inline(region, Superscript, "super") 442 443 def parse_underline(self, region): 444 self.parse_inline(region, Underline, "underline") 445 446 447 448 # Complete inline pattern handlers. 449 450 def parse_link(self, region): 451 target = self.match_group("target") 452 text = self.match_group("text") 453 link = Link(text and [Text(text)], target) 454 region.append_inline(link) 455 456 def parse_macro(self, region): 457 name = self.match_group("name") 458 args = self.match_group("args") 459 460 # Obtain the raw arguments. Moin usually leaves it to the macro to 461 # interpret the individual arguments. 462 463 arglist = args and args.split(",") or [] 464 macro = Macro(name, arglist) 465 region.append_inline(macro) 466 467 468 469 # Table attribute handlers. 470 471 def parse_table_attr(self, attrs, pattern_name): 472 473 "Handle a table attribute." 474 475 attrs.append(TableAttr(pattern_name, self.match_group("value"), True)) 476 477 def parse_colour(self, cell): 478 self.parse_table_attr(cell, "colour") 479 480 def parse_colspan(self, cell): 481 self.parse_table_attr(cell, "colspan") 482 483 def parse_rowspan(self, cell): 484 self.parse_table_attr(cell, "rowspan") 485 486 def parse_width(self, cell): 487 self.parse_table_attr(cell, "width") 488 489 490 491 # Regular expressions. 492 493 syntax = { 494 # Page regions: 495 496 "regionstart" : join((group("indent", r"\N*"), # ws... (optional) 497 group("level", repeat("[{]", 3)))), # {{{... 498 499 "regionend" : join((r"\N*", # ws... (optional) 500 group("feature", join(( 501 group("level", repeat("[}]", 3)), # }}}... 502 group("extra", r"\n"), 503 "?"))))), # nl (optional) 504 505 "header" : join(("#!", # #! 506 group("args", ".*?"), "\n")), # text-excl-nl 507 508 # Region contents: 509 510 # Line-oriented patterns support features which require their own 511 # separate lines. 512 513 "break" : r"^(\s*?)\n", # blank line 514 515 "defterm" : join(("^", 516 group("pad", r"\N+"), # ws... 517 expect(".+?::"))), # text :: 518 519 "defterm_empty" : join(("^", 520 group("pad", r"\N+"), # ws... 521 expect("::\s+"))), # :: 522 # ws... (optional) 523 524 "heading" : join(("^", 525 group("extra", r"\N*"), # ws... (optional) 526 group("level", "=+"), # =... 527 group("pad", r"\s+"), # ws... 528 expect(join((r".*?\N+", # text 529 recur("level"), # =... 530 r"\N*$"))))), # ws... (optional) 531 532 "listitem" : join(("^", 533 group("indent", r"\N+"), # ws... 534 group("marker", r"\*"), # list-marker 535 group("pad", r"\s*"))), # ws... (optional) 536 537 "listitem_num" : join(("^", 538 group("indent", r"\N+"), # ws... 539 group("marker", r"\d+\."), # decimal-marker 540 optional(join(("#", group("num", r"\d+")))), # # num (optional) 541 group("pad", r"\s+"))), # ws... 542 543 "listitem_alpha": join(("^", 544 group("indent", r"\N+"), # ws... 545 group("marker", r"[aA]\."), # alpha-marker 546 optional(join(("#", group("num", r"\d+")))), # # num (optional) 547 group("pad", r"\s+"))), # ws... 548 549 "listitem_roman": join(("^", 550 group("indent", r"\N+"), # ws... 551 group("marker", r"[iI]\."), # roman-marker 552 optional(join(("#", group("num", r"\d+")))), # # num (optional) 553 group("pad", r"\s+"))), # ws... 554 555 "listitem_dot" : join(("^", 556 group("indent", r"\N+"), # ws... 557 group("marker", r"\."), # dot-marker 558 group("pad", r"\s*"))), # ws... (optional) 559 560 "tablerow" : r"^\|\|", # || 561 562 # Region contents: 563 564 # Inline patterns are for markup features that appear within blocks. 565 # The patterns below start inline spans that can contain other markup 566 # features. 567 568 "fontstyle" : group("style", repeat("'", 2, 6)), # ''... 569 "larger" : r"~\+", # ~+ 570 "monospace" : r"`", # ` 571 "rule" : group("rule", "-----*"), # ----... 572 "smaller" : r"~-", # ~- 573 "strike" : r"--\(", # --( 574 "sub" : r",,", # ,, 575 "super" : r"\^", # ^ 576 "underline" : r"__", # __ 577 578 # Complete inline patterns are for markup features that do not support 579 # arbitrary content within them: 580 581 "link" : join((r"\[\[", # [[ 582 group("target", ".*?"), # target 583 optional(join((r"\|", group("text", ".*?")))), # | text (optional) 584 "]]")), # ]] 585 586 "macro" : join(("<<", # << 587 group("name", "\w+?"), # digit-letter... 588 optional(join((r"\(", # ( (optional) 589 group("args", ".*?"), # not-)... 590 r"\)"))), # ) (optional) 591 ">>")), # >> 592 593 # Ending patterns for inline features: 594 595 "largerend" : r"\+~", # +~ 596 "monospaceend" : r"`", # ` 597 "smallerend" : r"-~", # -~ 598 "strikeend" : r"\)--", # )-- 599 "subend" : r",,", # ,, 600 "superend" : r"\^", # ^ 601 "underlineend" : r"__", # __ 602 603 # Heading contents: 604 605 "headingend" : join((group("pad", r"\N+"), # ws... 606 group("level", "=+"), # =... 607 group("extra", r"\N*\n"))), # ws (optional) nl 608 609 # List contents: 610 611 "deftermend" : join(("::", group("pad", r"\s*?\n"))), # :: 612 # ws... (optional) 613 # nl 614 615 "deftermsep" : join(("::", group("pad", r"\s+"))), # :: 616 # ws... (optional) 617 618 "listitemend" : r"^", # next line 619 620 # Table contents: 621 622 "tableattrs" : r"<", # < 623 "tablecell" : r"\|\|", # || 624 625 "tableend" : join((group("extra", r"\s*?"), # ws... (optional) 626 "^")), # next line 627 628 # Table attributes: 629 630 "tableattrsend" : r">", # > 631 "halign" : group("value", "[(:)]"), # halign-marker 632 "valign" : group("value", "[v^]"), # valign-marker 633 "colour" : group("value", join(("\#", # # 634 repeat("[0-9A-F]", 6, 6)))), # nnnnnn 635 636 "colspan" : join(("-", # - 637 group("value", "\d+"))), # n... 638 639 "rowspan" : join((r"\|", # | 640 group("value", "\d+"))), # n... 641 642 "width" : group("value", "\d+%"), # n... % 643 644 "attrname" : join((excl(r"[-\d]"), # not-dash-or-digit 645 group("name", r"[-\w]+"))), # dash-digit-letter... 646 647 "attrvalue" : join(("=", group("quote", r"\Q"), # quote 648 group("value", ".*?"), # non-quote... (optional) 649 recur("quote"))), # quote 650 } 651 652 patterns = get_patterns(syntax) 653 654 655 656 # Patterns available within certain markup features. 657 658 table_pattern_names = [ 659 "attrname", "colour", "colspan", "halign", "rowspan", "tableattrsend", 660 "valign", "width" 661 ] 662 663 inline_pattern_names = [ 664 "fontstyle", "larger", "link", "macro", "monospace", "regionstart", 665 "smaller", "strike", "sub", "super", "underline", 666 ] 667 668 list_pattern_names = [ 669 "listitem", "listitem_alpha", "listitem_dot", "listitem_num", 670 "listitem_roman", 671 ] 672 673 listitem_pattern_names = inline_pattern_names + ["listitemend"] 674 675 region_without_table_pattern_names = inline_pattern_names + list_pattern_names + [ 676 "break", "heading", "defterm", "defterm_empty", 677 "regionend", "rule", 678 ] 679 680 region_pattern_names = region_without_table_pattern_names + ["tablerow"] 681 682 table_region_pattern_names = inline_pattern_names + [ 683 "tableattrs", "tablecell", "tableend" 684 ] 685 686 def inline_patterns_for(self, name): 687 names = self.inline_pattern_names[:] 688 names[names.index(name)] = "%send" % name 689 return names 690 691 692 693 # Pattern handlers. 694 695 end_region = ParserBase.end_region 696 697 handlers = { 698 None : end_region, 699 "attrname" : parse_attrname, 700 "break" : parse_break, 701 "colour" : parse_colour, 702 "colspan" : parse_colspan, 703 "defterm" : parse_defterm, 704 "defterm_empty" : parse_defterm_empty, 705 "deftermend" : end_region, 706 "deftermsep" : end_region, 707 "fontstyle" : parse_fontstyle, 708 "halign" : parse_halign, 709 "heading" : parse_heading, 710 "headingend" : parse_heading_end, 711 "larger" : parse_larger, 712 "largerend" : end_region, 713 "link" : parse_link, 714 "macro" : parse_macro, 715 "listitemend" : end_region, 716 "listitem" : parse_listitem, 717 "listitem_alpha" : parse_listitem, 718 "listitem_dot" : parse_listitem, 719 "listitem_num" : parse_listitem, 720 "listitem_roman" : parse_listitem, 721 "monospace" : parse_monospace, 722 "monospaceend" : end_region, 723 "regionstart" : parse_section, 724 "regionend" : parse_section_end, 725 "rowspan" : parse_rowspan, 726 "rule" : parse_rule, 727 "smaller" : parse_smaller, 728 "smallerend" : end_region, 729 "strike" : parse_strike, 730 "strikeend" : end_region, 731 "sub" : parse_sub, 732 "subend" : end_region, 733 "super" : parse_super, 734 "superend" : end_region, 735 "tableattrs" : parse_table_attrs, 736 "tableattrsend" : end_region, 737 "tablerow" : parse_table_row, 738 "tablecell" : end_region, 739 "tableend" : end_region, 740 "underline" : parse_underline, 741 "underlineend" : end_region, 742 "valign" : parse_valign, 743 "width" : parse_width, 744 } 745 746 parser = MoinParser 747 748 # vim: tabstop=4 expandtab shiftwidth=4