1 #!/usr/bin/env python 2 3 """ 4 Moin wiki format parser. 5 6 Copyright (C) 2017 Paul Boddie <paul@boddie.org.uk> 7 8 This program is free software; you can redistribute it and/or modify it under 9 the terms of the GNU General Public License as published by the Free Software 10 Foundation; either version 3 of the License, or (at your option) any later 11 version. 12 13 This program is distributed in the hope that it will be useful, but WITHOUT 14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS 15 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more 16 details. 17 18 You should have received a copy of the GNU General Public License along with 19 this program. If not, see <http://www.gnu.org/licenses/>. 20 """ 21 22 from moinformat.serialisers import serialise 23 from moinformat.tree import Block, Break, DefItem, DefTerm, FontStyle, Heading, \ 24 Larger, ListItem, Monospace, Region, Rule, Smaller, \ 25 Subscript, Superscript, TableAttr, TableAttrs, \ 26 TableCell, TableRow, Text, Underline 27 import re 28 29 # Regular expressions. 30 31 syntax = { 32 # Page regions: 33 "regionstart" : r"((^\s*)([{]{3,}))", # {{{... 34 "regionend" : r"^\s*([}]{3,})", # }}}... 35 "header" : r"#!(.*?)\n", # #! char-excl-nl 36 37 # Region contents: 38 # Line-oriented patterns: 39 # blank line 40 "break" : r"^(\s*?)\n", 41 # ws... expecting text :: 42 "defterm" : r"^(\s+)(?=.+?::)", 43 # ws... expecting :: ws... 44 "defterm_empty" : r"^(\s+)(?=::\s+)", 45 # [ws...] =... ws... expecting headingend 46 "heading" : r"^(\s*)(?P<x>=+)(\s+)(?=.*?\s+(?P=x)\s*\n)", 47 # ws... list-item [ws...] 48 "listitem" : r"^(\s+)(\*)(\s*)", 49 # ws... number-item ws... 50 "listitem_num" : r"^(\s+)(\d+\.)(\s+)", 51 # ws... alpha-item ws... 52 "listitem_alpha": r"^(\s+)([aA]\.)(\s+)", 53 # ws... roman-item ws... 54 "listitem_roman": r"^(\s+)([iI]\.)(\s+)", 55 # ws... dot-item [ws...] 56 "listitem_dot" : r"^(\s+)(\.)(\s*)", 57 # || 58 "tablerow" : r"^\|\|", 59 60 # Region contents: 61 # Inline patterns: 62 "fontstyle" : r"('{2,6})", 63 "larger" : r"~\+", 64 "monospace" : r"`", 65 "rule" : r"(-----*)", # ----... 66 "smaller" : r"~-", 67 "sub" : r",,", 68 "super" : r"\^", 69 "underline" : r"__", 70 71 # Inline contents: 72 "largerend" : r"\+~", 73 "monospaceend" : r"`", 74 "smallerend" : r"-~", 75 "subend" : r",,", 76 "superend" : r"\^", 77 "underlineend" : r"__", 78 79 # Heading contents: 80 "headingend" : r"(\s+)(=+)(\s*\n)", # ws... =... [ws...] nl 81 82 # List contents: 83 "deftermend" : r"::(\s*?\n)", 84 "deftermsep" : r"::(\s+)", 85 "listitemend" : r"^", # next line 86 87 # Table contents: 88 "tableattrs" : r"<", 89 "tablecell" : r"\|\|", 90 "tableend" : r"(\s*?)^", # [ws...] next line 91 92 # Table attributes: 93 "tableattrsend" : r">", 94 "halign" : r"([(:)])", 95 "valign" : r"([v^])", 96 "colour" : r"(\#[0-9A-F]{6})", 97 "colspan" : r"-(\d+)", 98 "rowspan" : r"\|(\d+)", 99 "width" : r"(\d+%)", 100 "attrname" : r"((?![-\d])[-\w]+)", # not-dash-or-digit dash-or-word-char... 101 "attrvalue" : r"""=(?P<x>['"])(.*?)(?P=x)""", 102 } 103 104 # Define pattern details. 105 106 table_pattern_names = ["attrname", "colour", "colspan", "halign", "rowspan", "tableattrsend", "valign", "width"] 107 108 inline_pattern_names = ["fontstyle", "larger", "monospace", "smaller", "sub", "super", "underline"] 109 110 def inline_patterns_for(name): 111 names = inline_pattern_names[:] 112 names[names.index(name)] = "%send" % name 113 return names 114 115 # Define patterns for the regular expressions. 116 117 patterns = {} 118 for name, value in syntax.items(): 119 patterns[name] = re.compile(value, re.UNICODE | re.MULTILINE) 120 121 122 123 # Tokenising functions. 124 125 class TokenStream: 126 127 "A stream of tokens taken from a string." 128 129 def __init__(self, s): 130 self.s = s 131 self.pos = 0 132 self.match = None 133 self.matching = None 134 135 def rewind(self, length): 136 137 "Rewind in the string by 'length'." 138 139 self.pos -= min(length, self.pos) 140 141 def read_until(self, pattern_names, remaining=True): 142 143 """ 144 Find the first match for the given 'pattern_names'. Return the text 145 preceding any match, the remaining text if no match was found, or None 146 if no match was found and 'remaining' is given as a false value. 147 """ 148 149 first = None 150 self.matching = None 151 152 # Find the first matching pattern. 153 154 for pattern_name in pattern_names: 155 match = patterns[pattern_name].search(self.s, self.pos) 156 if match: 157 start, end = match.span() 158 if self.matching is None or start < first: 159 first = start 160 self.matching = pattern_name 161 self.match = match 162 163 if self.matching is None: 164 if remaining: 165 return self.s[self.pos:] 166 else: 167 return None 168 else: 169 return self.s[self.pos:first] 170 171 def read_match(self, group=1): 172 173 """ 174 Return the matched text, updating the position in the stream. If 'group' 175 is specified, the indicated group in a match will be returned. 176 Typically, group 1 should contain all pertinent data, but groups defined 177 within group 1 can provide sections of the data. 178 """ 179 180 if self.match: 181 _start, self.pos = self.match.span() 182 try: 183 return self.match.group(group) 184 except IndexError: 185 return "" 186 else: 187 self.pos = len(self.s) 188 return None 189 190 191 192 # Parsing utilities. 193 194 def parse_region_details(items, region, pattern_names): 195 196 "Parse 'items' within 'region' searching using 'pattern_names'." 197 198 try: 199 while True: 200 201 # Obtain text before any marker or the end of the input. 202 203 preceding = items.read_until(pattern_names) 204 if preceding: 205 region.append_inline(Text(preceding)) 206 207 # End of input. 208 209 if not items.matching: 210 break 211 212 # Obtain any feature. 213 214 feature = items.read_match() 215 handler = handlers.get(items.matching) 216 217 # Handle each feature or add text to the region. 218 219 if handler: 220 handler(items, region) 221 else: 222 region.append_inline(Text(feature)) 223 224 except StopIteration: 225 pass 226 227 region.normalise() 228 229 def end_region(items, region): 230 231 "End the parsing of 'region', breaking out of the parsing loop." 232 233 raise StopIteration 234 235 def new_block(region): 236 237 "Start a new block in 'region'." 238 239 block = Block([]) 240 region.add(block) 241 242 243 244 # Parser functions for different page features. 245 246 def parse_page(s): 247 248 """ 249 Parse page text 's'. Pages consist of regions delimited by markers. 250 """ 251 252 return parse_region(TokenStream(s)) 253 254 def parse_region(items, level=0, indent=0): 255 256 """ 257 Parse the data provided by 'items' to populate a region with the given 258 'level' at the given 'indent'. 259 """ 260 261 region = Region([], level, indent) 262 263 # Parse section headers. 264 265 parse_region_header(items, region) 266 267 # Parse section body. 268 269 if region.is_transparent(): 270 parse_region_wiki(items, region) 271 else: 272 parse_region_opaque(items, region) 273 274 return region 275 276 def parse_region_header(items, region): 277 278 """ 279 Parse the region header from the 'items', setting it for the given 'region'. 280 """ 281 282 if items.read_until(["header"], False) == "": # None means no header 283 region.type = items.read_match() 284 285 def parse_region_wiki(items, region): 286 287 "Parse the data provided by 'items' to populate a wiki 'region'." 288 289 new_block(region) 290 parse_region_details(items, region, inline_pattern_names + [ 291 "break", "heading", 292 "defterm", "defterm_empty", 293 "listitem", "listitem_alpha", "listitem_dot", "listitem_num", 294 "listitem_roman", 295 "regionstart", "regionend", 296 "rule", 297 "tablerow", 298 ]) 299 300 def parse_region_opaque(items, region): 301 302 "Parse the data provided by 'items' to populate an opaque 'region'." 303 304 parse_region_details(items, region, ["regionend"]) 305 306 def parse_attrname(items, attrs): 307 308 "Handle an attribute name within 'attrs'." 309 310 name = items.read_match() 311 attr = TableAttr(name) 312 313 preceding = items.read_until(["attrvalue"], False) 314 if preceding == "": 315 attr.quote = items.read_match(1) 316 attr.value = items.read_match(2) 317 318 attrs.append(attr) 319 320 def parse_break(items, region): 321 322 "Handle a paragraph break within 'region'." 323 324 region.add(Break()) 325 new_block(region) 326 327 def parse_defitem(items, region, extra=""): 328 329 "Handle a definition item within 'region'." 330 331 pad = items.read_match(1) 332 item = DefItem([], pad, extra) 333 parse_region_details(items, item, ["listitemend"]) 334 region.add(item) 335 new_block(region) 336 337 def parse_defterm(items, region): 338 339 "Handle a definition term within 'region'." 340 341 pad = items.read_match(1) 342 term = DefTerm([], pad) 343 parse_region_details(items, term, ["deftermend", "deftermsep"]) 344 region.add(term) 345 if items.matching == "deftermsep": 346 parse_defitem(items, region) 347 348 def parse_defterm_empty(items, region): 349 350 "Handle an empty definition term within 'region'." 351 352 extra = items.read_match(1) 353 parse_region_details(items, region, ["deftermsep"]) 354 parse_defitem(items, region, extra) 355 356 def parse_fontstyle(items, region): 357 358 "Handle emphasis and strong styles." 359 360 n = len(items.read_match(1)) 361 362 # Handle endings. 363 364 if isinstance(region, FontStyle): 365 emphasis = n in (2, 4, 5) 366 strong = n in (3, 5, 6) 367 active = True 368 369 if region.emphasis and emphasis: 370 active = region.close_emphasis() 371 n -= 2 372 if region.strong and strong: 373 active = region.close_strong() 374 n -= 3 375 376 if not active: 377 if n: 378 items.rewind(n) 379 raise StopIteration 380 381 elif not n: 382 return 383 384 # Handle new styles. 385 386 emphasis = n in (2, 4, 5) 387 strong = n in (3, 5, 6) 388 double = n in (4, 6) 389 390 span = FontStyle([], emphasis, strong) 391 if not double: 392 parse_region_details(items, span, inline_pattern_names) 393 region.append_inline(span) 394 395 def parse_halign(items, attrs): 396 397 "Handle horizontal alignment within 'attrs'." 398 399 value = items.read_match() 400 attr = TableAttr("halign", value == "(" and "left" or value == ")" and "right" or "center", True) 401 attrs.append(attr) 402 403 def parse_heading(items, region): 404 405 "Handle a heading." 406 407 start_extra = items.read_match(1) 408 level = len(items.read_match(2)) 409 start_pad = items.read_match(3) 410 heading = Heading([], level, start_extra, start_pad) 411 parse_region_details(items, heading, ["headingend"] + inline_pattern_names) 412 region.add(heading) 413 new_block(region) 414 415 def parse_heading_end(items, heading): 416 417 "Handle the end of a heading." 418 419 level = len(items.read_match(2)) 420 if heading.level == level: 421 heading.end_pad = items.read_match(1) 422 heading.end_extra = items.read_match(3) 423 raise StopIteration 424 425 def parse_listitem(items, region): 426 427 "Handle a list item marker within 'region'." 428 429 indent = len(items.read_match(1)) 430 marker = items.read_match(2) 431 space = items.read_match(3) 432 item = ListItem([], indent, marker, space) 433 parse_region_details(items, item, ["listitemend"]) 434 region.add(item) 435 new_block(region) 436 437 def parse_rule(items, region): 438 439 "Handle a horizontal rule within 'region'." 440 441 length = len(items.read_match(1)) 442 rule = Rule(length) 443 region.add(rule) 444 new_block(region) 445 446 def parse_section(items, region): 447 448 "Handle the start of a new section within 'region'." 449 450 # Parse the section and start a new block after the section. 451 452 indent = len(items.read_match(2)) 453 level = len(items.read_match(3)) 454 region.add(parse_region(items, level, indent)) 455 new_block(region) 456 457 def parse_section_end(items, region): 458 459 "Handle the end of a new section within 'region'." 460 461 feature = items.read_match() 462 if region.have_end(feature): 463 raise StopIteration 464 else: 465 region.append_inline(Text(feature)) 466 467 def parse_table_attrs(items, cell): 468 469 "Handle the start of table attributes within 'cell'." 470 471 attrs = TableAttrs([]) 472 parse_region_details(items, attrs, table_pattern_names) 473 474 # Test the validity of the attributes. 475 476 last = None 477 478 for node in attrs.nodes: 479 480 # Text separator nodes must be whitespace. 481 482 if isinstance(node, Text): 483 if node.s.strip(): 484 break 485 486 # Named attributes must be preceded by space if not the first. 487 488 elif last and not node.concise and not isinstance(last, Text): 489 break 490 491 last = node 492 493 # All nodes were valid: preserve the collection. 494 495 else: 496 cell.attrs = attrs 497 return 498 499 # Invalid nodes were found: serialise the attributes as text. 500 501 cell.append_inline(Text(serialise(attrs))) 502 503 def parse_table_row(items, region): 504 505 "Handle the start of a table row within 'region'." 506 507 row = TableRow([]) 508 509 while True: 510 cell = TableCell([]) 511 parse_region_details(items, cell, ["tableattrs", "tablecell", "tableend"]) 512 513 # Handle the end of the row. 514 515 if items.matching == "tableend": 516 trailing = items.read_match() 517 518 # If the cell was started but not finished, convert the row into text. 519 520 if not row.nodes or not cell.empty(): 521 for node in row.nodes: 522 region.append_inline(Text(serialise(node))) 523 region.append_inline(Text(serialise(cell))) 524 region.append_inline(Text(trailing)) 525 526 new_block(region) 527 return 528 529 # Append the final cell, if not empty. 530 531 else: 532 row.trailing = trailing 533 534 if not cell.empty(): 535 row.append(cell) 536 break 537 538 # A cell separator has been found. 539 540 row.append(cell) 541 542 region.add(row) 543 new_block(region) 544 545 def parse_valign(items, attrs): 546 547 "Handle vertical alignment within 'attrs'." 548 549 value = items.read_match() 550 attr = TableAttr("valign", value == "^" and "top" or "bottom", True) 551 attrs.append(attr) 552 553 554 555 # Inline formatting handlers. 556 557 def parse_inline(items, region, cls, pattern_name): 558 559 "Handle an inline region." 560 561 span = cls([]) 562 parse_region_details(items, span, inline_patterns_for(pattern_name)) 563 region.append_inline(span) 564 565 parse_larger = lambda items, region: parse_inline(items, region, Larger, "larger") 566 parse_monospace = lambda items, region: parse_inline(items, region, Monospace, "monospace") 567 parse_smaller = lambda items, region: parse_inline(items, region, Smaller, "smaller") 568 parse_sub = lambda items, region: parse_inline(items, region, Subscript, "sub") 569 parse_super = lambda items, region: parse_inline(items, region, Superscript, "super") 570 parse_underline = lambda items, region: parse_inline(items, region, Underline, "underline") 571 572 # Table attribute handlers. 573 574 def parse_table_attr(items, attrs, pattern_name): 575 576 "Handle a table attribute." 577 578 value = items.read_match() 579 attrs.append(TableAttr(pattern_name, value, True)) 580 581 parse_colour = lambda items, cell: parse_table_attr(items, cell, "colour") 582 parse_colspan = lambda items, cell: parse_table_attr(items, cell, "colspan") 583 parse_rowspan = lambda items, cell: parse_table_attr(items, cell, "rowspan") 584 parse_width = lambda items, cell: parse_table_attr(items, cell, "width") 585 586 587 588 # Pattern handlers. 589 590 handlers = { 591 None : end_region, 592 "attrname" : parse_attrname, 593 "break" : parse_break, 594 "colour" : parse_colour, 595 "colspan" : parse_colspan, 596 "defterm" : parse_defterm, 597 "defterm_empty" : parse_defterm_empty, 598 "deftermend" : end_region, 599 "deftermsep" : end_region, 600 "fontstyle" : parse_fontstyle, 601 "halign" : parse_halign, 602 "heading" : parse_heading, 603 "headingend" : parse_heading_end, 604 "larger" : parse_larger, 605 "largerend" : end_region, 606 "listitemend" : end_region, 607 "listitem" : parse_listitem, 608 "listitem_alpha" : parse_listitem, 609 "listitem_dot" : parse_listitem, 610 "listitem_num" : parse_listitem, 611 "listitem_roman" : parse_listitem, 612 "monospace" : parse_monospace, 613 "monospaceend" : end_region, 614 "regionstart" : parse_section, 615 "regionend" : parse_section_end, 616 "rowspan" : parse_rowspan, 617 "rule" : parse_rule, 618 "smaller" : parse_smaller, 619 "smallerend" : end_region, 620 "sub" : parse_sub, 621 "subend" : end_region, 622 "super" : parse_super, 623 "superend" : end_region, 624 "tableattrs" : parse_table_attrs, 625 "tableattrsend" : end_region, 626 "tablerow" : parse_table_row, 627 "tablecell" : end_region, 628 "tableend" : end_region, 629 "underline" : parse_underline, 630 "underlineend" : end_region, 631 "valign" : parse_valign, 632 "width" : parse_width, 633 } 634 635 636 637 # Top-level functions. 638 639 parse = parse_page 640 641 # vim: tabstop=4 expandtab shiftwidth=4