1 #!/usr/bin/env python 2 3 """ 4 Moin wiki format parser. 5 6 Copyright (C) 2017 Paul Boddie <paul@boddie.org.uk> 7 8 This program is free software; you can redistribute it and/or modify it under 9 the terms of the GNU General Public License as published by the Free Software 10 Foundation; either version 3 of the License, or (at your option) any later 11 version. 12 13 This program is distributed in the hope that it will be useful, but WITHOUT 14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS 15 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more 16 details. 17 18 You should have received a copy of the GNU General Public License along with 19 this program. If not, see <http://www.gnu.org/licenses/>. 20 """ 21 22 from moinformat.parsing import ParserBase, TokenStream, new_block 23 from moinformat.serialisers import serialise 24 from moinformat.tree import Break, DefItem, DefTerm, FontStyle, Heading, \ 25 Larger, ListItem, Monospace, Region, Rule, Smaller, \ 26 Subscript, Superscript, TableAttr, TableAttrs, \ 27 TableCell, TableRow, Text, Underline 28 29 import re 30 31 # Regular expressions. 32 33 syntax = { 34 # Page regions: 35 "regionstart" : r"((^\s*)([{]{3,}))", # {{{... 36 "regionend" : r"^\s*([}]{3,})", # }}}... 37 "header" : r"#!(.*?)\n", # #! char-excl-nl 38 39 # Region contents: 40 # Line-oriented patterns: 41 # blank line 42 "break" : r"^(\s*?)\n", 43 # ws... expecting text :: 44 "defterm" : r"^(\s+)(?=.+?::)", 45 # ws... expecting :: ws... 46 "defterm_empty" : r"^(\s+)(?=::\s+)", 47 # [ws...] =... ws... expecting headingend 48 "heading" : r"^(\s*)(?P<x>=+)(\s+)(?=.*?\s+(?P=x)\s*\n)", 49 # ws... list-item [ws...] 50 "listitem" : r"^(\s+)(\*)(\s*)", 51 # ws... number-item ws... 52 "listitem_num" : r"^(\s+)(\d+\.)(\s+)", 53 # ws... alpha-item ws... 54 "listitem_alpha": r"^(\s+)([aA]\.)(\s+)", 55 # ws... roman-item ws... 56 "listitem_roman": r"^(\s+)([iI]\.)(\s+)", 57 # ws... dot-item [ws...] 58 "listitem_dot" : r"^(\s+)(\.)(\s*)", 59 # || 60 "tablerow" : r"^\|\|", 61 62 # Region contents: 63 # Inline patterns: 64 "fontstyle" : r"('{2,6})", 65 "larger" : r"~\+", 66 "monospace" : r"`", 67 "rule" : r"(-----*)", # ----... 68 "smaller" : r"~-", 69 "sub" : r",,", 70 "super" : r"\^", 71 "underline" : r"__", 72 73 # Inline contents: 74 "largerend" : r"\+~", 75 "monospaceend" : r"`", 76 "smallerend" : r"-~", 77 "subend" : r",,", 78 "superend" : r"\^", 79 "underlineend" : r"__", 80 81 # Heading contents: 82 "headingend" : r"(\s+)(=+)(\s*\n)", # ws... =... [ws...] nl 83 84 # List contents: 85 "deftermend" : r"::(\s*?\n)", 86 "deftermsep" : r"::(\s+)", 87 "listitemend" : r"^", # next line 88 89 # Table contents: 90 "tableattrs" : r"<", 91 "tablecell" : r"\|\|", 92 "tableend" : r"(\s*?)^", # [ws...] next line 93 94 # Table attributes: 95 "tableattrsend" : r">", 96 "halign" : r"([(:)])", 97 "valign" : r"([v^])", 98 "colour" : r"(\#[0-9A-F]{6})", 99 "colspan" : r"-(\d+)", 100 "rowspan" : r"\|(\d+)", 101 "width" : r"(\d+%)", 102 "attrname" : r"((?![-\d])[-\w]+)", # not-dash-or-digit dash-or-word-char... 103 "attrvalue" : r"""=(?P<x>['"])(.*?)(?P=x)""", 104 } 105 106 # Define pattern details. 107 108 table_pattern_names = ["attrname", "colour", "colspan", "halign", "rowspan", "tableattrsend", "valign", "width"] 109 110 inline_pattern_names = ["fontstyle", "larger", "monospace", "smaller", "sub", "super", "underline"] 111 112 def inline_patterns_for(name): 113 names = inline_pattern_names[:] 114 names[names.index(name)] = "%send" % name 115 return names 116 117 # Define patterns for the regular expressions. 118 119 patterns = {} 120 for name, value in syntax.items(): 121 patterns[name] = re.compile(value, re.UNICODE | re.MULTILINE) 122 123 124 125 class Parser(ParserBase): 126 127 "A wiki region parser." 128 129 def __init__(self, formats=None): 130 131 """ 132 Initialise the parser with any given 'formats' mapping from region type 133 names to parser objects. 134 """ 135 136 formats = {"wiki" : self} 137 if formats: 138 formats.update(formats) 139 140 ParserBase.__init__(self, formats) 141 142 def get_items(self, s): 143 144 "Return a sequence of token items for 's'." 145 146 return TokenStream(s, patterns) 147 148 # Principal parser methods. 149 150 def parse(self, s): 151 152 """ 153 Parse page text 's'. Pages consist of regions delimited by markers. 154 """ 155 156 items = self.get_items(s) 157 region = Region([]) 158 159 # Parse page header. 160 161 self.parse_region_header(items, region) 162 163 # Handle pages directly with this parser. 164 # Otherwise, test the type and find an appropriate parser. 165 166 if not region.type: 167 self.parse_region_content(items, region) 168 else: 169 self.parse_region_type(items, region) 170 171 return region 172 173 def parse_region_content(self, items, region): 174 175 "Parse the data provided by 'items' to populate a wiki 'region'." 176 177 new_block(region) 178 179 self.parse_region_details(items, region, inline_pattern_names + [ 180 "break", "heading", 181 "defterm", "defterm_empty", 182 "listitem", "listitem_alpha", "listitem_dot", "listitem_num", 183 "listitem_roman", 184 "regionstart", "regionend", 185 "rule", 186 "tablerow", 187 ]) 188 189 # Parser methods supporting different page features. 190 191 def parse_attrname(self, items, attrs): 192 193 "Handle an attribute name within 'attrs'." 194 195 name = items.read_match() 196 attr = TableAttr(name) 197 198 preceding = items.read_until(["attrvalue"], False) 199 if preceding == "": 200 attr.quote = items.read_match(1) 201 attr.value = items.read_match(2) 202 203 attrs.append(attr) 204 205 def parse_break(self, items, region): 206 207 "Handle a paragraph break within 'region'." 208 209 region.add(Break()) 210 new_block(region) 211 212 def parse_defitem(self, items, region, extra=""): 213 214 "Handle a definition item within 'region'." 215 216 pad = items.read_match(1) 217 item = DefItem([], pad, extra) 218 self.parse_region_details(items, item, ["listitemend"]) 219 region.add(item) 220 new_block(region) 221 222 def parse_defterm(self, items, region): 223 224 "Handle a definition term within 'region'." 225 226 pad = items.read_match(1) 227 term = DefTerm([], pad) 228 self.parse_region_details(items, term, ["deftermend", "deftermsep"]) 229 region.add(term) 230 if items.matching == "deftermsep": 231 self.parse_defitem(items, region) 232 233 def parse_defterm_empty(self, items, region): 234 235 "Handle an empty definition term within 'region'." 236 237 extra = items.read_match(1) 238 self.parse_region_details(items, region, ["deftermsep"]) 239 self.parse_defitem(items, region, extra) 240 241 def parse_fontstyle(self, items, region): 242 243 "Handle emphasis and strong styles." 244 245 n = len(items.read_match(1)) 246 247 # Handle endings. 248 249 if isinstance(region, FontStyle): 250 emphasis = n in (2, 4, 5) 251 strong = n in (3, 5, 6) 252 active = True 253 254 if region.emphasis and emphasis: 255 active = region.close_emphasis() 256 n -= 2 257 if region.strong and strong: 258 active = region.close_strong() 259 n -= 3 260 261 if not active: 262 if n: 263 items.rewind(n) 264 raise StopIteration 265 266 elif not n: 267 return 268 269 # Handle new styles. 270 271 emphasis = n in (2, 4, 5) 272 strong = n in (3, 5, 6) 273 double = n in (4, 6) 274 275 span = FontStyle([], emphasis, strong) 276 if not double: 277 self.parse_region_details(items, span, inline_pattern_names) 278 region.append_inline(span) 279 280 def parse_halign(self, items, attrs): 281 282 "Handle horizontal alignment within 'attrs'." 283 284 value = items.read_match() 285 attr = TableAttr("halign", value == "(" and "left" or value == ")" and "right" or "center", True) 286 attrs.append(attr) 287 288 def parse_heading(self, items, region): 289 290 "Handle a heading." 291 292 start_extra = items.read_match(1) 293 level = len(items.read_match(2)) 294 start_pad = items.read_match(3) 295 heading = Heading([], level, start_extra, start_pad) 296 self.parse_region_details(items, heading, ["headingend"] + inline_pattern_names) 297 region.add(heading) 298 new_block(region) 299 300 def parse_heading_end(self, items, heading): 301 302 "Handle the end of a heading." 303 304 level = len(items.read_match(2)) 305 if heading.level == level: 306 heading.end_pad = items.read_match(1) 307 heading.end_extra = items.read_match(3) 308 raise StopIteration 309 310 def parse_listitem(self, items, region): 311 312 "Handle a list item marker within 'region'." 313 314 indent = len(items.read_match(1)) 315 marker = items.read_match(2) 316 space = items.read_match(3) 317 item = ListItem([], indent, marker, space) 318 self.parse_region_details(items, item, ["listitemend"]) 319 region.add(item) 320 new_block(region) 321 322 def parse_rule(self, items, region): 323 324 "Handle a horizontal rule within 'region'." 325 326 length = len(items.read_match(1)) 327 rule = Rule(length) 328 region.add(rule) 329 new_block(region) 330 331 def parse_section(self, items, region): 332 333 "Handle the start of a new section within 'region'." 334 335 # Parse the section and start a new block after the section. 336 337 indent = len(items.read_match(2)) 338 level = len(items.read_match(3)) 339 region.add(self.parse_region(items, level, indent)) 340 new_block(region) 341 342 def parse_section_end(self, items, region): 343 344 "Handle the end of a new section within 'region'." 345 346 feature = items.read_match() 347 if region.have_end(feature): 348 raise StopIteration 349 else: 350 region.append_inline(Text(feature)) 351 352 def parse_table_attrs(self, items, cell): 353 354 "Handle the start of table attributes within 'cell'." 355 356 attrs = TableAttrs([]) 357 self.parse_region_details(items, attrs, table_pattern_names) 358 359 # Test the validity of the attributes. 360 361 last = None 362 363 for node in attrs.nodes: 364 365 # Text separator nodes must be whitespace. 366 367 if isinstance(node, Text): 368 if node.s.strip(): 369 break 370 371 # Named attributes must be preceded by space if not the first. 372 373 elif last and not node.concise and not isinstance(last, Text): 374 break 375 376 last = node 377 378 # All nodes were valid: preserve the collection. 379 380 else: 381 cell.attrs = attrs 382 return 383 384 # Invalid nodes were found: serialise the attributes as text. 385 386 cell.append_inline(Text(serialise(attrs))) 387 388 def parse_table_row(self, items, region): 389 390 "Handle the start of a table row within 'region'." 391 392 row = TableRow([]) 393 394 while True: 395 cell = TableCell([]) 396 self.parse_region_details(items, cell, ["tableattrs", "tablecell", "tableend"]) 397 398 # Handle the end of the row. 399 400 if items.matching == "tableend": 401 trailing = items.read_match() 402 403 # If the cell was started but not finished, convert the row into text. 404 405 if not row.nodes or not cell.empty(): 406 for node in row.nodes: 407 region.append_inline(Text(serialise(node))) 408 region.append_inline(Text(serialise(cell))) 409 region.append_inline(Text(trailing)) 410 411 new_block(region) 412 return 413 414 # Append the final cell, if not empty. 415 416 else: 417 row.trailing = trailing 418 419 if not cell.empty(): 420 row.append(cell) 421 break 422 423 # A cell separator has been found. 424 425 row.append(cell) 426 427 region.add(row) 428 new_block(region) 429 430 def parse_valign(self, items, attrs): 431 432 "Handle vertical alignment within 'attrs'." 433 434 value = items.read_match() 435 attr = TableAttr("valign", value == "^" and "top" or "bottom", True) 436 attrs.append(attr) 437 438 439 440 # Inline formatting handlers. 441 442 def parse_inline(self, items, region, cls, pattern_name): 443 444 "Handle an inline region." 445 446 span = cls([]) 447 self.parse_region_details(items, span, inline_patterns_for(pattern_name)) 448 region.append_inline(span) 449 450 def parse_larger(self, items, region): 451 self.parse_inline(items, region, Larger, "larger") 452 453 def parse_monospace(self, items, region): 454 self.parse_inline(items, region, Monospace, "monospace") 455 456 def parse_smaller(self, items, region): 457 self.parse_inline(items, region, Smaller, "smaller") 458 459 def parse_sub(self, items, region): 460 self.parse_inline(items, region, Subscript, "sub") 461 462 def parse_super(self, items, region): 463 self.parse_inline(items, region, Superscript, "super") 464 465 def parse_underline(self, items, region): 466 self.parse_inline(items, region, Underline, "underline") 467 468 469 470 # Table attribute handlers. 471 472 def parse_table_attr(self, items, attrs, pattern_name): 473 474 "Handle a table attribute." 475 476 value = items.read_match() 477 attrs.append(TableAttr(pattern_name, value, True)) 478 479 def parse_colour(self, items, cell): 480 self.parse_table_attr(items, cell, "colour") 481 482 def parse_colspan(self, items, cell): 483 self.parse_table_attr(items, cell, "colspan") 484 485 def parse_rowspan(self, items, cell): 486 self.parse_table_attr(items, cell, "rowspan") 487 488 def parse_width(self, items, cell): 489 self.parse_table_attr(items, cell, "width") 490 491 492 493 # Pattern handlers. 494 495 end_region = ParserBase.end_region 496 497 handlers = { 498 None : end_region, 499 "attrname" : parse_attrname, 500 "break" : parse_break, 501 "colour" : parse_colour, 502 "colspan" : parse_colspan, 503 "defterm" : parse_defterm, 504 "defterm_empty" : parse_defterm_empty, 505 "deftermend" : end_region, 506 "deftermsep" : end_region, 507 "fontstyle" : parse_fontstyle, 508 "halign" : parse_halign, 509 "heading" : parse_heading, 510 "headingend" : parse_heading_end, 511 "larger" : parse_larger, 512 "largerend" : end_region, 513 "listitemend" : end_region, 514 "listitem" : parse_listitem, 515 "listitem_alpha" : parse_listitem, 516 "listitem_dot" : parse_listitem, 517 "listitem_num" : parse_listitem, 518 "listitem_roman" : parse_listitem, 519 "monospace" : parse_monospace, 520 "monospaceend" : end_region, 521 "regionstart" : parse_section, 522 "regionend" : parse_section_end, 523 "rowspan" : parse_rowspan, 524 "rule" : parse_rule, 525 "smaller" : parse_smaller, 526 "smallerend" : end_region, 527 "sub" : parse_sub, 528 "subend" : end_region, 529 "super" : parse_super, 530 "superend" : end_region, 531 "tableattrs" : parse_table_attrs, 532 "tableattrsend" : end_region, 533 "tablerow" : parse_table_row, 534 "tablecell" : end_region, 535 "tableend" : end_region, 536 "underline" : parse_underline, 537 "underlineend" : end_region, 538 "valign" : parse_valign, 539 "width" : parse_width, 540 } 541 542 543 544 # Top-level functions. 545 546 def parse(s, formats=None): 547 return Parser(formats).parse(s) 548 549 # vim: tabstop=4 expandtab shiftwidth=4