1 #!/usr/bin/env python 2 3 """ 4 Moin wiki format parser. 5 6 Copyright (C) 2017 Paul Boddie <paul@boddie.org.uk> 7 8 This program is free software; you can redistribute it and/or modify it under 9 the terms of the GNU General Public License as published by the Free Software 10 Foundation; either version 3 of the License, or (at your option) any later 11 version. 12 13 This program is distributed in the hope that it will be useful, but WITHOUT 14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS 15 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more 16 details. 17 18 You should have received a copy of the GNU General Public License along with 19 this program. If not, see <http://www.gnu.org/licenses/>. 20 """ 21 22 from moinformat.parsing import ParserBase, TokenStream, get_patterns, new_block 23 from moinformat.serialisers import serialise 24 from moinformat.tree import Break, DefItem, DefTerm, FontStyle, Heading, \ 25 Larger, ListItem, Monospace, Region, Rule, Smaller, \ 26 Subscript, Superscript, TableAttr, TableAttrs, \ 27 TableCell, TableRow, Text, Underline 28 29 # Regular expressions. 30 31 syntax = { 32 # Page regions: 33 "regionstart" : r"((^\s*)([{]{3,}))", # {{{... 34 "regionend" : r"^\s*([}]{3,})", # }}}... 35 "header" : r"#!(.*?)\n", # #! char-excl-nl 36 37 # Region contents: 38 # Line-oriented patterns: 39 # blank line 40 "break" : r"^(\s*?)\n", 41 # ws... expecting text :: 42 "defterm" : r"^(\s+)(?=.+?::)", 43 # ws... expecting :: ws... 44 "defterm_empty" : r"^(\s+)(?=::\s+)", 45 # [ws...] =... ws... expecting headingend 46 "heading" : r"^(\s*)(?P<x>=+)(\s+)(?=.*?\s+(?P=x)\s*\n)", 47 # ws... list-item [ws...] 48 "listitem" : r"^(\s+)(\*)(\s*)", 49 # ws... number-item ws... 50 "listitem_num" : r"^(\s+)(\d+\.)(\s+)", 51 # ws... alpha-item ws... 52 "listitem_alpha": r"^(\s+)([aA]\.)(\s+)", 53 # ws... roman-item ws... 54 "listitem_roman": r"^(\s+)([iI]\.)(\s+)", 55 # ws... dot-item [ws...] 56 "listitem_dot" : r"^(\s+)(\.)(\s*)", 57 # || 58 "tablerow" : r"^\|\|", 59 60 # Region contents: 61 # Inline patterns: 62 "fontstyle" : r"('{2,6})", 63 "larger" : r"~\+", 64 "monospace" : r"`", 65 "rule" : r"(-----*)", # ----... 66 "smaller" : r"~-", 67 "sub" : r",,", 68 "super" : r"\^", 69 "underline" : r"__", 70 71 # Inline contents: 72 "largerend" : r"\+~", 73 "monospaceend" : r"`", 74 "smallerend" : r"-~", 75 "subend" : r",,", 76 "superend" : r"\^", 77 "underlineend" : r"__", 78 79 # Heading contents: 80 "headingend" : r"(\s+)(=+)(\s*\n)", # ws... =... [ws...] nl 81 82 # List contents: 83 "deftermend" : r"::(\s*?\n)", 84 "deftermsep" : r"::(\s+)", 85 "listitemend" : r"^", # next line 86 87 # Table contents: 88 "tableattrs" : r"<", 89 "tablecell" : r"\|\|", 90 "tableend" : r"(\s*?)^", # [ws...] next line 91 92 # Table attributes: 93 "tableattrsend" : r">", 94 "halign" : r"([(:)])", 95 "valign" : r"([v^])", 96 "colour" : r"(\#[0-9A-F]{6})", 97 "colspan" : r"-(\d+)", 98 "rowspan" : r"\|(\d+)", 99 "width" : r"(\d+%)", 100 "attrname" : r"((?![-\d])[-\w]+)", # not-dash-or-digit dash-or-word-char... 101 "attrvalue" : r"""=(?P<x>['"])(.*?)(?P=x)""", 102 } 103 104 105 106 class Parser(ParserBase): 107 108 "A wiki region parser." 109 110 def __init__(self, formats=None): 111 112 """ 113 Initialise the parser with any given 'formats' mapping from region type 114 names to parser objects. 115 """ 116 117 formats = {"wiki" : self} 118 if formats: 119 formats.update(formats) 120 121 ParserBase.__init__(self, formats) 122 123 # Pattern details. 124 125 patterns = get_patterns(syntax) 126 127 table_pattern_names = [ 128 "attrname", "colour", "colspan", "halign", "rowspan", "tableattrsend", 129 "valign", "width" 130 ] 131 132 inline_pattern_names = [ 133 "fontstyle", "larger", "monospace", "smaller", "sub", "super", "underline", 134 ] 135 136 region_pattern_names = inline_pattern_names + [ 137 "break", "heading", "defterm", "defterm_empty", "listitem", 138 "listitem_alpha", "listitem_dot", "listitem_num", "listitem_roman", 139 "regionstart", "regionend", "rule", "tablerow", 140 ] 141 142 def inline_patterns_for(self, name): 143 names = self.inline_pattern_names[:] 144 names[names.index(name)] = "%send" % name 145 return names 146 147 # Principal parser methods. 148 149 def get_items(self, s): 150 151 "Return a sequence of token items for 's'." 152 153 return TokenStream(s, self.patterns) 154 155 def parse(self, s): 156 157 """ 158 Parse page text 's'. Pages consist of regions delimited by markers. 159 """ 160 161 items = self.get_items(s) 162 region = Region([]) 163 164 # Parse page header. 165 166 self.parse_region_header(items, region) 167 168 # Handle pages directly with this parser. 169 # Otherwise, test the type and find an appropriate parser. 170 171 if not region.type: 172 self.parse_region_content(items, region) 173 else: 174 self.parse_region_type(items, region) 175 176 return region 177 178 def parse_region_content(self, items, region): 179 180 "Parse the data provided by 'items' to populate a wiki 'region'." 181 182 new_block(region) 183 self.parse_region_details(items, region, self.region_pattern_names) 184 185 # Parser methods supporting different page features. 186 187 def parse_attrname(self, items, attrs): 188 189 "Handle an attribute name within 'attrs'." 190 191 name = items.read_match() 192 attr = TableAttr(name) 193 194 preceding = items.read_until(["attrvalue"], False) 195 if preceding == "": 196 attr.quote = items.read_match(1) 197 attr.value = items.read_match(2) 198 199 attrs.append(attr) 200 201 def parse_break(self, items, region): 202 203 "Handle a paragraph break within 'region'." 204 205 region.add(Break()) 206 new_block(region) 207 208 def parse_defitem(self, items, region, extra=""): 209 210 "Handle a definition item within 'region'." 211 212 pad = items.read_match(1) 213 item = DefItem([], pad, extra) 214 self.parse_region_details(items, item, ["listitemend"]) 215 region.add(item) 216 new_block(region) 217 218 def parse_defterm(self, items, region): 219 220 "Handle a definition term within 'region'." 221 222 pad = items.read_match(1) 223 term = DefTerm([], pad) 224 self.parse_region_details(items, term, ["deftermend", "deftermsep"]) 225 region.add(term) 226 if items.matching == "deftermsep": 227 self.parse_defitem(items, region) 228 229 def parse_defterm_empty(self, items, region): 230 231 "Handle an empty definition term within 'region'." 232 233 extra = items.read_match(1) 234 self.parse_region_details(items, region, ["deftermsep"]) 235 self.parse_defitem(items, region, extra) 236 237 def parse_fontstyle(self, items, region): 238 239 "Handle emphasis and strong styles." 240 241 n = len(items.read_match(1)) 242 243 # Handle endings. 244 245 if isinstance(region, FontStyle): 246 emphasis = n in (2, 4, 5) 247 strong = n in (3, 5, 6) 248 active = True 249 250 if region.emphasis and emphasis: 251 active = region.close_emphasis() 252 n -= 2 253 if region.strong and strong: 254 active = region.close_strong() 255 n -= 3 256 257 if not active: 258 if n: 259 items.rewind(n) 260 raise StopIteration 261 262 elif not n: 263 return 264 265 # Handle new styles. 266 267 emphasis = n in (2, 4, 5) 268 strong = n in (3, 5, 6) 269 double = n in (4, 6) 270 271 span = FontStyle([], emphasis, strong) 272 if not double: 273 self.parse_region_details(items, span, self.inline_pattern_names) 274 region.append_inline(span) 275 276 def parse_halign(self, items, attrs): 277 278 "Handle horizontal alignment within 'attrs'." 279 280 value = items.read_match() 281 attr = TableAttr("halign", value == "(" and "left" or value == ")" and "right" or "center", True) 282 attrs.append(attr) 283 284 def parse_heading(self, items, region): 285 286 "Handle a heading." 287 288 start_extra = items.read_match(1) 289 level = len(items.read_match(2)) 290 start_pad = items.read_match(3) 291 heading = Heading([], level, start_extra, start_pad) 292 self.parse_region_details(items, heading, ["headingend"] + self.inline_pattern_names) 293 region.add(heading) 294 new_block(region) 295 296 def parse_heading_end(self, items, heading): 297 298 "Handle the end of a heading." 299 300 level = len(items.read_match(2)) 301 if heading.level == level: 302 heading.end_pad = items.read_match(1) 303 heading.end_extra = items.read_match(3) 304 raise StopIteration 305 306 def parse_listitem(self, items, region): 307 308 "Handle a list item marker within 'region'." 309 310 indent = len(items.read_match(1)) 311 marker = items.read_match(2) 312 space = items.read_match(3) 313 item = ListItem([], indent, marker, space) 314 self.parse_region_details(items, item, ["listitemend"]) 315 region.add(item) 316 new_block(region) 317 318 def parse_rule(self, items, region): 319 320 "Handle a horizontal rule within 'region'." 321 322 length = len(items.read_match(1)) 323 rule = Rule(length) 324 region.add(rule) 325 new_block(region) 326 327 def parse_section(self, items, region): 328 329 "Handle the start of a new section within 'region'." 330 331 # Parse the section and start a new block after the section. 332 333 indent = len(items.read_match(2)) 334 level = len(items.read_match(3)) 335 region.add(self.parse_region(items, level, indent)) 336 new_block(region) 337 338 def parse_section_end(self, items, region): 339 340 "Handle the end of a new section within 'region'." 341 342 feature = items.read_match() 343 if region.have_end(feature): 344 raise StopIteration 345 else: 346 region.append_inline(Text(feature)) 347 348 def parse_table_attrs(self, items, cell): 349 350 "Handle the start of table attributes within 'cell'." 351 352 attrs = TableAttrs([]) 353 self.parse_region_details(items, attrs, self.table_pattern_names) 354 355 # Test the validity of the attributes. 356 357 last = None 358 359 for node in attrs.nodes: 360 361 # Text separator nodes must be whitespace. 362 363 if isinstance(node, Text): 364 if node.s.strip(): 365 break 366 367 # Named attributes must be preceded by space if not the first. 368 369 elif last and not node.concise and not isinstance(last, Text): 370 break 371 372 last = node 373 374 # All nodes were valid: preserve the collection. 375 376 else: 377 cell.attrs = attrs 378 return 379 380 # Invalid nodes were found: serialise the attributes as text. 381 382 cell.append_inline(Text(serialise(attrs))) 383 384 def parse_table_row(self, items, region): 385 386 "Handle the start of a table row within 'region'." 387 388 row = TableRow([]) 389 390 while True: 391 cell = TableCell([]) 392 self.parse_region_details(items, cell, ["tableattrs", "tablecell", "tableend"]) 393 394 # Handle the end of the row. 395 396 if items.matching == "tableend": 397 trailing = items.read_match() 398 399 # If the cell was started but not finished, convert the row into text. 400 401 if not row.nodes or not cell.empty(): 402 for node in row.nodes: 403 region.append_inline(Text(serialise(node))) 404 region.append_inline(Text(serialise(cell))) 405 region.append_inline(Text(trailing)) 406 407 new_block(region) 408 return 409 410 # Append the final cell, if not empty. 411 412 else: 413 row.trailing = trailing 414 415 if not cell.empty(): 416 row.append(cell) 417 break 418 419 # A cell separator has been found. 420 421 row.append(cell) 422 423 region.add(row) 424 new_block(region) 425 426 def parse_valign(self, items, attrs): 427 428 "Handle vertical alignment within 'attrs'." 429 430 value = items.read_match() 431 attr = TableAttr("valign", value == "^" and "top" or "bottom", True) 432 attrs.append(attr) 433 434 435 436 # Inline formatting handlers. 437 438 def parse_inline(self, items, region, cls, pattern_name): 439 440 "Handle an inline region." 441 442 span = cls([]) 443 self.parse_region_details(items, span, self.inline_patterns_for(pattern_name)) 444 region.append_inline(span) 445 446 def parse_larger(self, items, region): 447 self.parse_inline(items, region, Larger, "larger") 448 449 def parse_monospace(self, items, region): 450 self.parse_inline(items, region, Monospace, "monospace") 451 452 def parse_smaller(self, items, region): 453 self.parse_inline(items, region, Smaller, "smaller") 454 455 def parse_sub(self, items, region): 456 self.parse_inline(items, region, Subscript, "sub") 457 458 def parse_super(self, items, region): 459 self.parse_inline(items, region, Superscript, "super") 460 461 def parse_underline(self, items, region): 462 self.parse_inline(items, region, Underline, "underline") 463 464 465 466 # Table attribute handlers. 467 468 def parse_table_attr(self, items, attrs, pattern_name): 469 470 "Handle a table attribute." 471 472 value = items.read_match() 473 attrs.append(TableAttr(pattern_name, value, True)) 474 475 def parse_colour(self, items, cell): 476 self.parse_table_attr(items, cell, "colour") 477 478 def parse_colspan(self, items, cell): 479 self.parse_table_attr(items, cell, "colspan") 480 481 def parse_rowspan(self, items, cell): 482 self.parse_table_attr(items, cell, "rowspan") 483 484 def parse_width(self, items, cell): 485 self.parse_table_attr(items, cell, "width") 486 487 488 489 # Pattern handlers. 490 491 end_region = ParserBase.end_region 492 493 handlers = { 494 None : end_region, 495 "attrname" : parse_attrname, 496 "break" : parse_break, 497 "colour" : parse_colour, 498 "colspan" : parse_colspan, 499 "defterm" : parse_defterm, 500 "defterm_empty" : parse_defterm_empty, 501 "deftermend" : end_region, 502 "deftermsep" : end_region, 503 "fontstyle" : parse_fontstyle, 504 "halign" : parse_halign, 505 "heading" : parse_heading, 506 "headingend" : parse_heading_end, 507 "larger" : parse_larger, 508 "largerend" : end_region, 509 "listitemend" : end_region, 510 "listitem" : parse_listitem, 511 "listitem_alpha" : parse_listitem, 512 "listitem_dot" : parse_listitem, 513 "listitem_num" : parse_listitem, 514 "listitem_roman" : parse_listitem, 515 "monospace" : parse_monospace, 516 "monospaceend" : end_region, 517 "regionstart" : parse_section, 518 "regionend" : parse_section_end, 519 "rowspan" : parse_rowspan, 520 "rule" : parse_rule, 521 "smaller" : parse_smaller, 522 "smallerend" : end_region, 523 "sub" : parse_sub, 524 "subend" : end_region, 525 "super" : parse_super, 526 "superend" : end_region, 527 "tableattrs" : parse_table_attrs, 528 "tableattrsend" : end_region, 529 "tablerow" : parse_table_row, 530 "tablecell" : end_region, 531 "tableend" : end_region, 532 "underline" : parse_underline, 533 "underlineend" : end_region, 534 "valign" : parse_valign, 535 "width" : parse_width, 536 } 537 538 539 540 # Top-level functions. 541 542 def parse(s, formats=None): 543 return Parser(formats).parse(s) 544 545 # vim: tabstop=4 expandtab shiftwidth=4