1 #!/usr/bin/env python 2 3 """ 4 Moin wiki format parser. 5 6 Copyright (C) 2017 Paul Boddie <paul@boddie.org.uk> 7 8 This program is free software; you can redistribute it and/or modify it under 9 the terms of the GNU General Public License as published by the Free Software 10 Foundation; either version 3 of the License, or (at your option) any later 11 version. 12 13 This program is distributed in the hope that it will be useful, but WITHOUT 14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS 15 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more 16 details. 17 18 You should have received a copy of the GNU General Public License along with 19 this program. If not, see <http://www.gnu.org/licenses/>. 20 """ 21 22 from moinformat.tree import Block, Break, DefItem, DefTerm, FontStyle, Heading, \ 23 Larger, ListItem, Monospace, Region, Rule, Smaller, \ 24 Subscript, Superscript, TableCell, TableRow, Text, \ 25 Underline 26 import re 27 28 # Regular expressions. 29 30 syntax = { 31 # Page regions: 32 "regionstart" : r"((^\s*)([{]{3,}))", # {{{... 33 "regionend" : r"^\s*([}]{3,})", # }}}... 34 "header" : r"#!(.*?)\n", # #! char-excl-nl 35 36 # Region contents: 37 # Line-oriented patterns: 38 # blank line 39 "break" : r"^(\s*?)\n", 40 # ws... expecting text :: 41 "defterm" : r"^(\s+)(?=.+?::)", 42 # ws... expecting :: ws... 43 "defterm_empty" : r"^(\s+)(?=::\s+)", 44 # [ws...] =... ws... expecting headingend 45 "heading" : r"^(\s*)(?P<x>=+)(\s+)(?=.*?\s+(?P=x)\s*\n)", 46 # ws... list-item [ws...] 47 "listitem" : r"^(\s+)(\*)(\s*)", 48 # ws... number-item ws... 49 "listitem_num" : r"^(\s+)(\d+\.)(\s+)", 50 # ws... alpha-item ws... 51 "listitem_alpha": r"^(\s+)([aA]\.)(\s+)", 52 # ws... roman-item ws... 53 "listitem_roman": r"^(\s+)([iI]\.)(\s+)", 54 # ws... dot-item [ws...] 55 "listitem_dot" : r"^(\s+)(\.)(\s*)", 56 # || 57 "tablerow" : r"^\|\|", 58 59 # Region contents: 60 # Inline patterns: 61 "fontstyle" : r"('{2,6})", 62 "larger" : r"~\+", 63 "monospace" : r"`", 64 "rule" : r"(-----*)", # ----... 65 "smaller" : r"~-", 66 "sub" : r",,", 67 "super" : r"\^", 68 "underline" : r"__", 69 70 # Inline contents: 71 "largerend" : r"\+~", 72 "monospaceend" : r"`", 73 "smallerend" : r"-~", 74 "subend" : r",,", 75 "superend" : r"\^", 76 "underlineend" : r"__", 77 78 # Heading contents: 79 "headingend" : r"(\s+)(=+)(\s*\n)", # ws... =... [ws...] nl 80 81 # List contents: 82 "deftermend" : r"::(\s*?\n)", 83 "deftermsep" : r"::(\s+)", 84 "listitemend" : r"^", # next line 85 86 # Table contents: 87 "tablecell" : r"\|\|", 88 "tableend" : r"(\s*?)^", # [ws...] next line 89 } 90 91 # Define inline pattern details. 92 93 inline_pattern_names = ["fontstyle", "larger", "monospace", "smaller", "sub", "super", "underline"] 94 95 def inline_patterns_for(name): 96 names = inline_pattern_names[:] 97 names[names.index(name)] = "%send" % name 98 return names 99 100 # Define patterns for the regular expressions. 101 102 patterns = {} 103 for name, value in syntax.items(): 104 patterns[name] = re.compile(value, re.UNICODE | re.MULTILINE) 105 106 107 108 # Tokenising functions. 109 110 class TokenStream: 111 112 "A stream of tokens taken from a string." 113 114 def __init__(self, s): 115 self.s = s 116 self.pos = 0 117 self.match = None 118 self.matching = None 119 120 def rewind(self, length): 121 122 "Rewind in the string by 'length'." 123 124 self.pos -= min(length, self.pos) 125 126 def read_until(self, pattern_names, remaining=True): 127 128 """ 129 Find the first match for the given 'pattern_names'. Return the text 130 preceding any match, the remaining text if no match was found, or None 131 if no match was found and 'remaining' is given as a false value. 132 """ 133 134 first = None 135 self.matching = None 136 137 # Find the first matching pattern. 138 139 for pattern_name in pattern_names: 140 match = patterns[pattern_name].search(self.s, self.pos) 141 if match: 142 start, end = match.span() 143 if self.matching is None or start < first: 144 first = start 145 self.matching = pattern_name 146 self.match = match 147 148 if self.matching is None: 149 if remaining: 150 return self.s[self.pos:] 151 else: 152 return None 153 else: 154 return self.s[self.pos:first] 155 156 def read_match(self, group=1): 157 158 """ 159 Return the matched text, updating the position in the stream. If 'group' 160 is specified, the indicated group in a match will be returned. 161 Typically, group 1 should contain all pertinent data, but groups defined 162 within group 1 can provide sections of the data. 163 """ 164 165 if self.match: 166 _start, self.pos = self.match.span() 167 try: 168 return self.match.group(group) 169 except IndexError: 170 return "" 171 else: 172 self.pos = len(self.s) 173 return None 174 175 176 177 # Parser functions. 178 179 def parse_page(s): 180 181 """ 182 Parse page text 's'. Pages consist of regions delimited by markers. 183 """ 184 185 return parse_region(TokenStream(s)) 186 187 def parse_region(items, level=0, indent=0): 188 189 """ 190 Parse the data provided by 'items' to populate a region with the given 191 'level' at the given 'indent'. 192 """ 193 194 region = Region([], level, indent) 195 196 # Parse section headers. 197 198 parse_region_header(items, region) 199 200 # Parse section body. 201 202 if region.is_transparent(): 203 parse_region_wiki(items, region) 204 else: 205 parse_region_opaque(items, region) 206 207 return region 208 209 def parse_region_header(items, region): 210 211 """ 212 Parse the region header from the 'items', setting it for the given 'region'. 213 """ 214 215 if items.read_until(["header"], False) == "": # None means no header 216 region.type = items.read_match() 217 218 def parse_region_wiki(items, region): 219 220 "Parse the data provided by 'items' to populate a wiki 'region'." 221 222 new_block(region) 223 parse_region_details(items, region, inline_pattern_names + [ 224 "break", "heading", 225 "defterm", "defterm_empty", 226 "listitem", "listitem_alpha", "listitem_dot", "listitem_num", 227 "listitem_roman", 228 "regionstart", "regionend", 229 "rule", 230 "tablerow", 231 ]) 232 233 def parse_region_opaque(items, region): 234 235 "Parse the data provided by 'items' to populate an opaque 'region'." 236 237 parse_region_details(items, region, ["regionend"]) 238 239 def parse_region_details(items, region, pattern_names): 240 241 "Parse 'items' within 'region' searching using 'pattern_names'." 242 243 try: 244 while True: 245 246 # Obtain text before any marker or the end of the input. 247 248 preceding = items.read_until(pattern_names) 249 if preceding: 250 region.append_inline(Text(preceding)) 251 252 # End of input. 253 254 if not items.matching: 255 break 256 257 # Obtain any feature. 258 259 feature = items.read_match() 260 handler = handlers.get(items.matching) 261 262 # Handle each feature or add text to the region. 263 264 if handler: 265 handler(items, region) 266 else: 267 region.append_inline(Text(feature)) 268 269 except StopIteration: 270 pass 271 272 region.normalise() 273 274 def end_region(items, region): 275 276 "End the parsing of 'region'." 277 278 raise StopIteration 279 280 def parse_break(items, region): 281 282 "Handle a paragraph break within 'region'." 283 284 region.add(Break()) 285 new_block(region) 286 287 def parse_defitem(items, region, extra=""): 288 289 "Handle a definition item within 'region'." 290 291 pad = items.read_match(1) 292 item = DefItem([], pad, extra) 293 parse_region_details(items, item, ["listitemend"]) 294 region.add(item) 295 new_block(region) 296 297 def parse_defterm(items, region): 298 299 "Handle a definition term within 'region'." 300 301 pad = items.read_match(1) 302 term = DefTerm([], pad) 303 parse_region_details(items, term, ["deftermend", "deftermsep"]) 304 region.add(term) 305 if items.matching == "deftermsep": 306 parse_defitem(items, region) 307 308 def parse_defterm_empty(items, region): 309 310 "Handle an empty definition term within 'region'." 311 312 extra = items.read_match(1) 313 parse_region_details(items, region, ["deftermsep"]) 314 parse_defitem(items, region, extra) 315 316 def parse_fontstyle(items, region): 317 318 "Handle emphasis and strong styles." 319 320 n = len(items.read_match(1)) 321 322 # Handle endings. 323 324 if isinstance(region, FontStyle): 325 emphasis = n in (2, 4, 5) 326 strong = n in (3, 5, 6) 327 active = True 328 329 if region.emphasis and emphasis: 330 active = region.close_emphasis() 331 n -= 2 332 if region.strong and strong: 333 active = region.close_strong() 334 n -= 3 335 336 if not active: 337 if n: 338 items.rewind(n) 339 raise StopIteration 340 341 elif not n: 342 return 343 344 # Handle new styles. 345 346 emphasis = n in (2, 4, 5) 347 strong = n in (3, 5, 6) 348 double = n in (4, 6) 349 350 span = FontStyle([], emphasis, strong) 351 if not double: 352 parse_region_details(items, span, inline_pattern_names) 353 region.append_inline(span) 354 355 def parse_heading(items, region): 356 357 "Handle a heading." 358 359 start_extra = items.read_match(1) 360 level = len(items.read_match(2)) 361 start_pad = items.read_match(3) 362 heading = Heading([], level, start_extra, start_pad) 363 parse_region_details(items, heading, ["headingend"] + inline_pattern_names) 364 region.add(heading) 365 new_block(region) 366 367 def parse_heading_end(items, heading): 368 369 "Handle the end of a heading." 370 371 level = len(items.read_match(2)) 372 if heading.level == level: 373 heading.end_pad = items.read_match(1) 374 heading.end_extra = items.read_match(3) 375 raise StopIteration 376 377 def parse_listitem(items, region): 378 379 "Handle a list item marker within 'region'." 380 381 indent = len(items.read_match(1)) 382 marker = items.read_match(2) 383 space = items.read_match(3) 384 item = ListItem([], indent, marker, space) 385 parse_region_details(items, item, ["listitemend"]) 386 region.add(item) 387 new_block(region) 388 389 def parse_rule(items, region): 390 391 "Handle a horizontal rule within 'region'." 392 393 length = len(items.read_match(1)) 394 rule = Rule(length) 395 region.add(rule) 396 new_block(region) 397 398 def parse_section(items, region): 399 400 "Handle the start of a new section within 'region'." 401 402 # Parse the section and start a new block after the section. 403 404 indent = len(items.read_match(2)) 405 level = len(items.read_match(3)) 406 region.add(parse_region(items, level, indent)) 407 new_block(region) 408 409 def parse_section_end(items, region): 410 411 "Handle the end of a new section within 'region'." 412 413 feature = items.read_match() 414 if region.have_end(feature): 415 raise StopIteration 416 else: 417 region.append_inline(Text(feature)) 418 419 def parse_table_row(items, region): 420 421 "Handle the start of a table row within 'region'." 422 423 row = TableRow([]) 424 425 while True: 426 cell = TableCell([]) 427 parse_region_details(items, cell, ["tablecell", "tableend"]) 428 429 # Handle the end of the row. 430 431 if items.matching == "tableend": 432 trailing = items.read_match() 433 434 # If the cell was started but not finished, convert the row into text. 435 436 if not row.nodes or not cell.empty(): 437 region.append_inline(Text("||")) 438 439 # Convert all cells. 440 441 for node in row.nodes: 442 region.append_inline_many(node.nodes) 443 region.append_inline(Text("||")) 444 445 region.append_inline_many(cell.nodes) 446 region.append_inline(Text(trailing)) 447 448 new_block(region) 449 return 450 451 # Append the final cell, if not empty. 452 453 else: 454 row.trailing = trailing 455 456 if not cell.empty(): 457 row.append(cell) 458 break 459 460 row.append(cell) 461 462 region.add(row) 463 new_block(region) 464 465 # Inline formatting handlers. 466 467 def parse_inline(items, region, cls, pattern_name): 468 469 "Handle an inline region." 470 471 span = cls([]) 472 parse_region_details(items, span, inline_patterns_for(pattern_name)) 473 region.append_inline(span) 474 475 parse_larger = lambda items, region: parse_inline(items, region, Larger, "larger") 476 parse_monospace = lambda items, region: parse_inline(items, region, Monospace, "monospace") 477 parse_smaller = lambda items, region: parse_inline(items, region, Smaller, "smaller") 478 parse_sub = lambda items, region: parse_inline(items, region, Subscript, "sub") 479 parse_super = lambda items, region: parse_inline(items, region, Superscript, "super") 480 parse_underline = lambda items, region: parse_inline(items, region, Underline, "underline") 481 482 # Pattern handlers. 483 484 handlers = { 485 None : end_region, 486 "break" : parse_break, 487 "defterm" : parse_defterm, 488 "defterm_empty" : parse_defterm_empty, 489 "deftermend" : end_region, 490 "deftermsep" : end_region, 491 "fontstyle" : parse_fontstyle, 492 "heading" : parse_heading, 493 "headingend" : parse_heading_end, 494 "larger" : parse_larger, 495 "largerend" : end_region, 496 "listitemend" : end_region, 497 "listitem" : parse_listitem, 498 "listitem_alpha" : parse_listitem, 499 "listitem_dot" : parse_listitem, 500 "listitem_num" : parse_listitem, 501 "listitem_roman" : parse_listitem, 502 "monospace" : parse_monospace, 503 "monospaceend" : end_region, 504 "regionstart" : parse_section, 505 "regionend" : parse_section_end, 506 "rule" : parse_rule, 507 "smaller" : parse_smaller, 508 "smallerend" : end_region, 509 "sub" : parse_sub, 510 "subend" : end_region, 511 "super" : parse_super, 512 "superend" : end_region, 513 "tablerow" : parse_table_row, 514 "tablecell" : end_region, 515 "tableend" : end_region, 516 "underline" : parse_underline, 517 "underlineend" : end_region, 518 } 519 520 def new_block(region): 521 522 "Start a new block in 'region'." 523 524 block = Block([]) 525 region.add(block) 526 527 528 529 # Top-level functions. 530 531 parse = parse_page 532 533 # vim: tabstop=4 expandtab shiftwidth=4