1 #!/usr/bin/env python 2 3 """ 4 Moin wiki format parser. 5 6 Copyright (C) 2017 Paul Boddie <paul@boddie.org.uk> 7 8 This program is free software; you can redistribute it and/or modify it under 9 the terms of the GNU General Public License as published by the Free Software 10 Foundation; either version 3 of the License, or (at your option) any later 11 version. 12 13 This program is distributed in the hope that it will be useful, but WITHOUT 14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS 15 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more 16 details. 17 18 You should have received a copy of the GNU General Public License along with 19 this program. If not, see <http://www.gnu.org/licenses/>. 20 """ 21 22 from moinformat.tree import Block, Break, DefItem, DefTerm, FontStyle, Heading, \ 23 ListItem, Monospace, Region, Rule, Subscript, \ 24 Superscript, Text, Underline 25 import re 26 27 # Regular expressions. 28 29 syntax = { 30 # Page regions: 31 "regionstart" : r"((^\s*)([{]{3,}))", # {{{... 32 "regionend" : r"^\s*([}]{3,})", # }}}... 33 "header" : r"#!(.*?)\n", # #! char-excl-nl 34 35 # Region contents: 36 # Line-oriented patterns: 37 # blank line 38 "break" : r"^(\s*?)\n", 39 # ws... expecting text :: 40 "defterm" : r"^(\s+)(?=.+?::)", 41 # ws... expecting :: ws... 42 "defterm_empty" : r"^(\s+)(?=::\s+)", 43 # [ws...] =... ws... expecting headingend 44 "heading" : r"^(\s*)(?P<x>=+)(\s+)(?=.*?\s+(?P=x)\s*\n)", 45 # ws... list-item [ws...] 46 "listitem" : r"^(\s+)(\*)(\s*)", 47 # ws... number-item ws... 48 "listitem_num" : r"^(\s+)(\d+\.)(\s+)", 49 # ws... alpha-item ws... 50 "listitem_alpha": r"^(\s+)([aA]\.)(\s+)", 51 # ws... roman-item ws... 52 "listitem_roman": r"^(\s+)([iI]\.)(\s+)", 53 # ws... dot-item [ws...] 54 "listitem_dot" : r"^(\s+)(\.)(\s*)", 55 56 # Region contents: 57 # Inline patterns: 58 "fontstyle" : r"('{2,6})", 59 "monospace" : r"`", 60 "rule" : r"(-----*)", # ----... 61 "sub" : r",,", 62 "super" : r"\^", 63 "underline" : r"__", 64 65 # Inline contents: 66 "monospaceend" : r"`", 67 "subend" : r",,", 68 "superend" : r"\^", 69 "underlineend" : r"__", 70 71 # Heading contents: 72 "headingend" : r"(\s+)(=+)(\s*\n)", # ws... =... [ws...] nl 73 74 # List contents: 75 "deftermend" : r"::(\s*?\n)", 76 "deftermsep" : r"::(\s+)", 77 "listitemend" : r"^", # next line 78 } 79 80 # Define patterns for the regular expressions. 81 82 patterns = {} 83 for name, value in syntax.items(): 84 patterns[name] = re.compile(value, re.UNICODE | re.MULTILINE) 85 86 inline_pattern_names = ["fontstyle", "monospace", "sub", "super", "underline"] 87 88 def inline_patterns_for(name): 89 names = inline_pattern_names[:] 90 names[names.index(name)] = "%send" % name 91 return names 92 93 94 95 # Tokenising functions. 96 97 class TokenStream: 98 99 "A stream of tokens taken from a string." 100 101 def __init__(self, s): 102 self.s = s 103 self.pos = 0 104 self.match = None 105 self.matching = None 106 107 def rewind(self, length): 108 109 "Rewind in the string by 'length'." 110 111 self.pos -= min(length, self.pos) 112 113 def read_until(self, pattern_names, remaining=True): 114 115 """ 116 Find the first match for the given 'pattern_names'. Return the text 117 preceding any match, the remaining text if no match was found, or None 118 if no match was found and 'remaining' is given as a false value. 119 """ 120 121 first = None 122 self.matching = None 123 124 # Find the first matching pattern. 125 126 for pattern_name in pattern_names: 127 match = patterns[pattern_name].search(self.s, self.pos) 128 if match: 129 start, end = match.span() 130 if self.matching is None or start < first: 131 first = start 132 self.matching = pattern_name 133 self.match = match 134 135 if self.matching is None: 136 if remaining: 137 return self.s[self.pos:] 138 else: 139 return None 140 else: 141 return self.s[self.pos:first] 142 143 def read_match(self, group=1): 144 145 """ 146 Return the matched text, updating the position in the stream. If 'group' 147 is specified, the indicated group in a match will be returned. 148 Typically, group 1 should contain all pertinent data, but groups defined 149 within group 1 can provide sections of the data. 150 """ 151 152 if self.match: 153 _start, self.pos = self.match.span() 154 try: 155 return self.match.group(group) 156 except IndexError: 157 return "" 158 else: 159 self.pos = len(self.s) 160 return None 161 162 163 164 # Parser functions. 165 166 def parse_page(s): 167 168 """ 169 Parse page text 's'. Pages consist of regions delimited by markers. 170 """ 171 172 return parse_region(TokenStream(s)) 173 174 def parse_region(items, level=0, indent=0): 175 176 """ 177 Parse the data provided by 'items' to populate a region with the given 178 'level' at the given 'indent'. 179 """ 180 181 region = Region([], level, indent) 182 183 # Parse section headers. 184 185 parse_region_header(items, region) 186 187 # Parse section body. 188 189 if region.is_transparent(): 190 parse_region_wiki(items, region) 191 else: 192 parse_region_opaque(items, region) 193 194 return region 195 196 def parse_region_header(items, region): 197 198 """ 199 Parse the region header from the 'items', setting it for the given 'region'. 200 """ 201 202 if items.read_until(["header"], False) == "": # None means no header 203 region.type = items.read_match() 204 205 def parse_region_wiki(items, region): 206 207 "Parse the data provided by 'items' to populate a wiki 'region'." 208 209 new_block(region) 210 parse_region_details(items, region, inline_pattern_names + [ 211 "break", "heading", 212 "defterm", "defterm_empty", 213 "listitem", "listitem_alpha", "listitem_dot", "listitem_num", 214 "listitem_roman", 215 "regionstart", "regionend", 216 "rule", 217 ]) 218 219 def parse_region_opaque(items, region): 220 221 "Parse the data provided by 'items' to populate an opaque 'region'." 222 223 parse_region_details(items, region, ["regionend"]) 224 225 def parse_region_details(items, region, pattern_names): 226 227 "Parse 'items' within 'region' searching using 'pattern_names'." 228 229 try: 230 while True: 231 232 # Obtain text before any marker or the end of the input. 233 234 preceding = items.read_until(pattern_names) 235 if preceding: 236 region.append_inline(Text(preceding)) 237 238 # End of input. 239 240 if not items.matching: 241 break 242 243 # Obtain any feature. 244 245 feature = items.read_match() 246 handler = handlers.get(items.matching) 247 248 # Handle each feature or add text to the region. 249 250 if handler: 251 handler(items, region) 252 else: 253 region.append_inline(Text(feature)) 254 255 except StopIteration: 256 pass 257 258 region.normalise() 259 260 def end_region(items, region): 261 262 "End the parsing of 'region'." 263 264 raise StopIteration 265 266 def parse_break(items, region): 267 268 "Handle a paragraph break within 'region'." 269 270 region.add(Break()) 271 new_block(region) 272 273 def parse_defitem(items, region, extra=""): 274 275 "Handle a definition item within 'region'." 276 277 pad = items.read_match(1) 278 item = DefItem([], pad, extra) 279 parse_region_details(items, item, ["listitemend"]) 280 region.add(item) 281 new_block(region) 282 283 def parse_defterm(items, region): 284 285 "Handle a definition term within 'region'." 286 287 pad = items.read_match(1) 288 term = DefTerm([], pad) 289 parse_region_details(items, term, ["deftermend", "deftermsep"]) 290 region.add(term) 291 if items.matching == "deftermsep": 292 parse_defitem(items, region) 293 294 def parse_defterm_empty(items, region): 295 296 "Handle an empty definition term within 'region'." 297 298 extra = items.read_match(1) 299 parse_region_details(items, region, ["deftermsep"]) 300 parse_defitem(items, region, extra) 301 302 def parse_fontstyle(items, region): 303 304 "Handle emphasis and strong styles." 305 306 n = len(items.read_match(1)) 307 308 # Handle endings. 309 310 if isinstance(region, FontStyle): 311 emphasis = n in (2, 4, 5) 312 strong = n in (3, 5, 6) 313 active = True 314 315 if region.emphasis and emphasis: 316 active = region.close_emphasis() 317 n -= 2 318 if region.strong and strong: 319 active = region.close_strong() 320 n -= 3 321 322 if not active: 323 if n: 324 items.rewind(n) 325 raise StopIteration 326 327 elif not n: 328 return 329 330 # Handle new styles. 331 332 emphasis = n in (2, 4, 5) 333 strong = n in (3, 5, 6) 334 double = n in (4, 6) 335 336 span = FontStyle([], emphasis, strong) 337 if not double: 338 parse_region_details(items, span, inline_pattern_names) 339 region.append_inline(span) 340 341 def parse_heading(items, region): 342 343 "Handle a heading." 344 345 start_extra = items.read_match(1) 346 level = len(items.read_match(2)) 347 start_pad = items.read_match(3) 348 heading = Heading([], level, start_extra, start_pad) 349 parse_region_details(items, heading, ["headingend"] + inline_pattern_names) 350 region.add(heading) 351 new_block(region) 352 353 def parse_heading_end(items, heading): 354 355 "Handle the end of a heading." 356 357 level = len(items.read_match(2)) 358 if heading.level == level: 359 heading.end_pad = items.read_match(1) 360 heading.end_extra = items.read_match(3) 361 raise StopIteration 362 363 def parse_listitem(items, region): 364 365 "Handle a list item marker within 'region'." 366 367 indent = len(items.read_match(1)) 368 marker = items.read_match(2) 369 space = items.read_match(3) 370 item = ListItem([], indent, marker, space) 371 parse_region_details(items, item, ["listitemend"]) 372 region.add(item) 373 new_block(region) 374 375 def parse_monospace(items, region): 376 377 "Handle monospace." 378 379 span = Monospace([]) 380 parse_region_details(items, span, inline_patterns_for("monospace")) 381 region.append_inline(span) 382 383 def parse_rule(items, region): 384 385 "Handle a horizontal rule within 'region'." 386 387 length = len(items.read_match(1)) 388 rule = Rule(length) 389 region.add(rule) 390 new_block(region) 391 392 def parse_section(items, region): 393 394 "Handle the start of a new section within 'region'." 395 396 # Parse the section and start a new block after the section. 397 398 indent = len(items.read_match(2)) 399 level = len(items.read_match(3)) 400 region.add(parse_region(items, level, indent)) 401 new_block(region) 402 403 def parse_section_end(items, region): 404 405 "Handle the end of a new section within 'region'." 406 407 feature = items.read_match() 408 if region.have_end(feature): 409 raise StopIteration 410 else: 411 region.append_inline(Text(feature)) 412 413 def parse_sub(items, region): 414 415 "Handle subscript." 416 417 span = Subscript([]) 418 parse_region_details(items, span, inline_patterns_for("sub")) 419 region.append_inline(span) 420 421 def parse_super(items, region): 422 423 "Handle superscript." 424 425 span = Superscript([]) 426 parse_region_details(items, span, inline_patterns_for("super")) 427 region.append_inline(span) 428 429 def parse_underline(items, region): 430 431 "Handle underline." 432 433 span = Underline([]) 434 parse_region_details(items, span, inline_patterns_for("underline")) 435 region.append_inline(span) 436 437 # Pattern handlers. 438 439 handlers = { 440 None : end_region, 441 "break" : parse_break, 442 "defterm" : parse_defterm, 443 "defterm_empty" : parse_defterm_empty, 444 "deftermend" : end_region, 445 "deftermsep" : end_region, 446 "fontstyle" : parse_fontstyle, 447 "heading" : parse_heading, 448 "headingend" : parse_heading_end, 449 "listitemend" : end_region, 450 "listitem" : parse_listitem, 451 "listitem_alpha" : parse_listitem, 452 "listitem_dot" : parse_listitem, 453 "listitem_num" : parse_listitem, 454 "listitem_roman" : parse_listitem, 455 "monospace" : parse_monospace, 456 "monospaceend" : end_region, 457 "regionstart" : parse_section, 458 "regionend" : parse_section_end, 459 "rule" : parse_rule, 460 "sub" : parse_sub, 461 "subend" : end_region, 462 "super" : parse_super, 463 "superend" : end_region, 464 "underline" : parse_underline, 465 "underlineend" : end_region, 466 } 467 468 def new_block(region): 469 470 "Start a new block in 'region'." 471 472 block = Block([]) 473 region.add(block) 474 475 476 477 # Top-level functions. 478 479 parse = parse_page 480 481 # vim: tabstop=4 expandtab shiftwidth=4