1 #!/usr/bin/env python 2 3 """ 4 Moin wiki format parser. 5 6 Copyright (C) 2017 Paul Boddie <paul@boddie.org.uk> 7 8 This program is free software; you can redistribute it and/or modify it under 9 the terms of the GNU General Public License as published by the Free Software 10 Foundation; either version 3 of the License, or (at your option) any later 11 version. 12 13 This program is distributed in the hope that it will be useful, but WITHOUT 14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS 15 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more 16 details. 17 18 You should have received a copy of the GNU General Public License along with 19 this program. If not, see <http://www.gnu.org/licenses/>. 20 """ 21 22 from moinformat.tree import Block, Break, DefItem, DefTerm, FontStyle, Heading, \ 23 Larger, ListItem, Monospace, Region, Rule, Smaller, \ 24 Subscript, Superscript, Text, Underline 25 import re 26 27 # Regular expressions. 28 29 syntax = { 30 # Page regions: 31 "regionstart" : r"((^\s*)([{]{3,}))", # {{{... 32 "regionend" : r"^\s*([}]{3,})", # }}}... 33 "header" : r"#!(.*?)\n", # #! char-excl-nl 34 35 # Region contents: 36 # Line-oriented patterns: 37 # blank line 38 "break" : r"^(\s*?)\n", 39 # ws... expecting text :: 40 "defterm" : r"^(\s+)(?=.+?::)", 41 # ws... expecting :: ws... 42 "defterm_empty" : r"^(\s+)(?=::\s+)", 43 # [ws...] =... ws... expecting headingend 44 "heading" : r"^(\s*)(?P<x>=+)(\s+)(?=.*?\s+(?P=x)\s*\n)", 45 # ws... list-item [ws...] 46 "listitem" : r"^(\s+)(\*)(\s*)", 47 # ws... number-item ws... 48 "listitem_num" : r"^(\s+)(\d+\.)(\s+)", 49 # ws... alpha-item ws... 50 "listitem_alpha": r"^(\s+)([aA]\.)(\s+)", 51 # ws... roman-item ws... 52 "listitem_roman": r"^(\s+)([iI]\.)(\s+)", 53 # ws... dot-item [ws...] 54 "listitem_dot" : r"^(\s+)(\.)(\s*)", 55 56 # Region contents: 57 # Inline patterns: 58 "fontstyle" : r"('{2,6})", 59 "larger" : r"~\+", 60 "monospace" : r"`", 61 "rule" : r"(-----*)", # ----... 62 "smaller" : r"~-", 63 "sub" : r",,", 64 "super" : r"\^", 65 "underline" : r"__", 66 67 # Inline contents: 68 "largerend" : r"\+~", 69 "monospaceend" : r"`", 70 "smallerend" : r"-~", 71 "subend" : r",,", 72 "superend" : r"\^", 73 "underlineend" : r"__", 74 75 # Heading contents: 76 "headingend" : r"(\s+)(=+)(\s*\n)", # ws... =... [ws...] nl 77 78 # List contents: 79 "deftermend" : r"::(\s*?\n)", 80 "deftermsep" : r"::(\s+)", 81 "listitemend" : r"^", # next line 82 } 83 84 # Define inline pattern details. 85 86 inline_pattern_names = ["fontstyle", "larger", "monospace", "smaller", "sub", "super", "underline"] 87 88 def inline_patterns_for(name): 89 names = inline_pattern_names[:] 90 names[names.index(name)] = "%send" % name 91 return names 92 93 # Define patterns for the regular expressions. 94 95 patterns = {} 96 for name, value in syntax.items(): 97 patterns[name] = re.compile(value, re.UNICODE | re.MULTILINE) 98 99 100 101 # Tokenising functions. 102 103 class TokenStream: 104 105 "A stream of tokens taken from a string." 106 107 def __init__(self, s): 108 self.s = s 109 self.pos = 0 110 self.match = None 111 self.matching = None 112 113 def rewind(self, length): 114 115 "Rewind in the string by 'length'." 116 117 self.pos -= min(length, self.pos) 118 119 def read_until(self, pattern_names, remaining=True): 120 121 """ 122 Find the first match for the given 'pattern_names'. Return the text 123 preceding any match, the remaining text if no match was found, or None 124 if no match was found and 'remaining' is given as a false value. 125 """ 126 127 first = None 128 self.matching = None 129 130 # Find the first matching pattern. 131 132 for pattern_name in pattern_names: 133 match = patterns[pattern_name].search(self.s, self.pos) 134 if match: 135 start, end = match.span() 136 if self.matching is None or start < first: 137 first = start 138 self.matching = pattern_name 139 self.match = match 140 141 if self.matching is None: 142 if remaining: 143 return self.s[self.pos:] 144 else: 145 return None 146 else: 147 return self.s[self.pos:first] 148 149 def read_match(self, group=1): 150 151 """ 152 Return the matched text, updating the position in the stream. If 'group' 153 is specified, the indicated group in a match will be returned. 154 Typically, group 1 should contain all pertinent data, but groups defined 155 within group 1 can provide sections of the data. 156 """ 157 158 if self.match: 159 _start, self.pos = self.match.span() 160 try: 161 return self.match.group(group) 162 except IndexError: 163 return "" 164 else: 165 self.pos = len(self.s) 166 return None 167 168 169 170 # Parser functions. 171 172 def parse_page(s): 173 174 """ 175 Parse page text 's'. Pages consist of regions delimited by markers. 176 """ 177 178 return parse_region(TokenStream(s)) 179 180 def parse_region(items, level=0, indent=0): 181 182 """ 183 Parse the data provided by 'items' to populate a region with the given 184 'level' at the given 'indent'. 185 """ 186 187 region = Region([], level, indent) 188 189 # Parse section headers. 190 191 parse_region_header(items, region) 192 193 # Parse section body. 194 195 if region.is_transparent(): 196 parse_region_wiki(items, region) 197 else: 198 parse_region_opaque(items, region) 199 200 return region 201 202 def parse_region_header(items, region): 203 204 """ 205 Parse the region header from the 'items', setting it for the given 'region'. 206 """ 207 208 if items.read_until(["header"], False) == "": # None means no header 209 region.type = items.read_match() 210 211 def parse_region_wiki(items, region): 212 213 "Parse the data provided by 'items' to populate a wiki 'region'." 214 215 new_block(region) 216 parse_region_details(items, region, inline_pattern_names + [ 217 "break", "heading", 218 "defterm", "defterm_empty", 219 "listitem", "listitem_alpha", "listitem_dot", "listitem_num", 220 "listitem_roman", 221 "regionstart", "regionend", 222 "rule", 223 ]) 224 225 def parse_region_opaque(items, region): 226 227 "Parse the data provided by 'items' to populate an opaque 'region'." 228 229 parse_region_details(items, region, ["regionend"]) 230 231 def parse_region_details(items, region, pattern_names): 232 233 "Parse 'items' within 'region' searching using 'pattern_names'." 234 235 try: 236 while True: 237 238 # Obtain text before any marker or the end of the input. 239 240 preceding = items.read_until(pattern_names) 241 if preceding: 242 region.append_inline(Text(preceding)) 243 244 # End of input. 245 246 if not items.matching: 247 break 248 249 # Obtain any feature. 250 251 feature = items.read_match() 252 handler = handlers.get(items.matching) 253 254 # Handle each feature or add text to the region. 255 256 if handler: 257 handler(items, region) 258 else: 259 region.append_inline(Text(feature)) 260 261 except StopIteration: 262 pass 263 264 region.normalise() 265 266 def end_region(items, region): 267 268 "End the parsing of 'region'." 269 270 raise StopIteration 271 272 def parse_break(items, region): 273 274 "Handle a paragraph break within 'region'." 275 276 region.add(Break()) 277 new_block(region) 278 279 def parse_defitem(items, region, extra=""): 280 281 "Handle a definition item within 'region'." 282 283 pad = items.read_match(1) 284 item = DefItem([], pad, extra) 285 parse_region_details(items, item, ["listitemend"]) 286 region.add(item) 287 new_block(region) 288 289 def parse_defterm(items, region): 290 291 "Handle a definition term within 'region'." 292 293 pad = items.read_match(1) 294 term = DefTerm([], pad) 295 parse_region_details(items, term, ["deftermend", "deftermsep"]) 296 region.add(term) 297 if items.matching == "deftermsep": 298 parse_defitem(items, region) 299 300 def parse_defterm_empty(items, region): 301 302 "Handle an empty definition term within 'region'." 303 304 extra = items.read_match(1) 305 parse_region_details(items, region, ["deftermsep"]) 306 parse_defitem(items, region, extra) 307 308 def parse_fontstyle(items, region): 309 310 "Handle emphasis and strong styles." 311 312 n = len(items.read_match(1)) 313 314 # Handle endings. 315 316 if isinstance(region, FontStyle): 317 emphasis = n in (2, 4, 5) 318 strong = n in (3, 5, 6) 319 active = True 320 321 if region.emphasis and emphasis: 322 active = region.close_emphasis() 323 n -= 2 324 if region.strong and strong: 325 active = region.close_strong() 326 n -= 3 327 328 if not active: 329 if n: 330 items.rewind(n) 331 raise StopIteration 332 333 elif not n: 334 return 335 336 # Handle new styles. 337 338 emphasis = n in (2, 4, 5) 339 strong = n in (3, 5, 6) 340 double = n in (4, 6) 341 342 span = FontStyle([], emphasis, strong) 343 if not double: 344 parse_region_details(items, span, inline_pattern_names) 345 region.append_inline(span) 346 347 def parse_heading(items, region): 348 349 "Handle a heading." 350 351 start_extra = items.read_match(1) 352 level = len(items.read_match(2)) 353 start_pad = items.read_match(3) 354 heading = Heading([], level, start_extra, start_pad) 355 parse_region_details(items, heading, ["headingend"] + inline_pattern_names) 356 region.add(heading) 357 new_block(region) 358 359 def parse_heading_end(items, heading): 360 361 "Handle the end of a heading." 362 363 level = len(items.read_match(2)) 364 if heading.level == level: 365 heading.end_pad = items.read_match(1) 366 heading.end_extra = items.read_match(3) 367 raise StopIteration 368 369 def parse_listitem(items, region): 370 371 "Handle a list item marker within 'region'." 372 373 indent = len(items.read_match(1)) 374 marker = items.read_match(2) 375 space = items.read_match(3) 376 item = ListItem([], indent, marker, space) 377 parse_region_details(items, item, ["listitemend"]) 378 region.add(item) 379 new_block(region) 380 381 def parse_rule(items, region): 382 383 "Handle a horizontal rule within 'region'." 384 385 length = len(items.read_match(1)) 386 rule = Rule(length) 387 region.add(rule) 388 new_block(region) 389 390 def parse_section(items, region): 391 392 "Handle the start of a new section within 'region'." 393 394 # Parse the section and start a new block after the section. 395 396 indent = len(items.read_match(2)) 397 level = len(items.read_match(3)) 398 region.add(parse_region(items, level, indent)) 399 new_block(region) 400 401 def parse_section_end(items, region): 402 403 "Handle the end of a new section within 'region'." 404 405 feature = items.read_match() 406 if region.have_end(feature): 407 raise StopIteration 408 else: 409 region.append_inline(Text(feature)) 410 411 # Inline formatting handlers. 412 413 def parse_inline(items, region, cls, pattern_name): 414 415 "Handle an inline region." 416 417 span = cls([]) 418 parse_region_details(items, span, inline_patterns_for(pattern_name)) 419 region.append_inline(span) 420 421 parse_larger = lambda items, region: parse_inline(items, region, Larger, "larger") 422 parse_monospace = lambda items, region: parse_inline(items, region, Monospace, "monospace") 423 parse_smaller = lambda items, region: parse_inline(items, region, Smaller, "smaller") 424 parse_sub = lambda items, region: parse_inline(items, region, Subscript, "sub") 425 parse_super = lambda items, region: parse_inline(items, region, Superscript, "super") 426 parse_underline = lambda items, region: parse_inline(items, region, Underline, "underline") 427 428 # Pattern handlers. 429 430 handlers = { 431 None : end_region, 432 "break" : parse_break, 433 "defterm" : parse_defterm, 434 "defterm_empty" : parse_defterm_empty, 435 "deftermend" : end_region, 436 "deftermsep" : end_region, 437 "fontstyle" : parse_fontstyle, 438 "heading" : parse_heading, 439 "headingend" : parse_heading_end, 440 "larger" : parse_larger, 441 "largerend" : end_region, 442 "listitemend" : end_region, 443 "listitem" : parse_listitem, 444 "listitem_alpha" : parse_listitem, 445 "listitem_dot" : parse_listitem, 446 "listitem_num" : parse_listitem, 447 "listitem_roman" : parse_listitem, 448 "monospace" : parse_monospace, 449 "monospaceend" : end_region, 450 "regionstart" : parse_section, 451 "regionend" : parse_section_end, 452 "rule" : parse_rule, 453 "smaller" : parse_smaller, 454 "smallerend" : end_region, 455 "sub" : parse_sub, 456 "subend" : end_region, 457 "super" : parse_super, 458 "superend" : end_region, 459 "underline" : parse_underline, 460 "underlineend" : end_region, 461 } 462 463 def new_block(region): 464 465 "Start a new block in 'region'." 466 467 block = Block([]) 468 region.add(block) 469 470 471 472 # Top-level functions. 473 474 parse = parse_page 475 476 # vim: tabstop=4 expandtab shiftwidth=4