1 #!/usr/bin/env python 2 3 """ 4 Moin wiki parsing functionality. 5 6 Copyright (C) 2017, 2018, 2019, 2021 Paul Boddie <paul@boddie.org.uk> 7 8 This program is free software; you can redistribute it and/or modify it under 9 the terms of the GNU General Public License as published by the Free Software 10 Foundation; either version 3 of the License, or (at your option) any later 11 version. 12 13 This program is distributed in the hope that it will be useful, but WITHOUT 14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS 15 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more 16 details. 17 18 You should have received a copy of the GNU General Public License along with 19 this program. If not, see <http://www.gnu.org/licenses/>. 20 """ 21 22 from moinformat.tree.moin import Block, Region, Text 23 import re 24 25 # Pattern management. 26 27 ws_excl_nl = r"[ \f\r\t\v]" 28 quotes = "['" '"]' # ['"] 29 dotall = r"(.|\n)" # behave similarly to dot with DOTALL option 30 dotparagraph = r"(.|\n(?!\r?\n))" # match everything within paragraphs 31 32 def choice(l): 33 34 "Return a pattern matching a choice of patterns in 'l'." 35 36 return "(%s)" % "|".join(l) 37 38 def excl(s): 39 40 "Return a non-matching pattern for 's'." 41 42 return "(?!%s)" % s 43 44 def expect(s): 45 46 "Return a pattern expecting 's'." 47 48 return "(?=%s)" % s 49 50 def group(name, s): 51 52 """ 53 Return a pattern for the group having the given 'name' and employing the 54 pattern string 's'. 55 """ 56 57 return "(?P<%s>%s)" % (name, s) 58 59 def optional(s): 60 61 "Return an optional pattern." 62 63 return "(?:%s)?" % s 64 65 def recur(name): 66 67 "Return a test for a recurrence of group 'name'." 68 69 return "(?P=%s)" % name 70 71 def repeat(s, min=None, max=None): 72 73 "Return a pattern matching 's' for the given 'min' and 'max' limits." 74 75 return "%s{%s,%s}" % (s, min is not None and min or "", 76 max is not None and max or "") 77 78 def get_patterns(syntax): 79 80 """ 81 Define patterns for the regular expressions in the 'syntax' mapping. In each 82 pattern, replace... 83 84 \E with a pattern for matching all characters including newlines 85 \N with a pattern for matching whitespace excluding newlines 86 \P with a pattern for matching all characters within a paragraph 87 \Q with a pattern for matching quotation marks 88 89 Group names are also qualified with a pattern name prefix. 90 """ 91 92 patterns = {} 93 94 for name, value in syntax.items(): 95 value = value.replace(r"\N", ws_excl_nl) 96 value = value.replace(r"\Q", quotes) 97 value = value.replace(r"\E", dotall) 98 value = value.replace(r"\P", dotparagraph) 99 100 # Add the name to group names as a prefix. 101 102 value = value.replace("(?P<", "(?P<%s_" % name) 103 value = value.replace("(?P=", "(?P=%s_" % name) 104 105 # Record the updated expression and add an identifying null group. 106 107 patterns[name] = "%s(?P<group_%s>)" % (value, name) 108 109 return patterns 110 111 def get_expression(d, keys): 112 113 """ 114 Return a compiled expression combining patterns in 'd' having the given 115 'keys'. 116 """ 117 118 subset = [] 119 120 for key in keys: 121 subset.append(d[key]) 122 123 return re.compile("|".join(subset), re.UNICODE | re.MULTILINE) 124 125 126 127 # Tokenising functions. 128 129 class TokenStream: 130 131 "A stream of tokens taken from a string." 132 133 def __init__(self, s, pos=0): 134 self.s = s 135 self.pos = pos 136 137 # Match details. 138 139 self.match = None 140 self.queued = None 141 self.groups = {} 142 143 # Pattern name details. 144 145 self.matching = None 146 147 def rewind(self, length): 148 149 "Rewind in the string by 'length'." 150 151 self.pos -= min(length, self.pos) 152 153 def queue_match(self): 154 155 "Rewind in the string to the start of the last match." 156 157 self.queued = self.match 158 159 def read_until(self, expression, remaining=True): 160 161 """ 162 Find the first match for the given 'expression'. Return the text 163 preceding any match, the remaining text if no match was found, or None 164 if no match was found and 'remaining' is given as a false value. 165 """ 166 167 if self.queued: 168 self.match = self.queued 169 self.queued = None 170 else: 171 self.matching = None 172 self.match = None 173 174 # Find the first matching pattern. 175 176 match = expression.search(self.s, self.pos) 177 178 if match: 179 for name, value in match.groupdict().items(): 180 181 # Use a group with a non-null value to identify the 182 # matching pattern. 183 184 if name.startswith("group_") and value is not None: 185 self.matching = name[len("group_"):] 186 self.start, self.end = match.span() 187 self.match = match 188 break 189 190 # Return the remaining text, if appropriate. 191 192 if self.matching is None: 193 self.groups = {} 194 if remaining: 195 return self.s[self.pos:] 196 else: 197 return None 198 199 # Return the preceding text, requiring the match to be retrieved. 200 201 else: 202 self.groups = self.filter_groups() 203 return self.s[self.pos:self.start] 204 205 def filter_groups(self): 206 207 "Filter groups from the current match for the matching pattern." 208 209 prefix = "%s_" % self.matching 210 211 d = {} 212 for key, value in self.match.groupdict().items(): 213 if key.startswith(prefix): 214 d[key[len(prefix):]] = value 215 return d 216 217 def match_group(self, group=None): 218 219 """ 220 Return the matched text, updating the position in the stream. If 'group' 221 is specified, the indicated group in a match will be returned. 222 Otherwise, the entire match is returned. 223 """ 224 225 self.update_pos() 226 227 if self.match: 228 if group is None: 229 return self.s[self.start:self.end] 230 else: 231 return self.groups.get(group) 232 else: 233 return None 234 235 def match_groups(self, groups=None): 236 237 "Return the match 'groups', or all groups if unspecified." 238 239 self.update_pos() 240 241 if self.match: 242 if groups is None: 243 return self.groups 244 else: 245 l = [] 246 for group in groups: 247 l.append(self.groups.get(group)) 248 return l 249 else: 250 return [] 251 252 def update_pos(self): 253 254 "Update the position in the stream." 255 256 if self.match: 257 _start, self.pos = self.match.span() 258 else: 259 self.pos = len(self.s) 260 261 262 263 # Parser abstractions. 264 265 class ParserBase: 266 267 "Common parsing methods." 268 269 region_pattern_names = None 270 271 def __init__(self, metadata, parsers=None, root=None): 272 273 """ 274 Initialise the parser with the given 'metadata' and optional 'parsers'. 275 An optional 'root' indicates the document-level parser. 276 """ 277 278 self.metadata = metadata 279 self.parsers = parsers 280 self.root = root 281 282 def get_parser(self, format_type): 283 284 """ 285 Return a parser for 'format_type' or None if no suitable parser is found. 286 """ 287 288 cls = self.parsers and self.parsers.get(format_type) 289 if cls: 290 return cls(self.metadata, self.parsers, self.root or self) 291 else: 292 return None 293 294 def get_expression(self, pattern_names): 295 296 "Return a mapping of the given 'pattern_names' to patterns." 297 298 return get_expression(self.patterns, pattern_names) 299 300 def get_items(self, s, pos=0): 301 302 "Return a sequence of token items for 's' and 'pos'." 303 304 return TokenStream(s, pos) 305 306 def set_region(self, items, region): 307 308 "Set the 'items' used to populate the given 'region'." 309 310 self.items = items 311 self.region = region 312 313 def read_until(self, pattern_names, remaining=True): 314 315 """ 316 Read the next portion of input, matching using 'pattern_names'. Return 317 the text preceding any match, the remaining text if no match was found, 318 or None if no match was found and 'remaining' is given as a false value. 319 """ 320 321 return self.items.read_until(self.get_expression(pattern_names), 322 remaining) 323 324 def match_group(self, group=None): 325 326 """ 327 Return the group of the matching pattern with the given 'group' 328 identifier. If 'group' is omitted or None, return the entire match. 329 """ 330 331 return self.items.match_group(group) 332 333 def matching_pattern(self): 334 335 "Return the name of the matching pattern." 336 337 return self.items.matching 338 339 def match_groups(self): 340 341 "Return the number of groups in the match." 342 343 return self.items.match_groups() 344 345 # Parser methods invoked from other objects. 346 347 def parse(self, s): 348 349 """ 350 Parse page text 's'. Pages consist of regions delimited by markers. 351 """ 352 353 self.items = self.get_items(s) 354 self.region = self.parse_region() 355 return self.region 356 357 def parse_region_content(self, items, region): 358 359 "Parse the data provided by 'items' to populate a 'region'." 360 361 self.set_region(items, region) 362 363 # Only parse directives if the region is transparent. 364 365 if region.transparent: 366 self.parse_region_directives(region) 367 368 # Parse inline and opaque regions. 369 370 if not region.transparent: 371 pattern_names = ["regionend"] 372 373 # Define a block to hold text. 374 375 else: 376 self.new_block(region) 377 pattern_names = self.region_pattern_names 378 379 # Start parsing. 380 381 if pattern_names: 382 self.parse_region_details(region, pattern_names) 383 384 # Reset the type if the region was not inline. 385 386 if region.type == "inline": 387 first = region.nodes and region.nodes[0] 388 if first and isinstance(first, Text) and first.multiline(): 389 region.type = None 390 391 # Top-level parser handler methods. 392 393 def parse_region(self, level=0, indent=0, type=None): 394 395 """ 396 Parse the data to populate a region with the given 'level' at the given 397 'indent' having the given initial 'type'. 398 """ 399 400 region = Region([], level, indent, type) 401 402 # Parse section headers, then parse according to region type. 403 404 self.parse_region_header(region) 405 self.parse_region_type(region) 406 407 return region 408 409 def parse_region_type(self, region): 410 411 """ 412 Use configured parsers to parse 'region' based on its type. 413 """ 414 415 # Find an appropriate parser given the type. 416 417 parser = self.get_parser(region.type) 418 if not parser: 419 region.transparent = False 420 parser = parser or self.get_parser("moin") 421 422 parser.parse_region_content(self.items, region) 423 424 def parse_region_header(self, region): 425 426 """ 427 Parse the region header, setting it on the 'region' object. 428 """ 429 430 if self.read_until(["header"], False) == "": # None means no header 431 region.args = self.match_group("args") 432 region.type = region.args.split(" ", 1)[0] 433 434 def parse_region_directives(self, region): 435 436 """ 437 Parse any directives immediately after the region header, adding them to 438 the 'region' object. 439 """ 440 441 try: 442 while True: 443 preceding = self.read_until(["directive"], False) 444 445 # With an immediately-appearing directive, handle its details. 446 447 if preceding == "": 448 handler = self.handlers.get(self.matching_pattern()) 449 if handler: 450 handler(self, region) 451 else: 452 break 453 454 # Otherwise, with no immediate directive (or none at all), stop. 455 456 else: 457 break 458 459 # Handle a premature end of region. 460 461 except StopIteration: 462 pass 463 464 # Parsing utilities. 465 466 def parse_region_details(self, region, pattern_names, strict=False): 467 468 """ 469 Search 'region' using the 'pattern_names'. If 'strict' is set to a true 470 value, forbid the accumulation of additional textual padding. 471 """ 472 473 try: 474 while True: 475 476 # Obtain text before any marker or the end of the input. 477 478 preceding = self.read_until(pattern_names) 479 if preceding: 480 if not strict: 481 region.append_inline(Text(preceding)) 482 else: 483 break 484 485 # End of input. 486 487 if not self.matching_pattern(): 488 self.match_group() 489 break 490 491 # Obtain any feature. 492 493 feature = self.match_group("feature") or self.match_group() 494 handler = self.handlers.get(self.matching_pattern()) 495 496 # Handle each feature or add text to the region. 497 498 if handler: 499 handler(self, region) 500 elif not strict: 501 region.append_inline(Text(feature)) 502 else: 503 break 504 505 except StopIteration: 506 pass 507 508 region.normalise() 509 510 def add_node(self, region, node): 511 512 "Add to 'region' the given 'node'." 513 514 region.add(node) 515 516 def append_node(self, region, node): 517 518 "Append to 'region' the given 'node'." 519 520 region.append(node) 521 522 def append_inline(self, region, node): 523 524 "Append to 'region' the given 'node'." 525 526 region.append_inline(node) 527 528 def end_region(self, region): 529 530 "End the parsing of 'region', breaking out of the parsing loop." 531 532 raise StopIteration 533 534 def queue_match(self): 535 536 "Queue the current match." 537 538 self.items.queue_match() 539 540 def new_block(self, region): 541 542 "Start a new block in 'region'." 543 544 self.add_node(region, Block([])) 545 546 # Common handler methods. 547 548 def parse_region_end(self, node): 549 550 "Handle the end of a region occurring within 'node'." 551 552 level = self.match_group("level") 553 feature = self.match_group("feature") 554 self.region.extra = self.match_group("extra") 555 556 if self.region.have_end(level): 557 raise StopIteration 558 else: 559 node.append_inline(Text(feature)) 560 561 # vim: tabstop=4 expandtab shiftwidth=4