1 #!/usr/bin/env python 2 3 """ 4 Moin wiki parsing functionality. 5 6 Copyright (C) 2017, 2018, 2019, 2021, 2023 Paul Boddie <paul@boddie.org.uk> 7 8 This program is free software; you can redistribute it and/or modify it under 9 the terms of the GNU General Public License as published by the Free Software 10 Foundation; either version 3 of the License, or (at your option) any later 11 version. 12 13 This program is distributed in the hope that it will be useful, but WITHOUT 14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS 15 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more 16 details. 17 18 You should have received a copy of the GNU General Public License along with 19 this program. If not, see <http://www.gnu.org/licenses/>. 20 """ 21 22 from moinformat.tree.moin import Block, Region, Text 23 import re 24 25 # Pattern management. 26 27 ws_excl_nl = r"[ \f\r\t\v]" 28 quotes = "['" '"]' # ['"] 29 dotall = r"(.|\n)" # behave similarly to dot with DOTALL option 30 dotparagraph = r"(.|\n(?!\r?\n))" # match everything within paragraphs 31 32 def choice(l): 33 34 "Return a pattern matching a choice of patterns in 'l'." 35 36 return "(%s)" % "|".join(l) 37 38 def excl(s): 39 40 "Return a non-matching pattern for 's'." 41 42 return "(?!%s)" % s 43 44 def expect(s): 45 46 "Return a pattern expecting 's'." 47 48 return "(?=%s)" % s 49 50 def group(name, s): 51 52 """ 53 Return a pattern for the group having the given 'name' and employing the 54 pattern string 's'. 55 """ 56 57 return "(?P<%s>%s)" % (name, s) 58 59 def optional(s): 60 61 "Return an optional pattern." 62 63 return "(?:%s)?" % s 64 65 def recur(name): 66 67 "Return a test for a recurrence of group 'name'." 68 69 return "(?P=%s)" % name 70 71 def repeat(s, min=None, max=None): 72 73 "Return a pattern matching 's' for the given 'min' and 'max' limits." 74 75 return "%s{%s,%s}" % (s, min is not None and min or "", 76 max is not None and max or "") 77 78 def get_patterns(syntax): 79 80 """ 81 Define patterns for the regular expressions in the 'syntax' mapping. In each 82 pattern, replace... 83 84 \E with a pattern for matching all characters including newlines 85 \N with a pattern for matching whitespace excluding newlines 86 \P with a pattern for matching all characters within a paragraph 87 \Q with a pattern for matching quotation marks 88 89 Group names are also qualified with a pattern name prefix. 90 """ 91 92 patterns = {} 93 94 for name, value in syntax.items(): 95 value = value.replace(r"\N", ws_excl_nl) 96 value = value.replace(r"\Q", quotes) 97 value = value.replace(r"\E", dotall) 98 value = value.replace(r"\P", dotparagraph) 99 100 # Add the name to group names as a prefix. 101 102 value = value.replace("(?P<", "(?P<%s_" % name) 103 value = value.replace("(?P=", "(?P=%s_" % name) 104 105 # Record the updated expression and add an identifying null group. 106 107 patterns[name] = "%s(?P<group_%s>)" % (value, name) 108 109 return patterns 110 111 def get_expression(d, keys): 112 113 """ 114 Return a compiled expression combining patterns in 'd' having the given 115 'keys'. 116 """ 117 118 subset = [] 119 120 for key in keys: 121 subset.append(d[key]) 122 123 return re.compile("|".join(subset), re.UNICODE | re.MULTILINE) 124 125 126 127 # Tokenising functions. 128 129 class TokenStream: 130 131 "A stream of tokens taken from a string." 132 133 def __init__(self, s, pos=0): 134 self.s = s 135 self.pos = pos 136 137 # Match details. 138 139 self.match = None 140 self.queued = None 141 self.groups = {} 142 143 # Pattern name details. 144 145 self.matching = None 146 147 def rewind(self, length): 148 149 "Rewind in the string by 'length'." 150 151 self.pos -= min(length, self.pos) 152 153 def queue_match(self): 154 155 "Rewind in the string to the start of the last match." 156 157 self.queued = self.match 158 159 def read_until(self, expression, remaining=True): 160 161 """ 162 Find the first match for the given 'expression'. Return the text 163 preceding any match, the remaining text if no match was found, or None 164 if no match was found and 'remaining' is given as a false value. 165 """ 166 167 if self.queued: 168 self.match = self.queued 169 self.queued = None 170 else: 171 self.matching = None 172 self.match = None 173 174 # Find the first matching pattern. 175 176 match = expression.search(self.s, self.pos) 177 178 if match: 179 for name, value in match.groupdict().items(): 180 181 # Use a group with a non-null value to identify the 182 # matching pattern. 183 184 if name.startswith("group_") and value is not None: 185 self.matching = name[len("group_"):] 186 self.start, self.end = match.span() 187 self.match = match 188 break 189 190 # Return the remaining text, if appropriate. 191 192 if self.matching is None: 193 self.groups = {} 194 if remaining: 195 return self.s[self.pos:] 196 else: 197 return None 198 199 # Return the preceding text, requiring the match to be retrieved. 200 201 else: 202 self.groups = self.filter_groups() 203 return self.s[self.pos:self.start] 204 205 def filter_groups(self): 206 207 "Filter groups from the current match for the matching pattern." 208 209 prefix = "%s_" % self.matching 210 211 d = {} 212 for key, value in self.match.groupdict().items(): 213 if key.startswith(prefix): 214 d[key[len(prefix):]] = value 215 return d 216 217 def match_group(self, group=None): 218 219 """ 220 Return the matched text, updating the position in the stream. If 'group' 221 is specified, the indicated group in a match will be returned. 222 Otherwise, the entire match is returned. 223 """ 224 225 self.update_pos() 226 227 if self.match: 228 if group is None: 229 return self.s[self.start:self.end] 230 else: 231 return self.groups.get(group) 232 else: 233 return None 234 235 def match_groups(self, groups=None): 236 237 "Return the match 'groups', or all groups if unspecified." 238 239 self.update_pos() 240 241 if self.match: 242 if groups is None: 243 return self.groups 244 else: 245 l = [] 246 for group in groups: 247 l.append(self.groups.get(group)) 248 return l 249 else: 250 return [] 251 252 def update_pos(self): 253 254 "Update the position in the stream." 255 256 if self.match: 257 _start, self.pos = self.match.span() 258 else: 259 self.pos = len(self.s) 260 261 262 263 # Parser abstractions. 264 265 class ParserBase: 266 267 "Common parsing methods." 268 269 region_pattern_names = None 270 271 def __init__(self, metadata, parsers=None, root=None): 272 273 """ 274 Initialise the parser with the given 'metadata' and optional 'parsers'. 275 An optional 'root' indicates the document-level parser. 276 """ 277 278 self.metadata = metadata 279 self.parsers = parsers 280 self.root = root 281 282 def update_metadata(self, metadata): 283 pass 284 285 def get_parser(self, format_type): 286 287 """ 288 Return a parser for 'format_type' or None if no suitable parser is found. 289 """ 290 291 cls = self.parsers and self.parsers.get(format_type) 292 if cls: 293 return cls(self.metadata, self.parsers, self.root or self) 294 else: 295 return None 296 297 def get_expression(self, pattern_names): 298 299 "Return a mapping of the given 'pattern_names' to patterns." 300 301 return get_expression(self.patterns, pattern_names) 302 303 def get_items(self, s, pos=0): 304 305 "Return a sequence of token items for 's' and 'pos'." 306 307 return TokenStream(s, pos) 308 309 def set_region(self, items, region): 310 311 "Set the 'items' used to populate the given 'region'." 312 313 self.items = items 314 self.region = region 315 316 def read_until(self, pattern_names, remaining=True): 317 318 """ 319 Read the next portion of input, matching using 'pattern_names'. Return 320 the text preceding any match, the remaining text if no match was found, 321 or None if no match was found and 'remaining' is given as a false value. 322 """ 323 324 return self.items.read_until(self.get_expression(pattern_names), 325 remaining) 326 327 def match_group(self, group=None): 328 329 """ 330 Return the group of the matching pattern with the given 'group' 331 identifier. If 'group' is omitted or None, return the entire match. 332 """ 333 334 return self.items.match_group(group) 335 336 def matching_pattern(self): 337 338 "Return the name of the matching pattern." 339 340 return self.items.matching 341 342 def match_groups(self): 343 344 "Return the number of groups in the match." 345 346 return self.items.match_groups() 347 348 # Parser methods invoked from other objects. 349 350 def parse(self, s): 351 352 """ 353 Parse page text 's'. Pages consist of regions delimited by markers. 354 """ 355 356 self.items = self.get_items(s) 357 self.region = self.parse_region() 358 return self.region 359 360 def parse_region_content(self, items, region): 361 362 "Parse the data provided by 'items' to populate a 'region'." 363 364 self.set_region(items, region) 365 366 # Only parse directives if the region is transparent. 367 368 if region.transparent: 369 self.parse_region_directives(region) 370 371 # Parse inline and opaque regions. 372 373 if not region.transparent: 374 pattern_names = ["regionend"] 375 376 # Define a block to hold text. 377 378 else: 379 self.new_block(region) 380 pattern_names = self.region_pattern_names 381 382 # Start parsing. 383 384 if pattern_names: 385 self.parse_region_details(region, pattern_names) 386 387 # Reset the type if the region was not inline. 388 389 if region.type == "inline": 390 first = region.nodes and region.nodes[0] 391 if first and isinstance(first, Text) and first.multiline(): 392 region.type = None 393 394 # Top-level parser handler methods. 395 396 def parse_region(self, level=0, indent=0, type=None): 397 398 """ 399 Parse the data to populate a region with the given 'level' at the given 400 'indent' having the given initial 'type'. 401 """ 402 403 region = Region([], level, indent, type) 404 405 # Parse section headers, then parse according to region type. 406 407 self.parse_region_header(region) 408 self.parse_region_type(region) 409 410 return region 411 412 def parse_region_type(self, region): 413 414 """ 415 Use configured parsers to parse 'region' based on its type. 416 """ 417 418 # Find an appropriate parser given the type. 419 420 parser = self.get_parser(region.type) 421 if not parser: 422 region.transparent = False 423 parser = parser or self.get_parser("moin") 424 425 parser.parse_region_content(self.items, region) 426 427 def parse_region_header(self, region): 428 429 """ 430 Parse the region header, setting it on the 'region' object. 431 """ 432 433 if self.read_until(["header"], False) == "": # None means no header 434 region.args = self.match_group("args") 435 region.type = region.args.split(" ", 1)[0] 436 437 def parse_region_directives(self, region): 438 439 """ 440 Parse any directives immediately after the region header, adding them to 441 the 'region' object. 442 """ 443 444 try: 445 while True: 446 preceding = self.read_until(["directive"], False) 447 448 # With an immediately-appearing directive, handle its details. 449 450 if preceding == "": 451 handler = self.handlers.get(self.matching_pattern()) 452 if handler: 453 handler(self, region) 454 else: 455 break 456 457 # Otherwise, with no immediate directive (or none at all), stop. 458 459 else: 460 break 461 462 # Handle a premature end of region. 463 464 except StopIteration: 465 pass 466 467 # Parsing utilities. 468 469 def parse_region_details(self, region, pattern_names, strict=False): 470 471 """ 472 Search 'region' using the 'pattern_names'. If 'strict' is set to a true 473 value, forbid the accumulation of additional textual padding. 474 """ 475 476 try: 477 while True: 478 479 # Obtain text before any marker or the end of the input. 480 481 preceding = self.read_until(pattern_names) 482 if preceding: 483 if not strict: 484 region.append_inline(Text(preceding)) 485 else: 486 break 487 488 # End of input. 489 490 if not self.matching_pattern(): 491 self.match_group() 492 break 493 494 # Obtain any feature. 495 496 feature = self.match_group("feature") or self.match_group() 497 handler = self.handlers.get(self.matching_pattern()) 498 499 # Handle each feature or add text to the region. 500 501 if handler: 502 handler(self, region) 503 elif not strict: 504 region.append_inline(Text(feature)) 505 else: 506 break 507 508 except StopIteration: 509 pass 510 511 region.normalise() 512 513 def add_node(self, region, node): 514 515 "Add to 'region' the given 'node'." 516 517 region.add(node) 518 519 def append_node(self, region, node): 520 521 "Append to 'region' the given 'node'." 522 523 region.append(node) 524 525 def append_inline(self, region, node): 526 527 "Append to 'region' the given 'node'." 528 529 region.append_inline(node) 530 531 def end_region(self, region): 532 533 "End the parsing of 'region', breaking out of the parsing loop." 534 535 raise StopIteration 536 537 def queue_match(self): 538 539 "Queue the current match." 540 541 self.items.queue_match() 542 543 def new_block(self, region): 544 545 "Start a new block in 'region'." 546 547 self.add_node(region, Block([])) 548 549 # Common handler methods. 550 551 def parse_region_end(self, node): 552 553 "Handle the end of a region occurring within 'node'." 554 555 level = self.match_group("level") 556 feature = self.match_group("feature") 557 self.region.extra = self.match_group("extra") 558 559 if self.region.have_end(level): 560 raise StopIteration 561 else: 562 node.append_inline(Text(feature)) 563 564 # vim: tabstop=4 expandtab shiftwidth=4