1 #!/usr/bin/env python 2 3 """ 4 Moin wiki parsing functionality. 5 6 Copyright (C) 2017, 2018, 2019 Paul Boddie <paul@boddie.org.uk> 7 8 This program is free software; you can redistribute it and/or modify it under 9 the terms of the GNU General Public License as published by the Free Software 10 Foundation; either version 3 of the License, or (at your option) any later 11 version. 12 13 This program is distributed in the hope that it will be useful, but WITHOUT 14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS 15 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more 16 details. 17 18 You should have received a copy of the GNU General Public License along with 19 this program. If not, see <http://www.gnu.org/licenses/>. 20 """ 21 22 from moinformat.tree.moin import Block, Region, Text 23 import re 24 25 # Pattern management. 26 27 ws_excl_nl = r"[ \f\r\t\v]" 28 quotes = "['" '"]' # ['"] 29 dotall = r"(.|\n)" # behave similarly to dot with DOTALL option 30 dotparagraph = r"(.|\n(?!\r?\n))" # match everything within paragraphs 31 32 def choice(l): 33 34 "Return a pattern matching a choice of patterns in 'l'." 35 36 return "(%s)" % "|".join(l) 37 38 def excl(s): 39 40 "Return a non-matching pattern for 's'." 41 42 return "(?!%s)" % s 43 44 def expect(s): 45 46 "Return a pattern expecting 's'." 47 48 return "(?=%s)" % s 49 50 def group(name, s): 51 52 """ 53 Return a pattern for the group having the given 'name' and employing the 54 pattern string 's'. 55 """ 56 57 return "(?P<%s>%s)" % (name, s) 58 59 def optional(s): 60 61 "Return an optional pattern." 62 63 return "(?:%s)?" % s 64 65 def recur(name): 66 67 "Return a test for a recurrence of group 'name'." 68 69 return "(?P=%s)" % name 70 71 def repeat(s, min=None, max=None): 72 73 "Return a pattern matching 's' for the given 'min' and 'max' limits." 74 75 return "%s{%s,%s}" % (s, min is not None and min or "", 76 max is not None and max or "") 77 78 def get_patterns(syntax): 79 80 """ 81 Define patterns for the regular expressions in the 'syntax' mapping. In each 82 pattern, replace... 83 84 \E with a pattern for matching all characters including newlines 85 \N with a pattern for matching whitespace excluding newlines 86 \P with a pattern for matching all characters within a paragraph 87 \Q with a pattern for matching quotation marks 88 89 Group names are also qualified with a pattern name prefix. 90 """ 91 92 patterns = {} 93 94 for name, value in syntax.items(): 95 value = value.replace(r"\N", ws_excl_nl) 96 value = value.replace(r"\Q", quotes) 97 value = value.replace(r"\E", dotall) 98 value = value.replace(r"\P", dotparagraph) 99 100 # Add the name to group names as a prefix. 101 102 value = value.replace("(?P<", "(?P<%s_" % name) 103 value = value.replace("(?P=", "(?P=%s_" % name) 104 105 # Record the updated expression and add an identifying null group. 106 107 patterns[name] = "%s(?P<group_%s>)" % (value, name) 108 109 return patterns 110 111 def get_expression(d, keys): 112 113 """ 114 Return a compiled expression combining patterns in 'd' having the given 115 'keys'. 116 """ 117 118 subset = [] 119 120 for key in keys: 121 subset.append(d[key]) 122 123 return re.compile("|".join(subset), re.UNICODE | re.MULTILINE) 124 125 126 127 # Tokenising functions. 128 129 class TokenStream: 130 131 "A stream of tokens taken from a string." 132 133 def __init__(self, s, pos=0): 134 self.s = s 135 self.pos = pos 136 137 # Match details. 138 139 self.match = None 140 self.queued = None 141 self.groups = {} 142 143 # Pattern name details. 144 145 self.matching = None 146 147 def rewind(self, length): 148 149 "Rewind in the string by 'length'." 150 151 self.pos -= min(length, self.pos) 152 153 def queue_match(self): 154 155 "Rewind in the string to the start of the last match." 156 157 self.queued = self.match 158 159 def read_until(self, expression, remaining=True): 160 161 """ 162 Find the first match for the given 'expression'. Return the text 163 preceding any match, the remaining text if no match was found, or None 164 if no match was found and 'remaining' is given as a false value. 165 """ 166 167 if self.queued: 168 self.match = self.queued 169 self.queued = None 170 else: 171 self.matching = None 172 173 # Find the first matching pattern. 174 175 match = expression.search(self.s, self.pos) 176 177 if match: 178 for name, value in match.groupdict().items(): 179 180 # Use a group with a non-null value to identify the 181 # matching pattern. 182 183 if name.startswith("group_") and value is not None: 184 self.matching = name[len("group_"):] 185 self.start, self.end = match.span() 186 self.match = match 187 break 188 189 # Return the remaining text, if appropriate. 190 191 if self.matching is None: 192 self.groups = {} 193 if remaining: 194 return self.s[self.pos:] 195 else: 196 return None 197 else: 198 self.groups = self.filter_groups() 199 return self.s[self.pos:self.start] 200 201 def filter_groups(self): 202 203 "Filter groups from the current match for the matching pattern." 204 205 prefix = "%s_" % self.matching 206 207 d = {} 208 for key, value in self.match.groupdict().items(): 209 if key.startswith(prefix): 210 d[key[len(prefix):]] = value 211 return d 212 213 def match_group(self, group=None): 214 215 """ 216 Return the matched text, updating the position in the stream. If 'group' 217 is specified, the indicated group in a match will be returned. 218 Otherwise, the entire match is returned. 219 """ 220 221 self.update_pos() 222 223 if self.match: 224 if group is None: 225 return self.s[self.start:self.end] 226 else: 227 return self.groups.get(group) 228 else: 229 return None 230 231 def match_groups(self, groups=None): 232 233 "Return the match 'groups', or all groups if unspecified." 234 235 self.update_pos() 236 237 if self.match: 238 if groups is None: 239 return self.groups 240 else: 241 l = [] 242 for group in groups: 243 l.append(self.groups.get(group)) 244 return l 245 else: 246 return [] 247 248 def update_pos(self): 249 250 "Update the position in the stream." 251 252 if self.match: 253 _start, self.pos = self.match.span() 254 else: 255 self.pos = len(self.s) 256 257 258 259 # Parser abstractions. 260 261 class ParserBase: 262 263 "Common parsing methods." 264 265 region_pattern_names = None 266 267 def __init__(self, metadata, parsers=None, root=None): 268 269 """ 270 Initialise the parser with the given 'metadata' and optional 'parsers'. 271 An optional 'root' indicates the document-level parser. 272 """ 273 274 self.metadata = metadata 275 self.parsers = parsers 276 self.root = root 277 278 def get_parser(self, format_type): 279 280 """ 281 Return a parser for 'format_type' or None if no suitable parser is found. 282 """ 283 284 cls = self.parsers and self.parsers.get(format_type) 285 if cls: 286 return cls(self.metadata, self.parsers, self.root or self) 287 else: 288 return None 289 290 def get_expression(self, pattern_names): 291 292 "Return a mapping of the given 'pattern_names' to patterns." 293 294 return get_expression(self.patterns, pattern_names) 295 296 def get_items(self, s, pos=0): 297 298 "Return a sequence of token items for 's' and 'pos'." 299 300 return TokenStream(s, pos) 301 302 def set_region(self, items, region): 303 304 "Set the 'items' used to populate the given 'region'." 305 306 self.items = items 307 self.region = region 308 309 def read_until(self, pattern_names, remaining=True): 310 311 """ 312 Read the next portion of input, matching using 'pattern_names'. Return 313 the text preceding any match, the remaining text if no match was found, 314 or None if no match was found and 'remaining' is given as a false value. 315 """ 316 317 return self.items.read_until(self.get_expression(pattern_names), 318 remaining) 319 320 def match_group(self, group=None): 321 322 """ 323 Return the group of the matching pattern with the given 'group' 324 identifier. If 'group' is omitted or None, return the entire match. 325 """ 326 327 return self.items.match_group(group) 328 329 def matching_pattern(self): 330 331 "Return the name of the matching pattern." 332 333 return self.items.matching 334 335 def match_groups(self): 336 337 "Return the number of groups in the match." 338 339 return self.items.match_groups() 340 341 # Parser methods invoked from other objects. 342 343 def parse(self, s): 344 345 """ 346 Parse page text 's'. Pages consist of regions delimited by markers. 347 """ 348 349 self.items = self.get_items(s) 350 self.region = self.parse_region() 351 return self.region 352 353 def parse_region_content(self, items, region): 354 355 "Parse the data provided by 'items' to populate a 'region'." 356 357 self.set_region(items, region) 358 359 # Only parse directives if the region is transparent. 360 361 if region.transparent: 362 self.parse_region_directives(region) 363 364 # Parse inline and opaque regions. 365 366 if not region.transparent: 367 pattern_names = ["regionend"] 368 369 # Define a block to hold text. 370 371 else: 372 self.new_block(region) 373 pattern_names = self.region_pattern_names 374 375 # Start parsing. 376 377 if pattern_names: 378 self.parse_region_details(region, pattern_names) 379 380 # Reset the type if the region was not inline. 381 382 if region.type == "inline": 383 first = region.nodes and region.nodes[0] 384 if first and isinstance(first, Text) and first.multiline(): 385 region.type = None 386 387 # Top-level parser handler methods. 388 389 def parse_region(self, level=0, indent=0, type=None): 390 391 """ 392 Parse the data to populate a region with the given 'level' at the given 393 'indent' having the given initial 'type'. 394 """ 395 396 region = Region([], level, indent, type) 397 398 # Parse section headers, then parse according to region type. 399 400 self.parse_region_header(region) 401 self.parse_region_type(region) 402 403 return region 404 405 def parse_region_type(self, region): 406 407 """ 408 Use configured parsers to parse 'region' based on its type. 409 """ 410 411 # Find an appropriate parser given the type. 412 413 parser = self.get_parser(region.type) 414 if not parser: 415 region.transparent = False 416 parser = parser or self.get_parser("moin") 417 418 parser.parse_region_content(self.items, region) 419 420 def parse_region_header(self, region): 421 422 """ 423 Parse the region header, setting it on the 'region' object. 424 """ 425 426 if self.read_until(["header"], False) == "": # None means no header 427 region.args = self.match_group("args") 428 region.type = region.args.split(" ", 1)[0] 429 430 def parse_region_directives(self, region): 431 432 """ 433 Parse any directives immediately after the region header, adding them to 434 the 'region' object. 435 """ 436 437 try: 438 while True: 439 preceding = self.read_until(["directive"], False) 440 441 # With an immediately-appearing directive, handle its details. 442 443 if preceding == "": 444 handler = self.handlers.get(self.matching_pattern()) 445 if handler: 446 handler(self, region) 447 else: 448 break 449 450 # Otherwise, with no immediate directive (or none at all), stop. 451 452 else: 453 break 454 455 # Handle a premature end of region. 456 457 except StopIteration: 458 pass 459 460 # Parsing utilities. 461 462 def parse_region_details(self, region, pattern_names, strict=False): 463 464 """ 465 Search 'region' using the 'pattern_names'. If 'strict' is set to a true 466 value, forbid the accumulation of additional textual padding. 467 """ 468 469 try: 470 while True: 471 472 # Obtain text before any marker or the end of the input. 473 474 preceding = self.read_until(pattern_names) 475 if preceding: 476 if not strict: 477 region.append_inline(Text(preceding)) 478 else: 479 break 480 481 # End of input. 482 483 if not self.matching_pattern(): 484 break 485 486 # Obtain any feature. 487 488 feature = self.match_group("feature") or self.match_group() 489 handler = self.handlers.get(self.matching_pattern()) 490 491 # Handle each feature or add text to the region. 492 493 if handler: 494 handler(self, region) 495 elif not strict: 496 region.append_inline(Text(feature)) 497 else: 498 break 499 500 except StopIteration: 501 pass 502 503 region.normalise() 504 505 def add_node(self, region, node): 506 507 "Add to 'region' the given 'node'." 508 509 region.add(node) 510 511 def append_node(self, region, node): 512 513 "Append to 'region' the given 'node'." 514 515 region.append(node) 516 517 def end_region(self, region): 518 519 "End the parsing of 'region', breaking out of the parsing loop." 520 521 raise StopIteration 522 523 def queue_match(self): 524 525 "Queue the current match." 526 527 self.items.queue_match() 528 529 def new_block(self, region): 530 531 "Start a new block in 'region'." 532 533 self.add_node(region, Block([])) 534 535 # Common handler methods. 536 537 def parse_region_end(self, node): 538 539 "Handle the end of a region occurring within 'node'." 540 541 level = self.match_group("level") 542 feature = self.match_group("feature") 543 self.region.extra = self.match_group("extra") 544 545 if self.region.have_end(level): 546 raise StopIteration 547 else: 548 node.append_inline(Text(feature)) 549 550 # vim: tabstop=4 expandtab shiftwidth=4