1 #!/usr/bin/env python 2 3 """ 4 Moin wiki parsing functionality. 5 6 Copyright (C) 2017, 2018 Paul Boddie <paul@boddie.org.uk> 7 8 This program is free software; you can redistribute it and/or modify it under 9 the terms of the GNU General Public License as published by the Free Software 10 Foundation; either version 3 of the License, or (at your option) any later 11 version. 12 13 This program is distributed in the hope that it will be useful, but WITHOUT 14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS 15 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more 16 details. 17 18 You should have received a copy of the GNU General Public License along with 19 this program. If not, see <http://www.gnu.org/licenses/>. 20 """ 21 22 from collections import defaultdict 23 from moinformat.tree.moin import Block, Region, Text 24 import re 25 26 # Pattern management. 27 28 ws_excl_nl = r"[ \f\r\t\v]" 29 quotes = "['" '"]' # ['"] 30 dotall = r"(.|\n)" # behave similarly to dot with DOTALL option 31 dotparagraph = r"(.|\n(?!\r?\n))" # match everything within paragraphs 32 33 def choice(l): 34 35 "Return a pattern matching a choice of patterns in 'l'." 36 37 return "(%s)" % "|".join(l) 38 39 def excl(s): 40 41 "Return a non-matching pattern for 's'." 42 43 return "(?!%s)" % s 44 45 def expect(s): 46 47 "Return a pattern expecting 's'." 48 49 return "(?=%s)" % s 50 51 def group(name, s): 52 53 """ 54 Return a pattern for the group having the given 'name' and employing the 55 pattern string 's'. 56 """ 57 58 return "(?P<%s>%s)" % (name, s) 59 60 def optional(s): 61 62 "Return an optional pattern." 63 64 return "(?:%s)?" % s 65 66 def recur(name): 67 68 "Return a test for a recurrence of group 'name'." 69 70 return "(?P=%s)" % name 71 72 def repeat(s, min=None, max=None): 73 74 "Return a pattern matching 's' for the given 'min' and 'max' limits." 75 76 return "%s{%s,%s}" % (s, min is not None and min or "", 77 max is not None and max or "") 78 79 def get_patterns(syntax): 80 81 """ 82 Define patterns for the regular expressions in the 'syntax' mapping. In each 83 pattern, replace... 84 85 \E with a pattern for matching all characters including newlines 86 \N with a pattern for matching whitespace excluding newlines 87 \P with a pattern for matching all characters within a paragraph 88 \Q with a pattern for matching quotation marks 89 90 Group names are also qualified with a pattern name prefix. 91 """ 92 93 patterns = {} 94 95 for name, value in syntax.items(): 96 value = value.replace(r"\N", ws_excl_nl) 97 value = value.replace(r"\Q", quotes) 98 value = value.replace(r"\E", dotall) 99 value = value.replace(r"\P", dotparagraph) 100 101 # Add the name to group names as a prefix. 102 103 value = value.replace("(?P<", "(?P<%s_" % name) 104 value = value.replace("(?P=", "(?P=%s_" % name) 105 106 # Record the updated expression and add an identifying null group. 107 108 patterns[name] = "%s(?P<group_%s>)" % (value, name) 109 110 return patterns 111 112 def get_expression(d, keys): 113 114 """ 115 Return a compiled expression combining patterns in 'd' having the given 116 'keys'. 117 """ 118 119 subset = [] 120 121 for key in keys: 122 subset.append(d[key]) 123 124 return re.compile("|".join(subset), re.UNICODE | re.MULTILINE) 125 126 127 128 # Tokenising functions. 129 130 class TokenStream: 131 132 "A stream of tokens taken from a string." 133 134 def __init__(self, s, pos=0): 135 self.s = s 136 self.pos = pos 137 138 # Match details. 139 140 self.match = None 141 self.queued = None 142 self.groups = {} 143 144 # Pattern name details. 145 146 self.matching = None 147 148 def rewind(self, length): 149 150 "Rewind in the string by 'length'." 151 152 self.pos -= min(length, self.pos) 153 154 def queue_match(self): 155 156 "Rewind in the string to the start of the last match." 157 158 self.queued = self.match 159 160 def read_until(self, expression, remaining=True): 161 162 """ 163 Find the first match for the given 'expression'. Return the text 164 preceding any match, the remaining text if no match was found, or None 165 if no match was found and 'remaining' is given as a false value. 166 """ 167 168 if self.queued: 169 self.match = self.queued 170 self.queued = None 171 else: 172 self.matching = None 173 174 # Find the first matching pattern. 175 176 match = expression.search(self.s, self.pos) 177 178 if match: 179 for name, value in match.groupdict().items(): 180 181 # Use a group with a non-null value to identify the 182 # matching pattern. 183 184 if name.startswith("group_") and value is not None: 185 self.matching = name[len("group_"):] 186 self.start, self.end = match.span() 187 self.match = match 188 break 189 190 # Return the remaining text, if appropriate. 191 192 if self.matching is None: 193 self.groups = {} 194 if remaining: 195 return self.s[self.pos:] 196 else: 197 return None 198 else: 199 self.groups = self.filter_groups() 200 return self.s[self.pos:self.start] 201 202 def filter_groups(self): 203 204 "Filter groups from the current match for the matching pattern." 205 206 prefix = "%s_" % self.matching 207 208 d = {} 209 for key, value in self.match.groupdict().items(): 210 if key.startswith(prefix): 211 d[key[len(prefix):]] = value 212 return d 213 214 def match_group(self, group=None): 215 216 """ 217 Return the matched text, updating the position in the stream. If 'group' 218 is specified, the indicated group in a match will be returned. 219 Otherwise, the entire match is returned. 220 """ 221 222 self.update_pos() 223 224 if self.match: 225 if group is None: 226 return self.s[self.start:self.end] 227 else: 228 return self.groups.get(group) 229 else: 230 return None 231 232 def match_groups(self, groups=None): 233 234 "Return the match 'groups', or all groups if unspecified." 235 236 self.update_pos() 237 238 if self.match: 239 if groups is None: 240 return self.groups 241 else: 242 l = [] 243 for group in groups: 244 l.append(self.groups.get(group)) 245 return l 246 else: 247 return [] 248 249 def update_pos(self): 250 251 "Update the position in the stream." 252 253 if self.match: 254 _start, self.pos = self.match.span() 255 else: 256 self.pos = len(self.s) 257 258 259 260 # Parser abstractions. 261 262 class ParserBase: 263 264 "Common parsing methods." 265 266 region_pattern_names = None 267 268 def __init__(self, metadata, parsers=None, root=None): 269 270 """ 271 Initialise the parser with the given 'metadata' and optional 'parsers'. 272 An optional 'root' indicates the document-level parser. 273 """ 274 275 self.metadata = metadata 276 self.parsers = parsers 277 self.root = root 278 279 def get_parser(self, format_type): 280 281 """ 282 Return a parser for 'format_type' or None if no suitable parser is found. 283 """ 284 285 cls = self.parsers and self.parsers.get(format_type) 286 if cls: 287 return cls(self.metadata, self.parsers, self.root or self) 288 else: 289 return None 290 291 def get_expression(self, pattern_names): 292 293 "Return a mapping of the given 'pattern_names' to patterns." 294 295 return get_expression(self.patterns, pattern_names) 296 297 def get_items(self, s, pos=0): 298 299 "Return a sequence of token items for 's' and 'pos'." 300 301 return TokenStream(s, pos) 302 303 def set_region(self, items, region): 304 305 "Set the 'items' used to populate the given 'region'." 306 307 self.items = items 308 self.region = region 309 310 def read_until(self, pattern_names, remaining=True): 311 312 """ 313 Read the next portion of input, matching using 'pattern_names'. Return 314 the text preceding any match, the remaining text if no match was found, 315 or None if no match was found and 'remaining' is given as a false value. 316 """ 317 318 return self.items.read_until(self.get_expression(pattern_names)) 319 320 def match_group(self, group=None): 321 322 """ 323 Return the group of the matching pattern with the given 'group' 324 identifier. If 'group' is omitted or None, return the entire match. 325 """ 326 327 return self.items.match_group(group) 328 329 def matching_pattern(self): 330 331 "Return the name of the matching pattern." 332 333 return self.items.matching 334 335 def match_groups(self): 336 337 "Return the number of groups in the match." 338 339 return self.items.match_groups() 340 341 # Parser methods invoked from other objects. 342 343 def parse(self, s): 344 345 """ 346 Parse page text 's'. Pages consist of regions delimited by markers. 347 """ 348 349 self.items = self.get_items(s) 350 self.region = self.parse_region() 351 return self.region 352 353 def parse_region_content(self, items, region): 354 355 "Parse the data provided by 'items' to populate a 'region'." 356 357 self.set_region(items, region) 358 359 # Only parse directives if the region is transparent. 360 361 if region.transparent: 362 self.parse_region_directives(region) 363 364 # Parse inline and opaque regions. 365 366 if not region.transparent: 367 pattern_names = ["regionend"] 368 369 # Define a block to hold text. 370 371 else: 372 self.new_block(region) 373 pattern_names = self.region_pattern_names 374 375 # Start parsing. 376 377 if pattern_names: 378 self.parse_region_details(region, pattern_names) 379 380 # Reset the type if the region was not inline. 381 382 if region.type == "inline": 383 first = region.nodes and region.nodes[0] 384 if first and isinstance(first, Text) and first.multiline(): 385 region.type = None 386 387 # Top-level parser handler methods. 388 389 def parse_region(self, level=0, indent=0, type=None): 390 391 """ 392 Parse the data to populate a region with the given 'level' at the given 393 'indent' having the given initial 'type'. 394 """ 395 396 region = Region([], level, indent, type) 397 398 # Parse section headers, then parse according to region type. 399 400 self.parse_region_header(region) 401 self.parse_region_type(region) 402 403 return region 404 405 def parse_region_type(self, region): 406 407 """ 408 Use configured parsers to parse 'region' based on its type. 409 """ 410 411 # Find an appropriate parser given the type. 412 413 parser = self.get_parser(region.type) 414 if not parser: 415 region.transparent = False 416 parser = parser or self.get_parser("moin") 417 418 parser.parse_region_content(self.items, region) 419 420 def parse_region_header(self, region): 421 422 """ 423 Parse the region header, setting it on the 'region' object. 424 """ 425 426 if self.read_until(["header"], False) == "": # None means no header 427 region.args = self.match_group("args") 428 region.type = region.args.split(" ", 1)[0] 429 430 def parse_region_directives(self, region): 431 432 """ 433 Parse any directives immediately after the region header, adding them to 434 the 'region' object. 435 """ 436 437 while True: 438 preceding = self.read_until(["directive"], False) 439 440 # With an immediately-appearing directive, handle its details. 441 442 if preceding == "": 443 handler = self.handlers.get(self.matching_pattern()) 444 if handler: 445 handler(self, region) 446 else: 447 break 448 449 # Otherwise, with no immediate directive (or none at all), stop. 450 451 else: 452 break 453 454 # Parsing utilities. 455 456 def parse_region_details(self, region, pattern_names, strict=False): 457 458 """ 459 Search 'region' using the 'pattern_names'. If 'strict' is set to a true 460 value, forbid the accumulation of additional textual padding. 461 """ 462 463 try: 464 while True: 465 466 # Obtain text before any marker or the end of the input. 467 468 preceding = self.read_until(pattern_names) 469 if preceding: 470 if not strict: 471 region.append_inline(Text(preceding)) 472 else: 473 break 474 475 # End of input. 476 477 if not self.matching_pattern(): 478 break 479 480 # Obtain any feature. 481 482 feature = self.match_group("feature") or self.match_group() 483 handler = self.handlers.get(self.matching_pattern()) 484 485 # Handle each feature or add text to the region. 486 487 if handler: 488 handler(self, region) 489 elif not strict: 490 region.append_inline(Text(feature)) 491 else: 492 break 493 494 except StopIteration: 495 pass 496 497 region.normalise() 498 499 def add_node(self, region, node): 500 501 "Add to 'region' the given 'node'." 502 503 region.add(node) 504 505 def append_node(self, region, node): 506 507 "Append to 'region' the given 'node'." 508 509 region.append(node) 510 511 def end_region(self, region): 512 513 "End the parsing of 'region', breaking out of the parsing loop." 514 515 raise StopIteration 516 517 def queue_match(self): 518 519 "Queue the current match." 520 521 self.items.queue_match() 522 523 def new_block(self, region): 524 525 "Start a new block in 'region'." 526 527 self.add_node(region, Block([])) 528 529 # Common handler methods. 530 531 def parse_region_end(self, node): 532 533 "Handle the end of a region occurring within 'node'." 534 535 level = self.match_group("level") 536 feature = self.match_group("feature") 537 self.region.extra = self.match_group("extra") 538 539 if self.region.have_end(level): 540 raise StopIteration 541 else: 542 node.append_inline(Text(feature)) 543 544 # vim: tabstop=4 expandtab shiftwidth=4