1 #!/usr/bin/env python 2 3 """ 4 Moin wiki parsing functionality. 5 6 Copyright (C) 2017, 2018 Paul Boddie <paul@boddie.org.uk> 7 8 This program is free software; you can redistribute it and/or modify it under 9 the terms of the GNU General Public License as published by the Free Software 10 Foundation; either version 3 of the License, or (at your option) any later 11 version. 12 13 This program is distributed in the hope that it will be useful, but WITHOUT 14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS 15 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more 16 details. 17 18 You should have received a copy of the GNU General Public License along with 19 this program. If not, see <http://www.gnu.org/licenses/>. 20 """ 21 22 from collections import defaultdict 23 from moinformat.tree.moin import Block, Region, Text 24 import re 25 26 # Pattern management. 27 28 ws_excl_nl = r"[ \f\r\t\v]" 29 quotes = "['" '"]' # ['"] 30 dotall = r"(.|\n)" 31 32 def choice(l): 33 34 "Return a pattern matching a choice of patterns in 'l'." 35 36 return "(%s)" % "|".join(l) 37 38 def excl(s): 39 40 "Return a non-matching pattern for 's'." 41 42 return "(?!%s)" % s 43 44 def expect(s): 45 46 "Return a pattern expecting 's'." 47 48 return "(?=%s)" % s 49 50 def group(name, s): 51 52 """ 53 Return a pattern for the group having the given 'name' and employing the 54 pattern string 's'. 55 """ 56 57 return "(?P<%s>%s)" % (name, s) 58 59 def optional(s): 60 61 "Return an optional pattern." 62 63 return "(?:%s)?" % s 64 65 def recur(name): 66 67 "Return a test for a recurrence of group 'name'." 68 69 return "(?P=%s)" % name 70 71 def repeat(s, min=None, max=None): 72 73 "Return a pattern matching 's' for the given 'min' and 'max' limits." 74 75 return "%s{%s,%s}" % (s, min is not None and min or "", 76 max is not None and max or "") 77 78 def get_patterns(syntax): 79 80 """ 81 Define patterns for the regular expressions in the 'syntax' mapping. In each 82 pattern, replace... 83 84 \E with a pattern for matching all characters including newlines 85 \N with a pattern for matching whitespace excluding newlines 86 \Q with a pattern for matching quotation marks 87 88 Group names are also qualified with a pattern name prefix. 89 """ 90 91 patterns = {} 92 93 for name, value in syntax.items(): 94 value = value.replace(r"\N", ws_excl_nl) 95 value = value.replace(r"\Q", quotes) 96 value = value.replace(r"\E", dotall) 97 98 # Add the name to group names as a prefix. 99 100 value = value.replace("(?P<", "(?P<%s_" % name) 101 value = value.replace("(?P=", "(?P=%s_" % name) 102 103 # Record the updated expression and add an identifying null group. 104 105 patterns[name] = "%s(?P<group_%s>)" % (value, name) 106 107 return patterns 108 109 def get_expression(d, keys): 110 111 """ 112 Return a compiled expression combining patterns in 'd' having the given 113 'keys'. 114 """ 115 116 subset = [] 117 118 for key in keys: 119 subset.append(d[key]) 120 121 return re.compile("|".join(subset), re.UNICODE | re.MULTILINE) 122 123 124 125 # Tokenising functions. 126 127 class TokenStream: 128 129 "A stream of tokens taken from a string." 130 131 def __init__(self, s, pos=0): 132 self.s = s 133 self.pos = pos 134 135 # Match details. 136 137 self.match = None 138 self.queued = None 139 self.groups = {} 140 141 # Pattern name details. 142 143 self.matching = None 144 145 def rewind(self, length): 146 147 "Rewind in the string by 'length'." 148 149 self.pos -= min(length, self.pos) 150 151 def queue_match(self): 152 153 "Rewind in the string to the start of the last match." 154 155 self.queued = self.match 156 157 def read_until(self, expression, remaining=True): 158 159 """ 160 Find the first match for the given 'expression'. Return the text 161 preceding any match, the remaining text if no match was found, or None 162 if no match was found and 'remaining' is given as a false value. 163 """ 164 165 if self.queued: 166 self.match = self.queued 167 self.queued = None 168 else: 169 self.matching = None 170 171 # Find the first matching pattern. 172 173 match = expression.search(self.s, self.pos) 174 175 if match: 176 for name, value in match.groupdict().items(): 177 178 # Use a group with a non-null value to identify the 179 # matching pattern. 180 181 if name.startswith("group_") and value is not None: 182 self.matching = name[len("group_"):] 183 self.start, self.end = match.span() 184 self.match = match 185 break 186 187 # Return the remaining text, if appropriate. 188 189 if self.matching is None: 190 self.groups = {} 191 if remaining: 192 return self.s[self.pos:] 193 else: 194 return None 195 else: 196 self.groups = self.filter_groups() 197 return self.s[self.pos:self.start] 198 199 def filter_groups(self): 200 201 "Filter groups from the current match for the matching pattern." 202 203 prefix = "%s_" % self.matching 204 205 d = {} 206 for key, value in self.match.groupdict().items(): 207 if key.startswith(prefix): 208 d[key[len(prefix):]] = value 209 return d 210 211 def match_group(self, group=None): 212 213 """ 214 Return the matched text, updating the position in the stream. If 'group' 215 is specified, the indicated group in a match will be returned. 216 Otherwise, the entire match is returned. 217 """ 218 219 self.update_pos() 220 221 if self.match: 222 if group is None: 223 return self.s[self.start:self.end] 224 else: 225 return self.groups.get(group) 226 else: 227 return None 228 229 def match_groups(self, groups=None): 230 231 "Return the match 'groups', or all groups if unspecified." 232 233 self.update_pos() 234 235 if self.match: 236 if groups is None: 237 return self.groups 238 else: 239 l = [] 240 for group in groups: 241 l.append(self.groups.get(group)) 242 return l 243 else: 244 return [] 245 246 def update_pos(self): 247 248 "Update the position in the stream." 249 250 if self.match: 251 _start, self.pos = self.match.span() 252 else: 253 self.pos = len(self.s) 254 255 256 257 # Parser abstractions. 258 259 class ParserBase: 260 261 "Common parsing methods." 262 263 region_pattern_names = None 264 265 def __init__(self, metadata, parsers=None, root=None): 266 267 """ 268 Initialise the parser with the given 'metadata' and optional 'parsers'. 269 An optional 'root' indicates the document-level parser. 270 """ 271 272 self.metadata = metadata 273 self.parsers = parsers 274 self.root = root 275 276 def get_parser(self, format_type): 277 278 """ 279 Return a parser for 'format_type' or None if no suitable parser is found. 280 """ 281 282 cls = self.parsers and self.parsers.get(format_type) 283 if cls: 284 return cls(self.metadata, self.parsers, self.root or self) 285 else: 286 return None 287 288 def get_expression(self, pattern_names): 289 290 "Return a mapping of the given 'pattern_names' to patterns." 291 292 return get_expression(self.patterns, pattern_names) 293 294 def get_items(self, s, pos=0): 295 296 "Return a sequence of token items for 's' and 'pos'." 297 298 return TokenStream(s, pos) 299 300 def set_region(self, items, region): 301 302 "Set the 'items' used to populate the given 'region'." 303 304 self.items = items 305 self.region = region 306 307 def read_until(self, pattern_names, remaining=True): 308 309 """ 310 Read the next portion of input, matching using 'pattern_names'. Return 311 the text preceding any match, the remaining text if no match was found, 312 or None if no match was found and 'remaining' is given as a false value. 313 """ 314 315 return self.items.read_until(self.get_expression(pattern_names)) 316 317 def match_group(self, group=None): 318 319 """ 320 Return the group of the matching pattern with the given 'group' 321 identifier. If 'group' is omitted or None, return the entire match. 322 """ 323 324 return self.items.match_group(group) 325 326 def matching_pattern(self): 327 328 "Return the name of the matching pattern." 329 330 return self.items.matching 331 332 def match_groups(self): 333 334 "Return the number of groups in the match." 335 336 return self.items.match_groups() 337 338 # Parser methods invoked from other objects. 339 340 def parse(self, s): 341 342 """ 343 Parse page text 's'. Pages consist of regions delimited by markers. 344 """ 345 346 self.items = self.get_items(s) 347 self.region = self.parse_region() 348 return self.region 349 350 def parse_region_content(self, items, region): 351 352 "Parse the data provided by 'items' to populate a 'region'." 353 354 self.set_region(items, region) 355 356 # Parse inline and opaque regions. 357 358 if not region.transparent: 359 pattern_names = ["regionend"] 360 361 # Define a block to hold text. 362 363 else: 364 self.new_block(region) 365 pattern_names = self.region_pattern_names 366 367 # Start parsing. 368 369 if pattern_names: 370 self.parse_region_details(region, pattern_names) 371 372 # Reset the type if the region was not inline. 373 374 if region.type == "inline": 375 first = region.nodes and region.nodes[0] 376 if first and isinstance(first, Text) and first.multiline(): 377 region.type = None 378 379 # Top-level parser handler methods. 380 381 def parse_region(self, level=0, indent=0, type=None): 382 383 """ 384 Parse the data to populate a region with the given 'level' at the given 385 'indent' having the given initial 'type'. 386 """ 387 388 region = Region([], level, indent, type) 389 390 # Parse section headers and directives, then parse according to region 391 # type. 392 393 self.parse_region_header(region) 394 self.parse_region_directives(region) 395 self.parse_region_type(region) 396 397 return region 398 399 def parse_region_type(self, region): 400 401 """ 402 Use configured parsers to parse 'region' based on its type. 403 """ 404 405 # Find an appropriate parser given the type. 406 407 parser = self.get_parser(region.type) 408 if not parser: 409 region.transparent = False 410 parser = parser or self.get_parser("moin") 411 parser.parse_region_content(self.items, region) 412 413 def parse_region_header(self, region): 414 415 """ 416 Parse the region header, setting it on the 'region' object. 417 """ 418 419 if self.read_until(["header"], False) == "": # None means no header 420 region.args = self.match_group("args") 421 region.type = region.args.split(" ", 1)[0] 422 423 def parse_region_directives(self, region): 424 425 """ 426 Parse any directives immediately after the region header, adding them to 427 the 'region' object. 428 """ 429 430 while True: 431 preceding = self.read_until(["directive"], False) 432 433 # With an immediately-appearing directive, handle its details. 434 435 if preceding == "": 436 handler = self.handlers.get(self.matching_pattern()) 437 if handler: 438 handler(self, region) 439 else: 440 break 441 442 # Otherwise, with no immediate directive (or none at all), stop. 443 444 else: 445 break 446 447 # Parsing utilities. 448 449 def parse_region_details(self, region, pattern_names, strict=False): 450 451 """ 452 Search 'region' using the 'pattern_names'. If 'strict' is set to a true 453 value, forbid the accumulation of additional textual padding. 454 """ 455 456 try: 457 while True: 458 459 # Obtain text before any marker or the end of the input. 460 461 preceding = self.read_until(pattern_names) 462 if preceding: 463 if not strict: 464 region.append_inline(Text(preceding)) 465 else: 466 break 467 468 # End of input. 469 470 if not self.matching_pattern(): 471 break 472 473 # Obtain any feature. 474 475 feature = self.match_group("feature") or self.match_group() 476 handler = self.handlers.get(self.matching_pattern()) 477 478 # Handle each feature or add text to the region. 479 480 if handler: 481 handler(self, region) 482 elif not strict: 483 region.append_inline(Text(feature)) 484 else: 485 break 486 487 except StopIteration: 488 pass 489 490 region.normalise() 491 492 def add_node(self, region, node): 493 494 "Add to 'region' the given 'node'." 495 496 region.add(node) 497 498 def append_node(self, region, node): 499 500 "Append to 'region' the given 'node'." 501 502 region.append(node) 503 504 def end_region(self, region): 505 506 "End the parsing of 'region', breaking out of the parsing loop." 507 508 raise StopIteration 509 510 def queue_match(self): 511 512 "Queue the current match." 513 514 self.items.queue_match() 515 516 def new_block(self, region): 517 518 "Start a new block in 'region'." 519 520 self.add_node(region, Block([])) 521 522 # Common handler methods. 523 524 def parse_region_end(self, node): 525 526 "Handle the end of a region occurring within 'node'." 527 528 level = self.match_group("level") 529 feature = self.match_group("feature") 530 self.region.extra = self.match_group("extra") 531 532 if self.region.have_end(level): 533 raise StopIteration 534 else: 535 node.append_inline(Text(feature)) 536 537 # vim: tabstop=4 expandtab shiftwidth=4