1 #!/usr/bin/env python 2 3 """ 4 Moin wiki parsing functionality. 5 6 Copyright (C) 2017, 2018 Paul Boddie <paul@boddie.org.uk> 7 8 This program is free software; you can redistribute it and/or modify it under 9 the terms of the GNU General Public License as published by the Free Software 10 Foundation; either version 3 of the License, or (at your option) any later 11 version. 12 13 This program is distributed in the hope that it will be useful, but WITHOUT 14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS 15 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more 16 details. 17 18 You should have received a copy of the GNU General Public License along with 19 this program. If not, see <http://www.gnu.org/licenses/>. 20 """ 21 22 from collections import defaultdict 23 from moinformat.tree.moin import Block, Region, Text 24 import re 25 26 # Pattern management. 27 28 ws_excl_nl = r"[ \f\r\t\v]" 29 quotes = "['" '"]' # ['"] 30 31 def choice(l): 32 33 "Return a pattern matching a choice of patterns in 'l'." 34 35 return "(%s)" % "|".join(l) 36 37 def excl(s): 38 39 "Return a non-matching pattern for 's'." 40 41 return "(?!%s)" % s 42 43 def expect(s): 44 45 "Return a pattern expecting 's'." 46 47 return "(?=%s)" % s 48 49 def group(name, s): 50 51 """ 52 Return a pattern for the group having the given 'name' and employing the 53 pattern string 's'. 54 """ 55 56 return "(?P<%s>%s)" % (name, s) 57 58 def optional(s): 59 60 "Return an optional pattern." 61 62 return "(?:%s)?" % s 63 64 def recur(name): 65 66 "Return a test for a recurrence of group 'name'." 67 68 return "(?P=%s)" % name 69 70 def repeat(s, min=None, max=None): 71 72 "Return a pattern matching 's' for the given 'min' and 'max' limits." 73 74 return "%s{%s,%s}" % (s, min is not None and min or "", 75 max is not None and max or "") 76 77 def get_patterns(syntax): 78 79 """ 80 Define patterns for the regular expressions in the 'syntax' mapping. In each 81 pattern, replace... 82 83 \N with a pattern for matching whitespace excluding newlines 84 \Q with a pattern for matching quotation marks 85 86 Group names are also qualified with a pattern name prefix. 87 """ 88 89 patterns = {} 90 91 for name, value in syntax.items(): 92 value = value.replace(r"\N", ws_excl_nl) 93 value = value.replace(r"\Q", quotes) 94 95 # Add the name to group names as a prefix. 96 97 value = value.replace("(?P<", "(?P<%s_" % name) 98 value = value.replace("(?P=", "(?P=%s_" % name) 99 100 # Record the updated expression and add an identifying null group. 101 102 patterns[name] = "%s(?P<group_%s>)" % (value, name) 103 104 return patterns 105 106 def get_expression(d, keys): 107 108 """ 109 Return a compiled expression combining patterns in 'd' having the given 110 'keys'. 111 """ 112 113 subset = [] 114 115 for key in keys: 116 subset.append(d[key]) 117 118 return re.compile("|".join(subset), re.UNICODE | re.MULTILINE) 119 120 121 122 # Tokenising functions. 123 124 class TokenStream: 125 126 "A stream of tokens taken from a string." 127 128 def __init__(self, s, pos=0): 129 self.s = s 130 self.pos = pos 131 132 # Match details. 133 134 self.match = None 135 self.queued = None 136 self.groups = {} 137 138 # Pattern name details. 139 140 self.matching = None 141 142 def rewind(self, length): 143 144 "Rewind in the string by 'length'." 145 146 self.pos -= min(length, self.pos) 147 148 def queue_match(self): 149 150 "Rewind in the string to the start of the last match." 151 152 self.queued = self.match 153 154 def read_until(self, expression, remaining=True): 155 156 """ 157 Find the first match for the given 'expression'. Return the text 158 preceding any match, the remaining text if no match was found, or None 159 if no match was found and 'remaining' is given as a false value. 160 """ 161 162 if self.queued: 163 self.match = self.queued 164 self.queued = None 165 else: 166 self.matching = None 167 168 # Find the first matching pattern. 169 170 match = expression.search(self.s, self.pos) 171 172 if match: 173 for name, value in match.groupdict().items(): 174 175 # Use a group with a non-null value to identify the 176 # matching pattern. 177 178 if name.startswith("group_") and value is not None: 179 self.matching = name[len("group_"):] 180 self.start, self.end = match.span() 181 self.match = match 182 break 183 184 # Return the remaining text, if appropriate. 185 186 if self.matching is None: 187 self.groups = {} 188 if remaining: 189 return self.s[self.pos:] 190 else: 191 return None 192 else: 193 self.groups = self.filter_groups() 194 return self.s[self.pos:self.start] 195 196 def filter_groups(self): 197 198 "Filter groups from the current match for the matching pattern." 199 200 prefix = "%s_" % self.matching 201 202 d = {} 203 for key, value in self.match.groupdict().items(): 204 if key.startswith(prefix): 205 d[key[len(prefix):]] = value 206 return d 207 208 def match_group(self, group=None): 209 210 """ 211 Return the matched text, updating the position in the stream. If 'group' 212 is specified, the indicated group in a match will be returned. 213 Otherwise, the entire match is returned. 214 """ 215 216 self.update_pos() 217 218 if self.match: 219 if group is None: 220 return self.s[self.start:self.end] 221 else: 222 return self.groups.get(group) 223 else: 224 return None 225 226 def match_groups(self, groups=None): 227 228 "Return the match 'groups', or all groups if unspecified." 229 230 self.update_pos() 231 232 if self.match: 233 if groups is None: 234 return self.groups 235 else: 236 l = [] 237 for group in groups: 238 l.append(self.groups.get(group)) 239 return l 240 else: 241 return [] 242 243 def update_pos(self): 244 245 "Update the position in the stream." 246 247 if self.match: 248 _start, self.pos = self.match.span() 249 else: 250 self.pos = len(self.s) 251 252 253 254 # Parser abstractions. 255 256 class ParserBase: 257 258 "Common parsing methods." 259 260 region_pattern_names = None 261 262 def __init__(self, formats=None, root=None): 263 264 """ 265 Initialise the parser with any given 'formats' mapping from region type 266 names to parser objects. An optional 'root' indicates the document-level 267 parser. 268 """ 269 270 self.formats = formats 271 self.root = root 272 273 def get_parser(self, format_type): 274 275 """ 276 Return a parser for 'format_type' or None if no suitable parser is found. 277 """ 278 279 if not self.formats: 280 return None 281 282 cls = self.formats.get(format_type) 283 if cls: 284 return cls(self.formats, self.root or self) 285 else: 286 return None 287 288 def get_expression(self, pattern_names): 289 290 "Return a mapping of the given 'pattern_names' to patterns." 291 292 return get_expression(self.patterns, pattern_names) 293 294 def get_items(self, s, pos=0): 295 296 "Return a sequence of token items for 's' and 'pos'." 297 298 return TokenStream(s, pos) 299 300 def set_region(self, items, region): 301 302 "Set the 'items' used to populate the given 'region'." 303 304 self.items = items 305 self.region = region 306 307 def read_until(self, pattern_names, remaining=True): 308 309 """ 310 Read the next portion of input, matching using 'pattern_names'. Return 311 the text preceding any match, the remaining text if no match was found, 312 or None if no match was found and 'remaining' is given as a false value. 313 """ 314 315 return self.items.read_until(self.get_expression(pattern_names)) 316 317 def match_group(self, group=None): 318 319 """ 320 Return the group of the matching pattern with the given 'group' 321 identifier. If 'group' is omitted or None, return the entire match. 322 """ 323 324 return self.items.match_group(group) 325 326 def matching_pattern(self): 327 328 "Return the name of the matching pattern." 329 330 return self.items.matching 331 332 def match_groups(self): 333 334 "Return the number of groups in the match." 335 336 return self.items.match_groups() 337 338 # Parser methods invoked from other objects. 339 340 def parse(self, s): 341 342 """ 343 Parse page text 's'. Pages consist of regions delimited by markers. 344 """ 345 346 self.items = self.get_items(s) 347 self.region = self.parse_region() 348 return self.region 349 350 def parse_region_content(self, items, region): 351 352 "Parse the data provided by 'items' to populate a 'region'." 353 354 self.set_region(items, region) 355 356 # Parse inline and opaque regions. 357 358 if not region.transparent: 359 pattern_names = ["regionend"] 360 361 # Define a block to hold text. 362 363 else: 364 self.new_block(region) 365 pattern_names = self.region_pattern_names 366 367 # Start parsing. 368 369 if pattern_names: 370 self.parse_region_details(region, pattern_names) 371 372 # Reset the type if the region was not inline. 373 374 if region.type == "inline": 375 first = region.nodes and region.nodes[0] 376 if first and isinstance(first, Text) and first.multiline(): 377 region.type = None 378 379 # Top-level parser handler methods. 380 381 def parse_region(self, level=0, indent=0, type=None): 382 383 """ 384 Parse the data to populate a region with the given 'level' at the given 385 'indent' having the given initial 'type'. 386 """ 387 388 region = Region([], level, indent, type) 389 390 # Parse section headers and directives, then parse according to region 391 # type. 392 393 self.parse_region_header(region) 394 self.parse_region_directives(region) 395 self.parse_region_type(region) 396 397 return region 398 399 def parse_region_type(self, region): 400 401 """ 402 Use configured parsers to parse 'region' based on its type. 403 """ 404 405 # Find an appropriate parser given the type. 406 407 parser = self.get_parser(region.type) 408 if not parser: 409 region.transparent = False 410 parser = parser or self.get_parser("moin") 411 parser.parse_region_content(self.items, region) 412 413 def parse_region_header(self, region): 414 415 """ 416 Parse the region header, setting it on the 'region' object. 417 """ 418 419 if self.read_until(["header"], False) == "": # None means no header 420 region.args = self.match_group("args") 421 region.type = region.args.split(" ", 1)[0] 422 423 def parse_region_directives(self, region): 424 425 """ 426 Parse any directives immediately after the region header, adding them to 427 the 'region' object. 428 """ 429 430 while True: 431 preceding = self.read_until(["directive"], False) 432 433 # With an immediately-appearing directive, handle its details. 434 435 if preceding == "": 436 handler = self.handlers.get(self.matching_pattern()) 437 if handler: 438 handler(self, region) 439 else: 440 break 441 442 # Otherwise, with no immediate directive (or none at all), stop. 443 444 else: 445 break 446 447 # Parsing utilities. 448 449 def parse_region_details(self, region, pattern_names, strict=False): 450 451 """ 452 Search 'region' using the 'pattern_names'. If 'strict' is set to a true 453 value, forbid the accumulation of additional textual padding. 454 """ 455 456 try: 457 while True: 458 459 # Obtain text before any marker or the end of the input. 460 461 preceding = self.read_until(pattern_names) 462 if preceding: 463 if not strict: 464 region.append_inline(Text(preceding)) 465 else: 466 break 467 468 # End of input. 469 470 if not self.matching_pattern(): 471 break 472 473 # Obtain any feature. 474 475 feature = self.match_group("feature") or self.match_group() 476 handler = self.handlers.get(self.matching_pattern()) 477 478 # Handle each feature or add text to the region. 479 480 if handler: 481 handler(self, region) 482 elif not strict: 483 region.append_inline(Text(feature)) 484 else: 485 break 486 487 except StopIteration: 488 pass 489 490 region.normalise() 491 492 def add_node(self, region, node): 493 494 "Add to 'region' the given 'node'." 495 496 region.add(node) 497 498 def append_node(self, region, node): 499 500 "Append to 'region' the given 'node'." 501 502 region.append(node) 503 504 def end_region(self, region): 505 506 "End the parsing of 'region', breaking out of the parsing loop." 507 508 raise StopIteration 509 510 def queue_match(self): 511 512 "Queue the current match." 513 514 self.items.queue_match() 515 516 def new_block(self, region): 517 518 "Start a new block in 'region'." 519 520 self.add_node(region, Block([])) 521 522 # Common handler methods. 523 524 def parse_region_end(self, node): 525 526 "Handle the end of a region occurring within 'node'." 527 528 level = self.match_group("level") 529 feature = self.match_group("feature") 530 self.region.extra = self.match_group("extra") 531 532 if self.region.have_end(level): 533 raise StopIteration 534 else: 535 node.append_inline(Text(feature)) 536 537 # vim: tabstop=4 expandtab shiftwidth=4