1 #!/usr/bin/env python 2 3 """ 4 Moin wiki parsing functionality. 5 6 Copyright (C) 2017, 2018, 2019 Paul Boddie <paul@boddie.org.uk> 7 8 This program is free software; you can redistribute it and/or modify it under 9 the terms of the GNU General Public License as published by the Free Software 10 Foundation; either version 3 of the License, or (at your option) any later 11 version. 12 13 This program is distributed in the hope that it will be useful, but WITHOUT 14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS 15 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more 16 details. 17 18 You should have received a copy of the GNU General Public License along with 19 this program. If not, see <http://www.gnu.org/licenses/>. 20 """ 21 22 from collections import defaultdict 23 from moinformat.tree.moin import Block, Region, Text 24 import re 25 26 # Pattern management. 27 28 ws_excl_nl = r"[ \f\r\t\v]" 29 quotes = "['" '"]' # ['"] 30 dotall = r"(.|\n)" # behave similarly to dot with DOTALL option 31 dotparagraph = r"(.|\n(?!\r?\n))" # match everything within paragraphs 32 33 def choice(l): 34 35 "Return a pattern matching a choice of patterns in 'l'." 36 37 return "(%s)" % "|".join(l) 38 39 def excl(s): 40 41 "Return a non-matching pattern for 's'." 42 43 return "(?!%s)" % s 44 45 def expect(s): 46 47 "Return a pattern expecting 's'." 48 49 return "(?=%s)" % s 50 51 def group(name, s): 52 53 "Return a pattern group having 'name' and the pattern string 's'." 54 55 return "(?P<%s>%s)" % (name, s) 56 57 def optional(s): 58 59 "Return an optional pattern." 60 61 return "(?:%s)?" % s 62 63 def recur(name): 64 65 "Return a test for a recurrence of group 'name'." 66 67 return "(?P=%s)" % name 68 69 def repeat(s, min=None, max=None): 70 71 "Return a pattern matching 's' for the given 'min' and 'max' limits." 72 73 return "%s{%s,%s}" % (s, min is not None and min or "", 74 max is not None and max or "") 75 76 def get_pattern(s): 77 78 "Return a compiled regular expression for the given pattern 's'." 79 80 return re.compile(s, re.UNICODE | re.MULTILINE) 81 82 def get_patterns(syntax): 83 84 """ 85 Define patterns for the regular expressions in the 'syntax' mapping. In each 86 pattern, replace \N with a pattern for matching whitespace excluding 87 newlines. 88 """ 89 90 patterns = {} 91 for name, value in syntax.items(): 92 value = value.replace(r"\N", ws_excl_nl) 93 value = value.replace(r"\Q", quotes) 94 value = value.replace(r"\E", dotall) 95 value = value.replace(r"\P", dotparagraph) 96 patterns[name] = get_pattern(value) 97 return patterns 98 99 def get_subset(d, keys): 100 101 "Return a subset of 'd' having the given 'keys'." 102 103 subset = {} 104 for key in keys: 105 subset[key] = d[key] 106 return subset 107 108 109 110 # Tokenising functions. 111 112 class TokenStream: 113 114 "A stream of tokens taken from a string." 115 116 def __init__(self, s, pos=0): 117 self.s = s 118 self.pos = pos 119 120 # Match details. 121 122 self.match = None 123 self.queued = None 124 self.match_start = None 125 126 # Pattern name details. 127 128 self.matching = None 129 130 def rewind(self, length): 131 132 "Rewind in the string by 'length'." 133 134 self.pos -= min(length, self.pos) 135 136 def queue_match(self): 137 138 "Rewind in the string to the start of the last match." 139 140 self.queued = self.match 141 142 def read_until(self, patterns, remaining=True): 143 144 """ 145 Find the first match for the given 'patterns'. Return the text preceding 146 any match, the remaining text if no match was found, or None if no match 147 was found and 'remaining' is given as a false value. 148 """ 149 150 if self.queued: 151 self.match = self.queued 152 self.queued = None 153 else: 154 self.match_start = None 155 self.matching = None 156 157 # Find the first matching pattern. 158 159 for pattern_name, pattern in patterns.items(): 160 match = pattern.search(self.s, self.pos) 161 if match: 162 start, end = match.span() 163 if self.matching is None or start < self.start: 164 self.start = start 165 self.matching = pattern_name 166 self.match = match 167 168 if self.matching is None: 169 if remaining: 170 return self.s[self.pos:] 171 else: 172 return None 173 else: 174 return self.s[self.pos:self.start] 175 176 def match_group(self, group=1): 177 178 """ 179 Return the matched text, updating the position in the stream. If 'group' 180 is specified, the indicated group in a match will be returned. 181 Typically, group 1 should contain all pertinent data, but groups defined 182 within group 1 can provide sections of the data. 183 """ 184 185 self.update_pos() 186 187 if self.match: 188 try: 189 return self.match.group(group) 190 except IndexError: 191 return "" 192 else: 193 return None 194 195 def match_groups(self, groups=None): 196 197 "Return the match 'groups', or all groups if unspecified." 198 199 self.update_pos() 200 201 if self.match: 202 if groups is None: 203 return self.match.groups() 204 else: 205 return self.match.groups(groups) 206 else: 207 return [] 208 209 def update_pos(self): 210 211 "Update the position in the stream." 212 213 if self.match: 214 _start, self.pos = self.match.span() 215 else: 216 self.pos = len(self.s) 217 218 219 220 # Parser abstractions. 221 222 class ParserBase: 223 224 "Common parsing methods." 225 226 region_pattern_names = None 227 228 def __init__(self, metadata, parsers=None, root=None): 229 230 """ 231 Initialise the parser with the given 'metadata' and optional 'parsers'. 232 An optional 'root' indicates the document-level parser. 233 """ 234 235 self.metadata = metadata 236 self.parsers = parsers 237 self.root = root 238 239 def get_parser(self, format_type): 240 241 """ 242 Return a parser for 'format_type' or None if no suitable parser is found. 243 """ 244 245 cls = self.parsers and self.parsers.get(format_type) 246 if cls: 247 return cls(self.metadata, self.parsers, self.root or self) 248 else: 249 return None 250 251 def get_patterns(self, pattern_names): 252 253 "Return a mapping of the given 'pattern_names' to patterns." 254 255 return get_subset(self.patterns, pattern_names) 256 257 def get_items(self, s, pos=0): 258 259 "Return a sequence of token items for 's' and 'pos'." 260 261 return TokenStream(s, pos) 262 263 def set_region(self, items, region): 264 265 "Set the 'items' used to populate the given 'region'." 266 267 self.items = items 268 self.region = region 269 270 def read_until(self, pattern_names, remaining=True): 271 272 """ 273 Read the next portion of input, matching using 'pattern_names'. Return 274 the text preceding any match, the remaining text if no match was found, 275 or None if no match was found and 'remaining' is given as a false value. 276 """ 277 278 return self.items.read_until(self.get_patterns(pattern_names)) 279 280 def match_group(self, group=1): 281 282 """ 283 Return the group of the matching pattern with the given 'group' number. 284 """ 285 286 return self.items.match_group(group) 287 288 def matching_pattern(self): 289 290 "Return the name of the matching pattern." 291 292 return self.items.matching 293 294 def match_groups(self): 295 296 "Return the number of groups in the match." 297 298 return self.items.match_groups() 299 300 # Parser methods invoked from other objects. 301 302 def parse(self, s): 303 304 """ 305 Parse page text 's'. Pages consist of regions delimited by markers. 306 """ 307 308 self.items = self.get_items(s) 309 self.region = self.parse_region() 310 return self.region 311 312 def parse_region_content(self, items, region): 313 314 "Parse the data provided by 'items' to populate a 'region'." 315 316 self.set_region(items, region) 317 318 # Only parse directives if the region is transparent. 319 320 if region.transparent: 321 self.parse_region_directives(region) 322 323 # Parse inline and opaque regions. 324 325 if not region.transparent: 326 pattern_names = ["regionend"] 327 328 # Define a block to hold text. 329 330 else: 331 self.new_block(region) 332 pattern_names = self.region_pattern_names 333 334 # Start parsing. 335 336 if pattern_names: 337 self.parse_region_details(region, pattern_names) 338 339 # Reset the type if the region was not inline. 340 341 if region.type == "inline": 342 first = region.nodes and region.nodes[0] 343 if first and isinstance(first, Text) and first.multiline(): 344 region.type = None 345 346 # Top-level parser handler methods. 347 348 def parse_region(self, level=0, indent=0, type=None): 349 350 """ 351 Parse the data to populate a region with the given 'level' at the given 352 'indent' having the given initial 'type'. 353 """ 354 355 region = Region([], level, indent, type) 356 357 # Parse section headers, then parse according to region type. 358 359 self.parse_region_header(region) 360 self.parse_region_type(region) 361 362 return region 363 364 def parse_region_type(self, region): 365 366 """ 367 Use configured parsers to parse 'region' based on its type. 368 """ 369 370 # Find an appropriate parser given the type. 371 372 parser = self.get_parser(region.type) 373 if not parser: 374 region.transparent = False 375 parser = parser or self.get_parser("moin") 376 377 parser.parse_region_content(self.items, region) 378 379 def parse_region_header(self, region): 380 381 """ 382 Parse the region header, setting it on the 'region' object. 383 """ 384 385 if self.read_until(["header"], False) == "": # None means no header 386 region.args = self.match_group("args") 387 region.type = region.args.split(" ", 1)[0] 388 389 def parse_region_directives(self, region): 390 391 """ 392 Parse any directives immediately after the region header, adding them to 393 the 'region' object. 394 """ 395 396 try: 397 while True: 398 preceding = self.read_until(["directive"], False) 399 400 # With an immediately-appearing directive, handle its details. 401 402 if preceding == "": 403 handler = self.handlers.get(self.matching_pattern()) 404 if handler: 405 handler(self, region) 406 else: 407 break 408 409 # Otherwise, with no immediate directive (or none at all), stop. 410 411 else: 412 break 413 414 # Handle a premature end of region. 415 416 except StopIteration: 417 pass 418 419 # Parsing utilities. 420 421 def parse_region_details(self, region, pattern_names, strict=False): 422 423 """ 424 Search 'region' using the 'pattern_names'. If 'strict' is set to a true 425 value, forbid the accumulation of additional textual padding. 426 """ 427 428 try: 429 while True: 430 431 # Obtain text before any marker or the end of the input. 432 433 preceding = self.read_until(pattern_names) 434 if preceding: 435 if not strict: 436 region.append_inline(Text(preceding)) 437 else: 438 break 439 440 # End of input. 441 442 if not self.matching_pattern(): 443 break 444 445 # Obtain any feature. 446 447 feature = self.match_group("feature") or self.match_group() 448 handler = self.handlers.get(self.matching_pattern()) 449 450 # Handle each feature or add text to the region. 451 452 if handler: 453 handler(self, region) 454 elif not strict: 455 region.append_inline(Text(feature)) 456 else: 457 break 458 459 except StopIteration: 460 pass 461 462 region.normalise() 463 464 def add_node(self, region, node): 465 466 "Add to 'region' the given 'node'." 467 468 region.add(node) 469 470 def append_node(self, region, node): 471 472 "Append to 'region' the given 'node'." 473 474 region.append(node) 475 476 def end_region(self, region): 477 478 "End the parsing of 'region', breaking out of the parsing loop." 479 480 raise StopIteration 481 482 def queue_match(self): 483 484 "Queue the current match." 485 486 self.items.queue_match() 487 488 def new_block(self, region): 489 490 "Start a new block in 'region'." 491 492 self.add_node(region, Block([])) 493 494 # Common handler methods. 495 496 def parse_region_end(self, node): 497 498 "Handle the end of a region occurring within 'node'." 499 500 level = self.match_group("level") 501 feature = self.match_group("feature") 502 self.region.extra = self.match_group("extra") 503 504 if self.region.have_end(level): 505 raise StopIteration 506 else: 507 node.append_inline(Text(feature)) 508 509 # vim: tabstop=4 expandtab shiftwidth=4