1 #!/usr/bin/env python 2 3 """ 4 Moin wiki parsing functionality. 5 6 Copyright (C) 2017, 2018 Paul Boddie <paul@boddie.org.uk> 7 8 This program is free software; you can redistribute it and/or modify it under 9 the terms of the GNU General Public License as published by the Free Software 10 Foundation; either version 3 of the License, or (at your option) any later 11 version. 12 13 This program is distributed in the hope that it will be useful, but WITHOUT 14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS 15 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more 16 details. 17 18 You should have received a copy of the GNU General Public License along with 19 this program. If not, see <http://www.gnu.org/licenses/>. 20 """ 21 22 from collections import defaultdict 23 from moinformat.tree.moin import Block, Region, Text 24 import re 25 26 # Pattern management. 27 28 ws_excl_nl = r"[ \f\r\t\v]" 29 quotes = "['" '"]' # ['"] 30 31 def choice(l): 32 33 "Return a pattern matching a choice of patterns in 'l'." 34 35 return "(%s)" % "|".join(l) 36 37 def excl(s): 38 39 "Return a non-matching pattern for 's'." 40 41 return "(?!%s)" % s 42 43 def expect(s): 44 45 "Return a pattern expecting 's'." 46 47 return "(?=%s)" % s 48 49 def group(name, s): 50 51 """ 52 Return a pattern for the group having the given 'name' and employing the 53 pattern string 's'. 54 """ 55 56 return "(?P<%s>%s)" % (name, s) 57 58 def optional(s): 59 60 "Return an optional pattern." 61 62 return "(?:%s)?" % s 63 64 def recur(name): 65 66 "Return a test for a recurrence of group 'name'." 67 68 return "(?P=%s)" % name 69 70 def repeat(s, min=None, max=None): 71 72 "Return a pattern matching 's' for the given 'min' and 'max' limits." 73 74 return "%s{%s,%s}" % (s, min is not None and min or "", 75 max is not None and max or "") 76 77 def get_patterns(syntax): 78 79 """ 80 Define patterns for the regular expressions in the 'syntax' mapping. In each 81 pattern, replace... 82 83 \N with a pattern for matching whitespace excluding newlines 84 \Q with a pattern for matching quotation marks 85 86 Group names are also qualified with a pattern name prefix. 87 """ 88 89 patterns = {} 90 91 for name, value in syntax.items(): 92 value = value.replace(r"\N", ws_excl_nl) 93 value = value.replace(r"\Q", quotes) 94 95 # Add the name to group names as a prefix. 96 97 value = value.replace("(?P<", "(?P<%s_" % name) 98 value = value.replace("(?P=", "(?P=%s_" % name) 99 100 # Record the updated expression and add an identifying null group. 101 102 patterns[name] = "%s(?P<group_%s>)" % (value, name) 103 104 return patterns 105 106 def get_expression(d, keys): 107 108 """ 109 Return a compiled expression combining patterns in 'd' having the given 110 'keys'. 111 """ 112 113 subset = [] 114 115 for key in keys: 116 subset.append(d[key]) 117 118 return re.compile("|".join(subset), re.UNICODE | re.MULTILINE) 119 120 121 122 # Tokenising functions. 123 124 class TokenStream: 125 126 "A stream of tokens taken from a string." 127 128 def __init__(self, s, pos=0): 129 self.s = s 130 self.pos = pos 131 132 # Match details. 133 134 self.match = None 135 self.queued = None 136 self.groups = {} 137 138 # Pattern name details. 139 140 self.matching = None 141 142 def rewind(self, length): 143 144 "Rewind in the string by 'length'." 145 146 self.pos -= min(length, self.pos) 147 148 def queue_match(self): 149 150 "Rewind in the string to the start of the last match." 151 152 self.queued = self.match 153 154 def read_until(self, expression, remaining=True): 155 156 """ 157 Find the first match for the given 'expression'. Return the text 158 preceding any match, the remaining text if no match was found, or None 159 if no match was found and 'remaining' is given as a false value. 160 """ 161 162 if self.queued: 163 self.match = self.queued 164 self.queued = None 165 else: 166 self.matching = None 167 168 # Find the first matching pattern. 169 170 match = expression.search(self.s, self.pos) 171 172 if match: 173 for name, value in match.groupdict().items(): 174 175 # Use a group with a non-null value to identify the 176 # matching pattern. 177 178 if name.startswith("group_") and value is not None: 179 self.matching = name[len("group_"):] 180 self.start, self.end = match.span() 181 self.match = match 182 break 183 184 # Return the remaining text, if appropriate. 185 186 if self.matching is None: 187 self.groups = {} 188 if remaining: 189 return self.s[self.pos:] 190 else: 191 return None 192 else: 193 self.groups = self.filter_groups() 194 return self.s[self.pos:self.start] 195 196 def filter_groups(self): 197 198 "Filter groups from the current match for the matching pattern." 199 200 prefix = "%s_" % self.matching 201 202 d = {} 203 for key, value in self.match.groupdict().items(): 204 if key.startswith(prefix): 205 d[key[len(prefix):]] = value 206 return d 207 208 def match_group(self, group=None): 209 210 """ 211 Return the matched text, updating the position in the stream. If 'group' 212 is specified, the indicated group in a match will be returned. 213 Otherwise, the entire match is returned. 214 """ 215 216 self.update_pos() 217 218 if self.match: 219 if group is None: 220 return self.s[self.start:self.end] 221 else: 222 return self.groups.get(group) 223 else: 224 return None 225 226 def match_groups(self, groups=None): 227 228 "Return the match 'groups', or all groups if unspecified." 229 230 self.update_pos() 231 232 if self.match: 233 if groups is None: 234 return self.groups 235 else: 236 l = [] 237 for group in groups: 238 l.append(self.groups.get(group)) 239 return l 240 else: 241 return [] 242 243 def update_pos(self): 244 245 "Update the position in the stream." 246 247 if self.match: 248 _start, self.pos = self.match.span() 249 else: 250 self.pos = len(self.s) 251 252 253 254 # Parser abstractions. 255 256 class ParserBase: 257 258 "Common parsing methods." 259 260 region_pattern_names = None 261 262 def __init__(self, formats=None, root=None): 263 264 """ 265 Initialise the parser with any given 'formats' mapping from region type 266 names to parser objects. An optional 'root' indicates the document-level 267 parser. 268 """ 269 270 self.formats = formats 271 self.root = root 272 273 def get_parser(self, format_type): 274 275 """ 276 Return a parser for 'format_type' or None if no suitable parser is found. 277 """ 278 279 if not self.formats: 280 return None 281 282 cls = self.formats.get(format_type) 283 if cls: 284 return cls(self.formats, self.root or self) 285 else: 286 return None 287 288 def get_expression(self, pattern_names): 289 290 "Return a mapping of the given 'pattern_names' to patterns." 291 292 return get_expression(self.patterns, pattern_names) 293 294 def get_items(self, s, pos=0): 295 296 "Return a sequence of token items for 's' and 'pos'." 297 298 return TokenStream(s, pos) 299 300 def set_region(self, items, region): 301 302 "Set the 'items' used to populate the given 'region'." 303 304 self.items = items 305 self.region = region 306 307 def read_until(self, pattern_names, remaining=True): 308 309 """ 310 Read the next portion of input, matching using 'pattern_names'. Return 311 the text preceding any match, the remaining text if no match was found, 312 or None if no match was found and 'remaining' is given as a false value. 313 """ 314 315 return self.items.read_until(self.get_expression(pattern_names)) 316 317 def match_group(self, group=None): 318 319 """ 320 Return the group of the matching pattern with the given 'group' 321 identifier. If 'group' is omitted or None, return the entire match. 322 """ 323 324 return self.items.match_group(group) 325 326 def matching_pattern(self): 327 328 "Return the name of the matching pattern." 329 330 return self.items.matching 331 332 def match_groups(self): 333 334 "Return the number of groups in the match." 335 336 return self.items.match_groups() 337 338 # Parser methods invoked from other objects. 339 340 def parse(self, s): 341 342 """ 343 Parse page text 's'. Pages consist of regions delimited by markers. 344 """ 345 346 self.items = self.get_items(s) 347 self.region = self.parse_region() 348 return self.region 349 350 def parse_region_content(self, items, region): 351 352 "Parse the data provided by 'items' to populate a 'region'." 353 354 self.set_region(items, region) 355 356 # Parse inline and opaque regions. 357 358 if not region.transparent: 359 pattern_names = ["regionend"] 360 361 # Define a block to hold text. 362 363 else: 364 self.new_block(region) 365 pattern_names = self.region_pattern_names 366 367 # Start parsing. 368 369 if pattern_names: 370 self.parse_region_details(region, pattern_names) 371 372 # Reset the type if the region was not inline. 373 374 if region.type == "inline": 375 first = region.nodes and region.nodes[0] 376 if first and isinstance(first, Text) and first.multiline(): 377 region.type = None 378 379 # Top-level parser handler methods. 380 381 def parse_region(self, level=0, indent=0, type=None): 382 383 """ 384 Parse the data to populate a region with the given 'level' at the given 385 'indent' having the given initial 'type'. 386 """ 387 388 region = Region([], level, indent, type) 389 390 # Parse section headers, then parse according to region type. 391 392 self.parse_region_header(region) 393 self.parse_region_type(region) 394 395 return region 396 397 def parse_region_type(self, region): 398 399 """ 400 Use configured parsers to parse 'region' based on its type. 401 """ 402 403 # Find an appropriate parser given the type. 404 405 parser = self.get_parser(region.type) 406 if not parser: 407 region.transparent = False 408 parser = parser or self.get_parser("moin") 409 parser.parse_region_content(self.items, region) 410 411 def parse_region_header(self, region): 412 413 """ 414 Parse the region header, setting it on the 'region' object. 415 """ 416 417 if self.read_until(["header"], False) == "": # None means no header 418 region.args = self.match_group("args") 419 region.type = region.args.split(" ", 1)[0] 420 421 # Parsing utilities. 422 423 def parse_region_details(self, region, pattern_names, strict=False): 424 425 """ 426 Search 'region' using the 'pattern_names'. If 'strict' is set to a true 427 value, forbid the accumulation of additional textual padding. 428 """ 429 430 try: 431 while True: 432 433 # Obtain text before any marker or the end of the input. 434 435 preceding = self.read_until(pattern_names) 436 if preceding: 437 if not strict: 438 region.append_inline(Text(preceding)) 439 else: 440 break 441 442 # End of input. 443 444 if not self.matching_pattern(): 445 break 446 447 # Obtain any feature. 448 449 feature = self.match_group("feature") or self.match_group() 450 handler = self.handlers.get(self.matching_pattern()) 451 452 # Handle each feature or add text to the region. 453 454 if handler: 455 handler(self, region) 456 elif not strict: 457 region.append_inline(Text(feature)) 458 else: 459 break 460 461 except StopIteration: 462 pass 463 464 region.normalise() 465 466 def add_node(self, region, node): 467 468 "Add to 'region' the given 'node'." 469 470 region.add(node) 471 472 def append_node(self, region, node): 473 474 "Append to 'region' the given 'node'." 475 476 region.append(node) 477 478 def end_region(self, region): 479 480 "End the parsing of 'region', breaking out of the parsing loop." 481 482 raise StopIteration 483 484 def queue_match(self): 485 486 "Queue the current match." 487 488 self.items.queue_match() 489 490 def new_block(self, region): 491 492 "Start a new block in 'region'." 493 494 self.add_node(region, Block([])) 495 496 # Common handler methods. 497 498 def parse_region_end(self, node): 499 500 "Handle the end of a region occurring within 'node'." 501 502 level = self.match_group("level") 503 feature = self.match_group("feature") 504 self.region.extra = self.match_group("extra") 505 506 if self.region.have_end(level): 507 raise StopIteration 508 else: 509 node.append_inline(Text(feature)) 510 511 # vim: tabstop=4 expandtab shiftwidth=4