1 #!/usr/bin/env python 2 3 """ 4 Moin wiki parsing functionality. 5 6 Copyright (C) 2017, 2018 Paul Boddie <paul@boddie.org.uk> 7 8 This program is free software; you can redistribute it and/or modify it under 9 the terms of the GNU General Public License as published by the Free Software 10 Foundation; either version 3 of the License, or (at your option) any later 11 version. 12 13 This program is distributed in the hope that it will be useful, but WITHOUT 14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS 15 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more 16 details. 17 18 You should have received a copy of the GNU General Public License along with 19 this program. If not, see <http://www.gnu.org/licenses/>. 20 """ 21 22 from collections import defaultdict 23 from moinformat.tree import Block, Region, Text 24 import re 25 26 # Pattern management. 27 28 ws_excl_nl = r"[ \f\r\t\v]" 29 quotes = "['" '"]' # ['"] 30 31 def excl(s): 32 33 "Return a non-matching pattern for 's'." 34 35 return "(?!%s)" % s 36 37 def expect(s): 38 39 "Return a pattern expecting 's'." 40 41 return "(?=%s)" % s 42 43 def group(name, s): 44 45 """ 46 Return a pattern for the group having the given 'name' and employing the 47 pattern string 's'. 48 """ 49 50 return "(?P<%s>%s)" % (name, s) 51 52 def optional(s): 53 54 "Return an optional pattern." 55 56 return "(?:%s)?" % s 57 58 def recur(name): 59 60 "Return a test for a recurrence of group 'name'." 61 62 return "(?P=%s)" % name 63 64 def repeat(s, min=None, max=None): 65 66 "Return a pattern matching 's' for the given 'min' and 'max' limits." 67 68 return "%s{%s,%s}" % (s, min is not None and min or "", 69 max is not None and max or "") 70 71 def get_patterns(syntax): 72 73 """ 74 Define patterns for the regular expressions in the 'syntax' mapping. In each 75 pattern, replace... 76 77 \N with a pattern for matching whitespace excluding newlines 78 \Q with a pattern for matching quotation marks 79 80 Group names are also qualified with a pattern name prefix. 81 """ 82 83 patterns = {} 84 85 for name, value in syntax.items(): 86 value = value.replace(r"\N", ws_excl_nl) 87 value = value.replace(r"\Q", quotes) 88 89 # Add the name to group names as a prefix. 90 91 value = value.replace("(?P<", "(?P<%s_" % name) 92 value = value.replace("(?P=", "(?P=%s_" % name) 93 94 # Record the updated expression and add an identifying null group. 95 96 patterns[name] = "%s(?P<group_%s>)" % (value, name) 97 98 return patterns 99 100 def get_expression(d, keys): 101 102 """ 103 Return a compiled expression combining patterns in 'd' having the given 104 'keys'. 105 """ 106 107 subset = [] 108 109 for key in keys: 110 subset.append(d[key]) 111 112 return re.compile("|".join(subset), re.UNICODE | re.MULTILINE) 113 114 115 116 # Tokenising functions. 117 118 class TokenStream: 119 120 "A stream of tokens taken from a string." 121 122 def __init__(self, s, pos=0): 123 self.s = s 124 self.pos = pos 125 126 # Match details. 127 128 self.match = None 129 self.queued = None 130 self.groups = {} 131 132 # Pattern name details. 133 134 self.matching = None 135 136 def rewind(self, length): 137 138 "Rewind in the string by 'length'." 139 140 self.pos -= min(length, self.pos) 141 142 def queue_match(self): 143 144 "Rewind in the string to the start of the last match." 145 146 self.queued = self.match 147 148 def read_until(self, expression, remaining=True): 149 150 """ 151 Find the first match for the given 'expression'. Return the text 152 preceding any match, the remaining text if no match was found, or None 153 if no match was found and 'remaining' is given as a false value. 154 """ 155 156 if self.queued: 157 self.match = self.queued 158 self.queued = None 159 else: 160 self.matching = None 161 162 # Find the first matching pattern. 163 164 match = expression.search(self.s, self.pos) 165 166 if match: 167 for name, value in match.groupdict().items(): 168 169 # Use a group with a non-null value to identify the 170 # matching pattern. 171 172 if name.startswith("group_") and value is not None: 173 self.matching = name[len("group_"):] 174 self.start, self.end = match.span() 175 self.match = match 176 break 177 178 # Return the remaining text, if appropriate. 179 180 if self.matching is None: 181 self.groups = {} 182 if remaining: 183 return self.s[self.pos:] 184 else: 185 return None 186 else: 187 self.groups = self.filter_groups() 188 return self.s[self.pos:self.start] 189 190 def filter_groups(self): 191 192 "Filter groups from the current match for the matching pattern." 193 194 prefix = "%s_" % self.matching 195 196 d = {} 197 for key, value in self.match.groupdict().items(): 198 if key.startswith(prefix): 199 d[key[len(prefix):]] = value 200 return d 201 202 def match_group(self, group=None): 203 204 """ 205 Return the matched text, updating the position in the stream. If 'group' 206 is specified, the indicated group in a match will be returned. 207 Otherwise, the entire match is returned. 208 """ 209 210 self.update_pos() 211 212 if self.match: 213 if group is None: 214 return self.s[self.start:self.end] 215 else: 216 return self.groups.get(group) 217 else: 218 return None 219 220 def match_groups(self, groups=None): 221 222 "Return the match 'groups', or all groups if unspecified." 223 224 self.update_pos() 225 226 if self.match: 227 if groups is None: 228 return self.groups 229 else: 230 l = [] 231 for group in groups: 232 l.append(self.groups.get(group)) 233 return l 234 else: 235 return [] 236 237 def update_pos(self): 238 239 "Update the position in the stream." 240 241 if self.match: 242 _start, self.pos = self.match.span() 243 else: 244 self.pos = len(self.s) 245 246 247 248 # Parser abstractions. 249 250 class ParserBase: 251 252 "Common parsing methods." 253 254 region_pattern_names = None 255 256 def __init__(self, formats=None): 257 258 """ 259 Initialise the parser with any given 'formats' mapping from region type 260 names to parser objects. 261 """ 262 263 self.formats = formats 264 265 def get_parser(self, format_type): 266 267 """ 268 Return a parser for 'format_type' or None if no suitable parser is found. 269 """ 270 271 if not self.formats: 272 return None 273 274 cls = self.formats.get(format_type) 275 if cls: 276 return cls(self.formats) 277 else: 278 return None 279 280 def get_expression(self, pattern_names): 281 282 "Return a mapping of the given 'pattern_names' to patterns." 283 284 return get_expression(self.patterns, pattern_names) 285 286 def get_items(self, s, pos=0): 287 288 "Return a sequence of token items for 's' and 'pos'." 289 290 return TokenStream(s, pos) 291 292 def set_region(self, items, region): 293 294 "Set the 'items' used to populate the given 'region'." 295 296 self.items = items 297 self.region = region 298 299 def read_until(self, pattern_names, remaining=True): 300 301 """ 302 Read the next portion of input, matching using 'pattern_names'. Return 303 the text preceding any match, the remaining text if no match was found, 304 or None if no match was found and 'remaining' is given as a false value. 305 """ 306 307 return self.items.read_until(self.get_expression(pattern_names)) 308 309 def match_group(self, group=None): 310 311 """ 312 Return the group of the matching pattern with the given 'group' 313 identifier. If 'group' is omitted or None, return the entire match. 314 """ 315 316 return self.items.match_group(group) 317 318 def matching_pattern(self): 319 320 "Return the name of the matching pattern." 321 322 return self.items.matching 323 324 def match_groups(self): 325 326 "Return the number of groups in the match." 327 328 return self.items.match_groups() 329 330 # Parser methods invoked from other objects. 331 332 def parse(self, s): 333 334 """ 335 Parse page text 's'. Pages consist of regions delimited by markers. 336 """ 337 338 self.items = self.get_items(s) 339 self.region = self.parse_region() 340 return self.region 341 342 def parse_region_content(self, items, region): 343 344 "Parse the data provided by 'items' to populate a 'region'." 345 346 self.set_region(items, region) 347 348 # Define a block to hold text and start parsing. 349 350 self.new_block(region) 351 352 if self.region_pattern_names: 353 self.parse_region_details(region, self.region_pattern_names) 354 355 # Top-level parser handler methods. 356 357 def parse_region(self, level=0, indent=0, type=None): 358 359 """ 360 Parse the data to populate a region with the given 'level' at the given 361 'indent' having the given initial 'type'. 362 """ 363 364 region = Region([], level, indent, type) 365 366 # Parse section headers, then parse according to region type. 367 368 self.parse_region_header(region) 369 self.parse_region_type(region) 370 371 return region 372 373 def parse_region_type(self, region): 374 375 """ 376 Use configured parsers to parse 'region' based on its type. 377 """ 378 379 # Handle potentially inline regions. 380 381 if region.type == "inline": 382 self.parse_region_inline(region) 383 return 384 385 # Find an appropriate parser given the type. 386 387 parser = self.get_parser(region.type) 388 389 if parser: 390 parser.parse_region_content(self.items, region) 391 392 # Otherwise, treat the section as opaque. 393 394 else: 395 self.parse_region_opaque(region) 396 397 def parse_region_header(self, region): 398 399 """ 400 Parse the region header, setting it on the 'region' object. 401 """ 402 403 if self.read_until(["header"], False) == "": # None means no header 404 region.type = self.match_group("args") 405 406 def parse_region_opaque(self, region): 407 408 "Parse the data to populate an opaque 'region'." 409 410 region.transparent = False 411 self.parse_region_details(region, ["regionend"]) 412 413 def parse_region_inline(self, region): 414 415 "Parse the data to populate an inline 'region'." 416 417 region.transparent = False 418 self.parse_region_details(region, ["regionend"]) 419 420 # Reset the type if the region was not inline. 421 422 if region.type == "inline": 423 first = region.nodes and region.nodes[0] 424 if first and isinstance(first, Text) and first.multiline(): 425 region.type = None 426 427 # Parsing utilities. 428 429 def parse_region_details(self, region, pattern_names, strict=False): 430 431 """ 432 Search 'region' using the 'pattern_names'. If 'strict' is set to a true 433 value, forbid the accumulation of additional textual padding. 434 """ 435 436 try: 437 while True: 438 439 # Obtain text before any marker or the end of the input. 440 441 preceding = self.read_until(pattern_names) 442 if preceding: 443 if not strict: 444 region.append_inline(Text(preceding)) 445 else: 446 break 447 448 # End of input. 449 450 if not self.matching_pattern(): 451 break 452 453 # Obtain any feature. 454 455 feature = self.match_group(None) 456 handler = self.handlers.get(self.matching_pattern()) 457 458 # Handle each feature or add text to the region. 459 460 if handler: 461 handler(self, region) 462 elif not strict: 463 region.append_inline(Text(feature)) 464 else: 465 break 466 467 except StopIteration: 468 pass 469 470 region.normalise() 471 472 def add_node(self, region, node): 473 474 "Add to 'region' the given 'node'." 475 476 region.add(node) 477 478 def append_node(self, region, node): 479 480 "Append to 'region' the given 'node'." 481 482 region.append(node) 483 484 def end_region(self, region): 485 486 "End the parsing of 'region', breaking out of the parsing loop." 487 488 raise StopIteration 489 490 def queue_match(self): 491 492 "Queue the current match." 493 494 self.items.queue_match() 495 496 def new_block(self, region): 497 498 "Start a new block in 'region'." 499 500 self.add_node(region, Block([])) 501 502 # vim: tabstop=4 expandtab shiftwidth=4