1 #!/usr/bin/env python 2 3 """ 4 Moin wiki parsing functionality. 5 6 Copyright (C) 2017, 2018 Paul Boddie <paul@boddie.org.uk> 7 8 This program is free software; you can redistribute it and/or modify it under 9 the terms of the GNU General Public License as published by the Free Software 10 Foundation; either version 3 of the License, or (at your option) any later 11 version. 12 13 This program is distributed in the hope that it will be useful, but WITHOUT 14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS 15 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more 16 details. 17 18 You should have received a copy of the GNU General Public License along with 19 this program. If not, see <http://www.gnu.org/licenses/>. 20 """ 21 22 from collections import defaultdict 23 from moinformat.tree import Block, Region, Text 24 import re 25 26 # Pattern management. 27 28 ws_excl_nl = r"[ \f\r\t\v]" 29 quotes = "['" '"]' # ['"] 30 31 def excl(s): 32 33 "Return a non-matching pattern for 's'." 34 35 return "(?!%s)" % s 36 37 def expect(s): 38 39 "Return a pattern expecting 's'." 40 41 return "(?=%s)" % s 42 43 def group(name, s): 44 45 """ 46 Return a pattern for the group having the given 'name' and employing the 47 pattern string 's'. 48 """ 49 50 return "(?P<%s>%s)" % (name, s) 51 52 def optional(s): 53 54 "Return an optional pattern." 55 56 return "(?:%s)?" % s 57 58 def recur(name): 59 60 "Return a test for a recurrence of group 'name'." 61 62 return "(?P=%s)" % name 63 64 def repeat(s, min=None, max=None): 65 66 "Return a pattern matching 's' for the given 'min' and 'max' limits." 67 68 return "%s{%s,%s}" % (s, min is not None and min or "", 69 max is not None and max or "") 70 71 def get_patterns(syntax): 72 73 """ 74 Define patterns for the regular expressions in the 'syntax' mapping. In each 75 pattern, replace... 76 77 \N with a pattern for matching whitespace excluding newlines 78 \Q with a pattern for matching quotation marks 79 80 Group names are also qualified with a pattern name prefix. 81 """ 82 83 patterns = {} 84 85 for name, value in syntax.items(): 86 value = value.replace(r"\N", ws_excl_nl) 87 value = value.replace(r"\Q", quotes) 88 89 # Add the name to group names as a prefix. 90 91 value = value.replace("(?P<", "(?P<%s_" % name) 92 value = value.replace("(?P=", "(?P=%s_" % name) 93 94 # Record the updated expression and add an identifying null group. 95 96 patterns[name] = "%s(?P<group_%s>)" % (value, name) 97 98 return patterns 99 100 def get_expression(d, keys): 101 102 """ 103 Return a compiled expression combining patterns in 'd' having the given 104 'keys'. 105 """ 106 107 subset = [] 108 109 for key in keys: 110 subset.append(d[key]) 111 112 return re.compile("|".join(subset), re.UNICODE | re.MULTILINE) 113 114 115 116 # Tokenising functions. 117 118 class TokenStream: 119 120 "A stream of tokens taken from a string." 121 122 def __init__(self, s, pos=0): 123 self.s = s 124 self.pos = pos 125 126 # Match details. 127 128 self.match = None 129 self.queued = None 130 self.groups = {} 131 132 # Pattern name details. 133 134 self.matching = None 135 136 def rewind(self, length): 137 138 "Rewind in the string by 'length'." 139 140 self.pos -= min(length, self.pos) 141 142 def queue_match(self): 143 144 "Rewind in the string to the start of the last match." 145 146 self.queued = self.match 147 148 def read_until(self, expression, remaining=True): 149 150 """ 151 Find the first match for the given 'expression'. Return the text 152 preceding any match, the remaining text if no match was found, or None 153 if no match was found and 'remaining' is given as a false value. 154 """ 155 156 if self.queued: 157 self.match = self.queued 158 self.queued = None 159 else: 160 self.matching = None 161 162 # Find the first matching pattern. 163 164 match = expression.search(self.s, self.pos) 165 166 if match: 167 for name, value in match.groupdict().items(): 168 169 # Use a group with a non-null value to identify the 170 # matching pattern. 171 172 if name.startswith("group_") and value is not None: 173 self.matching = name[len("group_"):] 174 self.start, self.end = match.span() 175 self.match = match 176 break 177 178 # Return the remaining text, if appropriate. 179 180 if self.matching is None: 181 self.groups = {} 182 if remaining: 183 return self.s[self.pos:] 184 else: 185 return None 186 else: 187 self.groups = self.filter_groups() 188 return self.s[self.pos:self.start] 189 190 def filter_groups(self): 191 192 "Filter groups from the current match for the matching pattern." 193 194 d = {} 195 for key, value in self.match.groupdict().items(): 196 if key.startswith("%s_" % self.matching): 197 d[key] = value 198 return d 199 200 def match_group(self, group=None): 201 202 """ 203 Return the matched text, updating the position in the stream. If 'group' 204 is specified, the indicated group in a match will be returned. 205 Otherwise, the entire match is returned. 206 """ 207 208 self.update_pos() 209 210 if self.match: 211 if group is None: 212 return self.s[self.start:self.end] 213 else: 214 return self.groups.get("%s_%s" % (self.matching, group)) 215 else: 216 return None 217 218 def match_groups(self, groups=None): 219 220 "Return the match 'groups', or all groups if unspecified." 221 222 self.update_pos() 223 224 if self.match: 225 if groups is None: 226 return self.groups 227 else: 228 l = [] 229 for group in groups: 230 l.append(self.groups.get("%s_%s" % (self.matching, group))) 231 return l 232 else: 233 return [] 234 235 def update_pos(self): 236 237 "Update the position in the stream." 238 239 if self.match: 240 _start, self.pos = self.match.span() 241 else: 242 self.pos = len(self.s) 243 244 245 246 # Parser abstractions. 247 248 class ParserBase: 249 250 "Common parsing methods." 251 252 region_pattern_names = None 253 254 def __init__(self, formats=None): 255 256 """ 257 Initialise the parser with any given 'formats' mapping from region type 258 names to parser objects. 259 """ 260 261 self.formats = formats 262 263 def get_parser(self, format_type): 264 265 """ 266 Return a parser for 'format_type' or None if no suitable parser is found. 267 """ 268 269 if not self.formats: 270 return None 271 272 cls = self.formats.get(format_type) 273 if cls: 274 return cls(self.formats) 275 else: 276 return None 277 278 def get_expression(self, pattern_names): 279 280 "Return a mapping of the given 'pattern_names' to patterns." 281 282 return get_expression(self.patterns, pattern_names) 283 284 def get_items(self, s, pos=0): 285 286 "Return a sequence of token items for 's' and 'pos'." 287 288 return TokenStream(s, pos) 289 290 def set_region(self, items, region): 291 292 "Set the 'items' used to populate the given 'region'." 293 294 self.items = items 295 self.region = region 296 297 def read_until(self, pattern_names, remaining=True): 298 299 """ 300 Read the next portion of input, matching using 'pattern_names'. Return 301 the text preceding any match, the remaining text if no match was found, 302 or None if no match was found and 'remaining' is given as a false value. 303 """ 304 305 return self.items.read_until(self.get_expression(pattern_names)) 306 307 def match_group(self, group=None): 308 309 """ 310 Return the group of the matching pattern with the given 'group' 311 identifier. If 'group' is omitted or None, return the entire match. 312 """ 313 314 return self.items.match_group(group) 315 316 def matching_pattern(self): 317 318 "Return the name of the matching pattern." 319 320 return self.items.matching 321 322 def match_groups(self): 323 324 "Return the number of groups in the match." 325 326 return self.items.match_groups() 327 328 # Parser methods invoked from other objects. 329 330 def parse(self, s): 331 332 """ 333 Parse page text 's'. Pages consist of regions delimited by markers. 334 """ 335 336 self.items = self.get_items(s) 337 self.region = self.parse_region() 338 return self.region 339 340 def parse_region_content(self, items, region): 341 342 "Parse the data provided by 'items' to populate a 'region'." 343 344 self.set_region(items, region) 345 346 # Define a block to hold text and start parsing. 347 348 self.new_block(region) 349 350 if self.region_pattern_names: 351 self.parse_region_details(region, self.region_pattern_names) 352 353 # Top-level parser handler methods. 354 355 def parse_region(self, level=0, indent=0, type=None): 356 357 """ 358 Parse the data to populate a region with the given 'level' at the given 359 'indent' having the given initial 'type'. 360 """ 361 362 region = Region([], level, indent, type) 363 364 # Parse section headers, then parse according to region type. 365 366 self.parse_region_header(region) 367 self.parse_region_type(region) 368 369 return region 370 371 def parse_region_type(self, region): 372 373 """ 374 Use configured parsers to parse 'region' based on its type. 375 """ 376 377 # Handle potentially inline regions. 378 379 if region.type == "inline": 380 self.parse_region_inline(region) 381 return 382 383 # Find an appropriate parser given the type. 384 385 parser = self.get_parser(region.type) 386 387 if parser: 388 parser.parse_region_content(self.items, region) 389 390 # Otherwise, treat the section as opaque. 391 392 else: 393 self.parse_region_opaque(region) 394 395 def parse_region_header(self, region): 396 397 """ 398 Parse the region header, setting it on the 'region' object. 399 """ 400 401 if self.read_until(["header"], False) == "": # None means no header 402 region.type = self.match_group("args") 403 404 def parse_region_opaque(self, region): 405 406 "Parse the data to populate an opaque 'region'." 407 408 region.transparent = False 409 self.parse_region_details(region, ["regionend"]) 410 411 def parse_region_inline(self, region): 412 413 "Parse the data to populate an inline 'region'." 414 415 region.transparent = False 416 self.parse_region_details(region, ["regionend"]) 417 418 # Reset the type if the region was not inline. 419 420 if region.type == "inline": 421 first = region.nodes and region.nodes[0] 422 if first and isinstance(first, Text) and first.multiline(): 423 region.type = None 424 425 # Parsing utilities. 426 427 def parse_region_details(self, region, pattern_names, strict=False): 428 429 """ 430 Search 'region' using the 'pattern_names'. If 'strict' is set to a true 431 value, forbid the accumulation of additional textual padding. 432 """ 433 434 try: 435 while True: 436 437 # Obtain text before any marker or the end of the input. 438 439 preceding = self.read_until(pattern_names) 440 if preceding: 441 if not strict: 442 region.append_inline(Text(preceding)) 443 else: 444 break 445 446 # End of input. 447 448 if not self.matching_pattern(): 449 break 450 451 # Obtain any feature. 452 453 feature = self.match_group(None) 454 handler = self.handlers.get(self.matching_pattern()) 455 456 # Handle each feature or add text to the region. 457 458 if handler: 459 handler(self, region) 460 elif not strict: 461 region.append_inline(Text(feature)) 462 else: 463 break 464 465 except StopIteration: 466 pass 467 468 region.normalise() 469 470 def add_node(self, region, node): 471 472 "Add to 'region' the given 'node'." 473 474 region.add(node) 475 476 def append_node(self, region, node): 477 478 "Append to 'region' the given 'node'." 479 480 region.append(node) 481 482 def end_region(self, region): 483 484 "End the parsing of 'region', breaking out of the parsing loop." 485 486 raise StopIteration 487 488 def queue_match(self): 489 490 "Queue the current match." 491 492 self.items.queue_match() 493 494 def new_block(self, region): 495 496 "Start a new block in 'region'." 497 498 self.add_node(region, Block([])) 499 500 # vim: tabstop=4 expandtab shiftwidth=4