1 #!/usr/bin/env python 2 3 """ 4 Moin wiki parsing functionality. 5 6 Copyright (C) 2017, 2018 Paul Boddie <paul@boddie.org.uk> 7 8 This program is free software; you can redistribute it and/or modify it under 9 the terms of the GNU General Public License as published by the Free Software 10 Foundation; either version 3 of the License, or (at your option) any later 11 version. 12 13 This program is distributed in the hope that it will be useful, but WITHOUT 14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS 15 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more 16 details. 17 18 You should have received a copy of the GNU General Public License along with 19 this program. If not, see <http://www.gnu.org/licenses/>. 20 """ 21 22 from collections import defaultdict 23 from moinformat.tree import Block, Region, Text 24 import re 25 26 # Pattern management. 27 28 ws_excl_nl = r"[ \f\r\t\v]" 29 quotes = "['" '"]' # ['"] 30 31 def excl(s): 32 33 "Return a non-matching pattern for 's'." 34 35 return "(?!%s)" % s 36 37 def expect(s): 38 39 "Return a pattern expecting 's'." 40 41 return "(?=%s)" % s 42 43 def group(name, s): 44 45 """ 46 Return a pattern for the group having the given 'name' and employing the 47 pattern string 's'. 48 """ 49 50 return "(?P<%s>%s)" % (name, s) 51 52 def optional(s): 53 54 "Return an optional pattern." 55 56 return "(?:%s)?" % s 57 58 def recur(name): 59 60 "Return a test for a recurrence of group 'name'." 61 62 return "(?P=%s)" % name 63 64 def repeat(s, min=None, max=None): 65 66 "Return a pattern matching 's' for the given 'min' and 'max' limits." 67 68 return "%s{%s,%s}" % (s, min is not None and min or "", 69 max is not None and max or "") 70 71 def get_patterns(syntax): 72 73 """ 74 Define patterns for the regular expressions in the 'syntax' mapping. In each 75 pattern, replace... 76 77 \N with a pattern for matching whitespace excluding newlines 78 \Q with a pattern for matching quotation marks 79 80 Group names are also qualified with a pattern name prefix. 81 """ 82 83 patterns = {} 84 85 for name, value in syntax.items(): 86 value = value.replace(r"\N", ws_excl_nl) 87 value = value.replace(r"\Q", quotes) 88 89 # Add the name to group names as a prefix. 90 91 value = value.replace("(?P<", "(?P<%s_" % name) 92 value = value.replace("(?P=", "(?P=%s_" % name) 93 94 # Record the updated expression and add an identifying null group. 95 96 patterns[name] = "%s(?P<group_%s>)" % (value, name) 97 98 return patterns 99 100 def get_expression(d, keys): 101 102 """ 103 Return a compiled expression combining patterns in 'd' having the given 104 'keys'. 105 """ 106 107 subset = [] 108 109 for key in keys: 110 subset.append(d[key]) 111 112 return re.compile("|".join(subset), re.UNICODE | re.MULTILINE) 113 114 115 116 # Tokenising functions. 117 118 class TokenStream: 119 120 "A stream of tokens taken from a string." 121 122 def __init__(self, s, pos=0): 123 self.s = s 124 self.pos = pos 125 126 # Match details. 127 128 self.match = None 129 self.queued = None 130 self.groups = {} 131 132 # Pattern name details. 133 134 self.matching = None 135 136 def rewind(self, length): 137 138 "Rewind in the string by 'length'." 139 140 self.pos -= min(length, self.pos) 141 142 def queue_match(self): 143 144 "Rewind in the string to the start of the last match." 145 146 self.queued = self.match 147 148 def read_until(self, expression, remaining=True): 149 150 """ 151 Find the first match for the given 'expression'. Return the text 152 preceding any match, the remaining text if no match was found, or None 153 if no match was found and 'remaining' is given as a false value. 154 """ 155 156 if self.queued: 157 self.match = self.queued 158 self.queued = None 159 else: 160 self.matching = None 161 162 # Find the first matching pattern. 163 164 match = expression.search(self.s, self.pos) 165 166 if match: 167 for name, value in match.groupdict().items(): 168 169 # Use a group with a non-null value to identify the 170 # matching pattern. 171 172 if name.startswith("group_") and value is not None: 173 self.matching = name[len("group_"):] 174 self.start, self.end = match.span() 175 self.match = match 176 break 177 178 # Return the remaining text, if appropriate. 179 180 if self.matching is None: 181 self.groups = {} 182 if remaining: 183 return self.s[self.pos:] 184 else: 185 return None 186 else: 187 self.groups = self.filter_groups() 188 return self.s[self.pos:self.start] 189 190 def filter_groups(self): 191 192 "Filter groups from the current match for the matching pattern." 193 194 prefix = "%s_" % self.matching 195 196 d = {} 197 for key, value in self.match.groupdict().items(): 198 if key.startswith(prefix): 199 d[key[len(prefix):]] = value 200 return d 201 202 def match_group(self, group=None): 203 204 """ 205 Return the matched text, updating the position in the stream. If 'group' 206 is specified, the indicated group in a match will be returned. 207 Otherwise, the entire match is returned. 208 """ 209 210 self.update_pos() 211 212 if self.match: 213 if group is None: 214 return self.s[self.start:self.end] 215 else: 216 return self.groups.get(group) 217 else: 218 return None 219 220 def match_groups(self, groups=None): 221 222 "Return the match 'groups', or all groups if unspecified." 223 224 self.update_pos() 225 226 if self.match: 227 if groups is None: 228 return self.groups 229 else: 230 l = [] 231 for group in groups: 232 l.append(self.groups.get(group)) 233 return l 234 else: 235 return [] 236 237 def update_pos(self): 238 239 "Update the position in the stream." 240 241 if self.match: 242 _start, self.pos = self.match.span() 243 else: 244 self.pos = len(self.s) 245 246 247 248 # Parser abstractions. 249 250 class ParserBase: 251 252 "Common parsing methods." 253 254 region_pattern_names = None 255 256 def __init__(self, formats=None): 257 258 """ 259 Initialise the parser with any given 'formats' mapping from region type 260 names to parser objects. 261 """ 262 263 self.formats = formats 264 265 def get_parser(self, format_type): 266 267 """ 268 Return a parser for 'format_type' or None if no suitable parser is found. 269 """ 270 271 if not self.formats: 272 return None 273 274 cls = self.formats.get(format_type) 275 if cls: 276 return cls(self.formats) 277 else: 278 return None 279 280 def get_expression(self, pattern_names): 281 282 "Return a mapping of the given 'pattern_names' to patterns." 283 284 return get_expression(self.patterns, pattern_names) 285 286 def get_items(self, s, pos=0): 287 288 "Return a sequence of token items for 's' and 'pos'." 289 290 return TokenStream(s, pos) 291 292 def set_region(self, items, region): 293 294 "Set the 'items' used to populate the given 'region'." 295 296 self.items = items 297 self.region = region 298 299 def read_until(self, pattern_names, remaining=True): 300 301 """ 302 Read the next portion of input, matching using 'pattern_names'. Return 303 the text preceding any match, the remaining text if no match was found, 304 or None if no match was found and 'remaining' is given as a false value. 305 """ 306 307 return self.items.read_until(self.get_expression(pattern_names)) 308 309 def match_group(self, group=None): 310 311 """ 312 Return the group of the matching pattern with the given 'group' 313 identifier. If 'group' is omitted or None, return the entire match. 314 """ 315 316 return self.items.match_group(group) 317 318 def matching_pattern(self): 319 320 "Return the name of the matching pattern." 321 322 return self.items.matching 323 324 def match_groups(self): 325 326 "Return the number of groups in the match." 327 328 return self.items.match_groups() 329 330 # Parser methods invoked from other objects. 331 332 def parse(self, s): 333 334 """ 335 Parse page text 's'. Pages consist of regions delimited by markers. 336 """ 337 338 self.items = self.get_items(s) 339 self.region = self.parse_region() 340 return self.region 341 342 def parse_region_content(self, items, region): 343 344 "Parse the data provided by 'items' to populate a 'region'." 345 346 self.set_region(items, region) 347 348 # Parse inline and opaque regions. 349 350 if not region.transparent: 351 pattern_names = ["regionend"] 352 353 # Define a block to hold text. 354 355 else: 356 self.new_block(region) 357 pattern_names = self.region_pattern_names 358 359 # Start parsing. 360 361 if pattern_names: 362 self.parse_region_details(region, pattern_names) 363 364 # Reset the type if the region was not inline. 365 366 if region.type == "inline": 367 first = region.nodes and region.nodes[0] 368 if first and isinstance(first, Text) and first.multiline(): 369 region.type = None 370 371 # Top-level parser handler methods. 372 373 def parse_region(self, level=0, indent=0, type=None): 374 375 """ 376 Parse the data to populate a region with the given 'level' at the given 377 'indent' having the given initial 'type'. 378 """ 379 380 region = Region([], level, indent, type) 381 382 # Parse section headers, then parse according to region type. 383 384 self.parse_region_header(region) 385 self.parse_region_type(region) 386 387 return region 388 389 def parse_region_type(self, region): 390 391 """ 392 Use configured parsers to parse 'region' based on its type. 393 """ 394 395 # Find an appropriate parser given the type. 396 397 parser = self.get_parser(region.type) 398 if not parser: 399 region.transparent = False 400 parser = parser or self.get_parser("moin") 401 parser.parse_region_content(self.items, region) 402 403 def parse_region_header(self, region): 404 405 """ 406 Parse the region header, setting it on the 'region' object. 407 """ 408 409 if self.read_until(["header"], False) == "": # None means no header 410 region.type = self.match_group("args") 411 412 # Parsing utilities. 413 414 def parse_region_details(self, region, pattern_names, strict=False): 415 416 """ 417 Search 'region' using the 'pattern_names'. If 'strict' is set to a true 418 value, forbid the accumulation of additional textual padding. 419 """ 420 421 try: 422 while True: 423 424 # Obtain text before any marker or the end of the input. 425 426 preceding = self.read_until(pattern_names) 427 if preceding: 428 if not strict: 429 region.append_inline(Text(preceding)) 430 else: 431 break 432 433 # End of input. 434 435 if not self.matching_pattern(): 436 break 437 438 # Obtain any feature. 439 440 feature = self.match_group("feature") or self.match_group() 441 handler = self.handlers.get(self.matching_pattern()) 442 443 # Handle each feature or add text to the region. 444 445 if handler: 446 handler(self, region) 447 elif not strict: 448 region.append_inline(Text(feature)) 449 else: 450 break 451 452 except StopIteration: 453 pass 454 455 region.normalise() 456 457 def add_node(self, region, node): 458 459 "Add to 'region' the given 'node'." 460 461 region.add(node) 462 463 def append_node(self, region, node): 464 465 "Append to 'region' the given 'node'." 466 467 region.append(node) 468 469 def end_region(self, region): 470 471 "End the parsing of 'region', breaking out of the parsing loop." 472 473 raise StopIteration 474 475 def queue_match(self): 476 477 "Queue the current match." 478 479 self.items.queue_match() 480 481 def new_block(self, region): 482 483 "Start a new block in 'region'." 484 485 self.add_node(region, Block([])) 486 487 # vim: tabstop=4 expandtab shiftwidth=4