1 #!/usr/bin/env python 2 3 """ 4 Moin wiki parsing functionality. 5 6 Copyright (C) 2017, 2018 Paul Boddie <paul@boddie.org.uk> 7 8 This program is free software; you can redistribute it and/or modify it under 9 the terms of the GNU General Public License as published by the Free Software 10 Foundation; either version 3 of the License, or (at your option) any later 11 version. 12 13 This program is distributed in the hope that it will be useful, but WITHOUT 14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS 15 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more 16 details. 17 18 You should have received a copy of the GNU General Public License along with 19 this program. If not, see <http://www.gnu.org/licenses/>. 20 """ 21 22 from collections import defaultdict 23 from moinformat.tree.moin import Block, Region, Text 24 import re 25 26 # Pattern management. 27 28 ws_excl_nl = r"[ \f\r\t\v]" 29 quotes = "['" '"]' # ['"] 30 31 def excl(s): 32 33 "Return a non-matching pattern for 's'." 34 35 return "(?!%s)" % s 36 37 def expect(s): 38 39 "Return a pattern expecting 's'." 40 41 return "(?=%s)" % s 42 43 def group(name, s): 44 45 """ 46 Return a pattern for the group having the given 'name' and employing the 47 pattern string 's'. 48 """ 49 50 return "(?P<%s>%s)" % (name, s) 51 52 def optional(s): 53 54 "Return an optional pattern." 55 56 return "(?:%s)?" % s 57 58 def recur(name): 59 60 "Return a test for a recurrence of group 'name'." 61 62 return "(?P=%s)" % name 63 64 def repeat(s, min=None, max=None): 65 66 "Return a pattern matching 's' for the given 'min' and 'max' limits." 67 68 return "%s{%s,%s}" % (s, min is not None and min or "", 69 max is not None and max or "") 70 71 def get_patterns(syntax): 72 73 """ 74 Define patterns for the regular expressions in the 'syntax' mapping. In each 75 pattern, replace... 76 77 \N with a pattern for matching whitespace excluding newlines 78 \Q with a pattern for matching quotation marks 79 80 Group names are also qualified with a pattern name prefix. 81 """ 82 83 patterns = {} 84 85 for name, value in syntax.items(): 86 value = value.replace(r"\N", ws_excl_nl) 87 value = value.replace(r"\Q", quotes) 88 89 # Add the name to group names as a prefix. 90 91 value = value.replace("(?P<", "(?P<%s_" % name) 92 value = value.replace("(?P=", "(?P=%s_" % name) 93 94 # Record the updated expression and add an identifying null group. 95 96 patterns[name] = "%s(?P<group_%s>)" % (value, name) 97 98 return patterns 99 100 def get_expression(d, keys): 101 102 """ 103 Return a compiled expression combining patterns in 'd' having the given 104 'keys'. 105 """ 106 107 subset = [] 108 109 for key in keys: 110 subset.append(d[key]) 111 112 return re.compile("|".join(subset), re.UNICODE | re.MULTILINE) 113 114 115 116 # Tokenising functions. 117 118 class TokenStream: 119 120 "A stream of tokens taken from a string." 121 122 def __init__(self, s, pos=0): 123 self.s = s 124 self.pos = pos 125 126 # Match details. 127 128 self.match = None 129 self.queued = None 130 self.groups = {} 131 132 # Pattern name details. 133 134 self.matching = None 135 136 def rewind(self, length): 137 138 "Rewind in the string by 'length'." 139 140 self.pos -= min(length, self.pos) 141 142 def queue_match(self): 143 144 "Rewind in the string to the start of the last match." 145 146 self.queued = self.match 147 148 def read_until(self, expression, remaining=True): 149 150 """ 151 Find the first match for the given 'expression'. Return the text 152 preceding any match, the remaining text if no match was found, or None 153 if no match was found and 'remaining' is given as a false value. 154 """ 155 156 if self.queued: 157 self.match = self.queued 158 self.queued = None 159 else: 160 self.matching = None 161 162 # Find the first matching pattern. 163 164 match = expression.search(self.s, self.pos) 165 166 if match: 167 for name, value in match.groupdict().items(): 168 169 # Use a group with a non-null value to identify the 170 # matching pattern. 171 172 if name.startswith("group_") and value is not None: 173 self.matching = name[len("group_"):] 174 self.start, self.end = match.span() 175 self.match = match 176 break 177 178 # Return the remaining text, if appropriate. 179 180 if self.matching is None: 181 self.groups = {} 182 if remaining: 183 return self.s[self.pos:] 184 else: 185 return None 186 else: 187 self.groups = self.filter_groups() 188 return self.s[self.pos:self.start] 189 190 def filter_groups(self): 191 192 "Filter groups from the current match for the matching pattern." 193 194 prefix = "%s_" % self.matching 195 196 d = {} 197 for key, value in self.match.groupdict().items(): 198 if key.startswith(prefix): 199 d[key[len(prefix):]] = value 200 return d 201 202 def match_group(self, group=None): 203 204 """ 205 Return the matched text, updating the position in the stream. If 'group' 206 is specified, the indicated group in a match will be returned. 207 Otherwise, the entire match is returned. 208 """ 209 210 self.update_pos() 211 212 if self.match: 213 if group is None: 214 return self.s[self.start:self.end] 215 else: 216 return self.groups.get(group) 217 else: 218 return None 219 220 def match_groups(self, groups=None): 221 222 "Return the match 'groups', or all groups if unspecified." 223 224 self.update_pos() 225 226 if self.match: 227 if groups is None: 228 return self.groups 229 else: 230 l = [] 231 for group in groups: 232 l.append(self.groups.get(group)) 233 return l 234 else: 235 return [] 236 237 def update_pos(self): 238 239 "Update the position in the stream." 240 241 if self.match: 242 _start, self.pos = self.match.span() 243 else: 244 self.pos = len(self.s) 245 246 247 248 # Parser abstractions. 249 250 class ParserBase: 251 252 "Common parsing methods." 253 254 region_pattern_names = None 255 256 def __init__(self, formats=None, root=None): 257 258 """ 259 Initialise the parser with any given 'formats' mapping from region type 260 names to parser objects. An optional 'root' indicates the document-level 261 parser. 262 """ 263 264 self.formats = formats 265 self.root = root 266 267 def get_parser(self, format_type): 268 269 """ 270 Return a parser for 'format_type' or None if no suitable parser is found. 271 """ 272 273 if not self.formats: 274 return None 275 276 cls = self.formats.get(format_type) 277 if cls: 278 return cls(self.formats, self.root or self) 279 else: 280 return None 281 282 def get_expression(self, pattern_names): 283 284 "Return a mapping of the given 'pattern_names' to patterns." 285 286 return get_expression(self.patterns, pattern_names) 287 288 def get_items(self, s, pos=0): 289 290 "Return a sequence of token items for 's' and 'pos'." 291 292 return TokenStream(s, pos) 293 294 def set_region(self, items, region): 295 296 "Set the 'items' used to populate the given 'region'." 297 298 self.items = items 299 self.region = region 300 301 def read_until(self, pattern_names, remaining=True): 302 303 """ 304 Read the next portion of input, matching using 'pattern_names'. Return 305 the text preceding any match, the remaining text if no match was found, 306 or None if no match was found and 'remaining' is given as a false value. 307 """ 308 309 return self.items.read_until(self.get_expression(pattern_names)) 310 311 def match_group(self, group=None): 312 313 """ 314 Return the group of the matching pattern with the given 'group' 315 identifier. If 'group' is omitted or None, return the entire match. 316 """ 317 318 return self.items.match_group(group) 319 320 def matching_pattern(self): 321 322 "Return the name of the matching pattern." 323 324 return self.items.matching 325 326 def match_groups(self): 327 328 "Return the number of groups in the match." 329 330 return self.items.match_groups() 331 332 # Parser methods invoked from other objects. 333 334 def parse(self, s): 335 336 """ 337 Parse page text 's'. Pages consist of regions delimited by markers. 338 """ 339 340 self.items = self.get_items(s) 341 self.region = self.parse_region() 342 return self.region 343 344 def parse_region_content(self, items, region): 345 346 "Parse the data provided by 'items' to populate a 'region'." 347 348 self.set_region(items, region) 349 350 # Parse inline and opaque regions. 351 352 if not region.transparent: 353 pattern_names = ["regionend"] 354 355 # Define a block to hold text. 356 357 else: 358 self.new_block(region) 359 pattern_names = self.region_pattern_names 360 361 # Start parsing. 362 363 if pattern_names: 364 self.parse_region_details(region, pattern_names) 365 366 # Reset the type if the region was not inline. 367 368 if region.type == "inline": 369 first = region.nodes and region.nodes[0] 370 if first and isinstance(first, Text) and first.multiline(): 371 region.type = None 372 373 # Top-level parser handler methods. 374 375 def parse_region(self, level=0, indent=0, type=None): 376 377 """ 378 Parse the data to populate a region with the given 'level' at the given 379 'indent' having the given initial 'type'. 380 """ 381 382 region = Region([], level, indent, type) 383 384 # Parse section headers, then parse according to region type. 385 386 self.parse_region_header(region) 387 self.parse_region_type(region) 388 389 return region 390 391 def parse_region_type(self, region): 392 393 """ 394 Use configured parsers to parse 'region' based on its type. 395 """ 396 397 # Find an appropriate parser given the type. 398 399 parser = self.get_parser(region.type) 400 if not parser: 401 region.transparent = False 402 parser = parser or self.get_parser("moin") 403 parser.parse_region_content(self.items, region) 404 405 def parse_region_header(self, region): 406 407 """ 408 Parse the region header, setting it on the 'region' object. 409 """ 410 411 if self.read_until(["header"], False) == "": # None means no header 412 region.type = self.match_group("args") 413 414 # Parsing utilities. 415 416 def parse_region_details(self, region, pattern_names, strict=False): 417 418 """ 419 Search 'region' using the 'pattern_names'. If 'strict' is set to a true 420 value, forbid the accumulation of additional textual padding. 421 """ 422 423 try: 424 while True: 425 426 # Obtain text before any marker or the end of the input. 427 428 preceding = self.read_until(pattern_names) 429 if preceding: 430 if not strict: 431 region.append_inline(Text(preceding)) 432 else: 433 break 434 435 # End of input. 436 437 if not self.matching_pattern(): 438 break 439 440 # Obtain any feature. 441 442 feature = self.match_group("feature") or self.match_group() 443 handler = self.handlers.get(self.matching_pattern()) 444 445 # Handle each feature or add text to the region. 446 447 if handler: 448 handler(self, region) 449 elif not strict: 450 region.append_inline(Text(feature)) 451 else: 452 break 453 454 except StopIteration: 455 pass 456 457 region.normalise() 458 459 def add_node(self, region, node): 460 461 "Add to 'region' the given 'node'." 462 463 region.add(node) 464 465 def append_node(self, region, node): 466 467 "Append to 'region' the given 'node'." 468 469 region.append(node) 470 471 def end_region(self, region): 472 473 "End the parsing of 'region', breaking out of the parsing loop." 474 475 raise StopIteration 476 477 def queue_match(self): 478 479 "Queue the current match." 480 481 self.items.queue_match() 482 483 def new_block(self, region): 484 485 "Start a new block in 'region'." 486 487 self.add_node(region, Block([])) 488 489 # vim: tabstop=4 expandtab shiftwidth=4