1 #!/usr/bin/env python 2 3 """ 4 Moin wiki parsing functionality. 5 6 Copyright (C) 2017, 2018 Paul Boddie <paul@boddie.org.uk> 7 8 This program is free software; you can redistribute it and/or modify it under 9 the terms of the GNU General Public License as published by the Free Software 10 Foundation; either version 3 of the License, or (at your option) any later 11 version. 12 13 This program is distributed in the hope that it will be useful, but WITHOUT 14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS 15 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more 16 details. 17 18 You should have received a copy of the GNU General Public License along with 19 this program. If not, see <http://www.gnu.org/licenses/>. 20 """ 21 22 from collections import defaultdict 23 from moinformat.tree.moin import Block, Region, Text 24 import re 25 26 # Pattern management. 27 28 ws_excl_nl = r"[ \f\r\t\v]" 29 quotes = "['" '"]' # ['"] 30 31 def choice(l): 32 33 "Return a pattern matching a choice of patterns in 'l'." 34 35 return "(%s)" % "|".join(l) 36 37 def excl(s): 38 39 "Return a non-matching pattern for 's'." 40 41 return "(?!%s)" % s 42 43 def expect(s): 44 45 "Return a pattern expecting 's'." 46 47 return "(?=%s)" % s 48 49 def group(name, s): 50 51 "Return a pattern group having 'name' and the pattern string 's'." 52 53 return "(?P<%s>%s)" % (name, s) 54 55 def optional(s): 56 57 "Return an optional pattern." 58 59 return "(?:%s)?" % s 60 61 def recur(name): 62 63 "Return a test for a recurrence of group 'name'." 64 65 return "(?P=%s)" % name 66 67 def repeat(s, min=None, max=None): 68 69 "Return a pattern matching 's' for the given 'min' and 'max' limits." 70 71 return "%s{%s,%s}" % (s, min is not None and min or "", 72 max is not None and max or "") 73 74 def get_pattern(s): 75 76 "Return a compiled regular expression for the given pattern 's'." 77 78 return re.compile(s, re.UNICODE | re.MULTILINE) 79 80 def get_patterns(syntax): 81 82 """ 83 Define patterns for the regular expressions in the 'syntax' mapping. In each 84 pattern, replace \N with a pattern for matching whitespace excluding 85 newlines. 86 """ 87 88 patterns = {} 89 for name, value in syntax.items(): 90 value = value.replace(r"\N", ws_excl_nl) 91 value = value.replace(r"\Q", quotes) 92 patterns[name] = get_pattern(value) 93 return patterns 94 95 def get_subset(d, keys): 96 97 "Return a subset of 'd' having the given 'keys'." 98 99 subset = {} 100 for key in keys: 101 subset[key] = d[key] 102 return subset 103 104 105 106 # Tokenising functions. 107 108 class TokenStream: 109 110 "A stream of tokens taken from a string." 111 112 def __init__(self, s, pos=0): 113 self.s = s 114 self.pos = pos 115 116 # Match details. 117 118 self.match = None 119 self.queued = None 120 self.match_start = None 121 122 # Pattern name details. 123 124 self.matching = None 125 126 def rewind(self, length): 127 128 "Rewind in the string by 'length'." 129 130 self.pos -= min(length, self.pos) 131 132 def queue_match(self): 133 134 "Rewind in the string to the start of the last match." 135 136 self.queued = self.match 137 138 def read_until(self, patterns, remaining=True): 139 140 """ 141 Find the first match for the given 'patterns'. Return the text preceding 142 any match, the remaining text if no match was found, or None if no match 143 was found and 'remaining' is given as a false value. 144 """ 145 146 if self.queued: 147 self.match = self.queued 148 self.queued = None 149 else: 150 self.match_start = None 151 self.matching = None 152 153 # Find the first matching pattern. 154 155 for pattern_name, pattern in patterns.items(): 156 match = pattern.search(self.s, self.pos) 157 if match: 158 start, end = match.span() 159 if self.matching is None or start < self.start: 160 self.start = start 161 self.matching = pattern_name 162 self.match = match 163 164 if self.matching is None: 165 if remaining: 166 return self.s[self.pos:] 167 else: 168 return None 169 else: 170 return self.s[self.pos:self.start] 171 172 def match_group(self, group=1): 173 174 """ 175 Return the matched text, updating the position in the stream. If 'group' 176 is specified, the indicated group in a match will be returned. 177 Typically, group 1 should contain all pertinent data, but groups defined 178 within group 1 can provide sections of the data. 179 """ 180 181 self.update_pos() 182 183 if self.match: 184 try: 185 return self.match.group(group) 186 except IndexError: 187 return "" 188 else: 189 return None 190 191 def match_groups(self, groups=None): 192 193 "Return the match 'groups', or all groups if unspecified." 194 195 self.update_pos() 196 197 if self.match: 198 if groups is None: 199 return self.match.groups() 200 else: 201 return self.match.groups(groups) 202 else: 203 return [] 204 205 def update_pos(self): 206 207 "Update the position in the stream." 208 209 if self.match: 210 _start, self.pos = self.match.span() 211 else: 212 self.pos = len(self.s) 213 214 215 216 # Parser abstractions. 217 218 class ParserBase: 219 220 "Common parsing methods." 221 222 region_pattern_names = None 223 224 def __init__(self, formats=None, root=None): 225 226 """ 227 Initialise the parser with any given 'formats' mapping from region type 228 names to parser objects. An optional 'root' indicates the document-level 229 parser. 230 """ 231 232 self.formats = formats 233 self.root = root 234 235 def get_parser(self, format_type): 236 237 """ 238 Return a parser for 'format_type' or None if no suitable parser is found. 239 """ 240 241 if not self.formats: 242 return None 243 244 cls = self.formats.get(format_type) 245 if cls: 246 return cls(self.formats, self.root or self) 247 else: 248 return None 249 250 def get_patterns(self, pattern_names): 251 252 "Return a mapping of the given 'pattern_names' to patterns." 253 254 return get_subset(self.patterns, pattern_names) 255 256 def get_items(self, s, pos=0): 257 258 "Return a sequence of token items for 's' and 'pos'." 259 260 return TokenStream(s, pos) 261 262 def set_region(self, items, region): 263 264 "Set the 'items' used to populate the given 'region'." 265 266 self.items = items 267 self.region = region 268 269 def read_until(self, pattern_names, remaining=True): 270 271 """ 272 Read the next portion of input, matching using 'pattern_names'. Return 273 the text preceding any match, the remaining text if no match was found, 274 or None if no match was found and 'remaining' is given as a false value. 275 """ 276 277 return self.items.read_until(self.get_patterns(pattern_names)) 278 279 def match_group(self, group=1): 280 281 """ 282 Return the group of the matching pattern with the given 'group' number. 283 """ 284 285 return self.items.match_group(group) 286 287 def matching_pattern(self): 288 289 "Return the name of the matching pattern." 290 291 return self.items.matching 292 293 def match_groups(self): 294 295 "Return the number of groups in the match." 296 297 return self.items.match_groups() 298 299 # Parser methods invoked from other objects. 300 301 def parse(self, s): 302 303 """ 304 Parse page text 's'. Pages consist of regions delimited by markers. 305 """ 306 307 self.items = self.get_items(s) 308 self.region = self.parse_region() 309 return self.region 310 311 def parse_region_content(self, items, region): 312 313 "Parse the data provided by 'items' to populate a 'region'." 314 315 self.set_region(items, region) 316 317 # Parse inline and opaque regions. 318 319 if not region.transparent: 320 pattern_names = ["regionend"] 321 322 # Define a block to hold text. 323 324 else: 325 self.new_block(region) 326 pattern_names = self.region_pattern_names 327 328 # Start parsing. 329 330 if pattern_names: 331 self.parse_region_details(region, pattern_names) 332 333 # Reset the type if the region was not inline. 334 335 if region.type == "inline": 336 first = region.nodes and region.nodes[0] 337 if first and isinstance(first, Text) and first.multiline(): 338 region.type = None 339 340 # Top-level parser handler methods. 341 342 def parse_region(self, level=0, indent=0, type=None): 343 344 """ 345 Parse the data to populate a region with the given 'level' at the given 346 'indent' having the given initial 'type'. 347 """ 348 349 region = Region([], level, indent, type) 350 351 # Parse section headers, then parse according to region type. 352 353 self.parse_region_header(region) 354 self.parse_region_type(region) 355 356 return region 357 358 def parse_region_type(self, region): 359 360 """ 361 Use configured parsers to parse 'region' based on its type. 362 """ 363 364 # Find an appropriate parser given the type. 365 366 parser = self.get_parser(region.type) 367 if not parser: 368 region.transparent = False 369 parser = parser or self.get_parser("moin") 370 parser.parse_region_content(self.items, region) 371 372 def parse_region_header(self, region): 373 374 """ 375 Parse the region header, setting it on the 'region' object. 376 """ 377 378 if self.read_until(["header"], False) == "": # None means no header 379 region.args = self.match_group("args") 380 region.type = region.args.split(" ", 1)[0] 381 382 # Parsing utilities. 383 384 def parse_region_details(self, region, pattern_names, strict=False): 385 386 """ 387 Search 'region' using the 'pattern_names'. If 'strict' is set to a true 388 value, forbid the accumulation of additional textual padding. 389 """ 390 391 try: 392 while True: 393 394 # Obtain text before any marker or the end of the input. 395 396 preceding = self.read_until(pattern_names) 397 if preceding: 398 if not strict: 399 region.append_inline(Text(preceding)) 400 else: 401 break 402 403 # End of input. 404 405 if not self.matching_pattern(): 406 break 407 408 # Obtain any feature. 409 410 feature = self.match_group("feature") or self.match_group() 411 handler = self.handlers.get(self.matching_pattern()) 412 413 # Handle each feature or add text to the region. 414 415 if handler: 416 handler(self, region) 417 elif not strict: 418 region.append_inline(Text(feature)) 419 else: 420 break 421 422 except StopIteration: 423 pass 424 425 region.normalise() 426 427 def add_node(self, region, node): 428 429 "Add to 'region' the given 'node'." 430 431 region.add(node) 432 433 def append_node(self, region, node): 434 435 "Append to 'region' the given 'node'." 436 437 region.append(node) 438 439 def end_region(self, region): 440 441 "End the parsing of 'region', breaking out of the parsing loop." 442 443 raise StopIteration 444 445 def queue_match(self): 446 447 "Queue the current match." 448 449 self.items.queue_match() 450 451 def new_block(self, region): 452 453 "Start a new block in 'region'." 454 455 self.add_node(region, Block([])) 456 457 # Common handler methods. 458 459 def parse_region_end(self, node): 460 461 "Handle the end of a region occurring within 'node'." 462 463 level = self.match_group("level") 464 feature = self.match_group("feature") 465 self.region.extra = self.match_group("extra") 466 467 if self.region.have_end(level): 468 raise StopIteration 469 else: 470 node.append_inline(Text(feature)) 471 472 # vim: tabstop=4 expandtab shiftwidth=4