1 #!/usr/bin/env python 2 3 """ 4 Moin wiki parsing functionality. 5 6 Copyright (C) 2017, 2018 Paul Boddie <paul@boddie.org.uk> 7 8 This program is free software; you can redistribute it and/or modify it under 9 the terms of the GNU General Public License as published by the Free Software 10 Foundation; either version 3 of the License, or (at your option) any later 11 version. 12 13 This program is distributed in the hope that it will be useful, but WITHOUT 14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS 15 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more 16 details. 17 18 You should have received a copy of the GNU General Public License along with 19 this program. If not, see <http://www.gnu.org/licenses/>. 20 """ 21 22 from collections import defaultdict 23 from moinformat.tree import Block, Region, Text 24 import re 25 26 # Pattern management. 27 28 ws_excl_nl = r"[ \f\r\t\v]" 29 quotes = "['" '"]' # ['"] 30 31 def excl(s): 32 33 "Return a non-matching pattern for 's'." 34 35 return "(?!%s)" % s 36 37 def expect(s): 38 39 "Return a pattern expecting 's'." 40 41 return "(?=%s)" % s 42 43 def group(name, s): 44 45 "Return a pattern group having 'name' and the pattern string 's'." 46 47 return "(?P<%s>%s)" % (name, s) 48 49 def optional(s): 50 51 "Return an optional pattern." 52 53 return "(?:%s)?" % s 54 55 def recur(name): 56 57 "Return a test for a recurrence of group 'name'." 58 59 return "(?P=%s)" % name 60 61 def repeat(s, min=None, max=None): 62 63 "Return a pattern matching 's' for the given 'min' and 'max' limits." 64 65 return "%s{%s,%s}" % (s, min is not None and min or "", 66 max is not None and max or "") 67 68 def get_patterns(syntax): 69 70 """ 71 Define patterns for the regular expressions in the 'syntax' mapping. In each 72 pattern, replace \N with a pattern for matching whitespace excluding 73 newlines. 74 """ 75 76 patterns = {} 77 for name, value in syntax.items(): 78 value = value.replace(r"\N", ws_excl_nl) 79 value = value.replace(r"\Q", quotes) 80 patterns[name] = re.compile(value, re.UNICODE | re.MULTILINE) 81 return patterns 82 83 def get_subset(d, keys): 84 85 "Return a subset of 'd' having the given 'keys'." 86 87 subset = {} 88 for key in keys: 89 subset[key] = d[key] 90 return subset 91 92 93 94 # Tokenising functions. 95 96 class TokenStream: 97 98 "A stream of tokens taken from a string." 99 100 def __init__(self, s, pos=0): 101 self.s = s 102 self.pos = pos 103 104 # Match details. 105 106 self.match = None 107 self.queued = None 108 self.match_start = None 109 110 # Pattern name details. 111 112 self.matching = None 113 114 def rewind(self, length): 115 116 "Rewind in the string by 'length'." 117 118 self.pos -= min(length, self.pos) 119 120 def queue_match(self): 121 122 "Rewind in the string to the start of the last match." 123 124 self.queued = self.match 125 126 def read_until(self, patterns, remaining=True): 127 128 """ 129 Find the first match for the given 'patterns'. Return the text preceding 130 any match, the remaining text if no match was found, or None if no match 131 was found and 'remaining' is given as a false value. 132 """ 133 134 if self.queued: 135 self.match = self.queued 136 self.queued = None 137 else: 138 self.match_start = None 139 self.matching = None 140 141 # Find the first matching pattern. 142 143 for pattern_name, pattern in patterns.items(): 144 match = pattern.search(self.s, self.pos) 145 if match: 146 start, end = match.span() 147 148 # Where patterns match at the same place, obtain the longest 149 # match. 150 151 if self.matching is None or start < self.start or \ 152 start == self.start and end - start > self.length: 153 154 self.start = start 155 self.length = end - start 156 self.matching = pattern_name 157 self.match = match 158 159 if self.matching is None: 160 if remaining: 161 return self.s[self.pos:] 162 else: 163 return None 164 else: 165 return self.s[self.pos:self.start] 166 167 def match_group(self, group=1): 168 169 """ 170 Return the matched text, updating the position in the stream. If 'group' 171 is specified, the indicated group in a match will be returned. 172 Typically, group 1 should contain all pertinent data, but groups defined 173 within group 1 can provide sections of the data. 174 """ 175 176 self.update_pos() 177 178 if self.match: 179 try: 180 return self.match.group(group) 181 except IndexError: 182 return "" 183 else: 184 return None 185 186 def match_groups(self, groups=None): 187 188 "Return the match 'groups', or all groups if unspecified." 189 190 self.update_pos() 191 192 if self.match: 193 if groups is None: 194 return self.match.groups() 195 else: 196 return self.match.groups(groups) 197 else: 198 return [] 199 200 def update_pos(self): 201 202 "Update the position in the stream." 203 204 if self.match: 205 _start, self.pos = self.match.span() 206 else: 207 self.pos = len(self.s) 208 209 210 211 # Parser abstractions. 212 213 class ParserBase: 214 215 "Common parsing methods." 216 217 region_pattern_names = None 218 219 def __init__(self, formats=None): 220 221 """ 222 Initialise the parser with any given 'formats' mapping from region type 223 names to parser objects. 224 """ 225 226 self.formats = formats 227 228 def get_parser(self, format_type): 229 230 """ 231 Return a parser for 'format_type' or None if no suitable parser is found. 232 """ 233 234 if not self.formats: 235 return None 236 237 cls = self.formats.get(format_type) 238 if cls: 239 return cls(self.formats) 240 else: 241 return None 242 243 def get_patterns(self, pattern_names): 244 245 "Return a mapping of the given 'pattern_names' to patterns." 246 247 return get_subset(self.patterns, pattern_names) 248 249 def get_items(self, s, pos=0): 250 251 "Return a sequence of token items for 's' and 'pos'." 252 253 return TokenStream(s, pos) 254 255 def set_region(self, items, region): 256 257 "Set the 'items' used to populate the given 'region'." 258 259 self.items = items 260 self.region = region 261 262 def read_until(self, pattern_names, remaining=True): 263 264 """ 265 Read the next portion of input, matching using 'pattern_names'. Return 266 the text preceding any match, the remaining text if no match was found, 267 or None if no match was found and 'remaining' is given as a false value. 268 """ 269 270 return self.items.read_until(self.get_patterns(pattern_names)) 271 272 def match_group(self, group=1): 273 274 """ 275 Return the group of the matching pattern with the given 'group' number. 276 """ 277 278 return self.items.match_group(group) 279 280 def matching_pattern(self): 281 282 "Return the name of the matching pattern." 283 284 return self.items.matching 285 286 def match_groups(self): 287 288 "Return the number of groups in the match." 289 290 return self.items.match_groups() 291 292 # Parser methods invoked from other objects. 293 294 def parse(self, s): 295 296 """ 297 Parse page text 's'. Pages consist of regions delimited by markers. 298 """ 299 300 self.items = self.get_items(s) 301 self.region = self.parse_region() 302 return self.region 303 304 def parse_region_content(self, items, region): 305 306 "Parse the data provided by 'items' to populate a 'region'." 307 308 self.set_region(items, region) 309 310 # Parse inline and opaque regions. 311 312 if not region.transparent: 313 pattern_names = ["regionend"] 314 315 # Define a block to hold text. 316 317 else: 318 self.new_block(region) 319 pattern_names = self.region_pattern_names 320 321 # Start parsing. 322 323 if pattern_names: 324 self.parse_region_details(region, pattern_names) 325 326 # Reset the type if the region was not inline. 327 328 if region.type == "inline": 329 first = region.nodes and region.nodes[0] 330 if first and isinstance(first, Text) and first.multiline(): 331 region.type = None 332 333 # Top-level parser handler methods. 334 335 def parse_region(self, level=0, indent=0, type=None): 336 337 """ 338 Parse the data to populate a region with the given 'level' at the given 339 'indent' having the given initial 'type'. 340 """ 341 342 region = Region([], level, indent, type) 343 344 # Parse section headers, then parse according to region type. 345 346 self.parse_region_header(region) 347 self.parse_region_type(region) 348 349 return region 350 351 def parse_region_type(self, region): 352 353 """ 354 Use configured parsers to parse 'region' based on its type. 355 """ 356 357 # Find an appropriate parser given the type. 358 359 parser = self.get_parser(region.type) 360 if not parser: 361 region.transparent = False 362 parser = parser or self.get_parser("moin") 363 parser.parse_region_content(self.items, region) 364 365 def parse_region_header(self, region): 366 367 """ 368 Parse the region header, setting it on the 'region' object. 369 """ 370 371 if self.read_until(["header"], False) == "": # None means no header 372 region.type = self.match_group("args") 373 374 # Parsing utilities. 375 376 def parse_region_details(self, region, pattern_names, strict=False): 377 378 """ 379 Search 'region' using the 'pattern_names'. If 'strict' is set to a true 380 value, forbid the accumulation of additional textual padding. 381 """ 382 383 try: 384 while True: 385 386 # Obtain text before any marker or the end of the input. 387 388 preceding = self.read_until(pattern_names) 389 if preceding: 390 if not strict: 391 region.append_inline(Text(preceding)) 392 else: 393 break 394 395 # End of input. 396 397 if not self.matching_pattern(): 398 break 399 400 # Obtain any feature. 401 402 feature = self.match_group("feature") or self.match_group() 403 handler = self.handlers.get(self.matching_pattern()) 404 405 # Handle each feature or add text to the region. 406 407 if handler: 408 handler(self, region) 409 elif not strict: 410 region.append_inline(Text(feature)) 411 else: 412 break 413 414 except StopIteration: 415 pass 416 417 region.normalise() 418 419 def add_node(self, region, node): 420 421 "Add to 'region' the given 'node'." 422 423 region.add(node) 424 425 def append_node(self, region, node): 426 427 "Append to 'region' the given 'node'." 428 429 region.append(node) 430 431 def end_region(self, region): 432 433 "End the parsing of 'region', breaking out of the parsing loop." 434 435 raise StopIteration 436 437 def queue_match(self): 438 439 "Queue the current match." 440 441 self.items.queue_match() 442 443 def new_block(self, region): 444 445 "Start a new block in 'region'." 446 447 self.add_node(region, Block([])) 448 449 # vim: tabstop=4 expandtab shiftwidth=4