1 #!/usr/bin/env python 2 3 """ 4 Moin wiki parsing functionality. 5 6 Copyright (C) 2017, 2018 Paul Boddie <paul@boddie.org.uk> 7 8 This program is free software; you can redistribute it and/or modify it under 9 the terms of the GNU General Public License as published by the Free Software 10 Foundation; either version 3 of the License, or (at your option) any later 11 version. 12 13 This program is distributed in the hope that it will be useful, but WITHOUT 14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS 15 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more 16 details. 17 18 You should have received a copy of the GNU General Public License along with 19 this program. If not, see <http://www.gnu.org/licenses/>. 20 """ 21 22 from collections import defaultdict 23 from moinformat.tree import Block, Region, Text 24 import re 25 26 # Pattern management. 27 28 ws_excl_nl = r"[ \f\r\t\v]" 29 30 def get_patterns(syntax): 31 32 """ 33 Define patterns for the regular expressions in the 'syntax' mapping. In each 34 pattern, replace \N with a pattern for matching whitespace excluding 35 newlines. 36 """ 37 38 patterns = {} 39 for name, value in syntax.items(): 40 value = value.replace(r"\N", ws_excl_nl) 41 patterns[name] = re.compile(value, re.UNICODE | re.MULTILINE) 42 return patterns 43 44 def get_subset(d, keys): 45 46 "Return a subset of 'd' having the given 'keys'." 47 48 subset = {} 49 for key in keys: 50 subset[key] = d[key] 51 return subset 52 53 54 55 # Tokenising functions. 56 57 class TokenStream: 58 59 "A stream of tokens taken from a string." 60 61 def __init__(self, s, pos=0): 62 self.s = s 63 self.pos = pos 64 65 # Match details. 66 67 self.match = None 68 self.queued = None 69 self.match_start = None 70 71 # Pattern name details. 72 73 self.matching = None 74 75 def rewind(self, length): 76 77 "Rewind in the string by 'length'." 78 79 self.pos -= min(length, self.pos) 80 81 def queue_match(self): 82 83 "Rewind in the string to the start of the last match." 84 85 self.queued = self.match 86 87 def read_until(self, patterns, remaining=True): 88 89 """ 90 Find the first match for the given 'patterns'. Return the text preceding 91 any match, the remaining text if no match was found, or None if no match 92 was found and 'remaining' is given as a false value. 93 """ 94 95 if self.queued: 96 self.match = self.queued 97 self.queued = None 98 else: 99 self.match_start = None 100 self.matching = None 101 102 # Find the first matching pattern. 103 104 for pattern_name, pattern in patterns.items(): 105 match = pattern.search(self.s, self.pos) 106 if match: 107 start, end = match.span() 108 if self.matching is None or start < self.start: 109 self.start = start 110 self.matching = pattern_name 111 self.match = match 112 113 if self.matching is None: 114 if remaining: 115 return self.s[self.pos:] 116 else: 117 return None 118 else: 119 return self.s[self.pos:self.start] 120 121 def match_group(self, group=1): 122 123 """ 124 Return the matched text, updating the position in the stream. If 'group' 125 is specified, the indicated group in a match will be returned. 126 Typically, group 1 should contain all pertinent data, but groups defined 127 within group 1 can provide sections of the data. 128 """ 129 130 self.update_pos() 131 132 if self.match: 133 try: 134 return self.match.group(group) 135 except IndexError: 136 return "" 137 else: 138 return None 139 140 def match_groups(self, groups=None): 141 142 "Return the match 'groups', or all groups if unspecified." 143 144 self.update_pos() 145 146 if self.match: 147 if groups is None: 148 return self.match.groups() 149 else: 150 return self.match.groups(groups) 151 else: 152 return [] 153 154 def update_pos(self): 155 156 "Update the position in the stream." 157 158 if self.match: 159 _start, self.pos = self.match.span() 160 else: 161 self.pos = len(self.s) 162 163 164 165 # Parser abstractions. 166 167 class ParserBase: 168 169 "Common parsing methods." 170 171 region_pattern_names = None 172 173 def __init__(self, formats=None): 174 175 """ 176 Initialise the parser with any given 'formats' mapping from region type 177 names to parser objects. 178 """ 179 180 self.formats = formats 181 182 def get_parser(self, format_type): 183 184 """ 185 Return a parser for 'format_type' or None if no suitable parser is found. 186 """ 187 188 if not self.formats: 189 return None 190 191 cls = self.formats.get(format_type) 192 if cls: 193 return cls(self.formats) 194 else: 195 return None 196 197 def get_patterns(self, pattern_names): 198 199 "Return a mapping of the given 'pattern_names' to patterns." 200 201 return get_subset(self.patterns, pattern_names) 202 203 def get_items(self, s, pos=0): 204 205 "Return a sequence of token items for 's' and 'pos'." 206 207 return TokenStream(s, pos) 208 209 def set_region(self, items, region): 210 211 "Set the 'items' used to populate the given 'region'." 212 213 self.items = items 214 self.region = region 215 216 def read_until(self, pattern_names, remaining=True): 217 218 """ 219 Read the next portion of input, matching using 'pattern_names'. Return 220 the text preceding any match, the remaining text if no match was found, 221 or None if no match was found and 'remaining' is given as a false value. 222 """ 223 224 return self.items.read_until(self.get_patterns(pattern_names)) 225 226 def match_group(self, group=1): 227 228 """ 229 Return the group of the matching pattern with the given 'group' number. 230 """ 231 232 return self.items.match_group(group) 233 234 def matching_pattern(self): 235 236 "Return the name of the matching pattern." 237 238 return self.items.matching 239 240 def match_groups(self): 241 242 "Return the number of groups in the match." 243 244 return self.items.match_groups() 245 246 # Parser methods invoked from other objects. 247 248 def parse(self, s): 249 250 """ 251 Parse page text 's'. Pages consist of regions delimited by markers. 252 """ 253 254 self.items = self.get_items(s) 255 self.region = self.parse_region() 256 return self.region 257 258 def parse_region_content(self, items, region): 259 260 "Parse the data provided by 'items' to populate a 'region'." 261 262 self.set_region(items, region) 263 264 # Define a block to hold text and start parsing. 265 266 self.new_block(region) 267 268 if self.region_pattern_names: 269 self.parse_region_details(region, self.region_pattern_names) 270 271 # Top-level parser handler methods. 272 273 def parse_region(self, level=0, indent=0, type=None): 274 275 """ 276 Parse the data to populate a region with the given 'level' at the given 277 'indent' having the given initial 'type'. 278 """ 279 280 region = Region([], level, indent, type) 281 282 # Parse section headers, then parse according to region type. 283 284 self.parse_region_header(region) 285 self.parse_region_type(region) 286 287 return region 288 289 def parse_region_type(self, region): 290 291 """ 292 Use configured parsers to parse 'region' based on its type. 293 """ 294 295 # Handle potentially inline regions. 296 297 if region.type == "inline": 298 self.parse_region_inline(region) 299 return 300 301 # Find an appropriate parser given the type. 302 303 parser = self.get_parser(region.type) 304 305 if parser: 306 parser.parse_region_content(self.items, region) 307 308 # Otherwise, treat the section as opaque. 309 310 else: 311 self.parse_region_opaque(region) 312 313 def parse_region_header(self, region): 314 315 """ 316 Parse the region header, setting it on the 'region' object. 317 """ 318 319 if self.read_until(["header"], False) == "": # None means no header 320 region.type = self.match_group() 321 322 def parse_region_opaque(self, region): 323 324 "Parse the data to populate an opaque 'region'." 325 326 region.transparent = False 327 self.parse_region_details(region, ["regionend"]) 328 329 def parse_region_inline(self, region): 330 331 "Parse the data to populate an inline 'region'." 332 333 region.transparent = False 334 self.parse_region_details(region, ["regionend"]) 335 336 # Reset the type if the region was not inline. 337 338 if region.type == "inline": 339 first = region.nodes and region.nodes[0] 340 if first and isinstance(first, Text) and first.multiline(): 341 region.type = None 342 343 # Parsing utilities. 344 345 def parse_region_details(self, region, pattern_names, strict=False): 346 347 """ 348 Search 'region' using the 'pattern_names'. If 'strict' is set to a true 349 value, forbid the accumulation of additional textual padding. 350 """ 351 352 try: 353 while True: 354 355 # Obtain text before any marker or the end of the input. 356 357 preceding = self.read_until(pattern_names) 358 if preceding: 359 if not strict: 360 region.append_inline(Text(preceding)) 361 else: 362 break 363 364 # End of input. 365 366 if not self.matching_pattern(): 367 break 368 369 # Obtain any feature. 370 371 feature = self.match_group() 372 handler = self.handlers.get(self.matching_pattern()) 373 374 # Handle each feature or add text to the region. 375 376 if handler: 377 handler(self, region) 378 elif not strict: 379 region.append_inline(Text(feature)) 380 else: 381 break 382 383 except StopIteration: 384 pass 385 386 region.normalise() 387 388 def add_node(self, region, node): 389 390 "Add to 'region' the given 'node'." 391 392 region.add(node) 393 394 def append_node(self, region, node): 395 396 "Append to 'region' the given 'node'." 397 398 region.append(node) 399 400 def end_region(self, region): 401 402 "End the parsing of 'region', breaking out of the parsing loop." 403 404 raise StopIteration 405 406 def queue_match(self): 407 408 "Queue the current match." 409 410 self.items.queue_match() 411 412 def new_block(self, region): 413 414 "Start a new block in 'region'." 415 416 self.add_node(region, Block([])) 417 418 # vim: tabstop=4 expandtab shiftwidth=4