1 #!/usr/bin/env python 2 3 """ 4 Moin wiki parsing functionality. 5 6 Copyright (C) 2017, 2018 Paul Boddie <paul@boddie.org.uk> 7 8 This program is free software; you can redistribute it and/or modify it under 9 the terms of the GNU General Public License as published by the Free Software 10 Foundation; either version 3 of the License, or (at your option) any later 11 version. 12 13 This program is distributed in the hope that it will be useful, but WITHOUT 14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS 15 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more 16 details. 17 18 You should have received a copy of the GNU General Public License along with 19 this program. If not, see <http://www.gnu.org/licenses/>. 20 """ 21 22 from collections import defaultdict 23 from moinformat.tree import Block, Region, Text 24 import re 25 26 # Pattern management. 27 28 ws_excl_nl = r"[ \f\r\t\v]" 29 30 def get_patterns(syntax): 31 32 """ 33 Define patterns for the regular expressions in the 'syntax' mapping. In each 34 pattern, replace \N with a pattern for matching whitespace excluding 35 newlines. 36 """ 37 38 patterns = {} 39 for name, value in syntax.items(): 40 value = value.replace(r"\N", ws_excl_nl) 41 patterns[name] = re.compile(value, re.UNICODE | re.MULTILINE) 42 return patterns 43 44 def get_subset(d, keys): 45 46 "Return a subset of 'd' having the given 'keys'." 47 48 subset = {} 49 for key in keys: 50 subset[key] = d[key] 51 return subset 52 53 54 55 # Tokenising functions. 56 57 class TokenStream: 58 59 "A stream of tokens taken from a string." 60 61 def __init__(self, s, pos=0): 62 self.s = s 63 self.pos = pos 64 65 # Match details. 66 67 self.match = None 68 self.queued = None 69 self.match_start = None 70 71 # Pattern name details. 72 73 self.matching = None 74 75 def rewind(self, length): 76 77 "Rewind in the string by 'length'." 78 79 self.pos -= min(length, self.pos) 80 81 def queue_match(self): 82 83 "Rewind in the string to the start of the last match." 84 85 self.queued = self.match 86 87 def read_until(self, patterns, remaining=True): 88 89 """ 90 Find the first match for the given 'patterns'. Return the text preceding 91 any match, the remaining text if no match was found, or None if no match 92 was found and 'remaining' is given as a false value. 93 """ 94 95 if self.queued: 96 self.match = self.queued 97 self.queued = None 98 else: 99 self.match_start = None 100 self.matching = None 101 102 # Find the first matching pattern. 103 104 for pattern_name, pattern in patterns.items(): 105 match = pattern.search(self.s, self.pos) 106 if match: 107 start, end = match.span() 108 if self.matching is None or start < self.start: 109 self.start = start 110 self.matching = pattern_name 111 self.match = match 112 113 if self.matching is None: 114 if remaining: 115 return self.s[self.pos:] 116 else: 117 return None 118 else: 119 return self.s[self.pos:self.start] 120 121 def read_match(self, group=1): 122 123 """ 124 Return the matched text, updating the position in the stream. If 'group' 125 is specified, the indicated group in a match will be returned. 126 Typically, group 1 should contain all pertinent data, but groups defined 127 within group 1 can provide sections of the data. 128 """ 129 130 if self.match: 131 _start, self.pos = self.match.span() 132 try: 133 return self.match.group(group) 134 except IndexError: 135 return "" 136 else: 137 self.pos = len(self.s) 138 return None 139 140 def match_groups(self): 141 142 "Return the match groups." 143 144 if self.match: 145 return self.match.groups() 146 else: 147 return [] 148 149 150 151 # Parser abstractions. 152 153 class ParserBase: 154 155 "Common parsing methods." 156 157 region_pattern_names = None 158 159 def __init__(self, formats=None): 160 161 """ 162 Initialise the parser with any given 'formats' mapping from region type 163 names to parser objects. 164 """ 165 166 self.formats = formats 167 168 def get_parser(self, format_type): 169 170 """ 171 Return a parser for 'format_type' or None if no suitable parser is found. 172 """ 173 174 if not self.formats: 175 return None 176 177 cls = self.formats.get(format_type) 178 if cls: 179 return cls(self.formats) 180 else: 181 return None 182 183 def get_patterns(self, pattern_names): 184 185 "Return a mapping of the given 'pattern_names' to patterns." 186 187 return get_subset(self.patterns, pattern_names) 188 189 def get_items(self, s, pos=0): 190 191 "Return a sequence of token items for 's' and 'pos'." 192 193 return TokenStream(s, pos) 194 195 def set_region(self, items, region): 196 197 "Set the 'items' used to populate the given 'region'." 198 199 self.items = items 200 self.region = region 201 202 def read_until(self, pattern_names, remaining=True): 203 204 """ 205 Read the next portion of input, matching using 'pattern_names'. Return 206 the text preceding any match, the remaining text if no match was found, 207 or None if no match was found and 'remaining' is given as a false value. 208 """ 209 210 return self.items.read_until(self.get_patterns(pattern_names)) 211 212 def read_match(self, group=1): 213 214 """ 215 Return the group of the matching pattern with the given 'group' number. 216 """ 217 218 return self.items.read_match(group) 219 220 def read_matching(self): 221 222 "Return the name of the matching pattern." 223 224 return self.items.matching 225 226 def match_groups(self): 227 228 "Return the number of groups in the match." 229 230 return self.items.match_groups() 231 232 # Parser methods invoked from other objects. 233 234 def parse(self, s): 235 236 """ 237 Parse page text 's'. Pages consist of regions delimited by markers. 238 """ 239 240 self.items = self.get_items(s) 241 self.region = self.parse_region() 242 return self.region 243 244 def parse_region_content(self, items, region): 245 246 "Parse the data provided by 'items' to populate a 'region'." 247 248 self.set_region(items, region) 249 250 # Define a block to hold text and start parsing. 251 252 self.new_block(region) 253 254 if self.region_pattern_names: 255 self.parse_region_details(region, self.region_pattern_names) 256 257 # Top-level parser handler methods. 258 259 def parse_region(self, level=0, indent=0, type=None): 260 261 """ 262 Parse the data to populate a region with the given 'level' at the given 263 'indent' having the given initial 'type'. 264 """ 265 266 region = Region([], level, indent, type) 267 268 # Parse section headers, then parse according to region type. 269 270 self.parse_region_header(region) 271 self.parse_region_type(region) 272 273 return region 274 275 def parse_region_type(self, region): 276 277 """ 278 Use configured parsers to parse 'region' based on its type. 279 """ 280 281 # Handle potentially inline regions. 282 283 if region.type == "inline": 284 self.parse_region_inline(region) 285 return 286 287 # Find an appropriate parser given the type. 288 289 parser = self.get_parser(region.type) 290 291 if parser: 292 parser.parse_region_content(self.items, region) 293 294 # Otherwise, treat the section as opaque. 295 296 else: 297 self.parse_region_opaque(region) 298 299 def parse_region_header(self, region): 300 301 """ 302 Parse the region header, setting it on the 'region' object. 303 """ 304 305 if self.read_until(["header"], False) == "": # None means no header 306 region.type = self.read_match() 307 308 def parse_region_opaque(self, region): 309 310 "Parse the data to populate an opaque 'region'." 311 312 region.transparent = False 313 self.parse_region_details(region, ["regionend"]) 314 315 def parse_region_inline(self, region): 316 317 "Parse the data to populate an inline 'region'." 318 319 region.transparent = False 320 self.parse_region_details(region, ["regionend"]) 321 322 # Reset the type if the region was not inline. 323 324 if region.type == "inline": 325 first = region.nodes and region.nodes[0] 326 if first and isinstance(first, Text) and first.multiline(): 327 region.type = None 328 329 # Parsing utilities. 330 331 def parse_region_details(self, region, pattern_names, strict=False): 332 333 """ 334 Search 'region' using the 'pattern_names'. If 'strict' is set to a true 335 value, forbid the accumulation of additional textual padding. 336 """ 337 338 try: 339 while True: 340 341 # Obtain text before any marker or the end of the input. 342 343 preceding = self.read_until(pattern_names) 344 if preceding: 345 if not strict: 346 region.append_inline(Text(preceding)) 347 else: 348 break 349 350 # End of input. 351 352 if not self.read_matching(): 353 break 354 355 # Obtain any feature. 356 357 feature = self.read_match() 358 handler = self.handlers.get(self.read_matching()) 359 360 # Handle each feature or add text to the region. 361 362 if handler: 363 handler(self, region) 364 elif not strict: 365 region.append_inline(Text(feature)) 366 else: 367 break 368 369 except StopIteration: 370 pass 371 372 region.normalise() 373 374 def add_node(self, region, node): 375 376 "Add to 'region' the given 'node'." 377 378 region.add(node) 379 380 def append_node(self, region, node): 381 382 "Append to 'region' the given 'node'." 383 384 region.append(node) 385 386 def end_region(self, region): 387 388 "End the parsing of 'region', breaking out of the parsing loop." 389 390 raise StopIteration 391 392 def queue_match(self): 393 394 "Queue the current match." 395 396 self.items.queue_match() 397 398 def new_block(self, region): 399 400 "Start a new block in 'region'." 401 402 self.add_node(region, Block([])) 403 404 # vim: tabstop=4 expandtab shiftwidth=4