1 #!/usr/bin/env python 2 3 """ 4 Moin wiki parsing functionality. 5 6 Copyright (C) 2017, 2018 Paul Boddie <paul@boddie.org.uk> 7 8 This program is free software; you can redistribute it and/or modify it under 9 the terms of the GNU General Public License as published by the Free Software 10 Foundation; either version 3 of the License, or (at your option) any later 11 version. 12 13 This program is distributed in the hope that it will be useful, but WITHOUT 14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS 15 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more 16 details. 17 18 You should have received a copy of the GNU General Public License along with 19 this program. If not, see <http://www.gnu.org/licenses/>. 20 """ 21 22 from collections import defaultdict 23 from moinformat.tree import Block, Region, Text 24 import re 25 26 # Pattern management. 27 28 ws_excl_nl = r"[ \f\r\t\v]" 29 30 def get_patterns(syntax): 31 32 """ 33 Define patterns for the regular expressions in the 'syntax' mapping. In each 34 pattern, replace \N with a pattern for matching whitespace excluding 35 newlines. 36 """ 37 38 patterns = {} 39 for name, value in syntax.items(): 40 value = value.replace(r"\N", ws_excl_nl) 41 patterns[name] = re.compile(value, re.UNICODE | re.MULTILINE) 42 return patterns 43 44 def get_subset(d, keys): 45 46 "Return a subset of 'd' having the given 'keys'." 47 48 subset = {} 49 for key in keys: 50 subset[key] = d[key] 51 return subset 52 53 54 55 # Tokenising functions. 56 57 class TokenStream: 58 59 "A stream of tokens taken from a string." 60 61 def __init__(self, s, pos=0): 62 self.s = s 63 self.pos = pos 64 65 # Match details. 66 67 self.match = None 68 self.queued = None 69 self.match_start = None 70 71 # Pattern name details. 72 73 self.matching = None 74 75 def rewind(self, length): 76 77 "Rewind in the string by 'length'." 78 79 self.pos -= min(length, self.pos) 80 81 def queue_match(self): 82 83 "Rewind in the string to the start of the last match." 84 85 self.queued = self.match 86 87 def read_until(self, patterns, remaining=True): 88 89 """ 90 Find the first match for the given 'patterns'. Return the text preceding 91 any match, the remaining text if no match was found, or None if no match 92 was found and 'remaining' is given as a false value. 93 """ 94 95 if self.queued: 96 self.match = self.queued 97 self.queued = None 98 else: 99 self.match_start = None 100 self.matching = None 101 102 # Find the first matching pattern. 103 104 for pattern_name, pattern in patterns.items(): 105 match = pattern.search(self.s, self.pos) 106 if match: 107 start, end = match.span() 108 if self.matching is None or start < self.start: 109 self.start = start 110 self.matching = pattern_name 111 self.match = match 112 113 if self.matching is None: 114 if remaining: 115 return self.s[self.pos:] 116 else: 117 return None 118 else: 119 return self.s[self.pos:self.start] 120 121 def read_match(self, group=1): 122 123 """ 124 Return the matched text, updating the position in the stream. If 'group' 125 is specified, the indicated group in a match will be returned. 126 Typically, group 1 should contain all pertinent data, but groups defined 127 within group 1 can provide sections of the data. 128 """ 129 130 if self.match: 131 _start, self.pos = self.match.span() 132 try: 133 return self.match.group(group) 134 except IndexError: 135 return "" 136 else: 137 self.pos = len(self.s) 138 return None 139 140 def match_groups(self): 141 142 "Return the match groups." 143 144 if self.match: 145 return self.match.groups() 146 else: 147 return [] 148 149 150 151 # Parser abstractions. 152 153 class ParserBase: 154 155 "Common parsing methods." 156 157 region_pattern_names = None 158 159 def __init__(self, formats=None): 160 161 """ 162 Initialise the parser with any given 'formats' mapping from region type 163 names to parser objects. 164 """ 165 166 self.formats = formats 167 168 def get_parser(self, format_type): 169 170 """ 171 Return a parser for 'format_type' or None if no suitable parser is found. 172 """ 173 174 if not self.formats: 175 return None 176 177 cls = self.formats.get(format_type) 178 if cls: 179 return cls(self.formats) 180 else: 181 return None 182 183 def get_patterns(self, pattern_names): 184 185 "Return a mapping of the given 'pattern_names' to patterns." 186 187 return get_subset(self.patterns, pattern_names) 188 189 def get_items(self, s, pos=0): 190 191 "Return a sequence of token items for 's' and 'pos'." 192 193 return TokenStream(s, pos) 194 195 def set_region(self, items, region): 196 197 "Set the 'items' used to populate the given 'region'." 198 199 self.items = items 200 self.region = region 201 202 def read_until(self, pattern_names, remaining=True): 203 204 """ 205 Read the next portion of input, matching using 'pattern_names'. Return 206 the text preceding any match, the remaining text if no match was found, 207 or None if no match was found and 'remaining' is given as a false value. 208 """ 209 210 return self.items.read_until(self.get_patterns(pattern_names)) 211 212 def read_match(self, group=1): 213 214 """ 215 Return the group of the matching pattern with the given 'group' number. 216 """ 217 218 return self.items.read_match(group) 219 220 def read_matching(self): 221 222 "Return the name of the matching pattern." 223 224 return self.items.matching 225 226 def match_groups(self): 227 228 "Return the number of groups in the match." 229 230 return self.items.match_groups() 231 232 # Parser methods invoked from other objects. 233 234 def parse(self, s): 235 236 """ 237 Parse page text 's'. Pages consist of regions delimited by markers. 238 """ 239 240 self.items = self.get_items(s) 241 self.region = self.parse_region() 242 return self.region 243 244 def parse_region_content(self, items, region): 245 246 "Parse the data provided by 'items' to populate a 'region'." 247 248 self.set_region(items, region) 249 250 # Define a block to hold text and start parsing. 251 252 self.new_block(region) 253 254 if self.region_pattern_names: 255 self.parse_region_details(region, self.region_pattern_names) 256 257 # Top-level parser handler methods. 258 259 def parse_region(self, level=0, indent=0): 260 261 """ 262 Parse the data to populate a region with the given 'level' at the given 263 'indent'. 264 """ 265 266 region = Region([], level, indent) 267 268 # Parse section headers, then parse according to region type. 269 270 self.parse_region_header(region) 271 self.parse_region_type(region) 272 273 return region 274 275 def parse_region_type(self, region): 276 277 """ 278 Use configured parsers to parse 'region' based on its type. 279 """ 280 281 # Find an appropriate parser given the type. 282 283 parser = self.get_parser(region.type) 284 285 if parser: 286 parser.parse_region_content(self.items, region) 287 288 # Otherwise, treat the section as opaque. 289 290 else: 291 self.parse_region_opaque(region) 292 293 def parse_region_header(self, region): 294 295 """ 296 Parse the region header, setting it on the 'region' object. 297 """ 298 299 if self.read_until(["header"], False) == "": # None means no header 300 region.type = self.read_match() 301 302 def parse_region_opaque(self, region): 303 304 "Parse the data to populate an opaque 'region'." 305 306 region.transparent = False 307 self.parse_region_details(region, ["regionend"]) 308 309 # Parsing utilities. 310 311 def parse_region_details(self, region, pattern_names, strict=False): 312 313 """ 314 Search 'region' using the 'pattern_names'. If 'strict' is set to a true 315 value, forbid the accumulation of additional textual padding. 316 """ 317 318 try: 319 while True: 320 321 # Obtain text before any marker or the end of the input. 322 323 preceding = self.read_until(pattern_names) 324 if preceding: 325 if not strict: 326 region.append_inline(Text(preceding)) 327 else: 328 break 329 330 # End of input. 331 332 if not self.read_matching(): 333 break 334 335 # Obtain any feature. 336 337 feature = self.read_match() 338 handler = self.handlers.get(self.read_matching()) 339 340 # Handle each feature or add text to the region. 341 342 if handler: 343 handler(self, region) 344 elif not strict: 345 region.append_inline(Text(feature)) 346 else: 347 break 348 349 except StopIteration: 350 pass 351 352 region.normalise() 353 354 def add_node(self, region, node): 355 356 "Add to 'region' the given 'node'." 357 358 region.add(node) 359 360 def append_node(self, region, node): 361 362 "Append to 'region' the given 'node'." 363 364 region.append(node) 365 366 def end_region(self, region): 367 368 "End the parsing of 'region', breaking out of the parsing loop." 369 370 raise StopIteration 371 372 def queue_match(self): 373 374 "Queue the current match." 375 376 self.items.queue_match() 377 378 def new_block(self, region): 379 380 "Start a new block in 'region'." 381 382 self.add_node(region, Block([])) 383 384 # vim: tabstop=4 expandtab shiftwidth=4