1 #!/usr/bin/env python 2 3 """ 4 Moin wiki parsing functionality. 5 6 Copyright (C) 2017 Paul Boddie <paul@boddie.org.uk> 7 8 This program is free software; you can redistribute it and/or modify it under 9 the terms of the GNU General Public License as published by the Free Software 10 Foundation; either version 3 of the License, or (at your option) any later 11 version. 12 13 This program is distributed in the hope that it will be useful, but WITHOUT 14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS 15 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more 16 details. 17 18 You should have received a copy of the GNU General Public License along with 19 this program. If not, see <http://www.gnu.org/licenses/>. 20 """ 21 22 from moinformat.tree import Block, Region, Text 23 import re 24 25 # Pattern management. 26 27 ws_excl_nl = r"[ \f\r\t\v]" 28 29 def get_patterns(syntax): 30 31 """ 32 Define patterns for the regular expressions in the 'syntax' mapping. In each 33 pattern, replace \N with a pattern for matching whitespace excluding 34 newlines. 35 """ 36 37 patterns = {} 38 for name, value in syntax.items(): 39 value = value.replace(r"\N", ws_excl_nl) 40 patterns[name] = re.compile(value, re.UNICODE | re.MULTILINE) 41 return patterns 42 43 def combine_patterns(patterns, syntax): 44 45 "Combine 'patterns' with those defined by the given 'syntax' mapping." 46 47 return combine_dicts([patterns, get_patterns(syntax)]) 48 49 def combine_dicts(dicts): 50 51 "Combine the given 'dicts'." 52 53 combined = {} 54 for d in dicts: 55 combined.update(d) 56 return combined 57 58 59 60 # Tokenising functions. 61 62 class TokenStream: 63 64 "A stream of tokens taken from a string." 65 66 def __init__(self, s, patterns, pos=0): 67 self.s = s 68 self.patterns = patterns 69 self.pos = pos 70 self.match = None 71 self.matching = None 72 73 def rewind(self, length): 74 75 "Rewind in the string by 'length'." 76 77 self.pos -= min(length, self.pos) 78 79 def read_until(self, pattern_names, remaining=True): 80 81 """ 82 Find the first match for the given 'pattern_names'. Return the text 83 preceding any match, the remaining text if no match was found, or None 84 if no match was found and 'remaining' is given as a false value. 85 """ 86 87 first = None 88 self.matching = None 89 90 # Find the first matching pattern. 91 92 for pattern_name in pattern_names: 93 match = self.patterns[pattern_name].search(self.s, self.pos) 94 if match: 95 start, end = match.span() 96 if self.matching is None or start < first: 97 first = start 98 self.matching = pattern_name 99 self.match = match 100 101 if self.matching is None: 102 if remaining: 103 return self.s[self.pos:] 104 else: 105 return None 106 else: 107 return self.s[self.pos:first] 108 109 def read_match(self, group=1): 110 111 """ 112 Return the matched text, updating the position in the stream. If 'group' 113 is specified, the indicated group in a match will be returned. 114 Typically, group 1 should contain all pertinent data, but groups defined 115 within group 1 can provide sections of the data. 116 """ 117 118 if self.match: 119 _start, self.pos = self.match.span() 120 try: 121 return self.match.group(group) 122 except IndexError: 123 return "" 124 else: 125 self.pos = len(self.s) 126 return None 127 128 129 130 # Utility functions. 131 132 def new_block(region): 133 134 "Start a new block in 'region'." 135 136 region.add(Block([])) 137 138 139 140 # Parser abstractions. 141 142 class ParserBase: 143 144 "Common parsing methods." 145 146 def __init__(self, formats=None): 147 148 """ 149 Initialise the parser with any given 'formats' mapping from region type 150 names to parser objects. 151 """ 152 153 self.formats = formats 154 self.replaced_items = None 155 156 def get_items(self, s, pos=0): 157 158 "Return a sequence of token items for 's' and 'pos'." 159 160 raise NotImplementedError 161 162 def replace_items(self, items): 163 164 "Replace the given 'items' with a sequence employing the same state." 165 166 self.replaced_items = items 167 return self.get_items(items.s, items.pos) 168 169 def update_items(self, items): 170 171 "Update the state of the replaced items with that of 'items'." 172 173 self.replaced_items.pos = items.pos 174 175 def parse(self, s): 176 177 """ 178 Parse page text 's'. Pages consist of regions delimited by markers. 179 """ 180 181 return self.parse_region(self.get_items(s)) 182 183 def parse_region(self, items, level=0, indent=0): 184 185 """ 186 Parse the data provided by 'items' to populate a region with the given 187 'level' at the given 'indent'. 188 """ 189 190 region = Region([], level, indent) 191 192 # Parse section headers, then parse according to region type. 193 194 self.parse_region_header(items, region) 195 self.parse_region_type(items, region) 196 197 return region 198 199 def parse_region_type(self, items, region): 200 201 """ 202 Given data provided by 'items', use configured parsers to parse the 203 'region' based on its type. 204 """ 205 206 # Find an appropriate parser given the type. 207 208 if self.formats.has_key(region.type): 209 self.formats[region.type].parse_region_content(items, region) 210 211 # Otherwise, treat the section as opaque. 212 213 else: 214 self.parse_region_opaque(items, region) 215 216 def parse_region_header(self, items, region): 217 218 """ 219 Parse the region header from the 'items', setting it for the given 'region'. 220 """ 221 222 if items.read_until(["header"], False) == "": # None means no header 223 region.type = items.read_match() 224 225 def parse_region_opaque(self, items, region): 226 227 "Parse the data provided by 'items' to populate an opaque 'region'." 228 229 region.transparent = False 230 self.parse_region_details(items, region, ["regionend"]) 231 232 def parse_region_content(self, items, region): 233 234 "Parse the data provided by 'items' to populate the given 'region'." 235 236 pass 237 238 # Parsing utilities. 239 240 def parse_region_details(self, items, region, pattern_names): 241 242 "Parse 'items' within 'region' searching using 'pattern_names'." 243 244 try: 245 while True: 246 247 # Obtain text before any marker or the end of the input. 248 249 preceding = items.read_until(pattern_names) 250 if preceding: 251 region.append_inline(Text(preceding)) 252 253 # End of input. 254 255 if not items.matching: 256 break 257 258 # Obtain any feature. 259 260 feature = items.read_match() 261 handler = self.handlers.get(items.matching) 262 263 # Handle each feature or add text to the region. 264 265 if handler: 266 handler(self, items, region) 267 else: 268 region.append_inline(Text(feature)) 269 270 except StopIteration: 271 pass 272 273 region.normalise() 274 275 def end_region(self, items, region): 276 277 "End the parsing of 'region', breaking out of the parsing loop." 278 279 raise StopIteration 280 281 282 # Format mapping initialisation. 283 284 def init_formats(formats): 285 286 """ 287 Convert the given 'formats' mapping from a name-to-class mapping to a 288 name-to-instance mapping with each parser instance employing the format 289 mapping itself. Return the converted mapping. 290 """ 291 292 d = {} 293 for name, cls in formats.items(): 294 d[name] = cls(d) 295 return d 296 297 # vim: tabstop=4 expandtab shiftwidth=4