1 #!/usr/bin/env python 2 3 """ 4 Moin wiki parsing functionality. 5 6 Copyright (C) 2017 Paul Boddie <paul@boddie.org.uk> 7 8 This program is free software; you can redistribute it and/or modify it under 9 the terms of the GNU General Public License as published by the Free Software 10 Foundation; either version 3 of the License, or (at your option) any later 11 version. 12 13 This program is distributed in the hope that it will be useful, but WITHOUT 14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS 15 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more 16 details. 17 18 You should have received a copy of the GNU General Public License along with 19 this program. If not, see <http://www.gnu.org/licenses/>. 20 """ 21 22 from moinformat.tree import Block, Region, Text 23 import re 24 25 # Pattern management. 26 27 def get_patterns(syntax): 28 29 "Define patterns for the regular expressions in the 'syntax' mapping." 30 31 patterns = {} 32 for name, value in syntax.items(): 33 patterns[name] = re.compile(value, re.UNICODE | re.MULTILINE) 34 return patterns 35 36 def combine_patterns(patterns, syntax): 37 38 "Combine 'patterns' with those defined by the given 'syntax' mapping." 39 40 p = {} 41 p.update(patterns) 42 p.update(get_patterns(syntax)) 43 return p 44 45 # Tokenising functions. 46 47 class TokenStream: 48 49 "A stream of tokens taken from a string." 50 51 def __init__(self, s, patterns): 52 self.s = s 53 self.patterns = patterns 54 self.pos = 0 55 self.match = None 56 self.matching = None 57 58 def rewind(self, length): 59 60 "Rewind in the string by 'length'." 61 62 self.pos -= min(length, self.pos) 63 64 def read_until(self, pattern_names, remaining=True): 65 66 """ 67 Find the first match for the given 'pattern_names'. Return the text 68 preceding any match, the remaining text if no match was found, or None 69 if no match was found and 'remaining' is given as a false value. 70 """ 71 72 first = None 73 self.matching = None 74 75 # Find the first matching pattern. 76 77 for pattern_name in pattern_names: 78 match = self.patterns[pattern_name].search(self.s, self.pos) 79 if match: 80 start, end = match.span() 81 if self.matching is None or start < first: 82 first = start 83 self.matching = pattern_name 84 self.match = match 85 86 if self.matching is None: 87 if remaining: 88 return self.s[self.pos:] 89 else: 90 return None 91 else: 92 return self.s[self.pos:first] 93 94 def read_match(self, group=1): 95 96 """ 97 Return the matched text, updating the position in the stream. If 'group' 98 is specified, the indicated group in a match will be returned. 99 Typically, group 1 should contain all pertinent data, but groups defined 100 within group 1 can provide sections of the data. 101 """ 102 103 if self.match: 104 _start, self.pos = self.match.span() 105 try: 106 return self.match.group(group) 107 except IndexError: 108 return "" 109 else: 110 self.pos = len(self.s) 111 return None 112 113 114 115 # Utility functions. 116 117 def new_block(region): 118 119 "Start a new block in 'region'." 120 121 region.add(Block([])) 122 123 124 125 # Parser abstractions. 126 127 class ParserBase: 128 129 "Common parsing methods." 130 131 def __init__(self, formats=None): 132 133 """ 134 Initialise the parser with any given 'formats' mapping from region type 135 names to parser objects. 136 """ 137 138 self.formats = formats 139 140 def get_items(self, s): 141 142 "Return a sequence of token items for 's'." 143 144 raise NotImplementedError 145 146 def parse(self, s): 147 148 """ 149 Parse page text 's'. Pages consist of regions delimited by markers. 150 """ 151 152 return self.parse_region(self.get_items(s)) 153 154 def parse_region(self, items, level=0, indent=0): 155 156 """ 157 Parse the data provided by 'items' to populate a region with the given 158 'level' at the given 'indent'. 159 """ 160 161 region = Region([], level, indent) 162 163 # Parse section headers, then parse according to region type. 164 165 self.parse_region_header(items, region) 166 self.parse_region_type(items, region) 167 168 return region 169 170 def parse_region_type(self, items, region): 171 172 """ 173 Given data provided by 'items', use configured parsers to parse the 174 'region' based on its type. 175 """ 176 177 # Find an appropriate parser given the type. 178 179 if self.formats.has_key(region.type): 180 self.formats[region.type].parse_region_content(items, region) 181 182 # Otherwise, treat the section as opaque. 183 184 else: 185 self.parse_region_opaque(items, region) 186 187 def parse_region_header(self, items, region): 188 189 """ 190 Parse the region header from the 'items', setting it for the given 'region'. 191 """ 192 193 if items.read_until(["header"], False) == "": # None means no header 194 region.type = items.read_match() 195 196 def parse_region_opaque(self, items, region): 197 198 "Parse the data provided by 'items' to populate an opaque 'region'." 199 200 region.transparent = False 201 self.parse_region_details(items, region, ["regionend"]) 202 203 def parse_region_content(self, items, region): 204 205 "Parse the data provided by 'items' to populate the given 'region'." 206 207 pass 208 209 # Parsing utilities. 210 211 def parse_region_details(self, items, region, pattern_names): 212 213 "Parse 'items' within 'region' searching using 'pattern_names'." 214 215 try: 216 while True: 217 218 # Obtain text before any marker or the end of the input. 219 220 preceding = items.read_until(pattern_names) 221 if preceding: 222 region.append_inline(Text(preceding)) 223 224 # End of input. 225 226 if not items.matching: 227 break 228 229 # Obtain any feature. 230 231 feature = items.read_match() 232 handler = self.handlers.get(items.matching) 233 234 # Handle each feature or add text to the region. 235 236 if handler: 237 handler(self, items, region) 238 else: 239 region.append_inline(Text(feature)) 240 241 except StopIteration: 242 pass 243 244 region.normalise() 245 246 def end_region(self, items, region): 247 248 "End the parsing of 'region', breaking out of the parsing loop." 249 250 raise StopIteration 251 252 # vim: tabstop=4 expandtab shiftwidth=4