1 #!/usr/bin/env python 2 3 """ 4 Moin wiki parsing functionality. 5 6 Copyright (C) 2017 Paul Boddie <paul@boddie.org.uk> 7 8 This program is free software; you can redistribute it and/or modify it under 9 the terms of the GNU General Public License as published by the Free Software 10 Foundation; either version 3 of the License, or (at your option) any later 11 version. 12 13 This program is distributed in the hope that it will be useful, but WITHOUT 14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS 15 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more 16 details. 17 18 You should have received a copy of the GNU General Public License along with 19 this program. If not, see <http://www.gnu.org/licenses/>. 20 """ 21 22 from moinformat.tree import Block, Region, Text 23 24 # Tokenising functions. 25 26 class TokenStream: 27 28 "A stream of tokens taken from a string." 29 30 def __init__(self, s, patterns): 31 self.s = s 32 self.patterns = patterns 33 self.pos = 0 34 self.match = None 35 self.matching = None 36 37 def rewind(self, length): 38 39 "Rewind in the string by 'length'." 40 41 self.pos -= min(length, self.pos) 42 43 def read_until(self, pattern_names, remaining=True): 44 45 """ 46 Find the first match for the given 'pattern_names'. Return the text 47 preceding any match, the remaining text if no match was found, or None 48 if no match was found and 'remaining' is given as a false value. 49 """ 50 51 first = None 52 self.matching = None 53 54 # Find the first matching pattern. 55 56 for pattern_name in pattern_names: 57 match = self.patterns[pattern_name].search(self.s, self.pos) 58 if match: 59 start, end = match.span() 60 if self.matching is None or start < first: 61 first = start 62 self.matching = pattern_name 63 self.match = match 64 65 if self.matching is None: 66 if remaining: 67 return self.s[self.pos:] 68 else: 69 return None 70 else: 71 return self.s[self.pos:first] 72 73 def read_match(self, group=1): 74 75 """ 76 Return the matched text, updating the position in the stream. If 'group' 77 is specified, the indicated group in a match will be returned. 78 Typically, group 1 should contain all pertinent data, but groups defined 79 within group 1 can provide sections of the data. 80 """ 81 82 if self.match: 83 _start, self.pos = self.match.span() 84 try: 85 return self.match.group(group) 86 except IndexError: 87 return "" 88 else: 89 self.pos = len(self.s) 90 return None 91 92 93 94 # Utility functions. 95 96 def new_block(region): 97 98 "Start a new block in 'region'." 99 100 region.add(Block([])) 101 102 103 104 # Parser abstractions. 105 106 class ParserBase: 107 108 "Common parsing methods." 109 110 def __init__(self, formats=None): 111 112 """ 113 Initialise the parser with any given 'formats' mapping from region type 114 names to parser objects. 115 """ 116 117 self.formats = formats 118 119 def get_items(self, s): 120 121 "Return a sequence of token items for 's'." 122 123 raise NotImplementedError 124 125 def parse(self, s): 126 127 """ 128 Parse page text 's'. Pages consist of regions delimited by markers. 129 """ 130 131 return self.parse_region(self.get_items(s)) 132 133 def parse_region(self, items, level=0, indent=0): 134 135 """ 136 Parse the data provided by 'items' to populate a region with the given 137 'level' at the given 'indent'. 138 """ 139 140 region = Region([], level, indent) 141 142 # Parse section headers, then parse according to region type. 143 144 self.parse_region_header(items, region) 145 self.parse_region_type(items, region) 146 147 return region 148 149 def parse_region_type(self, items, region): 150 151 """ 152 Given data provided by 'items', use configured parsers to parse the 153 'region' based on its type. 154 """ 155 156 # Find an appropriate parser given the type. 157 158 if self.formats.has_key(region.type): 159 self.formats[region.type].parse_region_content(items, region) 160 161 # Otherwise, treat the section as opaque. 162 163 else: 164 self.parse_region_opaque(items, region) 165 166 def parse_region_header(self, items, region): 167 168 """ 169 Parse the region header from the 'items', setting it for the given 'region'. 170 """ 171 172 if items.read_until(["header"], False) == "": # None means no header 173 region.type = items.read_match() 174 175 def parse_region_opaque(self, items, region): 176 177 "Parse the data provided by 'items' to populate an opaque 'region'." 178 179 region.transparent = False 180 self.parse_region_details(items, region, ["regionend"]) 181 182 def parse_region_content(self, items, region): 183 184 "Parse the data provided by 'items' to populate the given 'region'." 185 186 pass 187 188 # Parsing utilities. 189 190 def parse_region_details(self, items, region, pattern_names): 191 192 "Parse 'items' within 'region' searching using 'pattern_names'." 193 194 try: 195 while True: 196 197 # Obtain text before any marker or the end of the input. 198 199 preceding = items.read_until(pattern_names) 200 if preceding: 201 region.append_inline(Text(preceding)) 202 203 # End of input. 204 205 if not items.matching: 206 break 207 208 # Obtain any feature. 209 210 feature = items.read_match() 211 handler = self.handlers.get(items.matching) 212 213 # Handle each feature or add text to the region. 214 215 if handler: 216 handler(self, items, region) 217 else: 218 region.append_inline(Text(feature)) 219 220 except StopIteration: 221 pass 222 223 region.normalise() 224 225 def end_region(self, items, region): 226 227 "End the parsing of 'region', breaking out of the parsing loop." 228 229 raise StopIteration 230 231 # vim: tabstop=4 expandtab shiftwidth=4