1 #!/usr/bin/env python 2 3 """ 4 Moin wiki format parser. 5 6 Copyright (C) 2017 Paul Boddie <paul@boddie.org.uk> 7 8 This program is free software; you can redistribute it and/or modify it under 9 the terms of the GNU General Public License as published by the Free Software 10 Foundation; either version 3 of the License, or (at your option) any later 11 version. 12 13 This program is distributed in the hope that it will be useful, but WITHOUT 14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS 15 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more 16 details. 17 18 You should have received a copy of the GNU General Public License along with 19 this program. If not, see <http://www.gnu.org/licenses/>. 20 """ 21 22 from moinformat.tree import Block, ListItem, Region, Rule, Text 23 import re 24 25 # Regular expressions. 26 27 syntax = { 28 # Page regions: 29 "regionstart" : (r"((^\s*)([{]{3,}))", re.MULTILINE | re.DOTALL), # {{{... 30 "regionend" : (r"^\s*([}]{3,})", re.MULTILINE | re.DOTALL), # }}}... 31 "header" : (r"#!(.*?)\n", 0), # #! char-excl-nl 32 33 # Region contents: 34 "break" : (r"^(\s*?)\n", re.MULTILINE), # blank line 35 "listitem" : (r"^((\s+)([*]|\d+[.]))", re.MULTILINE), # indent (list-item or number-item) 36 "rule" : (r"(-----*)", 0), # ----... 37 38 # List contents: 39 "listitemend" : (r"^", re.MULTILINE), # next line 40 } 41 42 # Define patterns for the regular expressions. 43 44 patterns = {} 45 for name, (value, flags) in syntax.items(): 46 patterns[name] = re.compile(value, re.UNICODE | flags) 47 48 49 50 # Tokenising functions. 51 52 class TokenStream: 53 54 "A stream of tokens taken from a string." 55 56 def __init__(self, s): 57 self.s = s 58 self.pos = 0 59 self.match = None 60 self.matching = None 61 62 def read_until(self, pattern_names, remaining=True): 63 64 """ 65 Find the first match for the given 'pattern_names'. Return the text 66 preceding any match, the remaining text if no match was found, or None 67 if no match was found and 'remaining' is given as a false value. 68 """ 69 70 first = None 71 self.matching = None 72 73 # Find the first matching pattern. 74 75 for pattern_name in pattern_names: 76 match = patterns[pattern_name].search(self.s, self.pos) 77 if match: 78 start, end = match.span() 79 if self.matching is None or start < first: 80 first = start 81 self.matching = pattern_name 82 self.match = match 83 84 if self.matching is None: 85 if remaining: 86 return self.s[self.pos:] 87 else: 88 return None 89 else: 90 return self.s[self.pos:first] 91 92 def read_match(self, group=1): 93 94 """ 95 Return the matched text, updating the position in the stream. If 'group' 96 is specified, the indicated group in a match will be returned. 97 Typically, group 1 should contain all pertinent data, but groups defined 98 within group 1 can provide sections of the data. 99 """ 100 101 if self.match: 102 _start, self.pos = self.match.span() 103 try: 104 return self.match.group(group) 105 except IndexError: 106 return "" 107 else: 108 self.pos = len(self.s) 109 return None 110 111 112 113 # Parser functions. 114 115 def parse_page(s): 116 117 """ 118 Parse page text 's'. Pages consist of regions delimited by markers. 119 """ 120 121 return parse_region(TokenStream(s)) 122 123 def parse_region(items, level=0, indent=0): 124 125 """ 126 Parse the data provided by 'items' to populate a region with the given 127 'level' at the given 'indent'. 128 """ 129 130 region = Region([], level, indent) 131 132 # Parse section headers. 133 134 parse_region_header(items, region) 135 136 # Parse section body. 137 138 if region.is_transparent(): 139 parse_region_wiki(items, region) 140 else: 141 parse_region_opaque(items, region) 142 143 return region 144 145 def parse_region_header(items, region): 146 147 """ 148 Parse the region header from the 'items', setting it for the given 'region'. 149 """ 150 151 if items.read_until(["header"], False) == "": # None means no header 152 region.type = items.read_match() 153 154 def parse_region_wiki(items, region): 155 156 "Parse the data provided by 'items' to populate a wiki 'region'." 157 158 new_block(region) 159 parse_region_details(items, region, ["break", "listitem", "regionstart", "regionend", "rule"]) 160 161 def parse_region_opaque(items, region): 162 163 "Parse the data provided by 'items' to populate an opaque 'region'." 164 165 parse_region_details(items, region, ["regionend"]) 166 167 def parse_region_details(items, region, pattern_names): 168 169 "Parse 'items' within 'region' searching using 'pattern_names'." 170 171 try: 172 while True: 173 174 # Obtain text before any marker or the end of the input. 175 176 preceding = items.read_until(pattern_names) 177 if preceding: 178 region.append_text(Text(preceding)) 179 180 # End of input. 181 182 if not items.matching: 183 break 184 185 # Obtain any feature. 186 187 feature = items.read_match() 188 handler = handlers.get(items.matching) 189 190 # Handle each feature or add text to the region. 191 192 if handler: 193 handler(items, region) 194 else: 195 region.append_text(Text(feature)) 196 197 except StopIteration: 198 pass 199 200 region.normalise() 201 202 def end_region(items, region): 203 204 "End the parsing of 'region'." 205 206 raise StopIteration 207 208 def parse_break(items, region): 209 210 "Handle a paragraph break within 'region'." 211 212 # Mark any previous block as not being the final one in a sequence. 213 214 block = region.nodes[-1] 215 block.final = False 216 new_block(region) 217 218 def parse_listitem_end(items, region): 219 220 "Handle the end of a list." 221 222 raise StopIteration 223 224 def parse_listitem(items, region): 225 226 "Handle a list item marker within 'region'." 227 228 item = ListItem([]) 229 parse_region_details(items, item, ["listitemend"]) 230 region.append(item) 231 new_block(region) 232 233 def parse_rule(items, region): 234 235 "Handle a horizontal rule within 'region'." 236 237 length = len(items.read_match(1)) 238 rule = Rule(length) 239 region.append(rule) 240 new_block(region) 241 242 def parse_section(items, region): 243 244 "Handle the start of a new section within 'region'." 245 246 # Parse the section and start a new block after the section. 247 248 indent = len(items.read_match(2)) 249 level = len(items.read_match(3)) 250 region.append(parse_region(items, level, indent)) 251 new_block(region) 252 253 def parse_section_end(items, region): 254 255 "Handle the end of a new section within 'region'." 256 257 feature = items.read_match() 258 if region.have_end(feature): 259 raise StopIteration 260 else: 261 region.append_text(Text(feature)) 262 263 # Pattern handlers. 264 265 handlers = { 266 None : end_region, 267 "break" : parse_break, 268 "listitemend" : parse_listitem_end, 269 "listitem" : parse_listitem, 270 "regionstart" : parse_section, 271 "regionend" : parse_section_end, 272 "rule" : parse_rule, 273 } 274 275 def new_block(region): 276 277 "Start a new block in 'region'." 278 279 block = Block([]) 280 region.append(block) 281 282 283 284 # Top-level functions. 285 286 parse = parse_page 287 288 # vim: tabstop=4 expandtab shiftwidth=4