1 #!/usr/bin/env python 2 3 """ 4 Moin wiki format parser. 5 6 Copyright (C) 2017 Paul Boddie <paul@boddie.org.uk> 7 8 This program is free software; you can redistribute it and/or modify it under 9 the terms of the GNU General Public License as published by the Free Software 10 Foundation; either version 3 of the License, or (at your option) any later 11 version. 12 13 This program is distributed in the hope that it will be useful, but WITHOUT 14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS 15 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more 16 details. 17 18 You should have received a copy of the GNU General Public License along with 19 this program. If not, see <http://www.gnu.org/licenses/>. 20 """ 21 22 from moinformat.tree import Region, Block, ListItem, Text 23 import re 24 25 # Regular expressions. 26 27 syntax = { 28 # Page regions: 29 "regionstart" : (r"((^\s*)([{]{3,}))", re.MULTILINE | re.DOTALL), # {{{... 30 "regionend" : (r"^\s*([}]{3,})", re.MULTILINE | re.DOTALL), # }}}... 31 "header" : (r"#!(.*?)\n", 0), # #! char-excl-nl 32 33 # Region contents: 34 "break" : (r"^(\s*?)\n", re.MULTILINE), # blank line 35 "listitem" : (r"^((\s+)([*]|\d+[.]))", re.MULTILINE), # indent (list-item or number-item) 36 37 # List contents: 38 "listitemend" : (r"^", re.MULTILINE), # next line 39 } 40 41 # Define patterns for the regular expressions. 42 43 patterns = {} 44 for name, (value, flags) in syntax.items(): 45 patterns[name] = re.compile(value, re.UNICODE | flags) 46 47 48 49 # Tokenising functions. 50 51 class TokenStream: 52 53 "A stream of tokens taken from a string." 54 55 def __init__(self, s): 56 self.s = s 57 self.pos = 0 58 self.match = None 59 self.matching = None 60 61 def read_until(self, pattern_names, remaining=True): 62 63 """ 64 Find the first match for the given 'pattern_names'. Return the text 65 preceding any match, the remaining text if no match was found, or None 66 if no match was found and 'remaining' is given as a false value. 67 """ 68 69 first = None 70 self.matching = None 71 72 # Find the first matching pattern. 73 74 for pattern_name in pattern_names: 75 match = patterns[pattern_name].search(self.s, self.pos) 76 if match: 77 start, end = match.span() 78 if self.matching is None or start < first: 79 first = start 80 self.matching = pattern_name 81 self.match = match 82 83 if self.matching is None: 84 if remaining: 85 return self.s[self.pos:] 86 else: 87 return None 88 else: 89 return self.s[self.pos:first] 90 91 def read_match(self, group=1): 92 93 """ 94 Return the matched text, updating the position in the stream. If 'group' 95 is specified, the indicated group in a match will be returned. 96 Typically, group 1 should contain all pertinent data, but groups defined 97 within group 1 can provide sections of the data. 98 """ 99 100 if self.match: 101 _start, self.pos = self.match.span() 102 try: 103 return self.match.group(group) 104 except IndexError: 105 return "" 106 else: 107 self.pos = len(self.s) 108 return None 109 110 111 112 # Parser functions. 113 114 def parse_page(s): 115 116 """ 117 Parse page text 's'. Pages consist of regions delimited by markers. 118 """ 119 120 return parse_region(TokenStream(s)) 121 122 def parse_region(items, level=0, indent=0): 123 124 """ 125 Parse the data provided by 'items' to populate a region with the given 126 'level' at the given 'indent'. 127 """ 128 129 region = Region([], level, indent) 130 131 # Parse section headers. 132 133 parse_region_header(items, region) 134 135 # Parse section body. 136 137 if region.is_transparent(): 138 parse_region_wiki(items, region) 139 else: 140 parse_region_opaque(items, region) 141 142 return region 143 144 def parse_region_header(items, region): 145 146 """ 147 Parse the region header from the 'items', setting it for the given 'region'. 148 """ 149 150 if items.read_until(["header"], False) == "": # None means no header 151 region.type = items.read_match() 152 153 def parse_region_wiki(items, region): 154 155 "Parse the data provided by 'items' to populate a wiki 'region'." 156 157 new_block(region) 158 parse_region_details(items, region, ["break", "listitem", "regionstart", "regionend"]) 159 160 def parse_region_opaque(items, region): 161 162 "Parse the data provided by 'items' to populate an opaque 'region'." 163 164 parse_region_details(items, region, ["regionend"]) 165 166 def parse_region_details(items, region, pattern_names): 167 168 "Parse 'items' within 'region' searching using 'pattern_names'." 169 170 try: 171 while True: 172 173 # Obtain text before any marker or the end of the input. 174 175 preceding = items.read_until(pattern_names) 176 if preceding: 177 region.append_text(Text(preceding)) 178 179 # End of input. 180 181 if not items.matching: 182 break 183 184 # Obtain any feature. 185 186 feature = items.read_match() 187 handler = handlers.get(items.matching) 188 189 # Handle each feature or add text to the region. 190 191 if handler: 192 handler(items, region) 193 else: 194 region.append_text(Text(feature)) 195 196 except StopIteration: 197 pass 198 199 region.normalise() 200 201 def end_region(items, region): 202 203 "End the parsing of 'region'." 204 205 raise StopIteration 206 207 def parse_break(items, region): 208 209 "Handle a paragraph break within 'region'." 210 211 # Mark any previous block as not being the final one in a sequence. 212 213 block = region.nodes[-1] 214 block.final = False 215 new_block(region) 216 217 def parse_listitem_end(items, region): 218 219 "Handle the end of a list." 220 221 raise StopIteration 222 223 def parse_listitem(items, region): 224 225 "Handle a list item marker within 'region'." 226 227 item = ListItem([]) 228 parse_region_details(items, item, ["listitemend"]) 229 region.append(item) 230 new_block(region) 231 232 def parse_section(items, region): 233 234 "Handle the start of a new section within 'region'." 235 236 # Parse the section and start a new block after the section. 237 238 indent = len(items.read_match(2)) 239 level = len(items.read_match(3)) 240 region.append(parse_region(items, level, indent)) 241 new_block(region) 242 243 def parse_section_end(items, region): 244 245 "Handle the end of a new section within 'region'." 246 247 feature = items.read_match() 248 if region.have_end(feature): 249 raise StopIteration 250 else: 251 region.append_text(Text(feature)) 252 253 # Pattern handlers. 254 255 handlers = { 256 None : end_region, 257 "break" : parse_break, 258 "listitemend" : parse_listitem_end, 259 "listitem" : parse_listitem, 260 "regionstart" : parse_section, 261 "regionend" : parse_section_end, 262 } 263 264 def new_block(region): 265 266 "Start a new block in 'region'." 267 268 block = Block([]) 269 region.append(block) 270 271 272 273 # Top-level functions. 274 275 parse = parse_page 276 277 # vim: tabstop=4 expandtab shiftwidth=4