1 #!/usr/bin/env python 2 3 """ 4 Moin wiki format parser. 5 6 Copyright (C) 2012, 2013, 2015, 2017 Paul Boddie <paul@boddie.org.uk> 7 8 This program is free software; you can redistribute it and/or modify it under 9 the terms of the GNU General Public License as published by the Free Software 10 Foundation; either version 3 of the License, or (at your option) any later 11 version. 12 13 This program is distributed in the hope that it will be useful, but WITHOUT 14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS 15 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more 16 details. 17 18 You should have received a copy of the GNU General Public License along with 19 this program. If not, see <http://www.gnu.org/licenses/>. 20 """ 21 22 from cgi import escape 23 import re 24 25 # Regular expressions. 26 27 syntax = { 28 # Page regions: 29 "regionstart" : (r"^\s*([{]{3,})", re.MULTILINE | re.DOTALL), # {{{... 30 "regionend" : (r"^\s*([}]{3,})", re.MULTILINE | re.DOTALL), # }}}... 31 "header" : (r"#!(.*?)\n", 0), # #! char-excl-nl 32 33 # Region contents: 34 "break" : (r"^(\s*?)\n", re.MULTILINE), # blank line 35 } 36 37 # Define patterns for the regular expressions. 38 39 patterns = {} 40 for name, (value, flags) in syntax.items(): 41 patterns[name] = re.compile(value, re.UNICODE | flags) 42 43 44 45 # Document nodes. 46 47 class Container: 48 49 "A container of document nodes." 50 51 def __init__(self, nodes): 52 self.nodes = nodes 53 54 def append(self, node): 55 self.nodes.append(node) 56 57 append_text = append 58 59 def normalise(self): 60 61 "Combine adjacent text nodes." 62 63 nodes = self.nodes 64 self.nodes = [] 65 text = None 66 67 for node in nodes: 68 69 # Open a text node or merge text into an open node. 70 71 if isinstance(node, Text): 72 if not text: 73 text = node 74 else: 75 text.merge(node) 76 77 # Close any open text node and append the current node. 78 79 else: 80 if text: 81 self.append(text) 82 text = None 83 self.append(node) 84 85 # Add any open text node. 86 87 if text: 88 self.append(text) 89 90 def __str__(self): 91 return self.prettyprint() 92 93 def prettyprint(self, indent=""): 94 pass 95 96 class Region(Container): 97 98 "A region of the page." 99 100 transparent_region_types = ["wiki"] 101 102 def __init__(self, nodes, level=0, type=None): 103 Container.__init__(self, nodes) 104 self.level = level 105 self.type = type 106 107 def append_text(self, s): 108 if self.is_transparent(): 109 self.nodes[-1].append(s) 110 else: 111 self.append(s) 112 113 def have_end(self, s): 114 return self.level and s.startswith("}") and self.level == len(s) 115 116 def is_transparent(self): 117 return not self.level or self.type in self.transparent_region_types 118 119 def __repr__(self): 120 return "Region(%r, %r, %r)" % (self.nodes, self.level, self.type) 121 122 def prettyprint(self, indent=""): 123 l = ["%sRegion: level=%d type=%s" % (indent, self.level, self.type)] 124 for node in self.nodes: 125 l.append(node.prettyprint(indent + " ")) 126 return "\n".join(l) 127 128 def to_string(self, out): 129 out.start_region(self.level, self.type) 130 for node in self.nodes: 131 node.to_string(out) 132 out.end_region(self.level, self.type) 133 134 class Block(Container): 135 136 "A block in the page." 137 138 def __init__(self, nodes, final=True): 139 Container.__init__(self, nodes) 140 self.final = final 141 142 def __repr__(self): 143 return "Block(%r)" % self.nodes 144 145 def prettyprint(self, indent=""): 146 l = ["%sBlock: final=%s" % (indent, self.final)] 147 for node in self.nodes: 148 l.append(node.prettyprint(indent + " ")) 149 return "\n".join(l) 150 151 def to_string(self, out): 152 out.start_block(self.final) 153 for node in self.nodes: 154 node.to_string(out) 155 out.end_block(self.final) 156 157 class Text: 158 159 "A text node." 160 161 def __init__(self, s): 162 self.s = s 163 164 def merge(self, text): 165 self.s += text.s 166 167 def __repr__(self): 168 return "Text(%r)" % self.s 169 170 def prettyprint(self, indent=""): 171 return "%sText: %r" % (indent, self.s) 172 173 def to_string(self, out): 174 out.text(self.s) 175 176 177 178 # Serialisation. 179 180 class Serialiser: 181 182 "General serialisation support." 183 184 def __init__(self, out): 185 self.out = out 186 187 class MoinSerialiser(Serialiser): 188 189 "Serialisation of the page." 190 191 def start_region(self, level, type): 192 out = self.out 193 if level: 194 out("{" * level) # marker 195 if type and level: 196 out("#!%s\n" % type) # header 197 198 def end_region(self, level, type): 199 out = self.out 200 if level: 201 out("}" * level) # marker 202 203 def start_block(self, final): 204 pass 205 206 def end_block(self, final): 207 if not final: 208 self.out("\n") 209 210 def text(self, s): 211 self.out(s) 212 213 class HTMLSerialiser(Serialiser): 214 215 "Serialisation of the page." 216 217 def start_region(self, level, type): 218 l = [] 219 out = l.append 220 if level: 221 out("level-%d" % level) # marker 222 223 # NOTE: Encode type details for CSS. 224 225 if type: 226 out("type-%s" % escape(type, True)) # header 227 228 self.out("<span class='%s'>" % " ".join(l)) 229 230 def end_region(self, level, type): 231 self.out("</span>") 232 233 def start_block(self, final): 234 self.out("<p>") 235 236 def end_block(self, final): 237 self.out("</p>") 238 239 def text(self, s): 240 self.out(escape(s)) 241 242 243 244 # Tokenising functions. 245 246 class TokenStream: 247 248 "A stream of tokens taken from a string." 249 250 def __init__(self, s): 251 self.s = s 252 self.pos = 0 253 self.match = None 254 self.matching = None 255 256 def read_until(self, pattern_names, remaining=True): 257 258 """ 259 Find the first match for the given 'pattern_names'. Return the text 260 preceding any match, the remaining text if no match was found, or None 261 if no match was found and 'remaining' is given as a false value. 262 """ 263 264 first = None 265 self.matching = None 266 267 # Find the first matching pattern. 268 269 for pattern_name in pattern_names: 270 match = patterns[pattern_name].search(self.s, self.pos) 271 if match: 272 start, end = match.span() 273 if self.matching is None or start < first: 274 first = start 275 self.matching = pattern_name 276 self.match = match 277 278 if self.matching is None: 279 if remaining: 280 return self.s[self.pos:] 281 else: 282 return None 283 else: 284 return self.s[self.pos:first] 285 286 def read_match(self): 287 288 "Return the matched text, updating the position in the stream." 289 290 if self.match: 291 _start, self.pos = self.match.span() 292 s = self.match.group(1) 293 return s 294 else: 295 self.pos = len(self.s) 296 return None 297 298 299 300 # Parser functions. 301 302 def parse_page(s): 303 304 """ 305 Parse page text 's'. Pages consist of regions delimited by markers. 306 """ 307 308 return parse_region(TokenStream(s)) 309 310 def parse_region(items, level=0): 311 312 """ 313 Parse the data provided by 'items' to populate a region at the given 314 'level'. 315 """ 316 317 region = Region([], level) 318 319 # Parse section headers. 320 321 parse_region_header(items, region) 322 323 # Parse section body. 324 325 if region.is_transparent(): 326 parse_region_wiki(items, region) 327 else: 328 parse_region_opaque(items, region) 329 330 return region 331 332 def parse_region_header(items, region): 333 334 """ 335 Parse the region header from the 'items', setting it for the given 'region'. 336 """ 337 338 if items.read_until(["header"], False) == "": # None means no header 339 region.type = items.read_match() 340 341 def parse_region_wiki(items, region): 342 343 "Parse the data provided by 'items' to populate a wiki 'region'." 344 345 new_block(region) 346 parse_region_details(items, region, ["break", "regionstart", "regionend"]) 347 348 def parse_region_opaque(items, region): 349 350 "Parse the data provided by 'items' to populate an opaque 'region'." 351 352 parse_region_details(items, region, ["regionend"]) 353 354 def parse_region_details(items, region, pattern_names): 355 356 "Parse 'items' within 'region' searching using 'pattern_names'." 357 358 try: 359 while True: 360 361 # Obtain text before any marker or the end of the input. 362 363 preceding = items.read_until(pattern_names) 364 if preceding: 365 region.append_text(Text(preceding)) 366 367 # End of input. 368 369 if not items.matching: 370 break 371 372 # Obtain any feature. 373 374 feature = items.read_match() 375 handler = handlers.get(items.matching) 376 377 # Handle each feature or add text to the region. 378 379 if handler: 380 handler(items, region) 381 else: 382 region.append_text(Text(feature)) 383 384 except StopIteration: 385 pass 386 387 region.normalise() 388 389 def end_region(items, region): 390 391 "End the parsing of 'region'." 392 393 raise StopIteration 394 395 def parse_break(items, region): 396 397 "Handle a paragraph break within 'region'." 398 399 # Mark any previous block as not being the final one in a sequence. 400 401 block = region.nodes[-1] 402 block.final = False 403 new_block(region) 404 405 def parse_section(items, region): 406 407 "Handle the start of a new section within 'region'." 408 409 # Parse the section and start a new block after the section. 410 411 level = len(items.read_match()) 412 region.append(parse_region(items, level)) 413 new_block(region) 414 415 def parse_section_end(items, region): 416 417 "Handle the end of a new section within 'region'." 418 419 feature = items.read_match() 420 if region.have_end(feature): 421 raise StopIteration 422 else: 423 region.append_text(Text(feature)) 424 425 # Pattern handlers. 426 427 handlers = { 428 None : end_region, 429 "break" : parse_break, 430 "regionstart" : parse_section, 431 "regionend" : parse_section_end, 432 } 433 434 def new_block(region): 435 436 "Start a new block in 'region'." 437 438 block = Block([]) 439 region.append(block) 440 441 442 443 # Top-level functions. 444 445 parse = parse_page 446 447 def serialise(doc, serialiser=MoinSerialiser): 448 l = [] 449 doc.to_string(serialiser(l.append)) 450 return "".join(l) 451 452 # vim: tabstop=4 expandtab shiftwidth=4