1 #!/usr/bin/env python 2 3 """ 4 Moin wiki format parser. 5 6 Copyright (C) 2012, 2013, 2015, 2017 Paul Boddie <paul@boddie.org.uk> 7 8 This program is free software; you can redistribute it and/or modify it under 9 the terms of the GNU General Public License as published by the Free Software 10 Foundation; either version 3 of the License, or (at your option) any later 11 version. 12 13 This program is distributed in the hope that it will be useful, but WITHOUT 14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS 15 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more 16 details. 17 18 You should have received a copy of the GNU General Public License along with 19 this program. If not, see <http://www.gnu.org/licenses/>. 20 """ 21 22 from cgi import escape 23 import re 24 25 # Regular expressions. 26 27 syntax = { 28 # Page regions: 29 "regionstart" : (r"^\s*([{]{3,})", re.MULTILINE | re.DOTALL), # {{{... 30 "regionend" : (r"^\s*([}]{3,})", re.MULTILINE | re.DOTALL), # }}}... 31 "header" : (r"#!(.*?)\n", 0), # #! char-excl-nl 32 33 # Region contents: 34 "break" : (r"^(\s*?)\n", re.MULTILINE), # blank line 35 "listitem" : (r"^((\s+)([*]|\d+[.]))", re.MULTILINE), # indent (list-item or number-item) 36 37 # List contents: 38 "listitemend" : (r"^", re.MULTILINE), # next line 39 } 40 41 # Define patterns for the regular expressions. 42 43 patterns = {} 44 for name, (value, flags) in syntax.items(): 45 patterns[name] = re.compile(value, re.UNICODE | flags) 46 47 48 49 # Document nodes. 50 51 class Container: 52 53 "A container of document nodes." 54 55 def __init__(self, nodes): 56 self.nodes = nodes 57 58 def append(self, node): 59 self.nodes.append(node) 60 61 append_text = append 62 63 def empty(self): 64 return not self.nodes 65 66 def normalise(self): 67 68 "Combine adjacent text nodes." 69 70 nodes = self.nodes 71 self.nodes = [] 72 text = None 73 74 for node in nodes: 75 76 # Open a text node or merge text into an open node. 77 78 if isinstance(node, Text): 79 if not text: 80 text = node 81 else: 82 text.merge(node) 83 84 # Close any open text node and append the current node. 85 86 else: 87 if text: 88 self.append(text) 89 text = None 90 self.append(node) 91 92 # Add any open text node. 93 94 if text: 95 self.append(text) 96 97 def __str__(self): 98 return self.prettyprint() 99 100 def prettyprint(self, indent=""): 101 pass 102 103 class Region(Container): 104 105 "A region of the page." 106 107 transparent_region_types = ["wiki"] 108 109 def __init__(self, nodes, level=0, type=None): 110 Container.__init__(self, nodes) 111 self.level = level 112 self.type = type 113 114 def append(self, node): 115 last = self.nodes and self.nodes[-1] 116 if last and last.empty(): 117 self.nodes[-1] = node 118 else: 119 self.nodes.append(node) 120 121 def append_text(self, s): 122 if self.is_transparent(): 123 self.nodes[-1].append(s) 124 else: 125 self.append(s) 126 127 def have_end(self, s): 128 return self.level and s.startswith("}") and self.level == len(s) 129 130 def is_transparent(self): 131 return not self.level or self.type in self.transparent_region_types 132 133 def __repr__(self): 134 return "Region(%r, %r, %r)" % (self.nodes, self.level, self.type) 135 136 def prettyprint(self, indent=""): 137 l = ["%sRegion: level=%d type=%s" % (indent, self.level, self.type)] 138 for node in self.nodes: 139 l.append(node.prettyprint(indent + " ")) 140 return "\n".join(l) 141 142 def to_string(self, out): 143 out.start_region(self.level, self.type) 144 for node in self.nodes: 145 node.to_string(out) 146 out.end_region(self.level, self.type) 147 148 class Block(Container): 149 150 "A block in the page." 151 152 def __init__(self, nodes, final=True): 153 Container.__init__(self, nodes) 154 self.final = final 155 156 def __repr__(self): 157 return "Block(%r)" % self.nodes 158 159 def prettyprint(self, indent=""): 160 l = ["%sBlock: final=%s" % (indent, self.final)] 161 for node in self.nodes: 162 l.append(node.prettyprint(indent + " ")) 163 return "\n".join(l) 164 165 def to_string(self, out): 166 out.start_block(self.final) 167 for node in self.nodes: 168 node.to_string(out) 169 out.end_block(self.final) 170 171 class ListItem(Container): 172 173 "A list item." 174 175 def __repr__(self): 176 return "ListItem(%r)" % self.nodes 177 178 def prettyprint(self, indent=""): 179 l = ["%sListItem:" % indent] 180 for node in self.nodes: 181 l.append(node.prettyprint(indent + " ")) 182 return "\n".join(l) 183 184 def to_string(self, out): 185 out.start_listitem() 186 for node in self.nodes: 187 node.to_string(out) 188 out.end_listitem() 189 190 191 class Text: 192 193 "A text node." 194 195 def __init__(self, s): 196 self.s = s 197 198 def empty(self): 199 return not self.s 200 201 def merge(self, text): 202 self.s += text.s 203 204 def __repr__(self): 205 return "Text(%r)" % self.s 206 207 def prettyprint(self, indent=""): 208 return "%sText: %r" % (indent, self.s) 209 210 def to_string(self, out): 211 out.text(self.s) 212 213 214 215 # Serialisation. 216 217 class Serialiser: 218 219 "General serialisation support." 220 221 def __init__(self, out): 222 self.out = out 223 224 class MoinSerialiser(Serialiser): 225 226 "Serialisation of the page." 227 228 def start_region(self, level, type): 229 out = self.out 230 if level: 231 out("{" * level) # marker 232 if type and level: 233 out("#!%s\n" % type) # header 234 235 def end_region(self, level, type): 236 out = self.out 237 if level: 238 out("}" * level) # marker 239 240 def start_block(self, final): 241 pass 242 243 def end_block(self, final): 244 if not final: 245 self.out("\n") 246 247 def start_listitem(self): 248 self.out(" *") 249 250 def end_listitem(self): 251 pass 252 253 def text(self, s): 254 self.out(s) 255 256 class HTMLSerialiser(Serialiser): 257 258 "Serialisation of the page." 259 260 def start_region(self, level, type): 261 l = [] 262 out = l.append 263 if level: 264 out("level-%d" % level) # marker 265 266 # NOTE: Encode type details for CSS. 267 268 if type: 269 out("type-%s" % escape(type, True)) # header 270 271 self.out("<span class='%s'>" % " ".join(l)) 272 273 def end_region(self, level, type): 274 self.out("</span>") 275 276 def start_block(self, final): 277 self.out("<p>") 278 279 def end_block(self, final): 280 self.out("</p>") 281 282 def start_listitem(self): 283 self.out("<li>") 284 285 def end_listitem(self): 286 self.out("</li>") 287 288 def text(self, s): 289 self.out(escape(s)) 290 291 292 293 # Tokenising functions. 294 295 class TokenStream: 296 297 "A stream of tokens taken from a string." 298 299 def __init__(self, s): 300 self.s = s 301 self.pos = 0 302 self.match = None 303 self.matching = None 304 305 def read_until(self, pattern_names, remaining=True): 306 307 """ 308 Find the first match for the given 'pattern_names'. Return the text 309 preceding any match, the remaining text if no match was found, or None 310 if no match was found and 'remaining' is given as a false value. 311 """ 312 313 first = None 314 self.matching = None 315 316 # Find the first matching pattern. 317 318 for pattern_name in pattern_names: 319 match = patterns[pattern_name].search(self.s, self.pos) 320 if match: 321 start, end = match.span() 322 if self.matching is None or start < first: 323 first = start 324 self.matching = pattern_name 325 self.match = match 326 327 if self.matching is None: 328 if remaining: 329 return self.s[self.pos:] 330 else: 331 return None 332 else: 333 return self.s[self.pos:first] 334 335 def read_match(self): 336 337 "Return the matched text, updating the position in the stream." 338 339 if self.match: 340 _start, self.pos = self.match.span() 341 try: 342 return self.match.group(1) 343 except IndexError: 344 return "" 345 else: 346 self.pos = len(self.s) 347 return None 348 349 350 351 # Parser functions. 352 353 def parse_page(s): 354 355 """ 356 Parse page text 's'. Pages consist of regions delimited by markers. 357 """ 358 359 return parse_region(TokenStream(s)) 360 361 def parse_region(items, level=0): 362 363 """ 364 Parse the data provided by 'items' to populate a region at the given 365 'level'. 366 """ 367 368 region = Region([], level) 369 370 # Parse section headers. 371 372 parse_region_header(items, region) 373 374 # Parse section body. 375 376 if region.is_transparent(): 377 parse_region_wiki(items, region) 378 else: 379 parse_region_opaque(items, region) 380 381 return region 382 383 def parse_region_header(items, region): 384 385 """ 386 Parse the region header from the 'items', setting it for the given 'region'. 387 """ 388 389 if items.read_until(["header"], False) == "": # None means no header 390 region.type = items.read_match() 391 392 def parse_region_wiki(items, region): 393 394 "Parse the data provided by 'items' to populate a wiki 'region'." 395 396 new_block(region) 397 parse_region_details(items, region, ["break", "listitem", "regionstart", "regionend"]) 398 399 def parse_region_opaque(items, region): 400 401 "Parse the data provided by 'items' to populate an opaque 'region'." 402 403 parse_region_details(items, region, ["regionend"]) 404 405 def parse_region_details(items, region, pattern_names): 406 407 "Parse 'items' within 'region' searching using 'pattern_names'." 408 409 try: 410 while True: 411 412 # Obtain text before any marker or the end of the input. 413 414 preceding = items.read_until(pattern_names) 415 if preceding: 416 region.append_text(Text(preceding)) 417 418 # End of input. 419 420 if not items.matching: 421 break 422 423 # Obtain any feature. 424 425 feature = items.read_match() 426 handler = handlers.get(items.matching) 427 428 # Handle each feature or add text to the region. 429 430 if handler: 431 handler(items, region) 432 else: 433 region.append_text(Text(feature)) 434 435 except StopIteration: 436 pass 437 438 region.normalise() 439 440 def end_region(items, region): 441 442 "End the parsing of 'region'." 443 444 raise StopIteration 445 446 def parse_break(items, region): 447 448 "Handle a paragraph break within 'region'." 449 450 # Mark any previous block as not being the final one in a sequence. 451 452 block = region.nodes[-1] 453 block.final = False 454 new_block(region) 455 456 def parse_listitem_end(items, region): 457 458 "Handle the end of a list." 459 460 raise StopIteration 461 462 def parse_listitem(items, region): 463 464 "Handle a list item marker within 'region'." 465 466 item = ListItem([]) 467 parse_region_details(items, item, ["listitemend"]) 468 region.append(item) 469 new_block(region) 470 471 def parse_section(items, region): 472 473 "Handle the start of a new section within 'region'." 474 475 # Parse the section and start a new block after the section. 476 477 level = len(items.read_match()) 478 region.append(parse_region(items, level)) 479 new_block(region) 480 481 def parse_section_end(items, region): 482 483 "Handle the end of a new section within 'region'." 484 485 feature = items.read_match() 486 if region.have_end(feature): 487 raise StopIteration 488 else: 489 region.append_text(Text(feature)) 490 491 # Pattern handlers. 492 493 handlers = { 494 None : end_region, 495 "break" : parse_break, 496 "listitemend" : parse_listitem_end, 497 "listitem" : parse_listitem, 498 "regionstart" : parse_section, 499 "regionend" : parse_section_end, 500 } 501 502 def new_block(region): 503 504 "Start a new block in 'region'." 505 506 block = Block([]) 507 region.append(block) 508 509 510 511 # Top-level functions. 512 513 parse = parse_page 514 515 def serialise(doc, serialiser=MoinSerialiser): 516 l = [] 517 doc.to_string(serialiser(l.append)) 518 return "".join(l) 519 520 # vim: tabstop=4 expandtab shiftwidth=4