1 #!/usr/bin/env python 2 3 """ 4 Moin wiki format parser. 5 6 Copyright (C) 2012, 2013, 2015, 2017 Paul Boddie <paul@boddie.org.uk> 7 8 This program is free software; you can redistribute it and/or modify it under 9 the terms of the GNU General Public License as published by the Free Software 10 Foundation; either version 3 of the License, or (at your option) any later 11 version. 12 13 This program is distributed in the hope that it will be useful, but WITHOUT 14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS 15 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more 16 details. 17 18 You should have received a copy of the GNU General Public License along with 19 this program. If not, see <http://www.gnu.org/licenses/>. 20 """ 21 22 from cgi import escape 23 import re 24 25 # Regular expressions. 26 27 syntax = { 28 # Page regions: 29 "marker" : (r"^\s*([{]{3,}|[}]{3,})", re.MULTILINE | re.DOTALL), # {{{... or }}}... 30 31 # Region contents: 32 "header" : (r"#!(.*?)\n", 0), # #! char-excl-nl 33 "break" : (r"^\s*?\n", re.MULTILINE), # blank line 34 } 35 36 # Define patterns for the regular expressions. 37 38 patterns = {} 39 for name, (value, flags) in syntax.items(): 40 patterns[name] = re.compile(value, re.UNICODE | flags) 41 42 43 44 # Document nodes. 45 46 class Container: 47 48 "A container of document nodes." 49 50 def __init__(self, nodes): 51 self.nodes = nodes 52 53 def append(self, node): 54 self.nodes.append(node) 55 56 def normalise(self): 57 58 "Combine adjacent text nodes." 59 60 nodes = self.nodes 61 self.nodes = [] 62 text = None 63 64 for node in nodes: 65 66 # Open a text node or merge text into an open node. 67 68 if isinstance(node, Text): 69 if not text: 70 text = node 71 else: 72 text.merge(node) 73 74 # Close any open text node and append the current node. 75 76 else: 77 if text: 78 self.append(text) 79 text = None 80 self.append(node) 81 82 # Add any open text node. 83 84 if text: 85 self.append(text) 86 87 class Region(Container): 88 89 "A region of the page." 90 91 transparent_region_types = ["wiki"] 92 93 def __init__(self, nodes, level=0, type=None): 94 Container.__init__(self, nodes) 95 self.level = level 96 self.type = type 97 98 def have_start(self, s): 99 return self.is_transparent() and s.startswith("{") 100 101 def have_end(self, s): 102 return self.level and s.startswith("}") and self.level == len(s) 103 104 def is_transparent(self): 105 return not self.level or self.type in self.transparent_region_types 106 107 def __repr__(self): 108 return "Region(%r, %r, %r)" % (self.nodes, self.level, self.type) 109 110 def to_string(self, out): 111 out.start_region(self.level, self.type) 112 for node in self.nodes: 113 node.to_string(out) 114 out.end_region(self.level, self.type) 115 116 class Block(Container): 117 118 "A block in the page." 119 120 def __init__(self, nodes, final=True): 121 Container.__init__(self, nodes) 122 self.final = final 123 124 def __repr__(self): 125 return "Block(%r)" % self.nodes 126 127 def to_string(self, out): 128 out.start_block(self.final) 129 for node in self.nodes: 130 node.to_string(out) 131 out.end_block(self.final) 132 133 class Text: 134 135 "A text node." 136 137 def __init__(self, s): 138 self.s = s 139 140 def merge(self, text): 141 self.s += text.s 142 143 def __repr__(self): 144 return "Text(%r)" % self.s 145 146 def to_string(self, out): 147 out.text(self.s) 148 149 150 151 # Serialisation. 152 153 class Serialiser: 154 155 "General serialisation support." 156 157 def __init__(self, out): 158 self.out = out 159 160 class MoinSerialiser(Serialiser): 161 162 "Serialisation of the page." 163 164 def start_region(self, level, type): 165 out = self.out 166 if level: 167 out("{" * level) # marker 168 if type and level: 169 out("#!%s\n" % type) # header 170 171 def end_region(self, level, type): 172 out = self.out 173 if level: 174 out("}" * level) # marker 175 176 def start_block(self, final): 177 pass 178 179 def end_block(self, final): 180 if not final: 181 self.out("\n") 182 183 def text(self, s): 184 self.out(s) 185 186 class HTMLSerialiser(Serialiser): 187 188 "Serialisation of the page." 189 190 def start_region(self, level, type): 191 l = [] 192 out = l.append 193 if level: 194 out("level-%d" % level) # marker 195 196 # NOTE: Encode type details for CSS. 197 198 if type: 199 out("type-%s" % escape(type, True)) # header 200 201 self.out("<span class='%s'>" % " ".join(l)) 202 203 def end_region(self, level, type): 204 self.out("</span>") 205 206 def start_block(self, final): 207 self.out("<p>") 208 209 def end_block(self, final): 210 self.out("</p>") 211 212 def text(self, s): 213 self.out(escape(s)) 214 215 216 217 # Tokenising functions. 218 219 class TokenStream: 220 221 "A stream of tokens taken from a string." 222 223 def __init__(self, s): 224 self.s = s 225 self.pos = 0 226 self.match = None 227 self.matching = None 228 229 def read_until(self, pattern_names, remaining=True): 230 231 """ 232 Find the first match for the given 'pattern_names'. Return the text 233 preceding any match, the remaining text if no match was found, or None 234 if no match was found and 'remaining' is given as a false value. 235 """ 236 237 first = None 238 self.matching = None 239 240 # Find the first matching pattern. 241 242 for pattern_name in pattern_names: 243 match = patterns[pattern_name].search(self.s, self.pos) 244 if match: 245 start, end = match.span() 246 if self.matching is None or start < first: 247 first = start 248 self.matching = pattern_name 249 self.match = match 250 251 if self.matching is None: 252 if remaining: 253 return self.s[self.pos:] 254 else: 255 return None 256 else: 257 return self.s[self.pos:first] 258 259 def read_match(self): 260 261 "Return the matched text, updating the position in the stream." 262 263 if self.match: 264 _start, self.pos = self.match.span() 265 s = self.match.group(1) 266 self.match = None 267 return s 268 else: 269 self.pos = len(self.s) 270 return None 271 272 273 274 # Parser functions. 275 276 def parse_page(s): 277 278 """ 279 Parse page text 's'. Pages consist of regions delimited by markers. 280 """ 281 282 items = TokenStream(s) 283 284 # Define a region for the page and parse it. 285 286 region = Region([]) 287 parse_region(items, region) 288 return region 289 290 def parse_region(items, region): 291 292 "Parse the data provided by 'items' to populate 'region'." 293 294 # Parse section headers. 295 296 parse_region_header(items, region) 297 298 if region.is_transparent(): 299 parse_region_wiki(items, region) 300 else: 301 parse_region_opaque(items, region) 302 303 def parse_region_wiki(items, region): 304 305 "Parse the data provided by 'items' to populate a wiki 'region'." 306 307 # Process exposed text and sections. 308 309 block = Block([]) 310 region.append(block) 311 312 while True: 313 314 # Obtain text before any marker or the end of the input. 315 316 match_text = items.read_until(["break", "marker"]) 317 if match_text: 318 block.append(Text(match_text)) 319 320 # Obtain any feature. 321 322 feature = items.read_match() 323 324 # End of input. 325 326 if not items.matching: 327 break 328 329 # Start a section if an appropriate marker is given. 330 331 if region.have_start(feature): 332 333 # Define the section and parse it. 334 335 _region = Region([], len(feature)) 336 region.append(_region) 337 parse_region(items, _region) 338 339 # Start a new block after the section. 340 341 block = Block([]) 342 region.append(block) 343 344 # Interpret the given marker, closing the current section if the 345 # given marker is the corresponding end marker for the current 346 # section. 347 348 elif region.have_end(feature): 349 break 350 351 # Start a new block if a paragraph break is found. 352 353 elif items.matching == "break": 354 block.final = False 355 block = Block([]) 356 region.append(block) 357 358 # Add any inappropriate marker to the text. 359 360 else: 361 block.append(Text(feature)) 362 363 region.normalise() 364 365 def parse_region_opaque(items, region): 366 367 "Parse the data provided by 'items' to populate an opaque 'region'." 368 369 # Process exposed text and sections. 370 371 while True: 372 373 # Obtain text before any marker or the end of the input. 374 375 match_text = items.read_until(["marker"]) 376 if match_text: 377 region.append(Text(match_text)) 378 379 # Obtain any marker. 380 381 marker = items.read_match() 382 383 # End of input. 384 385 if not marker: 386 break 387 388 # Interpret the given marker, closing the current section if the 389 # given marker is the corresponding end marker for the current 390 # section. 391 392 if region.have_end(marker): 393 break 394 395 # Add any inappropriate marker to the text. 396 397 else: 398 region.append(Text(marker)) 399 400 region.normalise() 401 402 def parse_region_header(items, region): 403 404 """ 405 Parse the region header from the 'items', setting it for the given 'region'. 406 """ 407 408 if items.read_until(["header"], False) == "": # None means no header 409 region.type = items.read_match() 410 411 412 413 # Top-level functions. 414 415 parse = parse_page 416 417 def serialise(doc, serialiser=MoinSerialiser): 418 l = [] 419 doc.to_string(serialiser(l.append)) 420 return "".join(l) 421 422 # vim: tabstop=4 expandtab shiftwidth=4