1 #!/usr/bin/env python 2 3 """ 4 Moin wiki format parser. 5 6 Copyright (C) 2012, 2013, 2015, 2017 Paul Boddie <paul@boddie.org.uk> 7 8 This program is free software; you can redistribute it and/or modify it under 9 the terms of the GNU General Public License as published by the Free Software 10 Foundation; either version 3 of the License, or (at your option) any later 11 version. 12 13 This program is distributed in the hope that it will be useful, but WITHOUT 14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS 15 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more 16 details. 17 18 You should have received a copy of the GNU General Public License along with 19 this program. If not, see <http://www.gnu.org/licenses/>. 20 """ 21 22 from cgi import escape 23 import re 24 25 # Regular expressions. 26 27 syntax = { 28 # Page regions: 29 "markers" : (r"^\s*([{]{3,}|[}]{3,})", re.MULTILINE | re.DOTALL), # {{{... or }}}... 30 31 # Region contents: 32 "header" : (r"\A#!(.*?)\n", 0), # #! char-excl-nl 33 "region text" : (r"(^\s*$)", re.MULTILINE), # blank line 34 } 35 36 # Define patterns for the regular expressions. 37 38 patterns = {} 39 for name, (value, flags) in syntax.items(): 40 patterns[name] = re.compile(value, re.UNICODE | flags) 41 42 43 44 # Document nodes. 45 46 class Container: 47 48 "A container of document nodes." 49 50 def __init__(self, nodes): 51 self.nodes = nodes 52 53 def append(self, node): 54 self.nodes.append(node) 55 56 def normalise(self): 57 58 "Combine adjacent text nodes." 59 60 nodes = self.nodes 61 self.nodes = [] 62 text = None 63 64 for node in nodes: 65 66 # Open a text node or merge text into an open node. 67 68 if isinstance(node, Text): 69 if not text: 70 text = node 71 else: 72 text.merge(node) 73 74 # Close any open text node and append the current node. 75 76 else: 77 if text: 78 self.append(text) 79 text = None 80 self.append(node) 81 82 # Add any open text node. 83 84 if text: 85 self.append(text) 86 87 class Region(Container): 88 89 "A region of the page." 90 91 transparent_region_types = ["wiki"] 92 93 def __init__(self, nodes, level=0, type=None): 94 Container.__init__(self, nodes) 95 self.level = level 96 self.type = type 97 98 def expand(self): 99 100 """ 101 Expand text nodes by parsing them as region text, if the region is 102 understandable to the standard parser. 103 """ 104 105 if self.is_transparent(): 106 nodes = self.nodes 107 self.nodes = [] 108 109 for node in nodes: 110 if isinstance(node, Text): 111 parse_region_text(node.s, self) 112 else: 113 self.append(node) 114 115 def have_start(self, s): 116 return self.is_transparent() and s.startswith("{") 117 118 def have_end(self, s): 119 return self.level and s.startswith("}") and self.level == len(s) 120 121 def is_transparent(self): 122 return not self.level or self.type in self.transparent_region_types 123 124 def __repr__(self): 125 return "Region(%r, %r, %r)" % (self.nodes, self.level, self.type) 126 127 def to_string(self, out): 128 out.start_region(self.level, self.type) 129 for node in self.nodes: 130 node.to_string(out) 131 out.end_region(self.level, self.type) 132 133 class Block(Container): 134 135 "A block in the page." 136 137 def __init__(self, nodes, final=True): 138 Container.__init__(self, nodes) 139 self.final = final 140 141 def __repr__(self): 142 return "Block(%r)" % self.nodes 143 144 def to_string(self, out): 145 out.start_block(self.final) 146 for node in self.nodes: 147 node.to_string(out) 148 out.end_block(self.final) 149 150 class Text: 151 152 "A text node." 153 154 def __init__(self, s): 155 self.s = s 156 157 def merge(self, text): 158 self.s += text.s 159 160 def __repr__(self): 161 return "Text(%r)" % self.s 162 163 def to_string(self, out): 164 out.text(self.s) 165 166 167 168 # Serialisation. 169 170 class Serialiser: 171 172 "General serialisation support." 173 174 def __init__(self, out): 175 self.out = out 176 177 class MoinSerialiser(Serialiser): 178 179 "Serialisation of the page." 180 181 def start_region(self, level, type): 182 out = self.out 183 if level: 184 out("{" * level) # marker 185 if type and level: 186 out("#!%s\n" % type) # header 187 188 def end_region(self, level, type): 189 out = self.out 190 if level: 191 out("}" * level) # marker 192 193 def start_block(self, final): 194 pass 195 196 def end_block(self, final): 197 if not final: 198 self.out("\n") 199 200 def text(self, s): 201 self.out(s) 202 203 class HTMLSerialiser(Serialiser): 204 205 "Serialisation of the page." 206 207 def start_region(self, level, type): 208 l = [] 209 out = l.append 210 if level: 211 out("level-%d" % level) # marker 212 213 # NOTE: Encode type details for CSS. 214 215 if type: 216 out("type-%s" % escape(type, True)) # header 217 218 self.out("<span class='%s'>" % " ".join(l)) 219 220 def end_region(self, level, type): 221 self.out("</span>") 222 223 def start_block(self, final): 224 self.out("<p>") 225 226 def end_block(self, final): 227 self.out("</p>") 228 229 def text(self, s): 230 self.out(escape(s)) 231 232 233 234 # Parser functions. 235 236 def parse_page(s): 237 238 """ 239 Parse page text 's'. Pages consist of regions delimited by markers. 240 """ 241 242 # Define tokens for interpretation by the parser. 243 244 items = iter(patterns["markers"].split(s)) 245 246 # Define a region for the page and parse it. 247 248 region = Region([]) 249 parse_region(items, region) 250 return region 251 252 def parse_region(items, region): 253 254 "Parse the data provided by 'items' to populate 'region'." 255 256 nodes = region.nodes 257 first = True 258 259 # Process exposed text and sections. 260 261 try: 262 try: 263 while True: 264 265 # Parse section headers. 266 267 if first: 268 match_text = parse_region_header(items.next(), region) 269 first = False 270 else: 271 match_text = items.next() 272 273 # Start a section if an appropriate marker is given. 274 275 if region.have_start(match_text): 276 277 # Define the section and parse it. 278 279 _region = Region([], len(match_text)) 280 region.append(_region) 281 parse_region(items, _region) 282 283 # Interpret the given marker, closing the current section if the 284 # given marker is the corresponding end marker for the current 285 # section. 286 287 elif region.have_end(match_text): 288 return 289 290 # Otherwise, parse text in the region. 291 292 else: 293 region.append(Text(match_text)) 294 295 # End of input. 296 297 except StopIteration: 298 pass 299 300 finally: 301 region.normalise() 302 303 # Parse region contents, if possible. 304 305 region.expand() 306 307 def parse_region_header(s, region): 308 309 """ 310 Parse the text 's', extracting any region header and setting it for the 311 given 'region'. Return the remaining text. 312 """ 313 314 items = iter(patterns["header"].split(s)) 315 pre_header = items.next() 316 317 if not pre_header: 318 region.type = items.next() 319 return items.next() 320 else: 321 return pre_header 322 323 def parse_region_text(s, region): 324 325 "Parse the text 's' as part of 'region'." 326 327 items = iter(patterns["region text"].split(s)) 328 block = Block([]) 329 region.append(block) 330 331 try: 332 while True: 333 match_text = items.next() 334 335 if not match_text.strip(): 336 region.append(block) 337 block.final = False 338 block = Block([]) 339 else: 340 block.append(Text(match_text)) 341 342 except StopIteration: 343 pass 344 345 346 347 # Top-level functions. 348 349 parse = parse_page 350 351 def serialise(doc, serialiser=MoinSerialiser): 352 l = [] 353 doc.to_string(serialiser(l.append)) 354 return "".join(l) 355 356 # vim: tabstop=4 expandtab shiftwidth=4