1.1 --- a/moinformat.py Fri Apr 28 18:56:50 2017 +0200
1.2 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000
1.3 @@ -1,530 +0,0 @@
1.4 -#!/usr/bin/env python
1.5 -
1.6 -"""
1.7 -Moin wiki format parser.
1.8 -
1.9 -Copyright (C) 2012, 2013, 2015, 2017 Paul Boddie <paul@boddie.org.uk>
1.10 -
1.11 -This program is free software; you can redistribute it and/or modify it under
1.12 -the terms of the GNU General Public License as published by the Free Software
1.13 -Foundation; either version 3 of the License, or (at your option) any later
1.14 -version.
1.15 -
1.16 -This program is distributed in the hope that it will be useful, but WITHOUT
1.17 -ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
1.18 -FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
1.19 -details.
1.20 -
1.21 -You should have received a copy of the GNU General Public License along with
1.22 -this program. If not, see <http://www.gnu.org/licenses/>.
1.23 -"""
1.24 -
1.25 -from cgi import escape
1.26 -import re
1.27 -
1.28 -# Regular expressions.
1.29 -
1.30 -syntax = {
1.31 - # Page regions:
1.32 - "regionstart" : (r"((^\s*)([{]{3,}))", re.MULTILINE | re.DOTALL), # {{{...
1.33 - "regionend" : (r"^\s*([}]{3,})", re.MULTILINE | re.DOTALL), # }}}...
1.34 - "header" : (r"#!(.*?)\n", 0), # #! char-excl-nl
1.35 -
1.36 - # Region contents:
1.37 - "break" : (r"^(\s*?)\n", re.MULTILINE), # blank line
1.38 - "listitem" : (r"^((\s+)([*]|\d+[.]))", re.MULTILINE), # indent (list-item or number-item)
1.39 -
1.40 - # List contents:
1.41 - "listitemend" : (r"^", re.MULTILINE), # next line
1.42 - }
1.43 -
1.44 -# Define patterns for the regular expressions.
1.45 -
1.46 -patterns = {}
1.47 -for name, (value, flags) in syntax.items():
1.48 - patterns[name] = re.compile(value, re.UNICODE | flags)
1.49 -
1.50 -
1.51 -
1.52 -# Document nodes.
1.53 -
1.54 -class Container:
1.55 -
1.56 - "A container of document nodes."
1.57 -
1.58 - def __init__(self, nodes):
1.59 - self.nodes = nodes
1.60 -
1.61 - def append(self, node):
1.62 - self.nodes.append(node)
1.63 -
1.64 - append_text = append
1.65 -
1.66 - def empty(self):
1.67 - return not self.nodes
1.68 -
1.69 - def normalise(self):
1.70 -
1.71 - "Combine adjacent text nodes."
1.72 -
1.73 - nodes = self.nodes
1.74 - self.nodes = []
1.75 - text = None
1.76 -
1.77 - for node in nodes:
1.78 -
1.79 - # Open a text node or merge text into an open node.
1.80 -
1.81 - if isinstance(node, Text):
1.82 - if not text:
1.83 - text = node
1.84 - else:
1.85 - text.merge(node)
1.86 -
1.87 - # Close any open text node and append the current node.
1.88 -
1.89 - else:
1.90 - if text:
1.91 - self.append(text)
1.92 - text = None
1.93 - self.append(node)
1.94 -
1.95 - # Add any open text node.
1.96 -
1.97 - if text:
1.98 - self.append(text)
1.99 -
1.100 - def __str__(self):
1.101 - return self.prettyprint()
1.102 -
1.103 - def prettyprint(self, indent=""):
1.104 - pass
1.105 -
1.106 -class Region(Container):
1.107 -
1.108 - "A region of the page."
1.109 -
1.110 - transparent_region_types = ["wiki"]
1.111 -
1.112 - def __init__(self, nodes, level=0, indent=0, type=None):
1.113 - Container.__init__(self, nodes)
1.114 - self.level = level
1.115 - self.indent = indent
1.116 - self.type = type
1.117 -
1.118 - def append(self, node):
1.119 - last = self.nodes and self.nodes[-1]
1.120 - if last and last.empty():
1.121 - self.nodes[-1] = node
1.122 - else:
1.123 - self.nodes.append(node)
1.124 -
1.125 - def append_text(self, s):
1.126 - if self.is_transparent():
1.127 - self.nodes[-1].append(s)
1.128 - else:
1.129 - self.append(s)
1.130 -
1.131 - def have_end(self, s):
1.132 - return self.level and s.startswith("}") and self.level == len(s)
1.133 -
1.134 - def is_transparent(self):
1.135 - return not self.level or self.type in self.transparent_region_types
1.136 -
1.137 - def __repr__(self):
1.138 - return "Region(%r, %r, %r, %r)" % (self.nodes, self.level, self.indent, self.type)
1.139 -
1.140 - def prettyprint(self, indent=""):
1.141 - l = ["%sRegion: level=%d indent=%d type=%s" % (indent, self.level, self.indent, self.type)]
1.142 - for node in self.nodes:
1.143 - l.append(node.prettyprint(indent + " "))
1.144 - return "\n".join(l)
1.145 -
1.146 - def to_string(self, out):
1.147 - out.start_region(self.level, self.indent, self.type)
1.148 - for node in self.nodes:
1.149 - node.to_string(out)
1.150 - out.end_region(self.level, self.indent, self.type)
1.151 -
1.152 -class Block(Container):
1.153 -
1.154 - "A block in the page."
1.155 -
1.156 - def __init__(self, nodes, final=True):
1.157 - Container.__init__(self, nodes)
1.158 - self.final = final
1.159 -
1.160 - def __repr__(self):
1.161 - return "Block(%r)" % self.nodes
1.162 -
1.163 - def prettyprint(self, indent=""):
1.164 - l = ["%sBlock: final=%s" % (indent, self.final)]
1.165 - for node in self.nodes:
1.166 - l.append(node.prettyprint(indent + " "))
1.167 - return "\n".join(l)
1.168 -
1.169 - def to_string(self, out):
1.170 - out.start_block(self.final)
1.171 - for node in self.nodes:
1.172 - node.to_string(out)
1.173 - out.end_block(self.final)
1.174 -
1.175 -class ListItem(Container):
1.176 -
1.177 - "A list item."
1.178 -
1.179 - def __repr__(self):
1.180 - return "ListItem(%r)" % self.nodes
1.181 -
1.182 - def prettyprint(self, indent=""):
1.183 - l = ["%sListItem:" % indent]
1.184 - for node in self.nodes:
1.185 - l.append(node.prettyprint(indent + " "))
1.186 - return "\n".join(l)
1.187 -
1.188 - def to_string(self, out):
1.189 - out.start_listitem()
1.190 - for node in self.nodes:
1.191 - node.to_string(out)
1.192 - out.end_listitem()
1.193 -
1.194 -
1.195 -class Text:
1.196 -
1.197 - "A text node."
1.198 -
1.199 - def __init__(self, s):
1.200 - self.s = s
1.201 -
1.202 - def empty(self):
1.203 - return not self.s
1.204 -
1.205 - def merge(self, text):
1.206 - self.s += text.s
1.207 -
1.208 - def __repr__(self):
1.209 - return "Text(%r)" % self.s
1.210 -
1.211 - def prettyprint(self, indent=""):
1.212 - return "%sText: %r" % (indent, self.s)
1.213 -
1.214 - def to_string(self, out):
1.215 - out.text(self.s)
1.216 -
1.217 -
1.218 -
1.219 -# Serialisation.
1.220 -
1.221 -class Serialiser:
1.222 -
1.223 - "General serialisation support."
1.224 -
1.225 - def __init__(self, out):
1.226 - self.out = out
1.227 -
1.228 -class MoinSerialiser(Serialiser):
1.229 -
1.230 - "Serialisation of the page."
1.231 -
1.232 - def start_region(self, level, indent, type):
1.233 - out = self.out
1.234 - if level:
1.235 - out(" " * indent + "{" * level)
1.236 - if type and level:
1.237 - out("#!%s\n" % type)
1.238 -
1.239 - def end_region(self, level, indent, type):
1.240 - out = self.out
1.241 - if level:
1.242 - out("}" * level)
1.243 -
1.244 - def start_block(self, final):
1.245 - pass
1.246 -
1.247 - def end_block(self, final):
1.248 - if not final:
1.249 - self.out("\n")
1.250 -
1.251 - def start_listitem(self):
1.252 - self.out(" *")
1.253 -
1.254 - def end_listitem(self):
1.255 - pass
1.256 -
1.257 - def text(self, s):
1.258 - self.out(s)
1.259 -
1.260 -class HTMLSerialiser(Serialiser):
1.261 -
1.262 - "Serialisation of the page."
1.263 -
1.264 - def start_region(self, level, indent, type):
1.265 - l = []
1.266 - out = l.append
1.267 - if level:
1.268 - out("level-%d" % level)
1.269 -
1.270 - if indent:
1.271 - out("indent-%d" % indent)
1.272 -
1.273 - # NOTE: Encode type details for CSS.
1.274 -
1.275 - if type:
1.276 - out("type-%s" % escape(type, True))
1.277 -
1.278 - self.out("<span class='%s'>" % " ".join(l))
1.279 -
1.280 - def end_region(self, level, indent, type):
1.281 - self.out("</span>")
1.282 -
1.283 - def start_block(self, final):
1.284 - self.out("<p>")
1.285 -
1.286 - def end_block(self, final):
1.287 - self.out("</p>")
1.288 -
1.289 - def start_listitem(self):
1.290 - self.out("<li>")
1.291 -
1.292 - def end_listitem(self):
1.293 - self.out("</li>")
1.294 -
1.295 - def text(self, s):
1.296 - self.out(escape(s))
1.297 -
1.298 -
1.299 -
1.300 -# Tokenising functions.
1.301 -
1.302 -class TokenStream:
1.303 -
1.304 - "A stream of tokens taken from a string."
1.305 -
1.306 - def __init__(self, s):
1.307 - self.s = s
1.308 - self.pos = 0
1.309 - self.match = None
1.310 - self.matching = None
1.311 -
1.312 - def read_until(self, pattern_names, remaining=True):
1.313 -
1.314 - """
1.315 - Find the first match for the given 'pattern_names'. Return the text
1.316 - preceding any match, the remaining text if no match was found, or None
1.317 - if no match was found and 'remaining' is given as a false value.
1.318 - """
1.319 -
1.320 - first = None
1.321 - self.matching = None
1.322 -
1.323 - # Find the first matching pattern.
1.324 -
1.325 - for pattern_name in pattern_names:
1.326 - match = patterns[pattern_name].search(self.s, self.pos)
1.327 - if match:
1.328 - start, end = match.span()
1.329 - if self.matching is None or start < first:
1.330 - first = start
1.331 - self.matching = pattern_name
1.332 - self.match = match
1.333 -
1.334 - if self.matching is None:
1.335 - if remaining:
1.336 - return self.s[self.pos:]
1.337 - else:
1.338 - return None
1.339 - else:
1.340 - return self.s[self.pos:first]
1.341 -
1.342 - def read_match(self, group=1):
1.343 -
1.344 - """
1.345 - Return the matched text, updating the position in the stream. If 'group'
1.346 - is specified, the indicated group in a match will be returned.
1.347 - Typically, group 1 should contain all pertinent data, but groups defined
1.348 - within group 1 can provide sections of the data.
1.349 - """
1.350 -
1.351 - if self.match:
1.352 - _start, self.pos = self.match.span()
1.353 - try:
1.354 - return self.match.group(group)
1.355 - except IndexError:
1.356 - return ""
1.357 - else:
1.358 - self.pos = len(self.s)
1.359 - return None
1.360 -
1.361 -
1.362 -
1.363 -# Parser functions.
1.364 -
1.365 -def parse_page(s):
1.366 -
1.367 - """
1.368 - Parse page text 's'. Pages consist of regions delimited by markers.
1.369 - """
1.370 -
1.371 - return parse_region(TokenStream(s))
1.372 -
1.373 -def parse_region(items, level=0, indent=0):
1.374 -
1.375 - """
1.376 - Parse the data provided by 'items' to populate a region with the given
1.377 - 'level' at the given 'indent'.
1.378 - """
1.379 -
1.380 - region = Region([], level, indent)
1.381 -
1.382 - # Parse section headers.
1.383 -
1.384 - parse_region_header(items, region)
1.385 -
1.386 - # Parse section body.
1.387 -
1.388 - if region.is_transparent():
1.389 - parse_region_wiki(items, region)
1.390 - else:
1.391 - parse_region_opaque(items, region)
1.392 -
1.393 - return region
1.394 -
1.395 -def parse_region_header(items, region):
1.396 -
1.397 - """
1.398 - Parse the region header from the 'items', setting it for the given 'region'.
1.399 - """
1.400 -
1.401 - if items.read_until(["header"], False) == "": # None means no header
1.402 - region.type = items.read_match()
1.403 -
1.404 -def parse_region_wiki(items, region):
1.405 -
1.406 - "Parse the data provided by 'items' to populate a wiki 'region'."
1.407 -
1.408 - new_block(region)
1.409 - parse_region_details(items, region, ["break", "listitem", "regionstart", "regionend"])
1.410 -
1.411 -def parse_region_opaque(items, region):
1.412 -
1.413 - "Parse the data provided by 'items' to populate an opaque 'region'."
1.414 -
1.415 - parse_region_details(items, region, ["regionend"])
1.416 -
1.417 -def parse_region_details(items, region, pattern_names):
1.418 -
1.419 - "Parse 'items' within 'region' searching using 'pattern_names'."
1.420 -
1.421 - try:
1.422 - while True:
1.423 -
1.424 - # Obtain text before any marker or the end of the input.
1.425 -
1.426 - preceding = items.read_until(pattern_names)
1.427 - if preceding:
1.428 - region.append_text(Text(preceding))
1.429 -
1.430 - # End of input.
1.431 -
1.432 - if not items.matching:
1.433 - break
1.434 -
1.435 - # Obtain any feature.
1.436 -
1.437 - feature = items.read_match()
1.438 - handler = handlers.get(items.matching)
1.439 -
1.440 - # Handle each feature or add text to the region.
1.441 -
1.442 - if handler:
1.443 - handler(items, region)
1.444 - else:
1.445 - region.append_text(Text(feature))
1.446 -
1.447 - except StopIteration:
1.448 - pass
1.449 -
1.450 - region.normalise()
1.451 -
1.452 -def end_region(items, region):
1.453 -
1.454 - "End the parsing of 'region'."
1.455 -
1.456 - raise StopIteration
1.457 -
1.458 -def parse_break(items, region):
1.459 -
1.460 - "Handle a paragraph break within 'region'."
1.461 -
1.462 - # Mark any previous block as not being the final one in a sequence.
1.463 -
1.464 - block = region.nodes[-1]
1.465 - block.final = False
1.466 - new_block(region)
1.467 -
1.468 -def parse_listitem_end(items, region):
1.469 -
1.470 - "Handle the end of a list."
1.471 -
1.472 - raise StopIteration
1.473 -
1.474 -def parse_listitem(items, region):
1.475 -
1.476 - "Handle a list item marker within 'region'."
1.477 -
1.478 - item = ListItem([])
1.479 - parse_region_details(items, item, ["listitemend"])
1.480 - region.append(item)
1.481 - new_block(region)
1.482 -
1.483 -def parse_section(items, region):
1.484 -
1.485 - "Handle the start of a new section within 'region'."
1.486 -
1.487 - # Parse the section and start a new block after the section.
1.488 -
1.489 - indent = len(items.read_match(2))
1.490 - level = len(items.read_match(3))
1.491 - region.append(parse_region(items, level, indent))
1.492 - new_block(region)
1.493 -
1.494 -def parse_section_end(items, region):
1.495 -
1.496 - "Handle the end of a new section within 'region'."
1.497 -
1.498 - feature = items.read_match()
1.499 - if region.have_end(feature):
1.500 - raise StopIteration
1.501 - else:
1.502 - region.append_text(Text(feature))
1.503 -
1.504 -# Pattern handlers.
1.505 -
1.506 -handlers = {
1.507 - None : end_region,
1.508 - "break" : parse_break,
1.509 - "listitemend" : parse_listitem_end,
1.510 - "listitem" : parse_listitem,
1.511 - "regionstart" : parse_section,
1.512 - "regionend" : parse_section_end,
1.513 - }
1.514 -
1.515 -def new_block(region):
1.516 -
1.517 - "Start a new block in 'region'."
1.518 -
1.519 - block = Block([])
1.520 - region.append(block)
1.521 -
1.522 -
1.523 -
1.524 -# Top-level functions.
1.525 -
1.526 -parse = parse_page
1.527 -
1.528 -def serialise(doc, serialiser=MoinSerialiser):
1.529 - l = []
1.530 - doc.to_string(serialiser(l.append))
1.531 - return "".join(l)
1.532 -
1.533 -# vim: tabstop=4 expandtab shiftwidth=4