1.1 --- a/moinformat.py	Fri Apr 28 18:56:50 2017 +0200
     1.2 +++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.3 @@ -1,530 +0,0 @@
     1.4 -#!/usr/bin/env python
     1.5 -
     1.6 -"""
     1.7 -Moin wiki format parser.
     1.8 -
     1.9 -Copyright (C) 2012, 2013, 2015, 2017 Paul Boddie <paul@boddie.org.uk>
    1.10 -
    1.11 -This program is free software; you can redistribute it and/or modify it under
    1.12 -the terms of the GNU General Public License as published by the Free Software
    1.13 -Foundation; either version 3 of the License, or (at your option) any later
    1.14 -version.
    1.15 -
    1.16 -This program is distributed in the hope that it will be useful, but WITHOUT
    1.17 -ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
    1.18 -FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
    1.19 -details.
    1.20 -
    1.21 -You should have received a copy of the GNU General Public License along with
    1.22 -this program.  If not, see <http://www.gnu.org/licenses/>.
    1.23 -"""
    1.24 -
    1.25 -from cgi import escape
    1.26 -import re
    1.27 -
    1.28 -# Regular expressions.
    1.29 -
    1.30 -syntax = {
    1.31 -    # Page regions:
    1.32 -    "regionstart"   : (r"((^\s*)([{]{3,}))",            re.MULTILINE | re.DOTALL),  # {{{...
    1.33 -    "regionend"     : (r"^\s*([}]{3,})",                re.MULTILINE | re.DOTALL),  # }}}...
    1.34 -    "header"        : (r"#!(.*?)\n",                    0),                         # #! char-excl-nl
    1.35 -
    1.36 -    # Region contents:
    1.37 -    "break"         : (r"^(\s*?)\n",                    re.MULTILINE),              # blank line
    1.38 -    "listitem"      : (r"^((\s+)([*]|\d+[.]))",         re.MULTILINE),              # indent (list-item or number-item)
    1.39 -
    1.40 -    # List contents:
    1.41 -    "listitemend"   : (r"^",                            re.MULTILINE),              # next line
    1.42 -    }
    1.43 -
    1.44 -# Define patterns for the regular expressions.
    1.45 -
    1.46 -patterns = {}
    1.47 -for name, (value, flags) in syntax.items():
    1.48 -    patterns[name] = re.compile(value, re.UNICODE | flags)
    1.49 -
    1.50 -
    1.51 -
    1.52 -# Document nodes.
    1.53 -
    1.54 -class Container:
    1.55 -
    1.56 -    "A container of document nodes."
    1.57 -
    1.58 -    def __init__(self, nodes):
    1.59 -        self.nodes = nodes
    1.60 -
    1.61 -    def append(self, node):
    1.62 -        self.nodes.append(node)
    1.63 -
    1.64 -    append_text = append
    1.65 -
    1.66 -    def empty(self):
    1.67 -        return not self.nodes
    1.68 -
    1.69 -    def normalise(self):
    1.70 -
    1.71 -        "Combine adjacent text nodes."
    1.72 -
    1.73 -        nodes = self.nodes
    1.74 -        self.nodes = []
    1.75 -        text = None
    1.76 -
    1.77 -        for node in nodes:
    1.78 -
    1.79 -            # Open a text node or merge text into an open node.
    1.80 -
    1.81 -            if isinstance(node, Text):
    1.82 -                if not text:
    1.83 -                    text = node
    1.84 -                else:
    1.85 -                    text.merge(node)
    1.86 -
    1.87 -            # Close any open text node and append the current node.
    1.88 -
    1.89 -            else:
    1.90 -                if text:
    1.91 -                    self.append(text)
    1.92 -                    text = None
    1.93 -                self.append(node)
    1.94 -
    1.95 -        # Add any open text node.
    1.96 -
    1.97 -        if text:
    1.98 -            self.append(text)
    1.99 -
   1.100 -    def __str__(self):
   1.101 -        return self.prettyprint()
   1.102 -
   1.103 -    def prettyprint(self, indent=""):
   1.104 -        pass
   1.105 -
   1.106 -class Region(Container):
   1.107 -
   1.108 -    "A region of the page."
   1.109 -
   1.110 -    transparent_region_types = ["wiki"]
   1.111 -
   1.112 -    def __init__(self, nodes, level=0, indent=0, type=None):
   1.113 -        Container.__init__(self, nodes)
   1.114 -        self.level = level
   1.115 -        self.indent = indent
   1.116 -        self.type = type
   1.117 -
   1.118 -    def append(self, node):
   1.119 -        last = self.nodes and self.nodes[-1]
   1.120 -        if last and last.empty():
   1.121 -            self.nodes[-1] = node
   1.122 -        else:
   1.123 -            self.nodes.append(node)
   1.124 -
   1.125 -    def append_text(self, s):
   1.126 -        if self.is_transparent():
   1.127 -            self.nodes[-1].append(s)
   1.128 -        else:
   1.129 -            self.append(s)
   1.130 -
   1.131 -    def have_end(self, s):
   1.132 -        return self.level and s.startswith("}") and self.level == len(s)
   1.133 -
   1.134 -    def is_transparent(self):
   1.135 -        return not self.level or self.type in self.transparent_region_types
   1.136 -
   1.137 -    def __repr__(self):
   1.138 -        return "Region(%r, %r, %r, %r)" % (self.nodes, self.level, self.indent, self.type)
   1.139 -
   1.140 -    def prettyprint(self, indent=""):
   1.141 -        l = ["%sRegion: level=%d indent=%d type=%s" % (indent, self.level, self.indent, self.type)]
   1.142 -        for node in self.nodes:
   1.143 -            l.append(node.prettyprint(indent + "  "))
   1.144 -        return "\n".join(l)
   1.145 -
   1.146 -    def to_string(self, out):
   1.147 -        out.start_region(self.level, self.indent, self.type)
   1.148 -        for node in self.nodes:
   1.149 -            node.to_string(out)
   1.150 -        out.end_region(self.level, self.indent, self.type)
   1.151 -
   1.152 -class Block(Container):
   1.153 -
   1.154 -    "A block in the page."
   1.155 -
   1.156 -    def __init__(self, nodes, final=True):
   1.157 -        Container.__init__(self, nodes)
   1.158 -        self.final = final
   1.159 -
   1.160 -    def __repr__(self):
   1.161 -        return "Block(%r)" % self.nodes
   1.162 -
   1.163 -    def prettyprint(self, indent=""):
   1.164 -        l = ["%sBlock: final=%s" % (indent, self.final)]
   1.165 -        for node in self.nodes:
   1.166 -            l.append(node.prettyprint(indent + "  "))
   1.167 -        return "\n".join(l)
   1.168 -
   1.169 -    def to_string(self, out):
   1.170 -        out.start_block(self.final)
   1.171 -        for node in self.nodes:
   1.172 -            node.to_string(out)
   1.173 -        out.end_block(self.final)
   1.174 -
   1.175 -class ListItem(Container):
   1.176 -
   1.177 -    "A list item."
   1.178 -
   1.179 -    def __repr__(self):
   1.180 -        return "ListItem(%r)" % self.nodes
   1.181 -
   1.182 -    def prettyprint(self, indent=""):
   1.183 -        l = ["%sListItem:" % indent]
   1.184 -        for node in self.nodes:
   1.185 -            l.append(node.prettyprint(indent + "  "))
   1.186 -        return "\n".join(l)
   1.187 -
   1.188 -    def to_string(self, out):
   1.189 -        out.start_listitem()
   1.190 -        for node in self.nodes:
   1.191 -            node.to_string(out)
   1.192 -        out.end_listitem()
   1.193 -
   1.194 -
   1.195 -class Text:
   1.196 -
   1.197 -    "A text node."
   1.198 -
   1.199 -    def __init__(self, s):
   1.200 -        self.s = s
   1.201 -
   1.202 -    def empty(self):
   1.203 -        return not self.s
   1.204 -
   1.205 -    def merge(self, text):
   1.206 -        self.s += text.s
   1.207 -
   1.208 -    def __repr__(self):
   1.209 -        return "Text(%r)" % self.s
   1.210 -
   1.211 -    def prettyprint(self, indent=""):
   1.212 -        return "%sText: %r" % (indent, self.s)
   1.213 -
   1.214 -    def to_string(self, out):
   1.215 -        out.text(self.s)
   1.216 -
   1.217 -
   1.218 -
   1.219 -# Serialisation.
   1.220 -
   1.221 -class Serialiser:
   1.222 -
   1.223 -    "General serialisation support."
   1.224 -
   1.225 -    def __init__(self, out):
   1.226 -        self.out = out
   1.227 -
   1.228 -class MoinSerialiser(Serialiser):
   1.229 -
   1.230 -    "Serialisation of the page."
   1.231 -
   1.232 -    def start_region(self, level, indent, type):
   1.233 -        out = self.out
   1.234 -        if level:
   1.235 -            out(" " * indent + "{" * level)
   1.236 -        if type and level:
   1.237 -            out("#!%s\n" % type)
   1.238 -
   1.239 -    def end_region(self, level, indent, type):
   1.240 -        out = self.out
   1.241 -        if level:
   1.242 -            out("}" * level)
   1.243 -
   1.244 -    def start_block(self, final):
   1.245 -        pass
   1.246 -
   1.247 -    def end_block(self, final):
   1.248 -        if not final:
   1.249 -            self.out("\n")
   1.250 -
   1.251 -    def start_listitem(self):
   1.252 -        self.out(" *")
   1.253 -
   1.254 -    def end_listitem(self):
   1.255 -        pass
   1.256 -
   1.257 -    def text(self, s):
   1.258 -        self.out(s)
   1.259 -
   1.260 -class HTMLSerialiser(Serialiser):
   1.261 -
   1.262 -    "Serialisation of the page."
   1.263 -
   1.264 -    def start_region(self, level, indent, type):
   1.265 -        l = []
   1.266 -        out = l.append
   1.267 -        if level:
   1.268 -            out("level-%d" % level)
   1.269 -
   1.270 -        if indent:
   1.271 -            out("indent-%d" % indent)
   1.272 -
   1.273 -        # NOTE: Encode type details for CSS.
   1.274 -
   1.275 -        if type:
   1.276 -            out("type-%s" % escape(type, True))
   1.277 -
   1.278 -        self.out("<span class='%s'>" % " ".join(l))
   1.279 -
   1.280 -    def end_region(self, level, indent, type):
   1.281 -        self.out("</span>")
   1.282 -
   1.283 -    def start_block(self, final):
   1.284 -        self.out("<p>")
   1.285 -
   1.286 -    def end_block(self, final):
   1.287 -        self.out("</p>")
   1.288 -
   1.289 -    def start_listitem(self):
   1.290 -        self.out("<li>")
   1.291 -
   1.292 -    def end_listitem(self):
   1.293 -        self.out("</li>")
   1.294 -
   1.295 -    def text(self, s):
   1.296 -        self.out(escape(s))
   1.297 -
   1.298 -
   1.299 -
   1.300 -# Tokenising functions.
   1.301 -
   1.302 -class TokenStream:
   1.303 -
   1.304 -    "A stream of tokens taken from a string."
   1.305 -
   1.306 -    def __init__(self, s):
   1.307 -        self.s = s
   1.308 -        self.pos = 0
   1.309 -        self.match = None
   1.310 -        self.matching = None
   1.311 -
   1.312 -    def read_until(self, pattern_names, remaining=True):
   1.313 -
   1.314 -        """
   1.315 -        Find the first match for the given 'pattern_names'. Return the text
   1.316 -        preceding any match, the remaining text if no match was found, or None
   1.317 -        if no match was found and 'remaining' is given as a false value.
   1.318 -        """
   1.319 -
   1.320 -        first = None
   1.321 -        self.matching = None
   1.322 -
   1.323 -        # Find the first matching pattern.
   1.324 -
   1.325 -        for pattern_name in pattern_names:
   1.326 -            match = patterns[pattern_name].search(self.s, self.pos)
   1.327 -            if match:
   1.328 -                start, end = match.span()
   1.329 -                if self.matching is None or start < first:
   1.330 -                    first = start
   1.331 -                    self.matching = pattern_name
   1.332 -                    self.match = match
   1.333 -
   1.334 -        if self.matching is None:
   1.335 -            if remaining:
   1.336 -                return self.s[self.pos:]
   1.337 -            else:
   1.338 -                return None
   1.339 -        else:
   1.340 -            return self.s[self.pos:first]
   1.341 -
   1.342 -    def read_match(self, group=1):
   1.343 -
   1.344 -        """
   1.345 -        Return the matched text, updating the position in the stream. If 'group'
   1.346 -        is specified, the indicated group in a match will be returned.
   1.347 -        Typically, group 1 should contain all pertinent data, but groups defined
   1.348 -        within group 1 can provide sections of the data.
   1.349 -        """
   1.350 -
   1.351 -        if self.match:
   1.352 -            _start, self.pos = self.match.span()
   1.353 -            try:
   1.354 -                return self.match.group(group)
   1.355 -            except IndexError:
   1.356 -                return ""
   1.357 -        else:
   1.358 -            self.pos = len(self.s)
   1.359 -            return None
   1.360 -
   1.361 -
   1.362 -
   1.363 -# Parser functions.
   1.364 -
   1.365 -def parse_page(s):
   1.366 -
   1.367 -    """
   1.368 -    Parse page text 's'. Pages consist of regions delimited by markers.
   1.369 -    """
   1.370 -
   1.371 -    return parse_region(TokenStream(s))
   1.372 -
   1.373 -def parse_region(items, level=0, indent=0):
   1.374 -
   1.375 -    """
   1.376 -    Parse the data provided by 'items' to populate a region with the given
   1.377 -    'level' at the given 'indent'.
   1.378 -    """
   1.379 -
   1.380 -    region = Region([], level, indent)
   1.381 -
   1.382 -    # Parse section headers.
   1.383 -
   1.384 -    parse_region_header(items, region)
   1.385 -
   1.386 -    # Parse section body.
   1.387 -
   1.388 -    if region.is_transparent():
   1.389 -        parse_region_wiki(items, region)
   1.390 -    else:
   1.391 -        parse_region_opaque(items, region)
   1.392 -
   1.393 -    return region
   1.394 -
   1.395 -def parse_region_header(items, region):
   1.396 -
   1.397 -    """
   1.398 -    Parse the region header from the 'items', setting it for the given 'region'.
   1.399 -    """
   1.400 -
   1.401 -    if items.read_until(["header"], False) == "": # None means no header
   1.402 -        region.type = items.read_match()
   1.403 -
   1.404 -def parse_region_wiki(items, region):
   1.405 -
   1.406 -    "Parse the data provided by 'items' to populate a wiki 'region'."
   1.407 -
   1.408 -    new_block(region)
   1.409 -    parse_region_details(items, region, ["break", "listitem", "regionstart", "regionend"])
   1.410 -
   1.411 -def parse_region_opaque(items, region):
   1.412 -
   1.413 -    "Parse the data provided by 'items' to populate an opaque 'region'."
   1.414 -
   1.415 -    parse_region_details(items, region, ["regionend"])
   1.416 -
   1.417 -def parse_region_details(items, region, pattern_names):
   1.418 -
   1.419 -    "Parse 'items' within 'region' searching using 'pattern_names'."
   1.420 -
   1.421 -    try:
   1.422 -        while True:
   1.423 -
   1.424 -            # Obtain text before any marker or the end of the input.
   1.425 -
   1.426 -            preceding = items.read_until(pattern_names)
   1.427 -            if preceding:
   1.428 -                region.append_text(Text(preceding))
   1.429 -
   1.430 -            # End of input.
   1.431 -
   1.432 -            if not items.matching:
   1.433 -                break
   1.434 -
   1.435 -            # Obtain any feature.
   1.436 -
   1.437 -            feature = items.read_match()
   1.438 -            handler = handlers.get(items.matching)
   1.439 -
   1.440 -            # Handle each feature or add text to the region.
   1.441 -
   1.442 -            if handler:
   1.443 -                handler(items, region)
   1.444 -            else:
   1.445 -                region.append_text(Text(feature))
   1.446 -
   1.447 -    except StopIteration:
   1.448 -        pass
   1.449 -
   1.450 -    region.normalise()
   1.451 -
   1.452 -def end_region(items, region):
   1.453 -
   1.454 -    "End the parsing of 'region'."
   1.455 -
   1.456 -    raise StopIteration
   1.457 -
   1.458 -def parse_break(items, region):
   1.459 -
   1.460 -    "Handle a paragraph break within 'region'."
   1.461 -
   1.462 -    # Mark any previous block as not being the final one in a sequence.
   1.463 -
   1.464 -    block = region.nodes[-1]
   1.465 -    block.final = False
   1.466 -    new_block(region)
   1.467 -
   1.468 -def parse_listitem_end(items, region):
   1.469 -
   1.470 -    "Handle the end of a list."
   1.471 -
   1.472 -    raise StopIteration
   1.473 -
   1.474 -def parse_listitem(items, region):
   1.475 -
   1.476 -    "Handle a list item marker within 'region'."
   1.477 -
   1.478 -    item = ListItem([])
   1.479 -    parse_region_details(items, item, ["listitemend"])
   1.480 -    region.append(item)
   1.481 -    new_block(region)
   1.482 -
   1.483 -def parse_section(items, region):
   1.484 -
   1.485 -    "Handle the start of a new section within 'region'."
   1.486 -
   1.487 -    # Parse the section and start a new block after the section.
   1.488 -
   1.489 -    indent = len(items.read_match(2))
   1.490 -    level = len(items.read_match(3))
   1.491 -    region.append(parse_region(items, level, indent))
   1.492 -    new_block(region)
   1.493 -
   1.494 -def parse_section_end(items, region):
   1.495 -
   1.496 -    "Handle the end of a new section within 'region'."
   1.497 -
   1.498 -    feature = items.read_match()
   1.499 -    if region.have_end(feature):
   1.500 -        raise StopIteration
   1.501 -    else:
   1.502 -        region.append_text(Text(feature))
   1.503 -
   1.504 -# Pattern handlers.
   1.505 -
   1.506 -handlers = {
   1.507 -    None : end_region,
   1.508 -    "break" : parse_break,
   1.509 -    "listitemend" : parse_listitem_end,
   1.510 -    "listitem" : parse_listitem,
   1.511 -    "regionstart" : parse_section,
   1.512 -    "regionend" : parse_section_end,
   1.513 -    }
   1.514 -
   1.515 -def new_block(region):
   1.516 -
   1.517 -    "Start a new block in 'region'."
   1.518 -
   1.519 -    block = Block([])
   1.520 -    region.append(block)
   1.521 -
   1.522 -
   1.523 -
   1.524 -# Top-level functions.
   1.525 -
   1.526 -parse = parse_page
   1.527 -
   1.528 -def serialise(doc, serialiser=MoinSerialiser):
   1.529 -    l = []
   1.530 -    doc.to_string(serialiser(l.append))
   1.531 -    return "".join(l)
   1.532 -
   1.533 -# vim: tabstop=4 expandtab shiftwidth=4