1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/wikiparser.py	Fri Feb 22 22:50:30 2013 +0100
     1.3 @@ -0,0 +1,426 @@
     1.4 +#!/usr/bin/env python
     1.5 +
     1.6 +"""
     1.7 +Confluence Wiki syntax parsing.
     1.8 +
     1.9 +Copyright (C) 2012, 2013 Paul Boddie <paul@boddie.org.uk>
    1.10 +
    1.11 +This software is free software; you can redistribute it and/or
    1.12 +modify it under the terms of the GNU General Public License as
    1.13 +published by the Free Software Foundation; either version 2 of
    1.14 +the License, or (at your option) any later version.
    1.15 +
    1.16 +This software is distributed in the hope that it will be useful,
    1.17 +but WITHOUT ANY WARRANTY; without even the implied warranty of
    1.18 +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    1.19 +GNU General Public License for more details.
    1.20 +
    1.21 +You should have received a copy of the GNU General Public
    1.22 +License along with this library; see the file LICENCE.txt
    1.23 +If not, write to the Free Software Foundation, Inc.,
    1.24 +51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA
    1.25 +
    1.26 +--------
    1.27 +
    1.28 +The basic procedure is as follows:
    1.29 +
    1.30 + 1. Wiki pages are first split up into regions.
    1.31 + 2. Then, within these regions, the text is split into blocks.
    1.32 +    1. First, lists are identified.
    1.33 +    2. Additionally, other block-like elements are identified.
    1.34 + 3. Each block is then parsed.
    1.35 +"""
    1.36 +
    1.37 +from common import *
    1.38 +import re
    1.39 +import sys
    1.40 +
    1.41 +# Section extraction.
    1.42 +
    1.43 +sections_regexp_str = r"(?<!{){(?P<type>[^-_*+{}\n:]+)(:[^}\n]+)?}.*?{(?P=type)}"
    1.44 +sections_regexp = re.compile(sections_regexp_str, re.DOTALL | re.MULTILINE)
    1.45 +
    1.46 +def get_regions(s):
    1.47 +
    1.48 +    """
    1.49 +    Return a list of regions from 's'. Each region is specified using a tuple of
    1.50 +    the form (type, text).
    1.51 +    """
    1.52 +
    1.53 +    last = 0
    1.54 +    regions = []
    1.55 +    for match in sections_regexp.finditer(s):
    1.56 +        start, end = match.span()
    1.57 +        regions.append((None, s[last:start]))
    1.58 +        regions.append(get_section_details(s[start:end]))
    1.59 +        last = end
    1.60 +    regions.append((None, s[last:]))
    1.61 +    return regions
    1.62 +
    1.63 +# Section inspection.
    1.64 +
    1.65 +section_regexp_str = r"{(?P<sectiontype>[^\n:]*?)(?::(?P<options>.*?))?}(?P<section>.*){(?P=sectiontype)}"
    1.66 +section_regexp = re.compile(section_regexp_str, re.DOTALL | re.MULTILINE)
    1.67 +
    1.68 +def get_section_details(s):
    1.69 +
    1.70 +    "Return the details of a section 's' in the form (type, text)."
    1.71 +
    1.72 +    match = section_regexp.match(s)
    1.73 +    if match:
    1.74 +        return (match.group("sectiontype"), match.group("options")), match.group("section")
    1.75 +    else:
    1.76 +        return None, s
    1.77 +
    1.78 +# Heading, table and list extraction.
    1.79 +
    1.80 +list_regexp_str = r"^\s*(?P<listtype>[*#-])[*#-]*.*\n(\s*(?P=listtype).*(?:\n|$))*"
    1.81 +table_regexp_str = r"^((?P<celltype>[|]{1,2})(.+?(?P=celltype))+(\n|$))+"
    1.82 +blocktext_regexp_str = r"^(?P<type>h\d|bq)\.\s+(?P<text>.*)$"
    1.83 +
    1.84 +blockelement_regexp = re.compile(
    1.85 +    "(" + list_regexp_str + ")"
    1.86 +    "|"
    1.87 +    "(" + table_regexp_str + ")"
    1.88 +    "|"
    1.89 +    "(" + blocktext_regexp_str + ")",
    1.90 +    re.MULTILINE
    1.91 +    )
    1.92 +
    1.93 +def get_block_elements(s):
    1.94 +
    1.95 +    """
    1.96 +    Extract headings, tables and lists from the given string 's'.
    1.97 +    """
    1.98 +
    1.99 +    last = 0
   1.100 +    blocks = []
   1.101 +    for match in blockelement_regexp.finditer(s):
   1.102 +        start, end = match.span()
   1.103 +        matchtype = match.group("listtype") and "list" or match.group("celltype") and "table" or match.group("type")
   1.104 +        blocks.append((None, s[last:start]))
   1.105 +        blocks.append((matchtype, match.group("text") or s[start:end]))
   1.106 +        last = end
   1.107 +    blocks.append((None, s[last:]))
   1.108 +    return blocks
   1.109 +
   1.110 +# Block extraction.
   1.111 +
   1.112 +block_regexp_str = r"^(?:\s*\n)+"
   1.113 +block_regexp = re.compile(block_regexp_str, re.MULTILINE)
   1.114 +
   1.115 +def get_basic_blocks(s):
   1.116 +
   1.117 +    """
   1.118 +    Return blocks from the given string 's' by splitting the text on blank lines
   1.119 +    and eliminating those lines.
   1.120 +    """
   1.121 +
   1.122 +    return [b for b in block_regexp.split(s) if b.strip()]
   1.123 +
   1.124 +# Block inspection.
   1.125 +
   1.126 +def get_blocks(s):
   1.127 +
   1.128 +    """
   1.129 +    Return blocks from the given string 's', inspecting the basic blocks and
   1.130 +    generating additional block-level text where appropriate.
   1.131 +    """
   1.132 +
   1.133 +    blocks = []
   1.134 +
   1.135 +    for blocktype, blocktext in get_block_elements(s):
   1.136 +
   1.137 +        # Collect heading, list and table blocks.
   1.138 +
   1.139 +        if blocktype is not None:
   1.140 +            blocks.append((blocktype, blocktext))
   1.141 +
   1.142 +        # Attempt to find new subblocks in other regions.
   1.143 +
   1.144 +        else:
   1.145 +            for block in get_basic_blocks(blocktext):
   1.146 +                blocks.append((None, block))
   1.147 +
   1.148 +    return blocks
   1.149 +
   1.150 +# List item inspection.
   1.151 +
   1.152 +listitem_regexp_str = r"^(?P<marker> *[-*#]+)\s*(?P<text>.*)$"
   1.153 +listitem_regexp = re.compile(listitem_regexp_str, re.MULTILINE)
   1.154 +
   1.155 +def get_list_items(text):
   1.156 +
   1.157 +    "Return a list of (marker, text) tuples for the given list 'text'."
   1.158 +
   1.159 +    items = []
   1.160 +
   1.161 +    for match in listitem_regexp.finditer(text):
   1.162 +        items.append((match.group("marker"), match.group("text")))
   1.163 +
   1.164 +    return items
   1.165 +
   1.166 +# Table row inspection.
   1.167 +
   1.168 +monospace_regexp_str = r"{{(?P<monotext>.*?)}}"
   1.169 +link_regexp_str = r"[[](?P<linktext>.*?)]"
   1.170 +image_regexp_str = r"!(?P<imagetext>.*?)!"
   1.171 +cellsep_regexp_str = r"(?P<celltype>[|]{1,2})"
   1.172 +
   1.173 +content_regexp_str = (
   1.174 +    "(" + monospace_regexp_str + ")"
   1.175 +    "|"
   1.176 +    "(" + link_regexp_str + ")"
   1.177 +    "|"
   1.178 +    "(" + image_regexp_str + ")"
   1.179 +    )
   1.180 +
   1.181 +table_content_regexp_str = (
   1.182 +    content_regexp_str +
   1.183 +    "|"
   1.184 +    "(" + cellsep_regexp_str + ")"
   1.185 +    )
   1.186 +
   1.187 +content_regexp = re.compile(content_regexp_str)
   1.188 +table_content_regexp = re.compile(table_content_regexp_str)
   1.189 +
   1.190 +def translate_content_match(match):
   1.191 +
   1.192 +    "Translate the content described by the given 'match', returning a string."
   1.193 +
   1.194 +    if match.group("monotext"):
   1.195 +        return "{{{%s}}}" % match.group("monotext")
   1.196 +
   1.197 +    elif match.group("linktext"):
   1.198 +        parts = match.group("linktext").split("|")
   1.199 +
   1.200 +        # NOTE: Proper detection of external links required.
   1.201 +
   1.202 +        if len(parts) == 1:
   1.203 +            label, target, title = None, parts[0], None
   1.204 +        elif len(parts) == 2:
   1.205 +            (label, target), title = parts, None
   1.206 +        else:
   1.207 +            label, target, title = parts
   1.208 +
   1.209 +        target = target.strip()
   1.210 +
   1.211 +        # Look for namespace links and rewrite them.
   1.212 +
   1.213 +        if target.find(":") != -1:
   1.214 +            prefix = ""
   1.215 +            space, rest = target.split(":", 1)
   1.216 +            if space not in URL_SCHEMES:
   1.217 +                target = "%s/%s" % (space, rest)
   1.218 +
   1.219 +        # Detect anchors.
   1.220 +
   1.221 +        elif target.startswith("#"):
   1.222 +            prefix = ""
   1.223 +
   1.224 +        # Detect attachments.
   1.225 +
   1.226 +        elif target.startswith("^"):
   1.227 +            prefix = "attachment:"
   1.228 +
   1.229 +        # Link to other pages within a space.
   1.230 +
   1.231 +        else:
   1.232 +            prefix = "../"
   1.233 +
   1.234 +            # Make the link tidier by making a target if none was given.
   1.235 +
   1.236 +            if not label:
   1.237 +                label = target
   1.238 +
   1.239 +        if not label and not title:
   1.240 +            return "[[%s%s]]" % (prefix, target)
   1.241 +        elif not title:
   1.242 +            return "[[%s%s|%s]]" % (prefix, target, label)
   1.243 +        else:
   1.244 +            return "[[%s%s|%s|title=%s]]" % (prefix, target, label, title)
   1.245 +
   1.246 +    elif match.group("imagetext"):
   1.247 +        parts = match.group("imagetext").split("|")
   1.248 +
   1.249 +        # NOTE: Proper detection of external links required.
   1.250 +
   1.251 +        if parts[0].startswith("http"):
   1.252 +            prefix = ""
   1.253 +        else:
   1.254 +            prefix = "attachment:"
   1.255 +
   1.256 +        # NOTE: Proper options conversion required.
   1.257 +
   1.258 +        if len(parts) == 1:
   1.259 +            return "{{%s%s}}" % (prefix, parts[0])
   1.260 +        else:
   1.261 +            return "{{%s%s|%s}}" % (prefix, parts[0], parts[1])
   1.262 +
   1.263 +    else:
   1.264 +        return match.group()
   1.265 +
   1.266 +def get_table_rows(text):
   1.267 +
   1.268 +    "Return a list of (cellsep, columns) tuples for the given table 'text'."
   1.269 +
   1.270 +    rows = []
   1.271 +
   1.272 +    for line in text.split("\n"):
   1.273 +        cellsep = None
   1.274 +        columns = [""]
   1.275 +        last = 0
   1.276 +        for match in table_content_regexp.finditer(line):
   1.277 +            start, end = match.span()
   1.278 +            columns[-1] += line[last:start]
   1.279 +
   1.280 +            if match.group("celltype"):
   1.281 +                if cellsep is None:
   1.282 +                    cellsep = match.group("celltype")
   1.283 +                columns.append("")
   1.284 +            else:
   1.285 +                columns[-1] += match.group()
   1.286 +
   1.287 +            last = end
   1.288 +
   1.289 +        columns[-1] += line[last:]
   1.290 +
   1.291 +        if cellsep:
   1.292 +            rows.append((cellsep, columns[1:-1]))
   1.293 +
   1.294 +    return rows
   1.295 +
   1.296 +def translate_content(text, sectiontype=None):
   1.297 +
   1.298 +    """
   1.299 +    Return a translation of the given 'text'. If the optional 'sectiontype' is
   1.300 +    specified, the translation may be modified to a form appropriate to the
   1.301 +    section being translated.
   1.302 +    """
   1.303 +
   1.304 +    parts = []
   1.305 +
   1.306 +    last = 0
   1.307 +    for match in content_regexp.finditer(text):
   1.308 +        start, end = match.span()
   1.309 +        parts.append(text[last:start])
   1.310 +
   1.311 +        # Handle unformatted sections.
   1.312 +
   1.313 +        if sectiontype in ("code", "noformat"):
   1.314 +            parts.append(match.group())
   1.315 +        else:
   1.316 +            parts.append(translate_content_match(match))
   1.317 +
   1.318 +        last = end
   1.319 +
   1.320 +    parts.append(text[last:])
   1.321 +    return "".join(parts)
   1.322 +
   1.323 +# Translation helpers.
   1.324 +
   1.325 +markers = {
   1.326 +    "*" : "*",
   1.327 +    "#" : "1.",
   1.328 +    "-" : "*",
   1.329 +    }
   1.330 +
   1.331 +def translate_marker(marker):
   1.332 +
   1.333 +    "Translate the given 'marker' to a suitable Moin representation."
   1.334 +
   1.335 +    return " " * len(marker) + markers[marker[-1]]
   1.336 +
   1.337 +cellseps = {
   1.338 +    "|" : "||",
   1.339 +    "||" : "||",
   1.340 +    }
   1.341 +
   1.342 +cellextra = {
   1.343 +    "|" : "",
   1.344 +    "||" : "'''",
   1.345 +    }
   1.346 +
   1.347 +def translate_cellsep(cellsep):
   1.348 +
   1.349 +    "Translate the given 'cellsep' to a suitable Moin representation."
   1.350 +
   1.351 +    return cellseps[cellsep]
   1.352 +
   1.353 +def translate_cell(cellsep, text):
   1.354 +
   1.355 +    "Using 'cellsep', translate the cell 'text'."
   1.356 +
   1.357 +    return cellextra[cellsep] + translate_content(text) + cellextra[cellsep]
   1.358 +
   1.359 +sectiontypes = {
   1.360 +    "code" : "",
   1.361 +    "noformat" : "",
   1.362 +    "quote" : "",
   1.363 +    "info" : "wiki important",
   1.364 +    "note" : "wiki caution",
   1.365 +    "tip" : "wiki tip",
   1.366 +    "warning" : "wiki warning",
   1.367 +    }
   1.368 +
   1.369 +# General parsing.
   1.370 +
   1.371 +def parse(s, out):
   1.372 +
   1.373 +    "Parse the content in the string 's', writing a translation to 'out'."
   1.374 +
   1.375 +    for type, text in get_regions(s):
   1.376 +
   1.377 +        # Handle list, heading, blockquote or anonymous blocks.
   1.378 +
   1.379 +        if type is None:
   1.380 +            for blocktype, blocktext in get_blocks(text):
   1.381 +
   1.382 +                # Translate headings and blockquotes.
   1.383 +
   1.384 +                if blocktypes.has_key(blocktype):
   1.385 +                    print >>out, blocktypes[blocktype] % blocktext
   1.386 +
   1.387 +                # Translate list items.
   1.388 +
   1.389 +                elif blocktype == "list":
   1.390 +                    for listmarker, listitem in get_list_items(blocktext):
   1.391 +                        print >>out, "%s %s" % (translate_marker(listmarker), translate_content(listitem))
   1.392 +
   1.393 +                # Translate table items.
   1.394 +
   1.395 +                elif blocktype == "table":
   1.396 +                    for cellsep, columns in get_table_rows(blocktext):
   1.397 +                        moinsep = translate_cellsep(cellsep)
   1.398 +                        print >>out, moinsep + moinsep.join([translate_cell(cellsep, column) for column in columns]) + moinsep
   1.399 +
   1.400 +                # Handle anonymous blocks.
   1.401 +
   1.402 +                else:
   1.403 +                    print >>out, translate_content(blocktext.rstrip())
   1.404 +
   1.405 +                print >>out
   1.406 +
   1.407 +        # Handle sections.
   1.408 +
   1.409 +        else:
   1.410 +            sectiontype, options = type
   1.411 +
   1.412 +            # Direct translations of sections.
   1.413 +
   1.414 +            mointype = sectiontypes.get(sectiontype)
   1.415 +            if mointype:
   1.416 +                print >>out, "{{{#!%s" % mointype
   1.417 +                if options:
   1.418 +                    print >>out, "##", options
   1.419 +            else:
   1.420 +                print >>out, "{{{",
   1.421 +            print >>out, translate_content(text, sectiontype),
   1.422 +            print >>out, "}}}"
   1.423 +            print >>out
   1.424 +
   1.425 +if __name__ == "__main__":
   1.426 +    s = sys.stdin.read()
   1.427 +    parse(s, sys.stdout)
   1.428 +
   1.429 +# vim: tabstop=4 expandtab shiftwidth=4