1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
1.2 +++ b/wikiparser.py Fri Feb 22 22:50:30 2013 +0100
1.3 @@ -0,0 +1,426 @@
1.4 +#!/usr/bin/env python
1.5 +
1.6 +"""
1.7 +Confluence Wiki syntax parsing.
1.8 +
1.9 +Copyright (C) 2012, 2013 Paul Boddie <paul@boddie.org.uk>
1.10 +
1.11 +This software is free software; you can redistribute it and/or
1.12 +modify it under the terms of the GNU General Public License as
1.13 +published by the Free Software Foundation; either version 2 of
1.14 +the License, or (at your option) any later version.
1.15 +
1.16 +This software is distributed in the hope that it will be useful,
1.17 +but WITHOUT ANY WARRANTY; without even the implied warranty of
1.18 +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
1.19 +GNU General Public License for more details.
1.20 +
1.21 +You should have received a copy of the GNU General Public
1.22 +License along with this library; see the file LICENCE.txt
1.23 +If not, write to the Free Software Foundation, Inc.,
1.24 +51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
1.25 +
1.26 +--------
1.27 +
1.28 +The basic procedure is as follows:
1.29 +
1.30 + 1. Wiki pages are first split up into regions.
1.31 + 2. Then, within these regions, the text is split into blocks.
1.32 + 1. First, lists are identified.
1.33 + 2. Additionally, other block-like elements are identified.
1.34 + 3. Each block is then parsed.
1.35 +"""
1.36 +
1.37 +from common import *
1.38 +import re
1.39 +import sys
1.40 +
1.41 +# Section extraction.
1.42 +
1.43 +sections_regexp_str = r"(?<!{){(?P<type>[^-_*+{}\n:]+)(:[^}\n]+)?}.*?{(?P=type)}"
1.44 +sections_regexp = re.compile(sections_regexp_str, re.DOTALL | re.MULTILINE)
1.45 +
1.46 +def get_regions(s):
1.47 +
1.48 + """
1.49 + Return a list of regions from 's'. Each region is specified using a tuple of
1.50 + the form (type, text).
1.51 + """
1.52 +
1.53 + last = 0
1.54 + regions = []
1.55 + for match in sections_regexp.finditer(s):
1.56 + start, end = match.span()
1.57 + regions.append((None, s[last:start]))
1.58 + regions.append(get_section_details(s[start:end]))
1.59 + last = end
1.60 + regions.append((None, s[last:]))
1.61 + return regions
1.62 +
1.63 +# Section inspection.
1.64 +
1.65 +section_regexp_str = r"{(?P<sectiontype>[^\n:]*?)(?::(?P<options>.*?))?}(?P<section>.*){(?P=sectiontype)}"
1.66 +section_regexp = re.compile(section_regexp_str, re.DOTALL | re.MULTILINE)
1.67 +
1.68 +def get_section_details(s):
1.69 +
1.70 + "Return the details of a section 's' in the form (type, text)."
1.71 +
1.72 + match = section_regexp.match(s)
1.73 + if match:
1.74 + return (match.group("sectiontype"), match.group("options")), match.group("section")
1.75 + else:
1.76 + return None, s
1.77 +
1.78 +# Heading, table and list extraction.
1.79 +
1.80 +list_regexp_str = r"^\s*(?P<listtype>[*#-])[*#-]*.*\n(\s*(?P=listtype).*(?:\n|$))*"
1.81 +table_regexp_str = r"^((?P<celltype>[|]{1,2})(.+?(?P=celltype))+(\n|$))+"
1.82 +blocktext_regexp_str = r"^(?P<type>h\d|bq)\.\s+(?P<text>.*)$"
1.83 +
1.84 +blockelement_regexp = re.compile(
1.85 + "(" + list_regexp_str + ")"
1.86 + "|"
1.87 + "(" + table_regexp_str + ")"
1.88 + "|"
1.89 + "(" + blocktext_regexp_str + ")",
1.90 + re.MULTILINE
1.91 + )
1.92 +
1.93 +def get_block_elements(s):
1.94 +
1.95 + """
1.96 + Extract headings, tables and lists from the given string 's'.
1.97 + """
1.98 +
1.99 + last = 0
1.100 + blocks = []
1.101 + for match in blockelement_regexp.finditer(s):
1.102 + start, end = match.span()
1.103 + matchtype = match.group("listtype") and "list" or match.group("celltype") and "table" or match.group("type")
1.104 + blocks.append((None, s[last:start]))
1.105 + blocks.append((matchtype, match.group("text") or s[start:end]))
1.106 + last = end
1.107 + blocks.append((None, s[last:]))
1.108 + return blocks
1.109 +
1.110 +# Block extraction.
1.111 +
1.112 +block_regexp_str = r"^(?:\s*\n)+"
1.113 +block_regexp = re.compile(block_regexp_str, re.MULTILINE)
1.114 +
1.115 +def get_basic_blocks(s):
1.116 +
1.117 + """
1.118 + Return blocks from the given string 's' by splitting the text on blank lines
1.119 + and eliminating those lines.
1.120 + """
1.121 +
1.122 + return [b for b in block_regexp.split(s) if b.strip()]
1.123 +
1.124 +# Block inspection.
1.125 +
1.126 +def get_blocks(s):
1.127 +
1.128 + """
1.129 + Return blocks from the given string 's', inspecting the basic blocks and
1.130 + generating additional block-level text where appropriate.
1.131 + """
1.132 +
1.133 + blocks = []
1.134 +
1.135 + for blocktype, blocktext in get_block_elements(s):
1.136 +
1.137 + # Collect heading, list and table blocks.
1.138 +
1.139 + if blocktype is not None:
1.140 + blocks.append((blocktype, blocktext))
1.141 +
1.142 + # Attempt to find new subblocks in other regions.
1.143 +
1.144 + else:
1.145 + for block in get_basic_blocks(blocktext):
1.146 + blocks.append((None, block))
1.147 +
1.148 + return blocks
1.149 +
1.150 +# List item inspection.
1.151 +
1.152 +listitem_regexp_str = r"^(?P<marker> *[-*#]+)\s*(?P<text>.*)$"
1.153 +listitem_regexp = re.compile(listitem_regexp_str, re.MULTILINE)
1.154 +
1.155 +def get_list_items(text):
1.156 +
1.157 + "Return a list of (marker, text) tuples for the given list 'text'."
1.158 +
1.159 + items = []
1.160 +
1.161 + for match in listitem_regexp.finditer(text):
1.162 + items.append((match.group("marker"), match.group("text")))
1.163 +
1.164 + return items
1.165 +
1.166 +# Table row inspection.
1.167 +
1.168 +monospace_regexp_str = r"{{(?P<monotext>.*?)}}"
1.169 +link_regexp_str = r"[[](?P<linktext>.*?)]"
1.170 +image_regexp_str = r"!(?P<imagetext>.*?)!"
1.171 +cellsep_regexp_str = r"(?P<celltype>[|]{1,2})"
1.172 +
1.173 +content_regexp_str = (
1.174 + "(" + monospace_regexp_str + ")"
1.175 + "|"
1.176 + "(" + link_regexp_str + ")"
1.177 + "|"
1.178 + "(" + image_regexp_str + ")"
1.179 + )
1.180 +
1.181 +table_content_regexp_str = (
1.182 + content_regexp_str +
1.183 + "|"
1.184 + "(" + cellsep_regexp_str + ")"
1.185 + )
1.186 +
1.187 +content_regexp = re.compile(content_regexp_str)
1.188 +table_content_regexp = re.compile(table_content_regexp_str)
1.189 +
1.190 +def translate_content_match(match):
1.191 +
1.192 + "Translate the content described by the given 'match', returning a string."
1.193 +
1.194 + if match.group("monotext"):
1.195 + return "{{{%s}}}" % match.group("monotext")
1.196 +
1.197 + elif match.group("linktext"):
1.198 + parts = match.group("linktext").split("|")
1.199 +
1.200 + # NOTE: Proper detection of external links required.
1.201 +
1.202 + if len(parts) == 1:
1.203 + label, target, title = None, parts[0], None
1.204 + elif len(parts) == 2:
1.205 + (label, target), title = parts, None
1.206 + else:
1.207 + label, target, title = parts
1.208 +
1.209 + target = target.strip()
1.210 +
1.211 + # Look for namespace links and rewrite them.
1.212 +
1.213 + if target.find(":") != -1:
1.214 + prefix = ""
1.215 + space, rest = target.split(":", 1)
1.216 + if space not in URL_SCHEMES:
1.217 + target = "%s/%s" % (space, rest)
1.218 +
1.219 + # Detect anchors.
1.220 +
1.221 + elif target.startswith("#"):
1.222 + prefix = ""
1.223 +
1.224 + # Detect attachments.
1.225 +
1.226 + elif target.startswith("^"):
1.227 + prefix = "attachment:"
1.228 +
1.229 + # Link to other pages within a space.
1.230 +
1.231 + else:
1.232 + prefix = "../"
1.233 +
1.234 + # Make the link tidier by making a target if none was given.
1.235 +
1.236 + if not label:
1.237 + label = target
1.238 +
1.239 + if not label and not title:
1.240 + return "[[%s%s]]" % (prefix, target)
1.241 + elif not title:
1.242 + return "[[%s%s|%s]]" % (prefix, target, label)
1.243 + else:
1.244 + return "[[%s%s|%s|title=%s]]" % (prefix, target, label, title)
1.245 +
1.246 + elif match.group("imagetext"):
1.247 + parts = match.group("imagetext").split("|")
1.248 +
1.249 + # NOTE: Proper detection of external links required.
1.250 +
1.251 + if parts[0].startswith("http"):
1.252 + prefix = ""
1.253 + else:
1.254 + prefix = "attachment:"
1.255 +
1.256 + # NOTE: Proper options conversion required.
1.257 +
1.258 + if len(parts) == 1:
1.259 + return "{{%s%s}}" % (prefix, parts[0])
1.260 + else:
1.261 + return "{{%s%s|%s}}" % (prefix, parts[0], parts[1])
1.262 +
1.263 + else:
1.264 + return match.group()
1.265 +
1.266 +def get_table_rows(text):
1.267 +
1.268 + "Return a list of (cellsep, columns) tuples for the given table 'text'."
1.269 +
1.270 + rows = []
1.271 +
1.272 + for line in text.split("\n"):
1.273 + cellsep = None
1.274 + columns = [""]
1.275 + last = 0
1.276 + for match in table_content_regexp.finditer(line):
1.277 + start, end = match.span()
1.278 + columns[-1] += line[last:start]
1.279 +
1.280 + if match.group("celltype"):
1.281 + if cellsep is None:
1.282 + cellsep = match.group("celltype")
1.283 + columns.append("")
1.284 + else:
1.285 + columns[-1] += match.group()
1.286 +
1.287 + last = end
1.288 +
1.289 + columns[-1] += line[last:]
1.290 +
1.291 + if cellsep:
1.292 + rows.append((cellsep, columns[1:-1]))
1.293 +
1.294 + return rows
1.295 +
1.296 +def translate_content(text, sectiontype=None):
1.297 +
1.298 + """
1.299 + Return a translation of the given 'text'. If the optional 'sectiontype' is
1.300 + specified, the translation may be modified to a form appropriate to the
1.301 + section being translated.
1.302 + """
1.303 +
1.304 + parts = []
1.305 +
1.306 + last = 0
1.307 + for match in content_regexp.finditer(text):
1.308 + start, end = match.span()
1.309 + parts.append(text[last:start])
1.310 +
1.311 + # Handle unformatted sections.
1.312 +
1.313 + if sectiontype in ("code", "noformat"):
1.314 + parts.append(match.group())
1.315 + else:
1.316 + parts.append(translate_content_match(match))
1.317 +
1.318 + last = end
1.319 +
1.320 + parts.append(text[last:])
1.321 + return "".join(parts)
1.322 +
1.323 +# Translation helpers.
1.324 +
1.325 +markers = {
1.326 + "*" : "*",
1.327 + "#" : "1.",
1.328 + "-" : "*",
1.329 + }
1.330 +
1.331 +def translate_marker(marker):
1.332 +
1.333 + "Translate the given 'marker' to a suitable Moin representation."
1.334 +
1.335 + return " " * len(marker) + markers[marker[-1]]
1.336 +
1.337 +cellseps = {
1.338 + "|" : "||",
1.339 + "||" : "||",
1.340 + }
1.341 +
1.342 +cellextra = {
1.343 + "|" : "",
1.344 + "||" : "'''",
1.345 + }
1.346 +
1.347 +def translate_cellsep(cellsep):
1.348 +
1.349 + "Translate the given 'cellsep' to a suitable Moin representation."
1.350 +
1.351 + return cellseps[cellsep]
1.352 +
1.353 +def translate_cell(cellsep, text):
1.354 +
1.355 + "Using 'cellsep', translate the cell 'text'."
1.356 +
1.357 + return cellextra[cellsep] + translate_content(text) + cellextra[cellsep]
1.358 +
1.359 +sectiontypes = {
1.360 + "code" : "",
1.361 + "noformat" : "",
1.362 + "quote" : "",
1.363 + "info" : "wiki important",
1.364 + "note" : "wiki caution",
1.365 + "tip" : "wiki tip",
1.366 + "warning" : "wiki warning",
1.367 + }
1.368 +
1.369 +# General parsing.
1.370 +
1.371 +def parse(s, out):
1.372 +
1.373 + "Parse the content in the string 's', writing a translation to 'out'."
1.374 +
1.375 + for type, text in get_regions(s):
1.376 +
1.377 + # Handle list, heading, blockquote or anonymous blocks.
1.378 +
1.379 + if type is None:
1.380 + for blocktype, blocktext in get_blocks(text):
1.381 +
1.382 + # Translate headings and blockquotes.
1.383 +
1.384 + if blocktypes.has_key(blocktype):
1.385 + print >>out, blocktypes[blocktype] % blocktext
1.386 +
1.387 + # Translate list items.
1.388 +
1.389 + elif blocktype == "list":
1.390 + for listmarker, listitem in get_list_items(blocktext):
1.391 + print >>out, "%s %s" % (translate_marker(listmarker), translate_content(listitem))
1.392 +
1.393 + # Translate table items.
1.394 +
1.395 + elif blocktype == "table":
1.396 + for cellsep, columns in get_table_rows(blocktext):
1.397 + moinsep = translate_cellsep(cellsep)
1.398 + print >>out, moinsep + moinsep.join([translate_cell(cellsep, column) for column in columns]) + moinsep
1.399 +
1.400 + # Handle anonymous blocks.
1.401 +
1.402 + else:
1.403 + print >>out, translate_content(blocktext.rstrip())
1.404 +
1.405 + print >>out
1.406 +
1.407 + # Handle sections.
1.408 +
1.409 + else:
1.410 + sectiontype, options = type
1.411 +
1.412 + # Direct translations of sections.
1.413 +
1.414 + mointype = sectiontypes.get(sectiontype)
1.415 + if mointype:
1.416 + print >>out, "{{{#!%s" % mointype
1.417 + if options:
1.418 + print >>out, "##", options
1.419 + else:
1.420 + print >>out, "{{{",
1.421 + print >>out, translate_content(text, sectiontype),
1.422 + print >>out, "}}}"
1.423 + print >>out
1.424 +
1.425 +if __name__ == "__main__":
1.426 + s = sys.stdin.read()
1.427 + parse(s, sys.stdout)
1.428 +
1.429 +# vim: tabstop=4 expandtab shiftwidth=4