1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
1.2 +++ b/common.py Fri Feb 22 22:50:30 2013 +0100
1.3 @@ -0,0 +1,38 @@
1.4 +#!/usr/bin/env python
1.5 +
1.6 +"""
1.7 +Common parsing data.
1.8 +
1.9 +Copyright (C) 2012, 2013 Paul Boddie <paul@boddie.org.uk>
1.10 +
1.11 +This software is free software; you can redistribute it and/or
1.12 +modify it under the terms of the GNU General Public License as
1.13 +published by the Free Software Foundation; either version 2 of
1.14 +the License, or (at your option) any later version.
1.15 +
1.16 +This software is distributed in the hope that it will be useful,
1.17 +but WITHOUT ANY WARRANTY; without even the implied warranty of
1.18 +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
1.19 +GNU General Public License for more details.
1.20 +
1.21 +You should have received a copy of the GNU General Public
1.22 +License along with this library; see the file LICENCE.txt
1.23 +If not, write to the Free Software Foundation, Inc.,
1.24 +51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
1.25 +"""
1.26 +
1.27 +URL_SCHEMES = ("http", "https", "ftp", "mailto")
1.28 +
1.29 +# Translation helpers.
1.30 +
1.31 +blocktypes = {
1.32 + "h1" : "= %s =",
1.33 + "h2" : "== %s ==",
1.34 + "h3" : "=== %s ===",
1.35 + "h4" : "==== %s ====",
1.36 + "h5" : "===== %s =====",
1.37 + "h6" : "====== %s ======",
1.38 + "bq" : "{{{%s}}}",
1.39 + }
1.40 +
1.41 +# vim: tabstop=4 expandtab shiftwidth=4
2.1 --- a/convert.py Sun Feb 17 20:36:11 2013 +0100
2.2 +++ b/convert.py Fri Feb 22 22:50:30 2013 +0100
2.3 @@ -27,7 +27,7 @@
2.4 from cStringIO import StringIO
2.5 import codecs
2.6 import xmlread
2.7 -import parser
2.8 +import wikiparser, xmlparser
2.9 import sys
2.10
2.11 MAX_TITLE_LENGTH = 120
2.12 @@ -226,7 +226,7 @@
2.13 'body'.
2.14 """
2.15
2.16 - fn = fn or parser.parse
2.17 + fn = fn or wikiparser.parse
2.18
2.19 out = codecs.open(filename, "w", encoding="utf-8")
2.20 try:
2.21 @@ -235,7 +235,7 @@
2.22 out.close()
2.23
2.24 def xmltranslate(filename, body):
2.25 - translate(filename, body, parser.xmlparse)
2.26 + translate(filename, body, xmlparser.parse)
2.27
2.28 def sort_comments(pages_dir, pageid):
2.29
3.1 --- a/parser.py Sun Feb 17 20:36:11 2013 +0100
3.2 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000
3.3 @@ -1,684 +0,0 @@
3.4 -#!/usr/bin/env python
3.5 -
3.6 -"""
3.7 -Confluence Wiki syntax parsing.
3.8 -
3.9 -Copyright (C) 2012, 2013 Paul Boddie <paul@boddie.org.uk>
3.10 -
3.11 -This software is free software; you can redistribute it and/or
3.12 -modify it under the terms of the GNU General Public License as
3.13 -published by the Free Software Foundation; either version 2 of
3.14 -the License, or (at your option) any later version.
3.15 -
3.16 -This software is distributed in the hope that it will be useful,
3.17 -but WITHOUT ANY WARRANTY; without even the implied warranty of
3.18 -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
3.19 -GNU General Public License for more details.
3.20 -
3.21 -You should have received a copy of the GNU General Public
3.22 -License along with this library; see the file LICENCE.txt
3.23 -If not, write to the Free Software Foundation, Inc.,
3.24 -51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
3.25 -
3.26 ---------
3.27 -
3.28 -The basic procedure is as follows:
3.29 -
3.30 - 1. Wiki pages are first split up into regions.
3.31 - 2. Then, within these regions, the text is split into blocks.
3.32 - 1. First, lists are identified.
3.33 - 2. Additionally, other block-like elements are identified.
3.34 - 3. Each block is then parsed.
3.35 -"""
3.36 -
3.37 -try:
3.38 - from cStringIO import StringIO
3.39 -except ImportError:
3.40 - from StringIO import StringIO
3.41 -
3.42 -from xmlread import Parser
3.43 -import re
3.44 -import sys
3.45 -import operator
3.46 -import htmlentitydefs
3.47 -
3.48 -URL_SCHEMES = ("http", "https", "ftp", "mailto")
3.49 -
3.50 -# Section extraction.
3.51 -
3.52 -sections_regexp_str = r"(?<!{){(?P<type>[^-_*+{}\n:]+)(:[^}\n]+)?}.*?{(?P=type)}"
3.53 -sections_regexp = re.compile(sections_regexp_str, re.DOTALL | re.MULTILINE)
3.54 -
3.55 -def get_regions(s):
3.56 -
3.57 - """
3.58 - Return a list of regions from 's'. Each region is specified using a tuple of
3.59 - the form (type, text).
3.60 - """
3.61 -
3.62 - last = 0
3.63 - regions = []
3.64 - for match in sections_regexp.finditer(s):
3.65 - start, end = match.span()
3.66 - regions.append((None, s[last:start]))
3.67 - regions.append(get_section_details(s[start:end]))
3.68 - last = end
3.69 - regions.append((None, s[last:]))
3.70 - return regions
3.71 -
3.72 -# Section inspection.
3.73 -
3.74 -section_regexp_str = r"{(?P<sectiontype>[^\n:]*?)(?::(?P<options>.*?))?}(?P<section>.*){(?P=sectiontype)}"
3.75 -section_regexp = re.compile(section_regexp_str, re.DOTALL | re.MULTILINE)
3.76 -
3.77 -def get_section_details(s):
3.78 -
3.79 - "Return the details of a section 's' in the form (type, text)."
3.80 -
3.81 - match = section_regexp.match(s)
3.82 - if match:
3.83 - return (match.group("sectiontype"), match.group("options")), match.group("section")
3.84 - else:
3.85 - return None, s
3.86 -
3.87 -# Heading, table and list extraction.
3.88 -
3.89 -list_regexp_str = r"^\s*(?P<listtype>[*#-])[*#-]*.*\n(\s*(?P=listtype).*(?:\n|$))*"
3.90 -table_regexp_str = r"^((?P<celltype>[|]{1,2})(.+?(?P=celltype))+(\n|$))+"
3.91 -blocktext_regexp_str = r"^(?P<type>h\d|bq)\.\s+(?P<text>.*)$"
3.92 -
3.93 -blockelement_regexp = re.compile(
3.94 - "(" + list_regexp_str + ")"
3.95 - "|"
3.96 - "(" + table_regexp_str + ")"
3.97 - "|"
3.98 - "(" + blocktext_regexp_str + ")",
3.99 - re.MULTILINE
3.100 - )
3.101 -
3.102 -def get_block_elements(s):
3.103 -
3.104 - """
3.105 - Extract headings, tables and lists from the given string 's'.
3.106 - """
3.107 -
3.108 - last = 0
3.109 - blocks = []
3.110 - for match in blockelement_regexp.finditer(s):
3.111 - start, end = match.span()
3.112 - matchtype = match.group("listtype") and "list" or match.group("celltype") and "table" or match.group("type")
3.113 - blocks.append((None, s[last:start]))
3.114 - blocks.append((matchtype, match.group("text") or s[start:end]))
3.115 - last = end
3.116 - blocks.append((None, s[last:]))
3.117 - return blocks
3.118 -
3.119 -# Block extraction.
3.120 -
3.121 -block_regexp_str = r"^(?:\s*\n)+"
3.122 -block_regexp = re.compile(block_regexp_str, re.MULTILINE)
3.123 -
3.124 -def get_basic_blocks(s):
3.125 -
3.126 - """
3.127 - Return blocks from the given string 's' by splitting the text on blank lines
3.128 - and eliminating those lines.
3.129 - """
3.130 -
3.131 - return [b for b in block_regexp.split(s) if b.strip()]
3.132 -
3.133 -# Block inspection.
3.134 -
3.135 -def get_blocks(s):
3.136 -
3.137 - """
3.138 - Return blocks from the given string 's', inspecting the basic blocks and
3.139 - generating additional block-level text where appropriate.
3.140 - """
3.141 -
3.142 - blocks = []
3.143 -
3.144 - for blocktype, blocktext in get_block_elements(s):
3.145 -
3.146 - # Collect heading, list and table blocks.
3.147 -
3.148 - if blocktype is not None:
3.149 - blocks.append((blocktype, blocktext))
3.150 -
3.151 - # Attempt to find new subblocks in other regions.
3.152 -
3.153 - else:
3.154 - for block in get_basic_blocks(blocktext):
3.155 - blocks.append((None, block))
3.156 -
3.157 - return blocks
3.158 -
3.159 -# List item inspection.
3.160 -
3.161 -listitem_regexp_str = r"^(?P<marker> *[-*#]+)\s*(?P<text>.*)$"
3.162 -listitem_regexp = re.compile(listitem_regexp_str, re.MULTILINE)
3.163 -
3.164 -def get_list_items(text):
3.165 -
3.166 - "Return a list of (marker, text) tuples for the given list 'text'."
3.167 -
3.168 - items = []
3.169 -
3.170 - for match in listitem_regexp.finditer(text):
3.171 - items.append((match.group("marker"), match.group("text")))
3.172 -
3.173 - return items
3.174 -
3.175 -# Table row inspection.
3.176 -
3.177 -monospace_regexp_str = r"{{(?P<monotext>.*?)}}"
3.178 -link_regexp_str = r"[[](?P<linktext>.*?)]"
3.179 -image_regexp_str = r"!(?P<imagetext>.*?)!"
3.180 -cellsep_regexp_str = r"(?P<celltype>[|]{1,2})"
3.181 -
3.182 -content_regexp_str = (
3.183 - "(" + monospace_regexp_str + ")"
3.184 - "|"
3.185 - "(" + link_regexp_str + ")"
3.186 - "|"
3.187 - "(" + image_regexp_str + ")"
3.188 - )
3.189 -
3.190 -table_content_regexp_str = (
3.191 - content_regexp_str +
3.192 - "|"
3.193 - "(" + cellsep_regexp_str + ")"
3.194 - )
3.195 -
3.196 -content_regexp = re.compile(content_regexp_str)
3.197 -table_content_regexp = re.compile(table_content_regexp_str)
3.198 -
3.199 -def translate_content_match(match):
3.200 -
3.201 - "Translate the content described by the given 'match', returning a string."
3.202 -
3.203 - if match.group("monotext"):
3.204 - return "{{{%s}}}" % match.group("monotext")
3.205 -
3.206 - elif match.group("linktext"):
3.207 - parts = match.group("linktext").split("|")
3.208 -
3.209 - # NOTE: Proper detection of external links required.
3.210 -
3.211 - if len(parts) == 1:
3.212 - label, target, title = None, parts[0], None
3.213 - elif len(parts) == 2:
3.214 - (label, target), title = parts, None
3.215 - else:
3.216 - label, target, title = parts
3.217 -
3.218 - target = target.strip()
3.219 -
3.220 - # Look for namespace links and rewrite them.
3.221 -
3.222 - if target.find(":") != -1:
3.223 - prefix = ""
3.224 - space, rest = target.split(":", 1)
3.225 - if space not in URL_SCHEMES:
3.226 - target = "%s/%s" % (space, rest)
3.227 -
3.228 - # Detect anchors.
3.229 -
3.230 - elif target.startswith("#"):
3.231 - prefix = ""
3.232 -
3.233 - # Detect attachments.
3.234 -
3.235 - elif target.startswith("^"):
3.236 - prefix = "attachment:"
3.237 -
3.238 - # Link to other pages within a space.
3.239 -
3.240 - else:
3.241 - prefix = "../"
3.242 -
3.243 - # Make the link tidier by making a target if none was given.
3.244 -
3.245 - if not label:
3.246 - label = target
3.247 -
3.248 - if not label and not title:
3.249 - return "[[%s%s]]" % (prefix, target)
3.250 - elif not title:
3.251 - return "[[%s%s|%s]]" % (prefix, target, label)
3.252 - else:
3.253 - return "[[%s%s|%s|title=%s]]" % (prefix, target, label, title)
3.254 -
3.255 - elif match.group("imagetext"):
3.256 - parts = match.group("imagetext").split("|")
3.257 -
3.258 - # NOTE: Proper detection of external links required.
3.259 -
3.260 - if parts[0].startswith("http"):
3.261 - prefix = ""
3.262 - else:
3.263 - prefix = "attachment:"
3.264 -
3.265 - # NOTE: Proper options conversion required.
3.266 -
3.267 - if len(parts) == 1:
3.268 - return "{{%s%s}}" % (prefix, parts[0])
3.269 - else:
3.270 - return "{{%s%s|%s}}" % (prefix, parts[0], parts[1])
3.271 -
3.272 - else:
3.273 - return match.group()
3.274 -
3.275 -def get_table_rows(text):
3.276 -
3.277 - "Return a list of (cellsep, columns) tuples for the given table 'text'."
3.278 -
3.279 - rows = []
3.280 -
3.281 - for line in text.split("\n"):
3.282 - cellsep = None
3.283 - columns = [""]
3.284 - last = 0
3.285 - for match in table_content_regexp.finditer(line):
3.286 - start, end = match.span()
3.287 - columns[-1] += line[last:start]
3.288 -
3.289 - if match.group("celltype"):
3.290 - if cellsep is None:
3.291 - cellsep = match.group("celltype")
3.292 - columns.append("")
3.293 - else:
3.294 - columns[-1] += match.group()
3.295 -
3.296 - last = end
3.297 -
3.298 - columns[-1] += line[last:]
3.299 -
3.300 - if cellsep:
3.301 - rows.append((cellsep, columns[1:-1]))
3.302 -
3.303 - return rows
3.304 -
3.305 -def translate_content(text, sectiontype=None):
3.306 -
3.307 - """
3.308 - Return a translation of the given 'text'. If the optional 'sectiontype' is
3.309 - specified, the translation may be modified to a form appropriate to the
3.310 - section being translated.
3.311 - """
3.312 -
3.313 - parts = []
3.314 -
3.315 - last = 0
3.316 - for match in content_regexp.finditer(text):
3.317 - start, end = match.span()
3.318 - parts.append(text[last:start])
3.319 -
3.320 - # Handle unformatted sections.
3.321 -
3.322 - if sectiontype in ("code", "noformat"):
3.323 - parts.append(match.group())
3.324 - else:
3.325 - parts.append(translate_content_match(match))
3.326 -
3.327 - last = end
3.328 -
3.329 - parts.append(text[last:])
3.330 - return "".join(parts)
3.331 -
3.332 -# Translation helpers.
3.333 -
3.334 -blocktypes = {
3.335 - "h1" : "= %s =",
3.336 - "h2" : "== %s ==",
3.337 - "h3" : "=== %s ===",
3.338 - "h4" : "==== %s ====",
3.339 - "h5" : "===== %s =====",
3.340 - "h6" : "====== %s ======",
3.341 - "bq" : "{{{%s}}}",
3.342 - }
3.343 -
3.344 -markers = {
3.345 - "*" : "*",
3.346 - "#" : "1.",
3.347 - "-" : "*",
3.348 - }
3.349 -
3.350 -def translate_marker(marker):
3.351 -
3.352 - "Translate the given 'marker' to a suitable Moin representation."
3.353 -
3.354 - return " " * len(marker) + markers[marker[-1]]
3.355 -
3.356 -cellseps = {
3.357 - "|" : "||",
3.358 - "||" : "||",
3.359 - }
3.360 -
3.361 -cellextra = {
3.362 - "|" : "",
3.363 - "||" : "'''",
3.364 - }
3.365 -
3.366 -def translate_cellsep(cellsep):
3.367 -
3.368 - "Translate the given 'cellsep' to a suitable Moin representation."
3.369 -
3.370 - return cellseps[cellsep]
3.371 -
3.372 -def translate_cell(cellsep, text):
3.373 -
3.374 - "Using 'cellsep', translate the cell 'text'."
3.375 -
3.376 - return cellextra[cellsep] + translate_content(text) + cellextra[cellsep]
3.377 -
3.378 -sectiontypes = {
3.379 - "code" : "",
3.380 - "noformat" : "",
3.381 - "quote" : "",
3.382 - "info" : "wiki important",
3.383 - "note" : "wiki caution",
3.384 - "tip" : "wiki tip",
3.385 - "warning" : "wiki warning",
3.386 - }
3.387 -
3.388 -# XML dialect syntax parsing.
3.389 -
3.390 -tags = {
3.391 - # XHTML tag MoinMoin syntax
3.392 - "strong" : "'''%s'''",
3.393 - "em" : "''%s''",
3.394 - "u" : "__%s__",
3.395 - "del" : "--(%s)--",
3.396 - "sup" : "^%s^",
3.397 - "sub" : ",,%s,,",
3.398 - "code" : "`%s`",
3.399 - "pre" : "{{{%s}}}",
3.400 - "blockquote" : " %s",
3.401 - "small" : "~-%s-~",
3.402 - "big" : "~+%s+~",
3.403 - "p" : "%s",
3.404 - "ol" : "%s",
3.405 - "ul" : "%s",
3.406 - "ac:plain-text-body" : "{{{%s}}}",
3.407 - "ac:link" : "[[%s%s|%s]]",
3.408 - }
3.409 -
3.410 -for tag, translation in blocktypes.items():
3.411 - tags[tag] = translation
3.412 -
3.413 -simple_tags = {
3.414 - # XHTML tag MoinMoin syntax
3.415 - "br" : "<<BR>>",
3.416 - }
3.417 -
3.418 -list_tags = {
3.419 - # XHTML list tag MoinMoin list item syntax
3.420 - "ol" : "1. %s",
3.421 - "ul" : "* %s",
3.422 - }
3.423 -
3.424 -indented_tags = ["li", "p"]
3.425 -
3.426 -link_target_tags = {
3.427 - # Confluence element Attribute providing the target
3.428 - "ri:page" : "ri:content-title",
3.429 - "ri:attachment" : "ri:filename",
3.430 - "ri:user" : "ri:username",
3.431 - }
3.432 -
3.433 -macro_rich_text_styles = {
3.434 - # Confluence style MoinMoin admonition style
3.435 - "note" : "caution",
3.436 - "warning" : "warning",
3.437 - "info" : "important",
3.438 - "tip" : "tip",
3.439 - }
3.440 -
3.441 -normalise_regexp_str = r"\s+"
3.442 -normalise_regexp = re.compile(normalise_regexp_str)
3.443 -
3.444 -class ConfluenceXMLParser(Parser):
3.445 -
3.446 - "Handle content from Confluence 4 page revisions."
3.447 -
3.448 - def __init__(self, out):
3.449 - Parser.__init__(self)
3.450 - self.out = out
3.451 -
3.452 - # Link target information.
3.453 -
3.454 - self.target = None
3.455 - self.target_type = None
3.456 -
3.457 - # Macro information.
3.458 -
3.459 - self.macro = None
3.460 - self.macro_parameters = {}
3.461 -
3.462 - # Indentation and preformatted states.
3.463 -
3.464 - self.indent = 0
3.465 - self.states = {}
3.466 - for name in ("pre", "ac:plain-text-body"):
3.467 - self.states[name] = 0
3.468 -
3.469 - # ContentHandler-related methods.
3.470 -
3.471 - def startElement(self, name, attrs):
3.472 - if list_tags.has_key(name):
3.473 - self.indent += 1
3.474 - elif self.states.has_key(name):
3.475 - self.states[name] += 1
3.476 - Parser.startElement(self, name, attrs)
3.477 -
3.478 - def endElement(self, name):
3.479 - Parser.endElement(self, name)
3.480 - if list_tags.has_key(name):
3.481 - self.indent -= 1
3.482 - elif self.states.has_key(name):
3.483 - self.states[name] -= 1
3.484 -
3.485 - def characters(self, content):
3.486 - if not self.is_preformatted():
3.487 - content = self.normalise(content, self.elements[-1])
3.488 - Parser.characters(self, content)
3.489 -
3.490 - def skippedEntity(self, name):
3.491 - ch = htmlentitydefs.name2codepoint.get(name)
3.492 - if ch:
3.493 - self.text[-1].append(unichr(ch))
3.494 -
3.495 - # Parser-related methods.
3.496 -
3.497 - def handleElement(self, name):
3.498 - text = "".join(self.text[-1])
3.499 - conversion = None
3.500 -
3.501 - # Handle list elements.
3.502 -
3.503 - if name == "li" and len(self.elements) > 1:
3.504 - list_tag = self.elements[-2]
3.505 - conversion = list_tags.get(list_tag)
3.506 -
3.507 - # Remember link target information.
3.508 -
3.509 - elif link_target_tags.has_key(name):
3.510 - self.target = self.attributes[-1].get(link_target_tags[name])
3.511 - self.target_type = name
3.512 - text = ""
3.513 -
3.514 - # Remember macro information.
3.515 -
3.516 - elif name == "ac:parameter":
3.517 - self.macro_parameters[self.attributes[-1].get("ac:name")] = text
3.518 - text = ""
3.519 -
3.520 - elif name == "ac:macro":
3.521 - self.macro = self.attributes[-1].get("ac:name")
3.522 -
3.523 - # Handle the common case.
3.524 -
3.525 - else:
3.526 - conversion = tags.get(name)
3.527 -
3.528 - # Attempt to convert the text.
3.529 -
3.530 - # Links require target information.
3.531 - # NOTE: User links should support the intended user namespace prefix.
3.532 -
3.533 - if name == "ac:link":
3.534 - if self.target_type == "ri:attachment":
3.535 - prefix = "attachment:"
3.536 - elif self.target_type == "ri:user":
3.537 - prefix = ""
3.538 - else:
3.539 - prefix = "../"
3.540 -
3.541 - text = conversion % (prefix, self.target, text or self.target)
3.542 - self.target = self.target_type = None
3.543 -
3.544 - # Macro name information is used to style rich text body regions.
3.545 -
3.546 - elif name == "ac:macro" and macro_rich_text_styles.has_key(self.macro):
3.547 - details = macro_rich_text_styles[self.macro]
3.548 - title = self.macro_parameters.get("title")
3.549 - if title:
3.550 - details = "%s\n\n%s" % (details, title)
3.551 - text = "{{{#!wiki %s\n\n%s}}}" % (details, text)
3.552 - self.macro = None
3.553 - self.macro_parameters = {}
3.554 -
3.555 - # Handle the common case.
3.556 -
3.557 - elif text and conversion:
3.558 - text = conversion % text
3.559 - elif simple_tags.has_key(name):
3.560 - text = simple_tags[name]
3.561 -
3.562 - # Normalise leading whitespace and indent the text if appropriate.
3.563 -
3.564 - if name in indented_tags:
3.565 - text = " " * self.indent + text.lstrip()
3.566 -
3.567 - # Add the converted text to the end of the parent element's text nodes.
3.568 -
3.569 - if len(self.text) > 1:
3.570 - nodes = self.text[-2]
3.571 - if "".join(self.text[-2]):
3.572 - parent = self.elements[-2]
3.573 - if parent == "body":
3.574 - nodes.append("\n\n")
3.575 - elif list_tags.has_key(parent):
3.576 - nodes.append("\n")
3.577 - elif list_tags.has_key(name) and parent == "li":
3.578 - nodes.append("\n")
3.579 - nodes.append(text)
3.580 -
3.581 - # Otherwise, emit the text.
3.582 -
3.583 - else:
3.584 - self.out.write(text)
3.585 -
3.586 - def is_preformatted(self):
3.587 - return reduce(operator.or_, self.states.values(), False)
3.588 -
3.589 - # Whitespace normalisation.
3.590 -
3.591 - def get_replacement(self, name):
3.592 - if name in ("html", "body") or list_tags.has_key(name):
3.593 - return ""
3.594 - else:
3.595 - return " "
3.596 -
3.597 - def normalise(self, text, name):
3.598 - return normalise_regexp.sub(self.get_replacement(name), text)
3.599 -
3.600 -def xmlparse(s, out):
3.601 -
3.602 - "Parse the content in the string 's', writing a translation to 'out'."
3.603 -
3.604 - # NOTE: CDATA sections appear to have erroneous endings.
3.605 -
3.606 - s = u"""\
3.607 -<?xml version="1.0"?>
3.608 -<!DOCTYPE html
3.609 - PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
3.610 - "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
3.611 -<html xmlns="http://www.w3.org/1999/xhtml">
3.612 -<body>
3.613 -%s
3.614 -</body>
3.615 -</html>""" % s.replace("]] >", "]]>")
3.616 -
3.617 - f = StringIO(s.encode("utf-8"))
3.618 - try:
3.619 - parser = ConfluenceXMLParser(out)
3.620 - parser.parse(f)
3.621 - finally:
3.622 - f.close()
3.623 -
3.624 -# General parsing.
3.625 -
3.626 -def parse(s, out):
3.627 -
3.628 - "Parse the content in the string 's', writing a translation to 'out'."
3.629 -
3.630 - for type, text in get_regions(s):
3.631 -
3.632 - # Handle list, heading, blockquote or anonymous blocks.
3.633 -
3.634 - if type is None:
3.635 - for blocktype, blocktext in get_blocks(text):
3.636 -
3.637 - # Translate headings and blockquotes.
3.638 -
3.639 - if blocktypes.has_key(blocktype):
3.640 - print >>out, blocktypes[blocktype] % blocktext
3.641 -
3.642 - # Translate list items.
3.643 -
3.644 - elif blocktype == "list":
3.645 - for listmarker, listitem in get_list_items(blocktext):
3.646 - print >>out, "%s %s" % (translate_marker(listmarker), translate_content(listitem))
3.647 -
3.648 - # Translate table items.
3.649 -
3.650 - elif blocktype == "table":
3.651 - for cellsep, columns in get_table_rows(blocktext):
3.652 - moinsep = translate_cellsep(cellsep)
3.653 - print >>out, moinsep + moinsep.join([translate_cell(cellsep, column) for column in columns]) + moinsep
3.654 -
3.655 - # Handle anonymous blocks.
3.656 -
3.657 - else:
3.658 - print >>out, translate_content(blocktext.rstrip())
3.659 -
3.660 - print >>out
3.661 -
3.662 - # Handle sections.
3.663 -
3.664 - else:
3.665 - sectiontype, options = type
3.666 -
3.667 - # Direct translations of sections.
3.668 -
3.669 - mointype = sectiontypes.get(sectiontype)
3.670 - if mointype:
3.671 - print >>out, "{{{#!%s" % mointype
3.672 - if options:
3.673 - print >>out, "##", options
3.674 - else:
3.675 - print >>out, "{{{",
3.676 - print >>out, translate_content(text, sectiontype),
3.677 - print >>out, "}}}"
3.678 - print >>out
3.679 -
3.680 -if __name__ == "__main__":
3.681 - s = sys.stdin.read()
3.682 - if "--xml" in sys.argv:
3.683 - xmlparse(s, sys.stdout)
3.684 - else:
3.685 - parse(s, sys.stdout)
3.686 -
3.687 -# vim: tabstop=4 expandtab shiftwidth=4
4.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
4.2 +++ b/wikiparser.py Fri Feb 22 22:50:30 2013 +0100
4.3 @@ -0,0 +1,426 @@
4.4 +#!/usr/bin/env python
4.5 +
4.6 +"""
4.7 +Confluence Wiki syntax parsing.
4.8 +
4.9 +Copyright (C) 2012, 2013 Paul Boddie <paul@boddie.org.uk>
4.10 +
4.11 +This software is free software; you can redistribute it and/or
4.12 +modify it under the terms of the GNU General Public License as
4.13 +published by the Free Software Foundation; either version 2 of
4.14 +the License, or (at your option) any later version.
4.15 +
4.16 +This software is distributed in the hope that it will be useful,
4.17 +but WITHOUT ANY WARRANTY; without even the implied warranty of
4.18 +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
4.19 +GNU General Public License for more details.
4.20 +
4.21 +You should have received a copy of the GNU General Public
4.22 +License along with this library; see the file LICENCE.txt
4.23 +If not, write to the Free Software Foundation, Inc.,
4.24 +51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
4.25 +
4.26 +--------
4.27 +
4.28 +The basic procedure is as follows:
4.29 +
4.30 + 1. Wiki pages are first split up into regions.
4.31 + 2. Then, within these regions, the text is split into blocks.
4.32 + 1. First, lists are identified.
4.33 + 2. Additionally, other block-like elements are identified.
4.34 + 3. Each block is then parsed.
4.35 +"""
4.36 +
4.37 +from common import *
4.38 +import re
4.39 +import sys
4.40 +
4.41 +# Section extraction.
4.42 +
4.43 +sections_regexp_str = r"(?<!{){(?P<type>[^-_*+{}\n:]+)(:[^}\n]+)?}.*?{(?P=type)}"
4.44 +sections_regexp = re.compile(sections_regexp_str, re.DOTALL | re.MULTILINE)
4.45 +
4.46 +def get_regions(s):
4.47 +
4.48 + """
4.49 + Return a list of regions from 's'. Each region is specified using a tuple of
4.50 + the form (type, text).
4.51 + """
4.52 +
4.53 + last = 0
4.54 + regions = []
4.55 + for match in sections_regexp.finditer(s):
4.56 + start, end = match.span()
4.57 + regions.append((None, s[last:start]))
4.58 + regions.append(get_section_details(s[start:end]))
4.59 + last = end
4.60 + regions.append((None, s[last:]))
4.61 + return regions
4.62 +
4.63 +# Section inspection.
4.64 +
4.65 +section_regexp_str = r"{(?P<sectiontype>[^\n:]*?)(?::(?P<options>.*?))?}(?P<section>.*){(?P=sectiontype)}"
4.66 +section_regexp = re.compile(section_regexp_str, re.DOTALL | re.MULTILINE)
4.67 +
4.68 +def get_section_details(s):
4.69 +
4.70 + "Return the details of a section 's' in the form (type, text)."
4.71 +
4.72 + match = section_regexp.match(s)
4.73 + if match:
4.74 + return (match.group("sectiontype"), match.group("options")), match.group("section")
4.75 + else:
4.76 + return None, s
4.77 +
4.78 +# Heading, table and list extraction.
4.79 +
4.80 +list_regexp_str = r"^\s*(?P<listtype>[*#-])[*#-]*.*\n(\s*(?P=listtype).*(?:\n|$))*"
4.81 +table_regexp_str = r"^((?P<celltype>[|]{1,2})(.+?(?P=celltype))+(\n|$))+"
4.82 +blocktext_regexp_str = r"^(?P<type>h\d|bq)\.\s+(?P<text>.*)$"
4.83 +
4.84 +blockelement_regexp = re.compile(
4.85 + "(" + list_regexp_str + ")"
4.86 + "|"
4.87 + "(" + table_regexp_str + ")"
4.88 + "|"
4.89 + "(" + blocktext_regexp_str + ")",
4.90 + re.MULTILINE
4.91 + )
4.92 +
4.93 +def get_block_elements(s):
4.94 +
4.95 + """
4.96 + Extract headings, tables and lists from the given string 's'.
4.97 + """
4.98 +
4.99 + last = 0
4.100 + blocks = []
4.101 + for match in blockelement_regexp.finditer(s):
4.102 + start, end = match.span()
4.103 + matchtype = match.group("listtype") and "list" or match.group("celltype") and "table" or match.group("type")
4.104 + blocks.append((None, s[last:start]))
4.105 + blocks.append((matchtype, match.group("text") or s[start:end]))
4.106 + last = end
4.107 + blocks.append((None, s[last:]))
4.108 + return blocks
4.109 +
4.110 +# Block extraction.
4.111 +
4.112 +block_regexp_str = r"^(?:\s*\n)+"
4.113 +block_regexp = re.compile(block_regexp_str, re.MULTILINE)
4.114 +
4.115 +def get_basic_blocks(s):
4.116 +
4.117 + """
4.118 + Return blocks from the given string 's' by splitting the text on blank lines
4.119 + and eliminating those lines.
4.120 + """
4.121 +
4.122 + return [b for b in block_regexp.split(s) if b.strip()]
4.123 +
4.124 +# Block inspection.
4.125 +
4.126 +def get_blocks(s):
4.127 +
4.128 + """
4.129 + Return blocks from the given string 's', inspecting the basic blocks and
4.130 + generating additional block-level text where appropriate.
4.131 + """
4.132 +
4.133 + blocks = []
4.134 +
4.135 + for blocktype, blocktext in get_block_elements(s):
4.136 +
4.137 + # Collect heading, list and table blocks.
4.138 +
4.139 + if blocktype is not None:
4.140 + blocks.append((blocktype, blocktext))
4.141 +
4.142 + # Attempt to find new subblocks in other regions.
4.143 +
4.144 + else:
4.145 + for block in get_basic_blocks(blocktext):
4.146 + blocks.append((None, block))
4.147 +
4.148 + return blocks
4.149 +
4.150 +# List item inspection.
4.151 +
4.152 +listitem_regexp_str = r"^(?P<marker> *[-*#]+)\s*(?P<text>.*)$"
4.153 +listitem_regexp = re.compile(listitem_regexp_str, re.MULTILINE)
4.154 +
4.155 +def get_list_items(text):
4.156 +
4.157 + "Return a list of (marker, text) tuples for the given list 'text'."
4.158 +
4.159 + items = []
4.160 +
4.161 + for match in listitem_regexp.finditer(text):
4.162 + items.append((match.group("marker"), match.group("text")))
4.163 +
4.164 + return items
4.165 +
4.166 +# Table row inspection.
4.167 +
4.168 +monospace_regexp_str = r"{{(?P<monotext>.*?)}}"
4.169 +link_regexp_str = r"[[](?P<linktext>.*?)]"
4.170 +image_regexp_str = r"!(?P<imagetext>.*?)!"
4.171 +cellsep_regexp_str = r"(?P<celltype>[|]{1,2})"
4.172 +
4.173 +content_regexp_str = (
4.174 + "(" + monospace_regexp_str + ")"
4.175 + "|"
4.176 + "(" + link_regexp_str + ")"
4.177 + "|"
4.178 + "(" + image_regexp_str + ")"
4.179 + )
4.180 +
4.181 +table_content_regexp_str = (
4.182 + content_regexp_str +
4.183 + "|"
4.184 + "(" + cellsep_regexp_str + ")"
4.185 + )
4.186 +
4.187 +content_regexp = re.compile(content_regexp_str)
4.188 +table_content_regexp = re.compile(table_content_regexp_str)
4.189 +
4.190 +def translate_content_match(match):
4.191 +
4.192 + "Translate the content described by the given 'match', returning a string."
4.193 +
4.194 + if match.group("monotext"):
4.195 + return "{{{%s}}}" % match.group("monotext")
4.196 +
4.197 + elif match.group("linktext"):
4.198 + parts = match.group("linktext").split("|")
4.199 +
4.200 + # NOTE: Proper detection of external links required.
4.201 +
4.202 + if len(parts) == 1:
4.203 + label, target, title = None, parts[0], None
4.204 + elif len(parts) == 2:
4.205 + (label, target), title = parts, None
4.206 + else:
4.207 + label, target, title = parts
4.208 +
4.209 + target = target.strip()
4.210 +
4.211 + # Look for namespace links and rewrite them.
4.212 +
4.213 + if target.find(":") != -1:
4.214 + prefix = ""
4.215 + space, rest = target.split(":", 1)
4.216 + if space not in URL_SCHEMES:
4.217 + target = "%s/%s" % (space, rest)
4.218 +
4.219 + # Detect anchors.
4.220 +
4.221 + elif target.startswith("#"):
4.222 + prefix = ""
4.223 +
4.224 + # Detect attachments.
4.225 +
4.226 + elif target.startswith("^"):
4.227 + prefix = "attachment:"
4.228 +
4.229 + # Link to other pages within a space.
4.230 +
4.231 + else:
4.232 + prefix = "../"
4.233 +
4.234 + # Make the link tidier by making a target if none was given.
4.235 +
4.236 + if not label:
4.237 + label = target
4.238 +
4.239 + if not label and not title:
4.240 + return "[[%s%s]]" % (prefix, target)
4.241 + elif not title:
4.242 + return "[[%s%s|%s]]" % (prefix, target, label)
4.243 + else:
4.244 + return "[[%s%s|%s|title=%s]]" % (prefix, target, label, title)
4.245 +
4.246 + elif match.group("imagetext"):
4.247 + parts = match.group("imagetext").split("|")
4.248 +
4.249 + # NOTE: Proper detection of external links required.
4.250 +
4.251 + if parts[0].startswith("http"):
4.252 + prefix = ""
4.253 + else:
4.254 + prefix = "attachment:"
4.255 +
4.256 + # NOTE: Proper options conversion required.
4.257 +
4.258 + if len(parts) == 1:
4.259 + return "{{%s%s}}" % (prefix, parts[0])
4.260 + else:
4.261 + return "{{%s%s|%s}}" % (prefix, parts[0], parts[1])
4.262 +
4.263 + else:
4.264 + return match.group()
4.265 +
4.266 +def get_table_rows(text):
4.267 +
4.268 + "Return a list of (cellsep, columns) tuples for the given table 'text'."
4.269 +
4.270 + rows = []
4.271 +
4.272 + for line in text.split("\n"):
4.273 + cellsep = None
4.274 + columns = [""]
4.275 + last = 0
4.276 + for match in table_content_regexp.finditer(line):
4.277 + start, end = match.span()
4.278 + columns[-1] += line[last:start]
4.279 +
4.280 + if match.group("celltype"):
4.281 + if cellsep is None:
4.282 + cellsep = match.group("celltype")
4.283 + columns.append("")
4.284 + else:
4.285 + columns[-1] += match.group()
4.286 +
4.287 + last = end
4.288 +
4.289 + columns[-1] += line[last:]
4.290 +
4.291 + if cellsep:
4.292 + rows.append((cellsep, columns[1:-1]))
4.293 +
4.294 + return rows
4.295 +
4.296 +def translate_content(text, sectiontype=None):
4.297 +
4.298 + """
4.299 + Return a translation of the given 'text'. If the optional 'sectiontype' is
4.300 + specified, the translation may be modified to a form appropriate to the
4.301 + section being translated.
4.302 + """
4.303 +
4.304 + parts = []
4.305 +
4.306 + last = 0
4.307 + for match in content_regexp.finditer(text):
4.308 + start, end = match.span()
4.309 + parts.append(text[last:start])
4.310 +
4.311 + # Handle unformatted sections.
4.312 +
4.313 + if sectiontype in ("code", "noformat"):
4.314 + parts.append(match.group())
4.315 + else:
4.316 + parts.append(translate_content_match(match))
4.317 +
4.318 + last = end
4.319 +
4.320 + parts.append(text[last:])
4.321 + return "".join(parts)
4.322 +
4.323 +# Translation helpers.
4.324 +
4.325 +markers = {
4.326 + "*" : "*",
4.327 + "#" : "1.",
4.328 + "-" : "*",
4.329 + }
4.330 +
4.331 +def translate_marker(marker):
4.332 +
4.333 + "Translate the given 'marker' to a suitable Moin representation."
4.334 +
4.335 + return " " * len(marker) + markers[marker[-1]]
4.336 +
4.337 +cellseps = {
4.338 + "|" : "||",
4.339 + "||" : "||",
4.340 + }
4.341 +
4.342 +cellextra = {
4.343 + "|" : "",
4.344 + "||" : "'''",
4.345 + }
4.346 +
4.347 +def translate_cellsep(cellsep):
4.348 +
4.349 + "Translate the given 'cellsep' to a suitable Moin representation."
4.350 +
4.351 + return cellseps[cellsep]
4.352 +
4.353 +def translate_cell(cellsep, text):
4.354 +
4.355 + "Using 'cellsep', translate the cell 'text'."
4.356 +
4.357 + return cellextra[cellsep] + translate_content(text) + cellextra[cellsep]
4.358 +
4.359 +sectiontypes = {
4.360 + "code" : "",
4.361 + "noformat" : "",
4.362 + "quote" : "",
4.363 + "info" : "wiki important",
4.364 + "note" : "wiki caution",
4.365 + "tip" : "wiki tip",
4.366 + "warning" : "wiki warning",
4.367 + }
4.368 +
4.369 +# General parsing.
4.370 +
4.371 +def parse(s, out):
4.372 +
4.373 + "Parse the content in the string 's', writing a translation to 'out'."
4.374 +
4.375 + for type, text in get_regions(s):
4.376 +
4.377 + # Handle list, heading, blockquote or anonymous blocks.
4.378 +
4.379 + if type is None:
4.380 + for blocktype, blocktext in get_blocks(text):
4.381 +
4.382 + # Translate headings and blockquotes.
4.383 +
4.384 + if blocktypes.has_key(blocktype):
4.385 + print >>out, blocktypes[blocktype] % blocktext
4.386 +
4.387 + # Translate list items.
4.388 +
4.389 + elif blocktype == "list":
4.390 + for listmarker, listitem in get_list_items(blocktext):
4.391 + print >>out, "%s %s" % (translate_marker(listmarker), translate_content(listitem))
4.392 +
4.393 + # Translate table items.
4.394 +
4.395 + elif blocktype == "table":
4.396 + for cellsep, columns in get_table_rows(blocktext):
4.397 + moinsep = translate_cellsep(cellsep)
4.398 + print >>out, moinsep + moinsep.join([translate_cell(cellsep, column) for column in columns]) + moinsep
4.399 +
4.400 + # Handle anonymous blocks.
4.401 +
4.402 + else:
4.403 + print >>out, translate_content(blocktext.rstrip())
4.404 +
4.405 + print >>out
4.406 +
4.407 + # Handle sections.
4.408 +
4.409 + else:
4.410 + sectiontype, options = type
4.411 +
4.412 + # Direct translations of sections.
4.413 +
4.414 + mointype = sectiontypes.get(sectiontype)
4.415 + if mointype:
4.416 + print >>out, "{{{#!%s" % mointype
4.417 + if options:
4.418 + print >>out, "##", options
4.419 + else:
4.420 + print >>out, "{{{",
4.421 + print >>out, translate_content(text, sectiontype),
4.422 + print >>out, "}}}"
4.423 + print >>out
4.424 +
4.425 +if __name__ == "__main__":
4.426 + s = sys.stdin.read()
4.427 + parse(s, sys.stdout)
4.428 +
4.429 +# vim: tabstop=4 expandtab shiftwidth=4
5.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
5.2 +++ b/xmlparser.py Fri Feb 22 22:50:30 2013 +0100
5.3 @@ -0,0 +1,276 @@
5.4 +#!/usr/bin/env python
5.5 +
5.6 +"""
5.7 +Confluence Wiki XML/XHTML syntax parsing.
5.8 +
5.9 +Copyright (C) 2012, 2013 Paul Boddie <paul@boddie.org.uk>
5.10 +
5.11 +This software is free software; you can redistribute it and/or
5.12 +modify it under the terms of the GNU General Public License as
5.13 +published by the Free Software Foundation; either version 2 of
5.14 +the License, or (at your option) any later version.
5.15 +
5.16 +This software is distributed in the hope that it will be useful,
5.17 +but WITHOUT ANY WARRANTY; without even the implied warranty of
5.18 +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
5.19 +GNU General Public License for more details.
5.20 +
5.21 +You should have received a copy of the GNU General Public
5.22 +License along with this library; see the file LICENCE.txt
5.23 +If not, write to the Free Software Foundation, Inc.,
5.24 +51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
5.25 +"""
5.26 +
5.27 +try:
5.28 + from cStringIO import StringIO
5.29 +except ImportError:
5.30 + from StringIO import StringIO
5.31 +
5.32 +from common import *
5.33 +from xmlread import Parser
5.34 +import re
5.35 +import sys
5.36 +import operator
5.37 +import htmlentitydefs
5.38 +
5.39 +# XML dialect syntax parsing.
5.40 +
5.41 +tags = {
5.42 + # XHTML tag MoinMoin syntax
5.43 + "strong" : "'''%s'''",
5.44 + "em" : "''%s''",
5.45 + "u" : "__%s__",
5.46 + "del" : "--(%s)--",
5.47 + "sup" : "^%s^",
5.48 + "sub" : ",,%s,,",
5.49 + "code" : "`%s`",
5.50 + "pre" : "{{{%s}}}",
5.51 + "blockquote" : " %s",
5.52 + "small" : "~-%s-~",
5.53 + "big" : "~+%s+~",
5.54 + "p" : "%s",
5.55 + "ol" : "%s",
5.56 + "ul" : "%s",
5.57 + "ac:plain-text-body" : "{{{%s}}}",
5.58 + "ac:link" : "[[%s%s|%s]]",
5.59 + }
5.60 +
5.61 +for tag, translation in blocktypes.items():
5.62 + tags[tag] = translation
5.63 +
5.64 +simple_tags = {
5.65 + # XHTML tag MoinMoin syntax
5.66 + "br" : "<<BR>>",
5.67 + }
5.68 +
5.69 +list_tags = {
5.70 + # XHTML list tag MoinMoin list item syntax
5.71 + "ol" : "1. %s",
5.72 + "ul" : "* %s",
5.73 + }
5.74 +
5.75 +indented_tags = ["li", "p"]
5.76 +
5.77 +link_target_tags = {
5.78 + # Confluence element Attribute providing the target
5.79 + "ri:page" : "ri:content-title",
5.80 + "ri:attachment" : "ri:filename",
5.81 + "ri:user" : "ri:username",
5.82 + }
5.83 +
5.84 +macro_rich_text_styles = {
5.85 + # Confluence style MoinMoin admonition style
5.86 + "note" : "caution",
5.87 + "warning" : "warning",
5.88 + "info" : "important",
5.89 + "tip" : "tip",
5.90 + }
5.91 +
5.92 +normalise_regexp_str = r"\s+"
5.93 +normalise_regexp = re.compile(normalise_regexp_str)
5.94 +
5.95 +class ConfluenceXMLParser(Parser):
5.96 +
5.97 + "Handle content from Confluence 4 page revisions."
5.98 +
5.99 + def __init__(self, out):
5.100 + Parser.__init__(self)
5.101 + self.out = out
5.102 +
5.103 + # Link target information.
5.104 +
5.105 + self.target = None
5.106 + self.target_type = None
5.107 +
5.108 + # Macro information.
5.109 +
5.110 + self.macro = None
5.111 + self.macro_parameters = {}
5.112 +
5.113 + # Indentation and preformatted states.
5.114 +
5.115 + self.indent = 0
5.116 + self.states = {}
5.117 + for name in ("pre", "ac:plain-text-body"):
5.118 + self.states[name] = 0
5.119 +
5.120 + # ContentHandler-related methods.
5.121 +
5.122 + def startElement(self, name, attrs):
5.123 + if list_tags.has_key(name):
5.124 + self.indent += 1
5.125 + elif self.states.has_key(name):
5.126 + self.states[name] += 1
5.127 + Parser.startElement(self, name, attrs)
5.128 +
5.129 + def endElement(self, name):
5.130 + Parser.endElement(self, name)
5.131 + if list_tags.has_key(name):
5.132 + self.indent -= 1
5.133 + elif self.states.has_key(name):
5.134 + self.states[name] -= 1
5.135 +
5.136 + def characters(self, content):
5.137 + if not self.is_preformatted():
5.138 + content = self.normalise(content, self.elements[-1])
5.139 + Parser.characters(self, content)
5.140 +
5.141 + def skippedEntity(self, name):
5.142 + ch = htmlentitydefs.name2codepoint.get(name)
5.143 + if ch:
5.144 + self.text[-1].append(unichr(ch))
5.145 +
5.146 + # Parser-related methods.
5.147 +
5.148 + def handleElement(self, name):
5.149 + text = "".join(self.text[-1])
5.150 + conversion = None
5.151 +
5.152 + # Handle list elements.
5.153 +
5.154 + if name == "li" and len(self.elements) > 1:
5.155 + list_tag = self.elements[-2]
5.156 + conversion = list_tags.get(list_tag)
5.157 +
5.158 + # Remember link target information.
5.159 +
5.160 + elif link_target_tags.has_key(name):
5.161 + self.target = self.attributes[-1].get(link_target_tags[name])
5.162 + self.target_type = name
5.163 + text = ""
5.164 +
5.165 + # Remember macro information.
5.166 +
5.167 + elif name == "ac:parameter":
5.168 + self.macro_parameters[self.attributes[-1].get("ac:name")] = text
5.169 + text = ""
5.170 +
5.171 + elif name == "ac:macro":
5.172 + self.macro = self.attributes[-1].get("ac:name")
5.173 +
5.174 + # Handle the common case.
5.175 +
5.176 + else:
5.177 + conversion = tags.get(name)
5.178 +
5.179 + # Attempt to convert the text.
5.180 +
5.181 + # Links require target information.
5.182 + # NOTE: User links should support the intended user namespace prefix.
5.183 +
5.184 + if name == "ac:link":
5.185 + if self.target_type == "ri:attachment":
5.186 + prefix = "attachment:"
5.187 + elif self.target_type == "ri:user":
5.188 + prefix = ""
5.189 + else:
5.190 + prefix = "../"
5.191 +
5.192 + text = conversion % (prefix, self.target, text or self.target)
5.193 + self.target = self.target_type = None
5.194 +
5.195 + # Macro name information is used to style rich text body regions.
5.196 +
5.197 + elif name == "ac:macro" and macro_rich_text_styles.has_key(self.macro):
5.198 + details = macro_rich_text_styles[self.macro]
5.199 + title = self.macro_parameters.get("title")
5.200 + if title:
5.201 + details = "%s\n\n%s" % (details, title)
5.202 + text = "{{{#!wiki %s\n\n%s}}}" % (details, text)
5.203 + self.macro = None
5.204 + self.macro_parameters = {}
5.205 +
5.206 + # Handle the common case.
5.207 +
5.208 + elif text and conversion:
5.209 + text = conversion % text
5.210 + elif simple_tags.has_key(name):
5.211 + text = simple_tags[name]
5.212 +
5.213 + # Normalise leading whitespace and indent the text if appropriate.
5.214 +
5.215 + if name in indented_tags:
5.216 + text = " " * self.indent + text.lstrip()
5.217 +
5.218 + # Add the converted text to the end of the parent element's text nodes.
5.219 +
5.220 + if len(self.text) > 1:
5.221 + nodes = self.text[-2]
5.222 + if "".join(self.text[-2]):
5.223 + parent = self.elements[-2]
5.224 + if parent == "body":
5.225 + nodes.append("\n\n")
5.226 + elif list_tags.has_key(parent):
5.227 + nodes.append("\n")
5.228 + elif list_tags.has_key(name) and parent == "li":
5.229 + nodes.append("\n")
5.230 + nodes.append(text)
5.231 +
5.232 + # Otherwise, emit the text.
5.233 +
5.234 + else:
5.235 + self.out.write(text)
5.236 +
5.237 + def is_preformatted(self):
5.238 + return reduce(operator.or_, self.states.values(), False)
5.239 +
5.240 + # Whitespace normalisation.
5.241 +
5.242 + def get_replacement(self, name):
5.243 + if name in ("html", "body") or list_tags.has_key(name):
5.244 + return ""
5.245 + else:
5.246 + return " "
5.247 +
5.248 + def normalise(self, text, name):
5.249 + return normalise_regexp.sub(self.get_replacement(name), text)
5.250 +
5.251 +def parse(s, out):
5.252 +
5.253 + "Parse the content in the string 's', writing a translation to 'out'."
5.254 +
5.255 + # NOTE: CDATA sections appear to have erroneous endings.
5.256 +
5.257 + s = u"""\
5.258 +<?xml version="1.0"?>
5.259 +<!DOCTYPE html
5.260 + PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
5.261 + "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
5.262 +<html xmlns="http://www.w3.org/1999/xhtml">
5.263 +<body>
5.264 +%s
5.265 +</body>
5.266 +</html>""" % s.replace("]] >", "]]>")
5.267 +
5.268 + f = StringIO(s.encode("utf-8"))
5.269 + try:
5.270 + parser = ConfluenceXMLParser(out)
5.271 + parser.parse(f)
5.272 + finally:
5.273 + f.close()
5.274 +
5.275 +if __name__ == "__main__":
5.276 + s = sys.stdin.read()
5.277 + parse(s, sys.stdout)
5.278 +
5.279 +# vim: tabstop=4 expandtab shiftwidth=4