Replaced the parser module with separate modules covering the different areas of functionality.

     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/common.py	Fri Feb 22 22:50:30 2013 +0100
     1.3 @@ -0,0 +1,38 @@
     1.4 +#!/usr/bin/env python
     1.5 +
     1.6 +"""
     1.7 +Common parsing data.
     1.8 +
     1.9 +Copyright (C) 2012, 2013 Paul Boddie <paul@boddie.org.uk>
    1.10 +
    1.11 +This software is free software; you can redistribute it and/or
    1.12 +modify it under the terms of the GNU General Public License as
    1.13 +published by the Free Software Foundation; either version 2 of
    1.14 +the License, or (at your option) any later version.
    1.15 +
    1.16 +This software is distributed in the hope that it will be useful,
    1.17 +but WITHOUT ANY WARRANTY; without even the implied warranty of
    1.18 +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    1.19 +GNU General Public License for more details.
    1.20 +
    1.21 +You should have received a copy of the GNU General Public
    1.22 +License along with this library; see the file LICENCE.txt
    1.23 +If not, write to the Free Software Foundation, Inc.,
    1.24 +51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA
    1.25 +"""
    1.26 +
    1.27 +URL_SCHEMES = ("http", "https", "ftp", "mailto")
    1.28 +
    1.29 +# Translation helpers.
    1.30 +
    1.31 +blocktypes = {
    1.32 +    "h1" : "= %s =",
    1.33 +    "h2" : "== %s ==",
    1.34 +    "h3" : "=== %s ===",
    1.35 +    "h4" : "==== %s ====",
    1.36 +    "h5" : "===== %s =====",
    1.37 +    "h6" : "====== %s ======",
    1.38 +    "bq" : "{{{%s}}}",
    1.39 +    }
    1.40 +
    1.41 +# vim: tabstop=4 expandtab shiftwidth=4

     2.1 --- a/convert.py	Sun Feb 17 20:36:11 2013 +0100
     2.2 +++ b/convert.py	Fri Feb 22 22:50:30 2013 +0100
     2.3 @@ -27,7 +27,7 @@
     2.4  from cStringIO import StringIO
     2.5  import codecs
     2.6  import xmlread
     2.7 -import parser
     2.8 +import wikiparser, xmlparser
     2.9  import sys
    2.10  
    2.11  MAX_TITLE_LENGTH = 120
    2.12 @@ -226,7 +226,7 @@
    2.13      'body'.
    2.14      """
    2.15  
    2.16 -    fn = fn or parser.parse
    2.17 +    fn = fn or wikiparser.parse
    2.18  
    2.19      out = codecs.open(filename, "w", encoding="utf-8")
    2.20      try:
    2.21 @@ -235,7 +235,7 @@
    2.22          out.close()
    2.23  
    2.24  def xmltranslate(filename, body):
    2.25 -    translate(filename, body, parser.xmlparse)
    2.26 +    translate(filename, body, xmlparser.parse)
    2.27  
    2.28  def sort_comments(pages_dir, pageid):
    2.29  

     3.1 --- a/parser.py	Sun Feb 17 20:36:11 2013 +0100
     3.2 +++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
     3.3 @@ -1,684 +0,0 @@
     3.4 -#!/usr/bin/env python
     3.5 -
     3.6 -"""
     3.7 -Confluence Wiki syntax parsing.
     3.8 -
     3.9 -Copyright (C) 2012, 2013 Paul Boddie <paul@boddie.org.uk>
    3.10 -
    3.11 -This software is free software; you can redistribute it and/or
    3.12 -modify it under the terms of the GNU General Public License as
    3.13 -published by the Free Software Foundation; either version 2 of
    3.14 -the License, or (at your option) any later version.
    3.15 -
    3.16 -This software is distributed in the hope that it will be useful,
    3.17 -but WITHOUT ANY WARRANTY; without even the implied warranty of
    3.18 -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    3.19 -GNU General Public License for more details.
    3.20 -
    3.21 -You should have received a copy of the GNU General Public
    3.22 -License along with this library; see the file LICENCE.txt
    3.23 -If not, write to the Free Software Foundation, Inc.,
    3.24 -51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA
    3.25 -
    3.26 ---------
    3.27 -
    3.28 -The basic procedure is as follows:
    3.29 -
    3.30 - 1. Wiki pages are first split up into regions.
    3.31 - 2. Then, within these regions, the text is split into blocks.
    3.32 -    1. First, lists are identified.
    3.33 -    2. Additionally, other block-like elements are identified.
    3.34 - 3. Each block is then parsed.
    3.35 -"""
    3.36 -
    3.37 -try:
    3.38 -    from cStringIO import StringIO
    3.39 -except ImportError:
    3.40 -    from StringIO import StringIO
    3.41 -
    3.42 -from xmlread import Parser
    3.43 -import re
    3.44 -import sys
    3.45 -import operator
    3.46 -import htmlentitydefs
    3.47 -
    3.48 -URL_SCHEMES = ("http", "https", "ftp", "mailto")
    3.49 -
    3.50 -# Section extraction.
    3.51 -
    3.52 -sections_regexp_str = r"(?<!{){(?P<type>[^-_*+{}\n:]+)(:[^}\n]+)?}.*?{(?P=type)}"
    3.53 -sections_regexp = re.compile(sections_regexp_str, re.DOTALL | re.MULTILINE)
    3.54 -
    3.55 -def get_regions(s):
    3.56 -
    3.57 -    """
    3.58 -    Return a list of regions from 's'. Each region is specified using a tuple of
    3.59 -    the form (type, text).
    3.60 -    """
    3.61 -
    3.62 -    last = 0
    3.63 -    regions = []
    3.64 -    for match in sections_regexp.finditer(s):
    3.65 -        start, end = match.span()
    3.66 -        regions.append((None, s[last:start]))
    3.67 -        regions.append(get_section_details(s[start:end]))
    3.68 -        last = end
    3.69 -    regions.append((None, s[last:]))
    3.70 -    return regions
    3.71 -
    3.72 -# Section inspection.
    3.73 -
    3.74 -section_regexp_str = r"{(?P<sectiontype>[^\n:]*?)(?::(?P<options>.*?))?}(?P<section>.*){(?P=sectiontype)}"
    3.75 -section_regexp = re.compile(section_regexp_str, re.DOTALL | re.MULTILINE)
    3.76 -
    3.77 -def get_section_details(s):
    3.78 -
    3.79 -    "Return the details of a section 's' in the form (type, text)."
    3.80 -
    3.81 -    match = section_regexp.match(s)
    3.82 -    if match:
    3.83 -        return (match.group("sectiontype"), match.group("options")), match.group("section")
    3.84 -    else:
    3.85 -        return None, s
    3.86 -
    3.87 -# Heading, table and list extraction.
    3.88 -
    3.89 -list_regexp_str = r"^\s*(?P<listtype>[*#-])[*#-]*.*\n(\s*(?P=listtype).*(?:\n|$))*"
    3.90 -table_regexp_str = r"^((?P<celltype>[|]{1,2})(.+?(?P=celltype))+(\n|$))+"
    3.91 -blocktext_regexp_str = r"^(?P<type>h\d|bq)\.\s+(?P<text>.*)$"
    3.92 -
    3.93 -blockelement_regexp = re.compile(
    3.94 -    "(" + list_regexp_str + ")"
    3.95 -    "|"
    3.96 -    "(" + table_regexp_str + ")"
    3.97 -    "|"
    3.98 -    "(" + blocktext_regexp_str + ")",
    3.99 -    re.MULTILINE
   3.100 -    )
   3.101 -
   3.102 -def get_block_elements(s):
   3.103 -
   3.104 -    """
   3.105 -    Extract headings, tables and lists from the given string 's'.
   3.106 -    """
   3.107 -
   3.108 -    last = 0
   3.109 -    blocks = []
   3.110 -    for match in blockelement_regexp.finditer(s):
   3.111 -        start, end = match.span()
   3.112 -        matchtype = match.group("listtype") and "list" or match.group("celltype") and "table" or match.group("type")
   3.113 -        blocks.append((None, s[last:start]))
   3.114 -        blocks.append((matchtype, match.group("text") or s[start:end]))
   3.115 -        last = end
   3.116 -    blocks.append((None, s[last:]))
   3.117 -    return blocks
   3.118 -
   3.119 -# Block extraction.
   3.120 -
   3.121 -block_regexp_str = r"^(?:\s*\n)+"
   3.122 -block_regexp = re.compile(block_regexp_str, re.MULTILINE)
   3.123 -
   3.124 -def get_basic_blocks(s):
   3.125 -
   3.126 -    """
   3.127 -    Return blocks from the given string 's' by splitting the text on blank lines
   3.128 -    and eliminating those lines.
   3.129 -    """
   3.130 -
   3.131 -    return [b for b in block_regexp.split(s) if b.strip()]
   3.132 -
   3.133 -# Block inspection.
   3.134 -
   3.135 -def get_blocks(s):
   3.136 -
   3.137 -    """
   3.138 -    Return blocks from the given string 's', inspecting the basic blocks and
   3.139 -    generating additional block-level text where appropriate.
   3.140 -    """
   3.141 -
   3.142 -    blocks = []
   3.143 -
   3.144 -    for blocktype, blocktext in get_block_elements(s):
   3.145 -
   3.146 -        # Collect heading, list and table blocks.
   3.147 -
   3.148 -        if blocktype is not None:
   3.149 -            blocks.append((blocktype, blocktext))
   3.150 -
   3.151 -        # Attempt to find new subblocks in other regions.
   3.152 -
   3.153 -        else:
   3.154 -            for block in get_basic_blocks(blocktext):
   3.155 -                blocks.append((None, block))
   3.156 -
   3.157 -    return blocks
   3.158 -
   3.159 -# List item inspection.
   3.160 -
   3.161 -listitem_regexp_str = r"^(?P<marker> *[-*#]+)\s*(?P<text>.*)$"
   3.162 -listitem_regexp = re.compile(listitem_regexp_str, re.MULTILINE)
   3.163 -
   3.164 -def get_list_items(text):
   3.165 -
   3.166 -    "Return a list of (marker, text) tuples for the given list 'text'."
   3.167 -
   3.168 -    items = []
   3.169 -
   3.170 -    for match in listitem_regexp.finditer(text):
   3.171 -        items.append((match.group("marker"), match.group("text")))
   3.172 -
   3.173 -    return items
   3.174 -
   3.175 -# Table row inspection.
   3.176 -
   3.177 -monospace_regexp_str = r"{{(?P<monotext>.*?)}}"
   3.178 -link_regexp_str = r"[[](?P<linktext>.*?)]"
   3.179 -image_regexp_str = r"!(?P<imagetext>.*?)!"
   3.180 -cellsep_regexp_str = r"(?P<celltype>[|]{1,2})"
   3.181 -
   3.182 -content_regexp_str = (
   3.183 -    "(" + monospace_regexp_str + ")"
   3.184 -    "|"
   3.185 -    "(" + link_regexp_str + ")"
   3.186 -    "|"
   3.187 -    "(" + image_regexp_str + ")"
   3.188 -    )
   3.189 -
   3.190 -table_content_regexp_str = (
   3.191 -    content_regexp_str +
   3.192 -    "|"
   3.193 -    "(" + cellsep_regexp_str + ")"
   3.194 -    )
   3.195 -
   3.196 -content_regexp = re.compile(content_regexp_str)
   3.197 -table_content_regexp = re.compile(table_content_regexp_str)
   3.198 -
   3.199 -def translate_content_match(match):
   3.200 -
   3.201 -    "Translate the content described by the given 'match', returning a string."
   3.202 -
   3.203 -    if match.group("monotext"):
   3.204 -        return "{{{%s}}}" % match.group("monotext")
   3.205 -
   3.206 -    elif match.group("linktext"):
   3.207 -        parts = match.group("linktext").split("|")
   3.208 -
   3.209 -        # NOTE: Proper detection of external links required.
   3.210 -
   3.211 -        if len(parts) == 1:
   3.212 -            label, target, title = None, parts[0], None
   3.213 -        elif len(parts) == 2:
   3.214 -            (label, target), title = parts, None
   3.215 -        else:
   3.216 -            label, target, title = parts
   3.217 -
   3.218 -        target = target.strip()
   3.219 -
   3.220 -        # Look for namespace links and rewrite them.
   3.221 -
   3.222 -        if target.find(":") != -1:
   3.223 -            prefix = ""
   3.224 -            space, rest = target.split(":", 1)
   3.225 -            if space not in URL_SCHEMES:
   3.226 -                target = "%s/%s" % (space, rest)
   3.227 -
   3.228 -        # Detect anchors.
   3.229 -
   3.230 -        elif target.startswith("#"):
   3.231 -            prefix = ""
   3.232 -
   3.233 -        # Detect attachments.
   3.234 -
   3.235 -        elif target.startswith("^"):
   3.236 -            prefix = "attachment:"
   3.237 -
   3.238 -        # Link to other pages within a space.
   3.239 -
   3.240 -        else:
   3.241 -            prefix = "../"
   3.242 -
   3.243 -            # Make the link tidier by making a target if none was given.
   3.244 -
   3.245 -            if not label:
   3.246 -                label = target
   3.247 -
   3.248 -        if not label and not title:
   3.249 -            return "[[%s%s]]" % (prefix, target)
   3.250 -        elif not title:
   3.251 -            return "[[%s%s|%s]]" % (prefix, target, label)
   3.252 -        else:
   3.253 -            return "[[%s%s|%s|title=%s]]" % (prefix, target, label, title)
   3.254 -
   3.255 -    elif match.group("imagetext"):
   3.256 -        parts = match.group("imagetext").split("|")
   3.257 -
   3.258 -        # NOTE: Proper detection of external links required.
   3.259 -
   3.260 -        if parts[0].startswith("http"):
   3.261 -            prefix = ""
   3.262 -        else:
   3.263 -            prefix = "attachment:"
   3.264 -
   3.265 -        # NOTE: Proper options conversion required.
   3.266 -
   3.267 -        if len(parts) == 1:
   3.268 -            return "{{%s%s}}" % (prefix, parts[0])
   3.269 -        else:
   3.270 -            return "{{%s%s|%s}}" % (prefix, parts[0], parts[1])
   3.271 -
   3.272 -    else:
   3.273 -        return match.group()
   3.274 -
   3.275 -def get_table_rows(text):
   3.276 -
   3.277 -    "Return a list of (cellsep, columns) tuples for the given table 'text'."
   3.278 -
   3.279 -    rows = []
   3.280 -
   3.281 -    for line in text.split("\n"):
   3.282 -        cellsep = None
   3.283 -        columns = [""]
   3.284 -        last = 0
   3.285 -        for match in table_content_regexp.finditer(line):
   3.286 -            start, end = match.span()
   3.287 -            columns[-1] += line[last:start]
   3.288 -
   3.289 -            if match.group("celltype"):
   3.290 -                if cellsep is None:
   3.291 -                    cellsep = match.group("celltype")
   3.292 -                columns.append("")
   3.293 -            else:
   3.294 -                columns[-1] += match.group()
   3.295 -
   3.296 -            last = end
   3.297 -
   3.298 -        columns[-1] += line[last:]
   3.299 -
   3.300 -        if cellsep:
   3.301 -            rows.append((cellsep, columns[1:-1]))
   3.302 -
   3.303 -    return rows
   3.304 -
   3.305 -def translate_content(text, sectiontype=None):
   3.306 -
   3.307 -    """
   3.308 -    Return a translation of the given 'text'. If the optional 'sectiontype' is
   3.309 -    specified, the translation may be modified to a form appropriate to the
   3.310 -    section being translated.
   3.311 -    """
   3.312 -
   3.313 -    parts = []
   3.314 -
   3.315 -    last = 0
   3.316 -    for match in content_regexp.finditer(text):
   3.317 -        start, end = match.span()
   3.318 -        parts.append(text[last:start])
   3.319 -
   3.320 -        # Handle unformatted sections.
   3.321 -
   3.322 -        if sectiontype in ("code", "noformat"):
   3.323 -            parts.append(match.group())
   3.324 -        else:
   3.325 -            parts.append(translate_content_match(match))
   3.326 -
   3.327 -        last = end
   3.328 -
   3.329 -    parts.append(text[last:])
   3.330 -    return "".join(parts)
   3.331 -
   3.332 -# Translation helpers.
   3.333 -
   3.334 -blocktypes = {
   3.335 -    "h1" : "= %s =",
   3.336 -    "h2" : "== %s ==",
   3.337 -    "h3" : "=== %s ===",
   3.338 -    "h4" : "==== %s ====",
   3.339 -    "h5" : "===== %s =====",
   3.340 -    "h6" : "====== %s ======",
   3.341 -    "bq" : "{{{%s}}}",
   3.342 -    }
   3.343 -
   3.344 -markers = {
   3.345 -    "*" : "*",
   3.346 -    "#" : "1.",
   3.347 -    "-" : "*",
   3.348 -    }
   3.349 -
   3.350 -def translate_marker(marker):
   3.351 -
   3.352 -    "Translate the given 'marker' to a suitable Moin representation."
   3.353 -
   3.354 -    return " " * len(marker) + markers[marker[-1]]
   3.355 -
   3.356 -cellseps = {
   3.357 -    "|" : "||",
   3.358 -    "||" : "||",
   3.359 -    }
   3.360 -
   3.361 -cellextra = {
   3.362 -    "|" : "",
   3.363 -    "||" : "'''",
   3.364 -    }
   3.365 -
   3.366 -def translate_cellsep(cellsep):
   3.367 -
   3.368 -    "Translate the given 'cellsep' to a suitable Moin representation."
   3.369 -
   3.370 -    return cellseps[cellsep]
   3.371 -
   3.372 -def translate_cell(cellsep, text):
   3.373 -
   3.374 -    "Using 'cellsep', translate the cell 'text'."
   3.375 -
   3.376 -    return cellextra[cellsep] + translate_content(text) + cellextra[cellsep]
   3.377 -
   3.378 -sectiontypes = {
   3.379 -    "code" : "",
   3.380 -    "noformat" : "",
   3.381 -    "quote" : "",
   3.382 -    "info" : "wiki important",
   3.383 -    "note" : "wiki caution",
   3.384 -    "tip" : "wiki tip",
   3.385 -    "warning" : "wiki warning",
   3.386 -    }
   3.387 -
   3.388 -# XML dialect syntax parsing.
   3.389 -
   3.390 -tags = {
   3.391 -    # XHTML tag               MoinMoin syntax
   3.392 -    "strong"                : "'''%s'''",
   3.393 -    "em"                    : "''%s''",
   3.394 -    "u"                     : "__%s__",
   3.395 -    "del"                   : "--(%s)--",
   3.396 -    "sup"                   : "^%s^",
   3.397 -    "sub"                   : ",,%s,,",
   3.398 -    "code"                  : "`%s`",
   3.399 -    "pre"                   : "{{{%s}}}",
   3.400 -    "blockquote"            : " %s",
   3.401 -    "small"                 : "~-%s-~",
   3.402 -    "big"                   : "~+%s+~",
   3.403 -    "p"                     : "%s",
   3.404 -    "ol"                    : "%s",
   3.405 -    "ul"                    : "%s",
   3.406 -    "ac:plain-text-body"    : "{{{%s}}}",
   3.407 -    "ac:link"               : "[[%s%s|%s]]",
   3.408 -    }
   3.409 -
   3.410 -for tag, translation in blocktypes.items():
   3.411 -    tags[tag] = translation
   3.412 -
   3.413 -simple_tags = {
   3.414 -    # XHTML tag               MoinMoin syntax
   3.415 -    "br"                    : "<<BR>>",
   3.416 -    }
   3.417 -
   3.418 -list_tags = {
   3.419 -    # XHTML list tag          MoinMoin list item syntax
   3.420 -    "ol"                    : "1. %s",
   3.421 -    "ul"                    : "* %s",
   3.422 -    }
   3.423 -
   3.424 -indented_tags = ["li", "p"]
   3.425 -
   3.426 -link_target_tags = {
   3.427 -    # Confluence element      Attribute providing the target
   3.428 -    "ri:page"               : "ri:content-title",
   3.429 -    "ri:attachment"         : "ri:filename",
   3.430 -    "ri:user"               : "ri:username",
   3.431 -    }
   3.432 -
   3.433 -macro_rich_text_styles = {
   3.434 -    # Confluence style        MoinMoin admonition style
   3.435 -    "note"                  : "caution",
   3.436 -    "warning"               : "warning",
   3.437 -    "info"                  : "important",
   3.438 -    "tip"                   : "tip",
   3.439 -    }
   3.440 -
   3.441 -normalise_regexp_str = r"\s+"
   3.442 -normalise_regexp = re.compile(normalise_regexp_str)
   3.443 -
   3.444 -class ConfluenceXMLParser(Parser):
   3.445 -
   3.446 -    "Handle content from Confluence 4 page revisions."
   3.447 -
   3.448 -    def __init__(self, out):
   3.449 -        Parser.__init__(self)
   3.450 -        self.out = out
   3.451 -
   3.452 -        # Link target information.
   3.453 -
   3.454 -        self.target = None
   3.455 -        self.target_type = None
   3.456 -
   3.457 -        # Macro information.
   3.458 -
   3.459 -        self.macro = None
   3.460 -        self.macro_parameters = {}
   3.461 -
   3.462 -        # Indentation and preformatted states.
   3.463 -
   3.464 -        self.indent = 0
   3.465 -        self.states = {}
   3.466 -        for name in ("pre", "ac:plain-text-body"):
   3.467 -            self.states[name] = 0
   3.468 -
   3.469 -    # ContentHandler-related methods.
   3.470 -
   3.471 -    def startElement(self, name, attrs):
   3.472 -        if list_tags.has_key(name):
   3.473 -            self.indent += 1
   3.474 -        elif self.states.has_key(name):
   3.475 -            self.states[name] += 1
   3.476 -        Parser.startElement(self, name, attrs)
   3.477 -
   3.478 -    def endElement(self, name):
   3.479 -        Parser.endElement(self, name)
   3.480 -        if list_tags.has_key(name):
   3.481 -            self.indent -= 1
   3.482 -        elif self.states.has_key(name):
   3.483 -            self.states[name] -= 1
   3.484 -
   3.485 -    def characters(self, content):
   3.486 -        if not self.is_preformatted():
   3.487 -            content = self.normalise(content, self.elements[-1])
   3.488 -        Parser.characters(self, content)
   3.489 -
   3.490 -    def skippedEntity(self, name):
   3.491 -        ch = htmlentitydefs.name2codepoint.get(name)
   3.492 -        if ch:
   3.493 -            self.text[-1].append(unichr(ch))
   3.494 -
   3.495 -    # Parser-related methods.
   3.496 -
   3.497 -    def handleElement(self, name):
   3.498 -        text = "".join(self.text[-1])
   3.499 -        conversion = None
   3.500 -
   3.501 -        # Handle list elements.
   3.502 -
   3.503 -        if name == "li" and len(self.elements) > 1:
   3.504 -            list_tag = self.elements[-2]
   3.505 -            conversion = list_tags.get(list_tag)
   3.506 -
   3.507 -        # Remember link target information.
   3.508 -
   3.509 -        elif link_target_tags.has_key(name):
   3.510 -            self.target = self.attributes[-1].get(link_target_tags[name])
   3.511 -            self.target_type = name
   3.512 -            text = ""
   3.513 -
   3.514 -        # Remember macro information.
   3.515 -
   3.516 -        elif name == "ac:parameter":
   3.517 -            self.macro_parameters[self.attributes[-1].get("ac:name")] = text
   3.518 -            text = ""
   3.519 -
   3.520 -        elif name == "ac:macro":
   3.521 -            self.macro = self.attributes[-1].get("ac:name")
   3.522 -
   3.523 -        # Handle the common case.
   3.524 -
   3.525 -        else:
   3.526 -            conversion = tags.get(name)
   3.527 -
   3.528 -        # Attempt to convert the text.
   3.529 -
   3.530 -        # Links require target information.
   3.531 -        # NOTE: User links should support the intended user namespace prefix.
   3.532 -
   3.533 -        if name == "ac:link":
   3.534 -            if self.target_type == "ri:attachment":
   3.535 -                prefix = "attachment:"
   3.536 -            elif self.target_type == "ri:user":
   3.537 -                prefix = ""
   3.538 -            else:
   3.539 -                prefix = "../"
   3.540 -
   3.541 -            text = conversion % (prefix, self.target, text or self.target)
   3.542 -            self.target = self.target_type = None
   3.543 -
   3.544 -        # Macro name information is used to style rich text body regions.
   3.545 -
   3.546 -        elif name == "ac:macro" and macro_rich_text_styles.has_key(self.macro):
   3.547 -            details = macro_rich_text_styles[self.macro]
   3.548 -            title = self.macro_parameters.get("title")
   3.549 -            if title:
   3.550 -                details = "%s\n\n%s" % (details, title)
   3.551 -            text = "{{{#!wiki %s\n\n%s}}}" % (details, text)
   3.552 -            self.macro = None
   3.553 -            self.macro_parameters = {}
   3.554 -
   3.555 -        # Handle the common case.
   3.556 -
   3.557 -        elif text and conversion:
   3.558 -            text = conversion % text
   3.559 -        elif simple_tags.has_key(name):
   3.560 -            text = simple_tags[name]
   3.561 -
   3.562 -        # Normalise leading whitespace and indent the text if appropriate.
   3.563 -
   3.564 -        if name in indented_tags:
   3.565 -            text = " " * self.indent + text.lstrip()
   3.566 -
   3.567 -        # Add the converted text to the end of the parent element's text nodes.
   3.568 -
   3.569 -        if len(self.text) > 1:
   3.570 -            nodes = self.text[-2]
   3.571 -            if "".join(self.text[-2]):
   3.572 -                parent = self.elements[-2]
   3.573 -                if parent == "body":
   3.574 -                    nodes.append("\n\n")
   3.575 -                elif list_tags.has_key(parent):
   3.576 -                    nodes.append("\n")
   3.577 -                elif list_tags.has_key(name) and parent == "li":
   3.578 -                    nodes.append("\n")
   3.579 -            nodes.append(text)
   3.580 -
   3.581 -        # Otherwise, emit the text.
   3.582 -
   3.583 -        else:
   3.584 -            self.out.write(text)
   3.585 -
   3.586 -    def is_preformatted(self):
   3.587 -        return reduce(operator.or_, self.states.values(), False)
   3.588 -
   3.589 -    # Whitespace normalisation.
   3.590 -
   3.591 -    def get_replacement(self, name):
   3.592 -        if name in ("html", "body") or list_tags.has_key(name):
   3.593 -            return ""
   3.594 -        else:
   3.595 -            return " "
   3.596 -
   3.597 -    def normalise(self, text, name):
   3.598 -        return normalise_regexp.sub(self.get_replacement(name), text)
   3.599 -
   3.600 -def xmlparse(s, out):
   3.601 -
   3.602 -    "Parse the content in the string 's', writing a translation to 'out'."
   3.603 -
   3.604 -    # NOTE: CDATA sections appear to have erroneous endings.
   3.605 -
   3.606 -    s = u"""\
   3.607 -<?xml version="1.0"?>
   3.608 -<!DOCTYPE html 
   3.609 -     PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
   3.610 -     "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
   3.611 -<html xmlns="http://www.w3.org/1999/xhtml">
   3.612 -<body>
   3.613 -%s
   3.614 -</body>
   3.615 -</html>""" % s.replace("]] >", "]]>")
   3.616 -
   3.617 -    f = StringIO(s.encode("utf-8"))
   3.618 -    try:
   3.619 -        parser = ConfluenceXMLParser(out)
   3.620 -        parser.parse(f)
   3.621 -    finally:
   3.622 -        f.close()
   3.623 -
   3.624 -# General parsing.
   3.625 -
   3.626 -def parse(s, out):
   3.627 -
   3.628 -    "Parse the content in the string 's', writing a translation to 'out'."
   3.629 -
   3.630 -    for type, text in get_regions(s):
   3.631 -
   3.632 -        # Handle list, heading, blockquote or anonymous blocks.
   3.633 -
   3.634 -        if type is None:
   3.635 -            for blocktype, blocktext in get_blocks(text):
   3.636 -
   3.637 -                # Translate headings and blockquotes.
   3.638 -
   3.639 -                if blocktypes.has_key(blocktype):
   3.640 -                    print >>out, blocktypes[blocktype] % blocktext
   3.641 -
   3.642 -                # Translate list items.
   3.643 -
   3.644 -                elif blocktype == "list":
   3.645 -                    for listmarker, listitem in get_list_items(blocktext):
   3.646 -                        print >>out, "%s %s" % (translate_marker(listmarker), translate_content(listitem))
   3.647 -
   3.648 -                # Translate table items.
   3.649 -
   3.650 -                elif blocktype == "table":
   3.651 -                    for cellsep, columns in get_table_rows(blocktext):
   3.652 -                        moinsep = translate_cellsep(cellsep)
   3.653 -                        print >>out, moinsep + moinsep.join([translate_cell(cellsep, column) for column in columns]) + moinsep
   3.654 -
   3.655 -                # Handle anonymous blocks.
   3.656 -
   3.657 -                else:
   3.658 -                    print >>out, translate_content(blocktext.rstrip())
   3.659 -
   3.660 -                print >>out
   3.661 -
   3.662 -        # Handle sections.
   3.663 -
   3.664 -        else:
   3.665 -            sectiontype, options = type
   3.666 -
   3.667 -            # Direct translations of sections.
   3.668 -
   3.669 -            mointype = sectiontypes.get(sectiontype)
   3.670 -            if mointype:
   3.671 -                print >>out, "{{{#!%s" % mointype
   3.672 -                if options:
   3.673 -                    print >>out, "##", options
   3.674 -            else:
   3.675 -                print >>out, "{{{",
   3.676 -            print >>out, translate_content(text, sectiontype),
   3.677 -            print >>out, "}}}"
   3.678 -            print >>out
   3.679 -
   3.680 -if __name__ == "__main__":
   3.681 -    s = sys.stdin.read()
   3.682 -    if "--xml" in sys.argv:
   3.683 -        xmlparse(s, sys.stdout)
   3.684 -    else:
   3.685 -        parse(s, sys.stdout)
   3.686 -
   3.687 -# vim: tabstop=4 expandtab shiftwidth=4

     4.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     4.2 +++ b/wikiparser.py	Fri Feb 22 22:50:30 2013 +0100
     4.3 @@ -0,0 +1,426 @@
     4.4 +#!/usr/bin/env python
     4.5 +
     4.6 +"""
     4.7 +Confluence Wiki syntax parsing.
     4.8 +
     4.9 +Copyright (C) 2012, 2013 Paul Boddie <paul@boddie.org.uk>
    4.10 +
    4.11 +This software is free software; you can redistribute it and/or
    4.12 +modify it under the terms of the GNU General Public License as
    4.13 +published by the Free Software Foundation; either version 2 of
    4.14 +the License, or (at your option) any later version.
    4.15 +
    4.16 +This software is distributed in the hope that it will be useful,
    4.17 +but WITHOUT ANY WARRANTY; without even the implied warranty of
    4.18 +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    4.19 +GNU General Public License for more details.
    4.20 +
    4.21 +You should have received a copy of the GNU General Public
    4.22 +License along with this library; see the file LICENCE.txt
    4.23 +If not, write to the Free Software Foundation, Inc.,
    4.24 +51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA
    4.25 +
    4.26 +--------
    4.27 +
    4.28 +The basic procedure is as follows:
    4.29 +
    4.30 + 1. Wiki pages are first split up into regions.
    4.31 + 2. Then, within these regions, the text is split into blocks.
    4.32 +    1. First, lists are identified.
    4.33 +    2. Additionally, other block-like elements are identified.
    4.34 + 3. Each block is then parsed.
    4.35 +"""
    4.36 +
    4.37 +from common import *
    4.38 +import re
    4.39 +import sys
    4.40 +
    4.41 +# Section extraction.
    4.42 +
    4.43 +sections_regexp_str = r"(?<!{){(?P<type>[^-_*+{}\n:]+)(:[^}\n]+)?}.*?{(?P=type)}"
    4.44 +sections_regexp = re.compile(sections_regexp_str, re.DOTALL | re.MULTILINE)
    4.45 +
    4.46 +def get_regions(s):
    4.47 +
    4.48 +    """
    4.49 +    Return a list of regions from 's'. Each region is specified using a tuple of
    4.50 +    the form (type, text).
    4.51 +    """
    4.52 +
    4.53 +    last = 0
    4.54 +    regions = []
    4.55 +    for match in sections_regexp.finditer(s):
    4.56 +        start, end = match.span()
    4.57 +        regions.append((None, s[last:start]))
    4.58 +        regions.append(get_section_details(s[start:end]))
    4.59 +        last = end
    4.60 +    regions.append((None, s[last:]))
    4.61 +    return regions
    4.62 +
    4.63 +# Section inspection.
    4.64 +
    4.65 +section_regexp_str = r"{(?P<sectiontype>[^\n:]*?)(?::(?P<options>.*?))?}(?P<section>.*){(?P=sectiontype)}"
    4.66 +section_regexp = re.compile(section_regexp_str, re.DOTALL | re.MULTILINE)
    4.67 +
    4.68 +def get_section_details(s):
    4.69 +
    4.70 +    "Return the details of a section 's' in the form (type, text)."
    4.71 +
    4.72 +    match = section_regexp.match(s)
    4.73 +    if match:
    4.74 +        return (match.group("sectiontype"), match.group("options")), match.group("section")
    4.75 +    else:
    4.76 +        return None, s
    4.77 +
    4.78 +# Heading, table and list extraction.
    4.79 +
    4.80 +list_regexp_str = r"^\s*(?P<listtype>[*#-])[*#-]*.*\n(\s*(?P=listtype).*(?:\n|$))*"
    4.81 +table_regexp_str = r"^((?P<celltype>[|]{1,2})(.+?(?P=celltype))+(\n|$))+"
    4.82 +blocktext_regexp_str = r"^(?P<type>h\d|bq)\.\s+(?P<text>.*)$"
    4.83 +
    4.84 +blockelement_regexp = re.compile(
    4.85 +    "(" + list_regexp_str + ")"
    4.86 +    "|"
    4.87 +    "(" + table_regexp_str + ")"
    4.88 +    "|"
    4.89 +    "(" + blocktext_regexp_str + ")",
    4.90 +    re.MULTILINE
    4.91 +    )
    4.92 +
    4.93 +def get_block_elements(s):
    4.94 +
    4.95 +    """
    4.96 +    Extract headings, tables and lists from the given string 's'.
    4.97 +    """
    4.98 +
    4.99 +    last = 0
   4.100 +    blocks = []
   4.101 +    for match in blockelement_regexp.finditer(s):
   4.102 +        start, end = match.span()
   4.103 +        matchtype = match.group("listtype") and "list" or match.group("celltype") and "table" or match.group("type")
   4.104 +        blocks.append((None, s[last:start]))
   4.105 +        blocks.append((matchtype, match.group("text") or s[start:end]))
   4.106 +        last = end
   4.107 +    blocks.append((None, s[last:]))
   4.108 +    return blocks
   4.109 +
   4.110 +# Block extraction.
   4.111 +
   4.112 +block_regexp_str = r"^(?:\s*\n)+"
   4.113 +block_regexp = re.compile(block_regexp_str, re.MULTILINE)
   4.114 +
   4.115 +def get_basic_blocks(s):
   4.116 +
   4.117 +    """
   4.118 +    Return blocks from the given string 's' by splitting the text on blank lines
   4.119 +    and eliminating those lines.
   4.120 +    """
   4.121 +
   4.122 +    return [b for b in block_regexp.split(s) if b.strip()]
   4.123 +
   4.124 +# Block inspection.
   4.125 +
   4.126 +def get_blocks(s):
   4.127 +
   4.128 +    """
   4.129 +    Return blocks from the given string 's', inspecting the basic blocks and
   4.130 +    generating additional block-level text where appropriate.
   4.131 +    """
   4.132 +
   4.133 +    blocks = []
   4.134 +
   4.135 +    for blocktype, blocktext in get_block_elements(s):
   4.136 +
   4.137 +        # Collect heading, list and table blocks.
   4.138 +
   4.139 +        if blocktype is not None:
   4.140 +            blocks.append((blocktype, blocktext))
   4.141 +
   4.142 +        # Attempt to find new subblocks in other regions.
   4.143 +
   4.144 +        else:
   4.145 +            for block in get_basic_blocks(blocktext):
   4.146 +                blocks.append((None, block))
   4.147 +
   4.148 +    return blocks
   4.149 +
   4.150 +# List item inspection.
   4.151 +
   4.152 +listitem_regexp_str = r"^(?P<marker> *[-*#]+)\s*(?P<text>.*)$"
   4.153 +listitem_regexp = re.compile(listitem_regexp_str, re.MULTILINE)
   4.154 +
   4.155 +def get_list_items(text):
   4.156 +
   4.157 +    "Return a list of (marker, text) tuples for the given list 'text'."
   4.158 +
   4.159 +    items = []
   4.160 +
   4.161 +    for match in listitem_regexp.finditer(text):
   4.162 +        items.append((match.group("marker"), match.group("text")))
   4.163 +
   4.164 +    return items
   4.165 +
   4.166 +# Table row inspection.
   4.167 +
   4.168 +monospace_regexp_str = r"{{(?P<monotext>.*?)}}"
   4.169 +link_regexp_str = r"[[](?P<linktext>.*?)]"
   4.170 +image_regexp_str = r"!(?P<imagetext>.*?)!"
   4.171 +cellsep_regexp_str = r"(?P<celltype>[|]{1,2})"
   4.172 +
   4.173 +content_regexp_str = (
   4.174 +    "(" + monospace_regexp_str + ")"
   4.175 +    "|"
   4.176 +    "(" + link_regexp_str + ")"
   4.177 +    "|"
   4.178 +    "(" + image_regexp_str + ")"
   4.179 +    )
   4.180 +
   4.181 +table_content_regexp_str = (
   4.182 +    content_regexp_str +
   4.183 +    "|"
   4.184 +    "(" + cellsep_regexp_str + ")"
   4.185 +    )
   4.186 +
   4.187 +content_regexp = re.compile(content_regexp_str)
   4.188 +table_content_regexp = re.compile(table_content_regexp_str)
   4.189 +
   4.190 +def translate_content_match(match):
   4.191 +
   4.192 +    "Translate the content described by the given 'match', returning a string."
   4.193 +
   4.194 +    if match.group("monotext"):
   4.195 +        return "{{{%s}}}" % match.group("monotext")
   4.196 +
   4.197 +    elif match.group("linktext"):
   4.198 +        parts = match.group("linktext").split("|")
   4.199 +
   4.200 +        # NOTE: Proper detection of external links required.
   4.201 +
   4.202 +        if len(parts) == 1:
   4.203 +            label, target, title = None, parts[0], None
   4.204 +        elif len(parts) == 2:
   4.205 +            (label, target), title = parts, None
   4.206 +        else:
   4.207 +            label, target, title = parts
   4.208 +
   4.209 +        target = target.strip()
   4.210 +
   4.211 +        # Look for namespace links and rewrite them.
   4.212 +
   4.213 +        if target.find(":") != -1:
   4.214 +            prefix = ""
   4.215 +            space, rest = target.split(":", 1)
   4.216 +            if space not in URL_SCHEMES:
   4.217 +                target = "%s/%s" % (space, rest)
   4.218 +
   4.219 +        # Detect anchors.
   4.220 +
   4.221 +        elif target.startswith("#"):
   4.222 +            prefix = ""
   4.223 +
   4.224 +        # Detect attachments.
   4.225 +
   4.226 +        elif target.startswith("^"):
   4.227 +            prefix = "attachment:"
   4.228 +
   4.229 +        # Link to other pages within a space.
   4.230 +
   4.231 +        else:
   4.232 +            prefix = "../"
   4.233 +
   4.234 +            # Make the link tidier by making a target if none was given.
   4.235 +
   4.236 +            if not label:
   4.237 +                label = target
   4.238 +
   4.239 +        if not label and not title:
   4.240 +            return "[[%s%s]]" % (prefix, target)
   4.241 +        elif not title:
   4.242 +            return "[[%s%s|%s]]" % (prefix, target, label)
   4.243 +        else:
   4.244 +            return "[[%s%s|%s|title=%s]]" % (prefix, target, label, title)
   4.245 +
   4.246 +    elif match.group("imagetext"):
   4.247 +        parts = match.group("imagetext").split("|")
   4.248 +
   4.249 +        # NOTE: Proper detection of external links required.
   4.250 +
   4.251 +        if parts[0].startswith("http"):
   4.252 +            prefix = ""
   4.253 +        else:
   4.254 +            prefix = "attachment:"
   4.255 +
   4.256 +        # NOTE: Proper options conversion required.
   4.257 +
   4.258 +        if len(parts) == 1:
   4.259 +            return "{{%s%s}}" % (prefix, parts[0])
   4.260 +        else:
   4.261 +            return "{{%s%s|%s}}" % (prefix, parts[0], parts[1])
   4.262 +
   4.263 +    else:
   4.264 +        return match.group()
   4.265 +
   4.266 +def get_table_rows(text):
   4.267 +
   4.268 +    "Return a list of (cellsep, columns) tuples for the given table 'text'."
   4.269 +
   4.270 +    rows = []
   4.271 +
   4.272 +    for line in text.split("\n"):
   4.273 +        cellsep = None
   4.274 +        columns = [""]
   4.275 +        last = 0
   4.276 +        for match in table_content_regexp.finditer(line):
   4.277 +            start, end = match.span()
   4.278 +            columns[-1] += line[last:start]
   4.279 +
   4.280 +            if match.group("celltype"):
   4.281 +                if cellsep is None:
   4.282 +                    cellsep = match.group("celltype")
   4.283 +                columns.append("")
   4.284 +            else:
   4.285 +                columns[-1] += match.group()
   4.286 +
   4.287 +            last = end
   4.288 +
   4.289 +        columns[-1] += line[last:]
   4.290 +
   4.291 +        if cellsep:
   4.292 +            rows.append((cellsep, columns[1:-1]))
   4.293 +
   4.294 +    return rows
   4.295 +
   4.296 +def translate_content(text, sectiontype=None):
   4.297 +
   4.298 +    """
   4.299 +    Return a translation of the given 'text'. If the optional 'sectiontype' is
   4.300 +    specified, the translation may be modified to a form appropriate to the
   4.301 +    section being translated.
   4.302 +    """
   4.303 +
   4.304 +    parts = []
   4.305 +
   4.306 +    last = 0
   4.307 +    for match in content_regexp.finditer(text):
   4.308 +        start, end = match.span()
   4.309 +        parts.append(text[last:start])
   4.310 +
   4.311 +        # Handle unformatted sections.
   4.312 +
   4.313 +        if sectiontype in ("code", "noformat"):
   4.314 +            parts.append(match.group())
   4.315 +        else:
   4.316 +            parts.append(translate_content_match(match))
   4.317 +
   4.318 +        last = end
   4.319 +
   4.320 +    parts.append(text[last:])
   4.321 +    return "".join(parts)
   4.322 +
   4.323 +# Translation helpers.
   4.324 +
   4.325 +markers = {
   4.326 +    "*" : "*",
   4.327 +    "#" : "1.",
   4.328 +    "-" : "*",
   4.329 +    }
   4.330 +
   4.331 +def translate_marker(marker):
   4.332 +
   4.333 +    "Translate the given 'marker' to a suitable Moin representation."
   4.334 +
   4.335 +    return " " * len(marker) + markers[marker[-1]]
   4.336 +
   4.337 +cellseps = {
   4.338 +    "|" : "||",
   4.339 +    "||" : "||",
   4.340 +    }
   4.341 +
   4.342 +cellextra = {
   4.343 +    "|" : "",
   4.344 +    "||" : "'''",
   4.345 +    }
   4.346 +
   4.347 +def translate_cellsep(cellsep):
   4.348 +
   4.349 +    "Translate the given 'cellsep' to a suitable Moin representation."
   4.350 +
   4.351 +    return cellseps[cellsep]
   4.352 +
   4.353 +def translate_cell(cellsep, text):
   4.354 +
   4.355 +    "Using 'cellsep', translate the cell 'text'."
   4.356 +
   4.357 +    return cellextra[cellsep] + translate_content(text) + cellextra[cellsep]
   4.358 +
   4.359 +sectiontypes = {
   4.360 +    "code" : "",
   4.361 +    "noformat" : "",
   4.362 +    "quote" : "",
   4.363 +    "info" : "wiki important",
   4.364 +    "note" : "wiki caution",
   4.365 +    "tip" : "wiki tip",
   4.366 +    "warning" : "wiki warning",
   4.367 +    }
   4.368 +
   4.369 +# General parsing.
   4.370 +
   4.371 +def parse(s, out):
   4.372 +
   4.373 +    "Parse the content in the string 's', writing a translation to 'out'."
   4.374 +
   4.375 +    for type, text in get_regions(s):
   4.376 +
   4.377 +        # Handle list, heading, blockquote or anonymous blocks.
   4.378 +
   4.379 +        if type is None:
   4.380 +            for blocktype, blocktext in get_blocks(text):
   4.381 +
   4.382 +                # Translate headings and blockquotes.
   4.383 +
   4.384 +                if blocktypes.has_key(blocktype):
   4.385 +                    print >>out, blocktypes[blocktype] % blocktext
   4.386 +
   4.387 +                # Translate list items.
   4.388 +
   4.389 +                elif blocktype == "list":
   4.390 +                    for listmarker, listitem in get_list_items(blocktext):
   4.391 +                        print >>out, "%s %s" % (translate_marker(listmarker), translate_content(listitem))
   4.392 +
   4.393 +                # Translate table items.
   4.394 +
   4.395 +                elif blocktype == "table":
   4.396 +                    for cellsep, columns in get_table_rows(blocktext):
   4.397 +                        moinsep = translate_cellsep(cellsep)
   4.398 +                        print >>out, moinsep + moinsep.join([translate_cell(cellsep, column) for column in columns]) + moinsep
   4.399 +
   4.400 +                # Handle anonymous blocks.
   4.401 +
   4.402 +                else:
   4.403 +                    print >>out, translate_content(blocktext.rstrip())
   4.404 +
   4.405 +                print >>out
   4.406 +
   4.407 +        # Handle sections.
   4.408 +
   4.409 +        else:
   4.410 +            sectiontype, options = type
   4.411 +
   4.412 +            # Direct translations of sections.
   4.413 +
   4.414 +            mointype = sectiontypes.get(sectiontype)
   4.415 +            if mointype:
   4.416 +                print >>out, "{{{#!%s" % mointype
   4.417 +                if options:
   4.418 +                    print >>out, "##", options
   4.419 +            else:
   4.420 +                print >>out, "{{{",
   4.421 +            print >>out, translate_content(text, sectiontype),
   4.422 +            print >>out, "}}}"
   4.423 +            print >>out
   4.424 +
   4.425 +if __name__ == "__main__":
   4.426 +    s = sys.stdin.read()
   4.427 +    parse(s, sys.stdout)
   4.428 +
   4.429 +# vim: tabstop=4 expandtab shiftwidth=4

     5.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     5.2 +++ b/xmlparser.py	Fri Feb 22 22:50:30 2013 +0100
     5.3 @@ -0,0 +1,276 @@
     5.4 +#!/usr/bin/env python
     5.5 +
     5.6 +"""
     5.7 +Confluence Wiki XML/XHTML syntax parsing.
     5.8 +
     5.9 +Copyright (C) 2012, 2013 Paul Boddie <paul@boddie.org.uk>
    5.10 +
    5.11 +This software is free software; you can redistribute it and/or
    5.12 +modify it under the terms of the GNU General Public License as
    5.13 +published by the Free Software Foundation; either version 2 of
    5.14 +the License, or (at your option) any later version.
    5.15 +
    5.16 +This software is distributed in the hope that it will be useful,
    5.17 +but WITHOUT ANY WARRANTY; without even the implied warranty of
    5.18 +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    5.19 +GNU General Public License for more details.
    5.20 +
    5.21 +You should have received a copy of the GNU General Public
    5.22 +License along with this library; see the file LICENCE.txt
    5.23 +If not, write to the Free Software Foundation, Inc.,
    5.24 +51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA
    5.25 +"""
    5.26 +
    5.27 +try:
    5.28 +    from cStringIO import StringIO
    5.29 +except ImportError:
    5.30 +    from StringIO import StringIO
    5.31 +
    5.32 +from common import *
    5.33 +from xmlread import Parser
    5.34 +import re
    5.35 +import sys
    5.36 +import operator
    5.37 +import htmlentitydefs
    5.38 +
    5.39 +# XML dialect syntax parsing.
    5.40 +
    5.41 +tags = {
    5.42 +    # XHTML tag               MoinMoin syntax
    5.43 +    "strong"                : "'''%s'''",
    5.44 +    "em"                    : "''%s''",
    5.45 +    "u"                     : "__%s__",
    5.46 +    "del"                   : "--(%s)--",
    5.47 +    "sup"                   : "^%s^",
    5.48 +    "sub"                   : ",,%s,,",
    5.49 +    "code"                  : "`%s`",
    5.50 +    "pre"                   : "{{{%s}}}",
    5.51 +    "blockquote"            : " %s",
    5.52 +    "small"                 : "~-%s-~",
    5.53 +    "big"                   : "~+%s+~",
    5.54 +    "p"                     : "%s",
    5.55 +    "ol"                    : "%s",
    5.56 +    "ul"                    : "%s",
    5.57 +    "ac:plain-text-body"    : "{{{%s}}}",
    5.58 +    "ac:link"               : "[[%s%s|%s]]",
    5.59 +    }
    5.60 +
    5.61 +for tag, translation in blocktypes.items():
    5.62 +    tags[tag] = translation
    5.63 +
    5.64 +simple_tags = {
    5.65 +    # XHTML tag               MoinMoin syntax
    5.66 +    "br"                    : "<<BR>>",
    5.67 +    }
    5.68 +
    5.69 +list_tags = {
    5.70 +    # XHTML list tag          MoinMoin list item syntax
    5.71 +    "ol"                    : "1. %s",
    5.72 +    "ul"                    : "* %s",
    5.73 +    }
    5.74 +
    5.75 +indented_tags = ["li", "p"]
    5.76 +
    5.77 +link_target_tags = {
    5.78 +    # Confluence element      Attribute providing the target
    5.79 +    "ri:page"               : "ri:content-title",
    5.80 +    "ri:attachment"         : "ri:filename",
    5.81 +    "ri:user"               : "ri:username",
    5.82 +    }
    5.83 +
    5.84 +macro_rich_text_styles = {
    5.85 +    # Confluence style        MoinMoin admonition style
    5.86 +    "note"                  : "caution",
    5.87 +    "warning"               : "warning",
    5.88 +    "info"                  : "important",
    5.89 +    "tip"                   : "tip",
    5.90 +    }
    5.91 +
    5.92 +normalise_regexp_str = r"\s+"
    5.93 +normalise_regexp = re.compile(normalise_regexp_str)
    5.94 +
    5.95 +class ConfluenceXMLParser(Parser):
    5.96 +
    5.97 +    "Handle content from Confluence 4 page revisions."
    5.98 +
    5.99 +    def __init__(self, out):
   5.100 +        Parser.__init__(self)
   5.101 +        self.out = out
   5.102 +
   5.103 +        # Link target information.
   5.104 +
   5.105 +        self.target = None
   5.106 +        self.target_type = None
   5.107 +
   5.108 +        # Macro information.
   5.109 +
   5.110 +        self.macro = None
   5.111 +        self.macro_parameters = {}
   5.112 +
   5.113 +        # Indentation and preformatted states.
   5.114 +
   5.115 +        self.indent = 0
   5.116 +        self.states = {}
   5.117 +        for name in ("pre", "ac:plain-text-body"):
   5.118 +            self.states[name] = 0
   5.119 +
   5.120 +    # ContentHandler-related methods.
   5.121 +
   5.122 +    def startElement(self, name, attrs):
   5.123 +        if list_tags.has_key(name):
   5.124 +            self.indent += 1
   5.125 +        elif self.states.has_key(name):
   5.126 +            self.states[name] += 1
   5.127 +        Parser.startElement(self, name, attrs)
   5.128 +
   5.129 +    def endElement(self, name):
   5.130 +        Parser.endElement(self, name)
   5.131 +        if list_tags.has_key(name):
   5.132 +            self.indent -= 1
   5.133 +        elif self.states.has_key(name):
   5.134 +            self.states[name] -= 1
   5.135 +
   5.136 +    def characters(self, content):
   5.137 +        if not self.is_preformatted():
   5.138 +            content = self.normalise(content, self.elements[-1])
   5.139 +        Parser.characters(self, content)
   5.140 +
   5.141 +    def skippedEntity(self, name):
   5.142 +        ch = htmlentitydefs.name2codepoint.get(name)
   5.143 +        if ch:
   5.144 +            self.text[-1].append(unichr(ch))
   5.145 +
   5.146 +    # Parser-related methods.
   5.147 +
   5.148 +    def handleElement(self, name):
   5.149 +        text = "".join(self.text[-1])
   5.150 +        conversion = None
   5.151 +
   5.152 +        # Handle list elements.
   5.153 +
   5.154 +        if name == "li" and len(self.elements) > 1:
   5.155 +            list_tag = self.elements[-2]
   5.156 +            conversion = list_tags.get(list_tag)
   5.157 +
   5.158 +        # Remember link target information.
   5.159 +
   5.160 +        elif link_target_tags.has_key(name):
   5.161 +            self.target = self.attributes[-1].get(link_target_tags[name])
   5.162 +            self.target_type = name
   5.163 +            text = ""
   5.164 +
   5.165 +        # Remember macro information.
   5.166 +
   5.167 +        elif name == "ac:parameter":
   5.168 +            self.macro_parameters[self.attributes[-1].get("ac:name")] = text
   5.169 +            text = ""
   5.170 +
   5.171 +        elif name == "ac:macro":
   5.172 +            self.macro = self.attributes[-1].get("ac:name")
   5.173 +
   5.174 +        # Handle the common case.
   5.175 +
   5.176 +        else:
   5.177 +            conversion = tags.get(name)
   5.178 +
   5.179 +        # Attempt to convert the text.
   5.180 +
   5.181 +        # Links require target information.
   5.182 +        # NOTE: User links should support the intended user namespace prefix.
   5.183 +
   5.184 +        if name == "ac:link":
   5.185 +            if self.target_type == "ri:attachment":
   5.186 +                prefix = "attachment:"
   5.187 +            elif self.target_type == "ri:user":
   5.188 +                prefix = ""
   5.189 +            else:
   5.190 +                prefix = "../"
   5.191 +
   5.192 +            text = conversion % (prefix, self.target, text or self.target)
   5.193 +            self.target = self.target_type = None
   5.194 +
   5.195 +        # Macro name information is used to style rich text body regions.
   5.196 +
   5.197 +        elif name == "ac:macro" and macro_rich_text_styles.has_key(self.macro):
   5.198 +            details = macro_rich_text_styles[self.macro]
   5.199 +            title = self.macro_parameters.get("title")
   5.200 +            if title:
   5.201 +                details = "%s\n\n%s" % (details, title)
   5.202 +            text = "{{{#!wiki %s\n\n%s}}}" % (details, text)
   5.203 +            self.macro = None
   5.204 +            self.macro_parameters = {}
   5.205 +
   5.206 +        # Handle the common case.
   5.207 +
   5.208 +        elif text and conversion:
   5.209 +            text = conversion % text
   5.210 +        elif simple_tags.has_key(name):
   5.211 +            text = simple_tags[name]
   5.212 +
   5.213 +        # Normalise leading whitespace and indent the text if appropriate.
   5.214 +
   5.215 +        if name in indented_tags:
   5.216 +            text = " " * self.indent + text.lstrip()
   5.217 +
   5.218 +        # Add the converted text to the end of the parent element's text nodes.
   5.219 +
   5.220 +        if len(self.text) > 1:
   5.221 +            nodes = self.text[-2]
   5.222 +            if "".join(self.text[-2]):
   5.223 +                parent = self.elements[-2]
   5.224 +                if parent == "body":
   5.225 +                    nodes.append("\n\n")
   5.226 +                elif list_tags.has_key(parent):
   5.227 +                    nodes.append("\n")
   5.228 +                elif list_tags.has_key(name) and parent == "li":
   5.229 +                    nodes.append("\n")
   5.230 +            nodes.append(text)
   5.231 +
   5.232 +        # Otherwise, emit the text.
   5.233 +
   5.234 +        else:
   5.235 +            self.out.write(text)
   5.236 +
   5.237 +    def is_preformatted(self):
   5.238 +        return reduce(operator.or_, self.states.values(), False)
   5.239 +
   5.240 +    # Whitespace normalisation.
   5.241 +
   5.242 +    def get_replacement(self, name):
   5.243 +        if name in ("html", "body") or list_tags.has_key(name):
   5.244 +            return ""
   5.245 +        else:
   5.246 +            return " "
   5.247 +
   5.248 +    def normalise(self, text, name):
   5.249 +        return normalise_regexp.sub(self.get_replacement(name), text)
   5.250 +
   5.251 +def parse(s, out):
   5.252 +
   5.253 +    "Parse the content in the string 's', writing a translation to 'out'."
   5.254 +
   5.255 +    # NOTE: CDATA sections appear to have erroneous endings.
   5.256 +
   5.257 +    s = u"""\
   5.258 +<?xml version="1.0"?>
   5.259 +<!DOCTYPE html 
   5.260 +     PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
   5.261 +     "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
   5.262 +<html xmlns="http://www.w3.org/1999/xhtml">
   5.263 +<body>
   5.264 +%s
   5.265 +</body>
   5.266 +</html>""" % s.replace("]] >", "]]>")
   5.267 +
   5.268 +    f = StringIO(s.encode("utf-8"))
   5.269 +    try:
   5.270 +        parser = ConfluenceXMLParser(out)
   5.271 +        parser.parse(f)
   5.272 +    finally:
   5.273 +        f.close()
   5.274 +
   5.275 +if __name__ == "__main__":
   5.276 +    s = sys.stdin.read()
   5.277 +    parse(s, sys.stdout)
   5.278 +
   5.279 +# vim: tabstop=4 expandtab shiftwidth=4
2013-02-22	Paul Boddie	raw files shortlog changelog graph	Replaced the parser module with separate modules covering the different areas of functionality.
			common.py (file) convert.py (file) parser.py wikiparser.py (file) xmlparser.py (file)