ConfluenceConverter

Annotated xmlparser.py

106:3255ed8e2426
2013-07-18 Paul Boddie Added libxml2dom dependency note and updated the "to do" list.
paul@35 1
#!/usr/bin/env python
paul@35 2
paul@35 3
"""
paul@35 4
Confluence Wiki XML/XHTML syntax parsing.
paul@35 5
paul@35 6
Copyright (C) 2012, 2013 Paul Boddie <paul@boddie.org.uk>
paul@35 7
paul@35 8
This software is free software; you can redistribute it and/or
paul@35 9
modify it under the terms of the GNU General Public License as
paul@35 10
published by the Free Software Foundation; either version 2 of
paul@35 11
the License, or (at your option) any later version.
paul@35 12
paul@35 13
This software is distributed in the hope that it will be useful,
paul@35 14
but WITHOUT ANY WARRANTY; without even the implied warranty of
paul@35 15
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
paul@35 16
GNU General Public License for more details.
paul@35 17
paul@35 18
You should have received a copy of the GNU General Public
paul@35 19
License along with this library; see the file LICENCE.txt
paul@35 20
If not, write to the Free Software Foundation, Inc.,
paul@35 21
51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA
paul@35 22
"""
paul@35 23
paul@35 24
try:
paul@35 25
    from cStringIO import StringIO
paul@35 26
except ImportError:
paul@35 27
    from StringIO import StringIO
paul@35 28
paul@51 29
from MoinMoin import wikiutil
paul@35 30
from common import *
paul@35 31
from xmlread import Parser
paul@35 32
import re
paul@35 33
import sys
paul@35 34
import operator
paul@35 35
import htmlentitydefs
paul@41 36
import codecs
paul@35 37
paul@35 38
# XML dialect syntax parsing.
paul@35 39
paul@35 40
tags = {
paul@35 41
    # XHTML tag               MoinMoin syntax
paul@35 42
    "strong"                : "'''%s'''",
paul@35 43
    "em"                    : "''%s''",
paul@35 44
    "u"                     : "__%s__",
paul@35 45
    "del"                   : "--(%s)--",
paul@35 46
    "sup"                   : "^%s^",
paul@35 47
    "sub"                   : ",,%s,,",
paul@35 48
    "code"                  : "`%s`",
paul@41 49
    "tbody"                 : "%s",
paul@41 50
    "tr"                    : "%s",
paul@41 51
    "th"                    : "'''%s'''",
paul@41 52
    "td"                    : "%s",
paul@35 53
    "blockquote"            : " %s",
paul@35 54
    "small"                 : "~-%s-~",
paul@35 55
    "big"                   : "~+%s+~",
paul@35 56
    "p"                     : "%s",
paul@35 57
    "ol"                    : "%s",
paul@35 58
    "ul"                    : "%s",
paul@84 59
    "ac:link"               : "[[%s%s%s|%s]]",
paul@84 60
    "ac:image"              : "{{%s%s%s|%s}}",
paul@55 61
    "a"                     : "[[%s|%s]]",
paul@35 62
    }
paul@35 63
paul@35 64
for tag, translation in blocktypes.items():
paul@35 65
    tags[tag] = translation
paul@35 66
paul@35 67
simple_tags = {
paul@35 68
    # XHTML tag               MoinMoin syntax
paul@35 69
    "br"                    : "<<BR>>",
paul@35 70
    }
paul@35 71
paul@66 72
simple_preformatted_tags = {
paul@66 73
    # XHTML tag               MoinMoin syntax
paul@66 74
    "br"                    : "\n",
paul@66 75
    }
paul@66 76
paul@35 77
list_tags = {
paul@35 78
    # XHTML list tag          MoinMoin list item syntax
paul@35 79
    "ol"                    : "1. %s",
paul@35 80
    "ul"                    : "* %s",
paul@35 81
    }
paul@35 82
paul@51 83
preformatted_tags = ["pre", "ac:plain-text-body"]
paul@51 84
single_level_tags = ["strong", "em", "u", "del", "sup", "sub", "code"]
paul@51 85
formatted_tags    = ["ac:rich-text-body", "table"]
paul@51 86
paul@56 87
indented_tags = ["li", "p"] + preformatted_tags + formatted_tags
paul@56 88
block_tags = indented_tags + blocktypes.keys() + list_tags.keys()
paul@58 89
span_override_tags = ["ac:link"]
paul@56 90
paul@35 91
link_target_tags = {
paul@54 92
    # Confluence element      Attributes providing the target
paul@54 93
    "ri:page"               : ("ri:space-key", "ri:content-title"),
paul@54 94
    "ri:attachment"         : ("ri:filename",),
paul@54 95
    "ri:user"               : ("ri:username",),
paul@35 96
    }
paul@35 97
paul@54 98
link_target_prefixes = {
paul@54 99
    # Attribute with details  Prefix ensuring correct relative link
paul@54 100
    "ri:space-key"          : "..",
paul@54 101
    "ri:content-title"      : "..",
paul@54 102
    }
paul@54 103
paul@54 104
link_label_attributes = "ri:content-title", "ac:link-body"
paul@54 105
paul@51 106
# NOTE: User links should support the intended user namespace prefix.
paul@51 107
paul@51 108
link_target_types = {
paul@51 109
    # Confluence element      MoinMoin link prefix
paul@51 110
    "ri:attachment"         : "attachment:",
paul@51 111
    "ri:user"               : "",
paul@51 112
    }
paul@51 113
paul@35 114
macro_rich_text_styles = {
paul@35 115
    # Confluence style        MoinMoin admonition style
paul@35 116
    "note"                  : "caution",
paul@35 117
    "warning"               : "warning",
paul@35 118
    "info"                  : "important",
paul@35 119
    "tip"                   : "tip",
paul@92 120
    "excerpt"               : "",
paul@35 121
    }
paul@35 122
paul@71 123
macroargs = {
paul@71 124
    # Confluence macro        Confluence and MoinMoin macro arguments
paul@71 125
    "color"                 : ("color", "col"),
paul@71 126
    }
paul@71 127
paul@71 128
macrotypes = {
paul@71 129
    # Confluence macro        MoinMoin syntax
paul@71 130
    "anchor"                : "<<Anchor(%(anchor)s)>>",
paul@71 131
    "color"                 : "<<Color2(%(content)s, %(args)s)>>",
paul@93 132
    "toc"                   : "<<TableOfContents>>",
paul@71 133
    }
paul@71 134
paul@35 135
normalise_regexp_str = r"\s+"
paul@35 136
normalise_regexp = re.compile(normalise_regexp_str)
paul@35 137
paul@35 138
class ConfluenceXMLParser(Parser):
paul@35 139
paul@35 140
    "Handle content from Confluence 4 page revisions."
paul@35 141
paul@35 142
    def __init__(self, out):
paul@35 143
        Parser.__init__(self)
paul@35 144
        self.out = out
paul@35 145
paul@51 146
        # Link target and label information.
paul@35 147
paul@35 148
        self.target = None
paul@35 149
        self.target_type = None
paul@51 150
        self.label = None
paul@35 151
paul@35 152
        # Macro information.
paul@35 153
paul@93 154
        self.macros = []
paul@93 155
        self.macro_parameters = []
paul@73 156
        self.held_anchors = []
paul@35 157
paul@51 158
        # Indentation and element nesting states.
paul@35 159
paul@63 160
        self.indents = [0]
paul@35 161
        self.states = {}
paul@51 162
        self.max_level = self.level = 0
paul@51 163
paul@51 164
        for name in preformatted_tags + single_level_tags:
paul@35 165
            self.states[name] = 0
paul@35 166
paul@41 167
        # Table states.
paul@41 168
paul@41 169
        self.table_rows = 0
paul@41 170
        self.table_columns = 0
paul@41 171
paul@56 172
        # Block states.
paul@56 173
paul@56 174
        self.have_block = False
paul@56 175
paul@35 176
    # ContentHandler-related methods.
paul@35 177
paul@35 178
    def startElement(self, name, attrs):
paul@54 179
paul@54 180
        # Track indentation for lists.
paul@54 181
paul@35 182
        if list_tags.has_key(name):
paul@63 183
            self.indents.append(self.indents[-1] + 1)
paul@54 184
paul@54 185
        # Track element nesting.
paul@54 186
paul@63 187
        if self.states.has_key(name):
paul@35 188
            self.states[name] += 1
paul@54 189
paul@54 190
        # Track cumulative element nesting in order to produce appropriate depth
paul@54 191
        # indicators in the formatted output.
paul@54 192
paul@51 193
        if name in preformatted_tags or name in formatted_tags:
paul@51 194
            self.level += 1
paul@51 195
            self.max_level = max(self.level, self.max_level)
paul@51 196
paul@63 197
            # Reset indentation within regions.
paul@63 198
paul@63 199
            self.indents.append(0)
paul@63 200
paul@73 201
        if name in headings:
paul@73 202
            self.held_anchors = []
paul@73 203
paul@35 204
        Parser.startElement(self, name, attrs)
paul@35 205
paul@51 206
        # Remember macro information for use within the element.
paul@51 207
paul@51 208
        if name == "ac:macro":
paul@93 209
            self.macros.append(self.attributes[-1].get("ac:name"))
paul@93 210
            self.macro_parameters.append({})
paul@51 211
paul@35 212
    def endElement(self, name):
paul@63 213
paul@63 214
        # Reset the indent for any preformatted/formatted region so that it may
paul@63 215
        # itself be indented.
paul@63 216
paul@63 217
        if name in preformatted_tags or name in formatted_tags:
paul@63 218
            self.indents.pop()
paul@63 219
paul@35 220
        Parser.endElement(self, name)
paul@51 221
paul@35 222
        if list_tags.has_key(name):
paul@63 223
            self.indents.pop()
paul@63 224
paul@63 225
        if self.states.has_key(name):
paul@35 226
            self.states[name] -= 1
paul@63 227
paul@51 228
        if name in preformatted_tags or name in formatted_tags:
paul@51 229
            self.level -= 1
paul@51 230
            if not self.level:
paul@51 231
                self.max_level = 0
paul@35 232
paul@63 233
        # Discard macro state.
paul@63 234
paul@63 235
        if name == "ac:macro":
paul@93 236
            self.macros.pop()
paul@93 237
            self.macro_parameters.pop()
paul@63 238
paul@35 239
    def characters(self, content):
paul@35 240
        if not self.is_preformatted():
paul@35 241
            content = self.normalise(content, self.elements[-1])
paul@35 242
        Parser.characters(self, content)
paul@35 243
paul@35 244
    def skippedEntity(self, name):
paul@35 245
        ch = htmlentitydefs.name2codepoint.get(name)
paul@35 246
        if ch:
paul@35 247
            self.text[-1].append(unichr(ch))
paul@35 248
paul@35 249
    # Parser-related methods.
paul@35 250
paul@35 251
    def handleElement(self, name):
paul@51 252
paul@51 253
        """
paul@51 254
        Handle the completion of the element with the given 'name'. Any content
paul@51 255
        will either be recorded for later use (by an enclosing element, for
paul@51 256
        example) or emitted in some form.
paul@51 257
        """
paul@51 258
paul@59 259
        text = u"".join(self.text[-1])
paul@41 260
paul@41 261
        # Handle state.
paul@41 262
paul@41 263
        if name == "table":
paul@41 264
            self.table_rows = 0
paul@41 265
        elif name == "tr":
paul@41 266
            self.table_columns = 0
paul@41 267
paul@41 268
        # Find conversions.
paul@41 269
paul@35 270
        conversion = None
paul@35 271
paul@35 272
        # Handle list elements.
paul@35 273
paul@35 274
        if name == "li" and len(self.elements) > 1:
paul@35 275
            list_tag = self.elements[-2]
paul@35 276
            conversion = list_tags.get(list_tag)
paul@35 277
paul@35 278
        # Remember link target information.
paul@35 279
paul@35 280
        elif link_target_tags.has_key(name):
paul@54 281
            target_details = []
paul@54 282
paul@54 283
            # Get target details from the element's attributes.
paul@54 284
paul@54 285
            for attrname in link_target_tags[name]:
paul@54 286
                attrvalue = self.attributes[-1].get(attrname)
paul@54 287
                if attrvalue:
paul@85 288
paul@85 289
                    # Obtain a link label.
paul@85 290
paul@85 291
                    if attrname in link_label_attributes and not self.label:
paul@85 292
                        self.label = attrvalue
paul@85 293
paul@85 294
                    # Validate any page title.
paul@85 295
paul@85 296
                    if attrname == "ri:content-title":
paul@85 297
                        attrvalue = get_page_title(attrvalue)
paul@54 298
                    target_details.append(attrvalue)
paul@85 299
paul@85 300
                    # Insert any prefix required for the link.
paul@85 301
paul@54 302
                    prefix = link_target_prefixes.get(attrname)
paul@54 303
                    if prefix:
paul@54 304
                        target_details.insert(0, prefix)
paul@54 305
paul@54 306
            # Make a link based on the details.
paul@54 307
paul@59 308
            self.target = u"/".join(target_details)
paul@35 309
            self.target_type = name
paul@35 310
            text = ""
paul@35 311
paul@51 312
        # For anchor links, just use the raw text and let Moin do the formatting.
paul@94 313
        # Set an empty default target, overwriting it if enclosing elements
paul@94 314
        # specify target details.
paul@51 315
paul@51 316
        elif name == "ac:link-body":
paul@94 317
            self.target = self.target or ""
paul@59 318
            self.label = text.strip()
paul@51 319
            text = ""
paul@51 320
paul@55 321
        # For conventional links, remember the href attribute as the target.
paul@55 322
paul@55 323
        elif name == "a":
paul@55 324
            self.target = self.attributes[-1].get("href")
paul@59 325
            self.label = text.strip()
paul@55 326
            text = ""
paul@55 327
paul@35 328
        # Remember macro information.
paul@35 329
paul@63 330
        elif name == "ac:parameter":
paul@93 331
            self.macro_parameters[-1][self.attributes[-1].get("ac:name")] = text
paul@35 332
            text = ""
paul@35 333
paul@63 334
        elif name == "ac:default-parameter":
paul@93 335
            self.macro_parameters[-1][self.attributes[-2].get("ac:name")] = text
paul@63 336
            text = ""
paul@63 337
paul@51 338
        # Handle single-level tags.
paul@51 339
paul@51 340
        elif name in single_level_tags and self.states[name] > 1:
paul@51 341
            conversion = "%s"
paul@51 342
paul@51 343
        # Handle preformatted sections.
paul@51 344
paul@51 345
        elif name in preformatted_tags or name in formatted_tags:
paul@51 346
paul@51 347
            # Nest the section appropriately.
paul@51 348
paul@51 349
            level = 3 + self.max_level - self.level
paul@51 350
            opening = "{" * level
paul@51 351
            closing = "}" * level
paul@51 352
paul@51 353
            # Macro name information is used to style rich text body regions.
paul@51 354
paul@93 355
            if name != "table" and self.macros and macro_rich_text_styles.has_key(self.macros[-1]):
paul@93 356
                details = macro_rich_text_styles[self.macros[-1]]
paul@93 357
                title = self.macro_parameters[-1].get("title")
paul@51 358
                if title:
paul@51 359
                    details = "%s\n\n%s" % (details, title)
paul@51 360
paul@51 361
                conversion = "%s#!wiki %s\n\n%%s\n%s" % (opening, details, closing)
paul@51 362
paul@51 363
            elif name == "table":
paul@51 364
                conversion = "%s#!table\n%%s\n%s" % (opening, closing)
paul@51 365
paul@51 366
            else:
paul@60 367
                # Preformatted sections containing newlines must contain an initial
paul@60 368
                # newline.
paul@60 369
paul@60 370
                if text.find("\n") != -1 and not text.startswith("\n"):
paul@60 371
                    opening += "\n"
paul@60 372
paul@51 373
                conversion = "%s%%s%s" % (opening, closing)
paul@35 374
paul@55 375
        # Handle the common case and simpler special cases.
paul@35 376
paul@55 377
        if not conversion:
paul@35 378
            conversion = tags.get(name)
paul@35 379
paul@56 380
paul@56 381
paul@35 382
        # Attempt to convert the text.
paul@35 383
paul@35 384
        # Links require target information.
paul@35 385
paul@42 386
        if name in ("ac:link", "ac:image"):
paul@54 387
            prefix = link_target_types.get(self.target_type, "")
paul@84 388
            anchor = self.attributes[-1].get("ac:anchor") or ""
paul@84 389
            label = self.label or text.strip() or self.target
paul@84 390
            text = conversion % (prefix, self.target, anchor and ("#%s" % anchor) or "", label)
paul@51 391
            self.target = self.target_type = self.label = None
paul@35 392
paul@55 393
        elif name == "a":
paul@59 394
            text = conversion % (self.target, self.label or self.target)
paul@55 395
            self.target = self.target_type = self.label = None
paul@55 396
paul@63 397
        # Macros require various kinds of information.
paul@71 398
        # Some macros affect the formatting of their contents, whereas other
paul@71 399
        # simpler macros are handled here.
paul@63 400
paul@73 401
        elif name == "ac:macro":
paul@93 402
            conversion = macrotypes.get(self.macros[-1])
paul@71 403
            if conversion:
paul@71 404
                parameters = {"content" : text}
paul@93 405
                parameters.update(self.macro_parameters[-1])
paul@93 406
                argnames = macroargs.get(self.macros[-1])
paul@71 407
                if argnames:
paul@71 408
                    confargname, moinargname = argnames
paul@93 409
                    parameters["args"] = quote_macro_argument("%s=%s" % (moinargname, self.macro_parameters[-1][confargname]))
paul@71 410
                text = conversion % parameters
paul@93 411
                if self.macros[-1] == "anchor" and self.forbids_macros():
paul@73 412
                    self.held_anchors.append(text)
paul@73 413
                    text = ""
paul@63 414
paul@63 415
        # Handle the common cases for parameterised and unparameterised
paul@63 416
        # substitutions.
paul@35 417
paul@35 418
        elif text and conversion:
paul@35 419
            text = conversion % text
paul@66 420
        elif simple_tags.has_key(name) and not self.is_preformatted():
paul@35 421
            text = simple_tags[name]
paul@66 422
        elif simple_preformatted_tags.has_key(name) and self.is_preformatted():
paul@66 423
            text = simple_preformatted_tags[name]
paul@35 424
paul@63 425
paul@63 426
paul@41 427
        # Postprocess table columns and rows.
paul@41 428
paul@41 429
        if name in ("th", "td"):
paul@41 430
            if self.table_columns:
paul@41 431
                text = "\n|| %s" % text
paul@41 432
            self.table_columns += 1
paul@41 433
        elif name == "tr":
paul@41 434
            if self.table_rows:
paul@41 435
                text = "\n==\n%s" % text
paul@41 436
            self.table_rows += 1
paul@41 437
paul@73 438
        # Postprocess held anchor tags in headings.
paul@73 439
paul@73 440
        elif name in headings and self.held_anchors:
paul@73 441
            text = "%s\n%s" % ("".join(self.held_anchors), text)
paul@73 442
paul@63 443
paul@63 444
paul@35 445
        # Normalise leading whitespace and indent the text if appropriate.
paul@35 446
paul@35 447
        if name in indented_tags:
paul@63 448
            text = " " * self.indents[-1] + text.lstrip()
paul@35 449
paul@35 450
        # Add the converted text to the end of the parent element's text nodes.
paul@35 451
paul@35 452
        if len(self.text) > 1:
paul@35 453
            nodes = self.text[-2]
paul@58 454
            parent = self.elements[-2]
paul@56 455
paul@56 456
            # Where preceding text exists, add any blank line separators.
paul@56 457
paul@59 458
            if u"".join(nodes):
paul@56 459
paul@56 460
                # All top-level elements are separated with blank lines.
paul@56 461
paul@35 462
                if parent == "body":
paul@56 463
                    nodes.append("\n")
paul@56 464
paul@56 465
                # Block elements always cause a new line to be started.
paul@56 466
paul@58 467
                if name in block_tags or self.have_block and name not in span_override_tags:
paul@35 468
                    nodes.append("\n")
paul@56 469
paul@56 470
                self.have_block = False
paul@56 471
paul@58 472
            # Lists inside lists require separation.
paul@58 473
paul@58 474
            elif list_tags.has_key(name) and parent == "li":
paul@58 475
                nodes.append("\n")
paul@56 476
paul@58 477
            # Without preceding text, save any block node state for non-block
paul@60 478
            # elements so that newline separators can be added at another
paul@58 479
            # level.
paul@58 480
paul@58 481
            elif name in block_tags and parent not in block_tags:
paul@58 482
                self.have_block = True
paul@58 483
paul@58 484
            elif name not in block_tags and self.have_block and name not in span_override_tags:
paul@56 485
                self.have_block = True
paul@56 486
paul@56 487
            else:
paul@56 488
                self.have_block = False
paul@56 489
paul@35 490
            nodes.append(text)
paul@35 491
paul@56 492
        # Otherwise, emit the text (at the top level of the document).
paul@35 493
paul@35 494
        else:
paul@35 495
            self.out.write(text)
paul@35 496
paul@35 497
    def is_preformatted(self):
paul@51 498
        return reduce(operator.or_, [self.states[tag] for tag in preformatted_tags], False)
paul@35 499
paul@71 500
    def forbids_macros(self):
paul@71 501
        return reduce(operator.or_, [(tag in headings or tag == "a") for tag in self.elements], False)
paul@71 502
paul@35 503
    # Whitespace normalisation.
paul@35 504
paul@35 505
    def get_replacement(self, name):
paul@42 506
        if name in ("html", "body", "table", "tbody", "tr") or list_tags.has_key(name):
paul@35 507
            return ""
paul@35 508
        else:
paul@35 509
            return " "
paul@35 510
paul@35 511
    def normalise(self, text, name):
paul@35 512
        return normalise_regexp.sub(self.get_replacement(name), text)
paul@35 513
paul@35 514
def parse(s, out):
paul@35 515
paul@35 516
    "Parse the content in the string 's', writing a translation to 'out'."
paul@35 517
paul@35 518
    # NOTE: CDATA sections appear to have erroneous endings.
paul@35 519
paul@35 520
    s = u"""\
paul@35 521
<?xml version="1.0"?>
paul@35 522
<!DOCTYPE html 
paul@35 523
     PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
paul@35 524
     "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
paul@35 525
<html xmlns="http://www.w3.org/1999/xhtml">
paul@35 526
<body>
paul@35 527
%s
paul@35 528
</body>
paul@35 529
</html>""" % s.replace("]] >", "]]>")
paul@35 530
paul@35 531
    f = StringIO(s.encode("utf-8"))
paul@35 532
    try:
paul@35 533
        parser = ConfluenceXMLParser(out)
paul@35 534
        parser.parse(f)
paul@35 535
    finally:
paul@35 536
        f.close()
paul@35 537
paul@35 538
if __name__ == "__main__":
paul@63 539
    s = codecs.getreader("utf-8")(sys.stdin).read()
paul@41 540
    out = codecs.getwriter("utf-8")(sys.stdout)
paul@41 541
    parse(s, out)
paul@35 542
paul@35 543
# vim: tabstop=4 expandtab shiftwidth=4