ConfluenceConverter

Annotated xmlparser.py

145:90c4ddc8afb6
2017-06-16 Paul Boddie Added some resources describing Confluence storage and format representations.
paul@35 1
#!/usr/bin/env python
paul@35 2
paul@35 3
"""
paul@35 4
Confluence Wiki XML/XHTML syntax parsing.
paul@35 5
paul@144 6
Copyright (C) 2012, 2013, 2015, 2017 Paul Boddie <paul@boddie.org.uk>
paul@35 7
paul@35 8
This software is free software; you can redistribute it and/or
paul@35 9
modify it under the terms of the GNU General Public License as
paul@35 10
published by the Free Software Foundation; either version 2 of
paul@35 11
the License, or (at your option) any later version.
paul@35 12
paul@35 13
This software is distributed in the hope that it will be useful,
paul@35 14
but WITHOUT ANY WARRANTY; without even the implied warranty of
paul@35 15
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
paul@35 16
GNU General Public License for more details.
paul@35 17
paul@35 18
You should have received a copy of the GNU General Public
paul@35 19
License along with this library; see the file LICENCE.txt
paul@35 20
If not, write to the Free Software Foundation, Inc.,
paul@35 21
51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA
paul@35 22
"""
paul@35 23
paul@35 24
try:
paul@35 25
    from cStringIO import StringIO
paul@35 26
except ImportError:
paul@35 27
    from StringIO import StringIO
paul@35 28
paul@51 29
from MoinMoin import wikiutil
paul@35 30
from common import *
paul@35 31
from xmlread import Parser
paul@35 32
import re
paul@35 33
import sys
paul@35 34
import operator
paul@35 35
import htmlentitydefs
paul@41 36
import codecs
paul@35 37
paul@35 38
# XML dialect syntax parsing.
paul@35 39
paul@35 40
tags = {
paul@35 41
    # XHTML tag               MoinMoin syntax
paul@35 42
    "strong"                : "'''%s'''",
paul@35 43
    "em"                    : "''%s''",
paul@35 44
    "u"                     : "__%s__",
paul@35 45
    "del"                   : "--(%s)--",
paul@35 46
    "sup"                   : "^%s^",
paul@35 47
    "sub"                   : ",,%s,,",
paul@35 48
    "code"                  : "`%s`",
paul@41 49
    "tbody"                 : "%s",
paul@41 50
    "tr"                    : "%s",
paul@41 51
    "th"                    : "'''%s'''",
paul@41 52
    "td"                    : "%s",
paul@35 53
    "blockquote"            : " %s",
paul@35 54
    "small"                 : "~-%s-~",
paul@35 55
    "big"                   : "~+%s+~",
paul@35 56
    "p"                     : "%s",
paul@35 57
    "ol"                    : "%s",
paul@35 58
    "ul"                    : "%s",
paul@84 59
    "ac:link"               : "[[%s%s%s|%s]]",
paul@84 60
    "ac:image"              : "{{%s%s%s|%s}}",
paul@55 61
    "a"                     : "[[%s|%s]]",
paul@35 62
    }
paul@35 63
paul@35 64
for tag, translation in blocktypes.items():
paul@35 65
    tags[tag] = translation
paul@35 66
paul@35 67
simple_tags = {
paul@35 68
    # XHTML tag               MoinMoin syntax
paul@35 69
    "br"                    : "<<BR>>",
paul@35 70
    }
paul@35 71
paul@66 72
simple_preformatted_tags = {
paul@66 73
    # XHTML tag               MoinMoin syntax
paul@66 74
    "br"                    : "\n",
paul@66 75
    }
paul@66 76
paul@35 77
list_tags = {
paul@35 78
    # XHTML list tag          MoinMoin list item syntax
paul@35 79
    "ol"                    : "1. %s",
paul@35 80
    "ul"                    : "* %s",
paul@35 81
    }
paul@35 82
paul@51 83
preformatted_tags = ["pre", "ac:plain-text-body"]
paul@51 84
single_level_tags = ["strong", "em", "u", "del", "sup", "sub", "code"]
paul@51 85
formatted_tags    = ["ac:rich-text-body", "table"]
paul@51 86
paul@56 87
indented_tags = ["li", "p"] + preformatted_tags + formatted_tags
paul@56 88
block_tags = indented_tags + blocktypes.keys() + list_tags.keys()
paul@58 89
span_override_tags = ["ac:link"]
paul@56 90
paul@35 91
link_target_tags = {
paul@54 92
    # Confluence element      Attributes providing the target
paul@54 93
    "ri:page"               : ("ri:space-key", "ri:content-title"),
paul@54 94
    "ri:attachment"         : ("ri:filename",),
paul@54 95
    "ri:user"               : ("ri:username",),
paul@35 96
    }
paul@35 97
paul@54 98
link_target_prefixes = {
paul@54 99
    # Attribute with details  Prefix ensuring correct relative link
paul@54 100
    "ri:space-key"          : "..",
paul@54 101
    "ri:content-title"      : "..",
paul@54 102
    }
paul@54 103
paul@54 104
link_label_attributes = "ri:content-title", "ac:link-body"
paul@54 105
paul@51 106
# NOTE: User links should support the intended user namespace prefix.
paul@51 107
paul@51 108
link_target_types = {
paul@51 109
    # Confluence element      MoinMoin link prefix
paul@51 110
    "ri:attachment"         : "attachment:",
paul@51 111
    "ri:user"               : "",
paul@51 112
    }
paul@51 113
paul@35 114
macro_rich_text_styles = {
paul@35 115
    # Confluence style        MoinMoin admonition style
paul@35 116
    "note"                  : "caution",
paul@35 117
    "warning"               : "warning",
paul@35 118
    "info"                  : "important",
paul@35 119
    "tip"                   : "tip",
paul@92 120
    "excerpt"               : "",
paul@35 121
    }
paul@35 122
paul@71 123
macroargs = {
paul@71 124
    # Confluence macro        Confluence and MoinMoin macro arguments
paul@71 125
    "color"                 : ("color", "col"),
paul@71 126
    }
paul@71 127
paul@71 128
macrotypes = {
paul@71 129
    # Confluence macro        MoinMoin syntax
paul@71 130
    "anchor"                : "<<Anchor(%(anchor)s)>>",
paul@71 131
    "color"                 : "<<Color2(%(content)s, %(args)s)>>",
paul@93 132
    "toc"                   : "<<TableOfContents>>",
paul@71 133
    }
paul@71 134
paul@35 135
normalise_regexp_str = r"\s+"
paul@35 136
normalise_regexp = re.compile(normalise_regexp_str)
paul@35 137
paul@35 138
class ConfluenceXMLParser(Parser):
paul@35 139
paul@35 140
    "Handle content from Confluence 4 page revisions."
paul@35 141
paul@142 142
    def __init__(self, out, is_comment_page=False):
paul@35 143
        Parser.__init__(self)
paul@35 144
        self.out = out
paul@142 145
        self.is_comment_page = is_comment_page
paul@35 146
paul@51 147
        # Link target and label information.
paul@35 148
paul@35 149
        self.target = None
paul@35 150
        self.target_type = None
paul@51 151
        self.label = None
paul@35 152
paul@35 153
        # Macro information.
paul@35 154
paul@93 155
        self.macros = []
paul@93 156
        self.macro_parameters = []
paul@73 157
        self.held_anchors = []
paul@35 158
paul@51 159
        # Indentation and element nesting states.
paul@35 160
paul@63 161
        self.indents = [0]
paul@35 162
        self.states = {}
paul@51 163
        self.max_level = self.level = 0
paul@51 164
paul@51 165
        for name in preformatted_tags + single_level_tags:
paul@35 166
            self.states[name] = 0
paul@35 167
paul@41 168
        # Table states.
paul@41 169
paul@41 170
        self.table_rows = 0
paul@41 171
        self.table_columns = 0
paul@41 172
paul@56 173
        # Block states.
paul@56 174
paul@56 175
        self.have_block = False
paul@56 176
paul@35 177
    # ContentHandler-related methods.
paul@35 178
paul@35 179
    def startElement(self, name, attrs):
paul@54 180
paul@54 181
        # Track indentation for lists.
paul@54 182
paul@35 183
        if list_tags.has_key(name):
paul@63 184
            self.indents.append(self.indents[-1] + 1)
paul@54 185
paul@54 186
        # Track element nesting.
paul@54 187
paul@63 188
        if self.states.has_key(name):
paul@35 189
            self.states[name] += 1
paul@54 190
paul@54 191
        # Track cumulative element nesting in order to produce appropriate depth
paul@54 192
        # indicators in the formatted output.
paul@54 193
paul@51 194
        if name in preformatted_tags or name in formatted_tags:
paul@51 195
            self.level += 1
paul@51 196
            self.max_level = max(self.level, self.max_level)
paul@51 197
paul@63 198
            # Reset indentation within regions.
paul@63 199
paul@63 200
            self.indents.append(0)
paul@63 201
paul@73 202
        if name in headings:
paul@73 203
            self.held_anchors = []
paul@73 204
paul@35 205
        Parser.startElement(self, name, attrs)
paul@35 206
paul@51 207
        # Remember macro information for use within the element.
paul@51 208
paul@144 209
        if name in ("ac:macro", "ac:structured-macro"):
paul@93 210
            self.macros.append(self.attributes[-1].get("ac:name"))
paul@93 211
            self.macro_parameters.append({})
paul@51 212
paul@35 213
    def endElement(self, name):
paul@63 214
paul@63 215
        # Reset the indent for any preformatted/formatted region so that it may
paul@63 216
        # itself be indented.
paul@63 217
paul@63 218
        if name in preformatted_tags or name in formatted_tags:
paul@63 219
            self.indents.pop()
paul@63 220
paul@35 221
        Parser.endElement(self, name)
paul@51 222
paul@35 223
        if list_tags.has_key(name):
paul@63 224
            self.indents.pop()
paul@63 225
paul@63 226
        if self.states.has_key(name):
paul@35 227
            self.states[name] -= 1
paul@63 228
paul@51 229
        if name in preformatted_tags or name in formatted_tags:
paul@51 230
            self.level -= 1
paul@51 231
            if not self.level:
paul@51 232
                self.max_level = 0
paul@35 233
paul@63 234
        # Discard macro state.
paul@63 235
paul@144 236
        if name in ("ac:macro", "ac:structured-macro"):
paul@93 237
            self.macros.pop()
paul@93 238
            self.macro_parameters.pop()
paul@63 239
paul@35 240
    def characters(self, content):
paul@35 241
        if not self.is_preformatted():
paul@35 242
            content = self.normalise(content, self.elements[-1])
paul@35 243
        Parser.characters(self, content)
paul@35 244
paul@35 245
    def skippedEntity(self, name):
paul@35 246
        ch = htmlentitydefs.name2codepoint.get(name)
paul@35 247
        if ch:
paul@35 248
            self.text[-1].append(unichr(ch))
paul@35 249
paul@35 250
    # Parser-related methods.
paul@35 251
paul@35 252
    def handleElement(self, name):
paul@51 253
paul@51 254
        """
paul@51 255
        Handle the completion of the element with the given 'name'. Any content
paul@51 256
        will either be recorded for later use (by an enclosing element, for
paul@51 257
        example) or emitted in some form.
paul@51 258
        """
paul@51 259
paul@59 260
        text = u"".join(self.text[-1])
paul@41 261
paul@41 262
        # Handle state.
paul@41 263
paul@41 264
        if name == "table":
paul@41 265
            self.table_rows = 0
paul@41 266
        elif name == "tr":
paul@41 267
            self.table_columns = 0
paul@41 268
paul@41 269
        # Find conversions.
paul@41 270
paul@35 271
        conversion = None
paul@35 272
paul@35 273
        # Handle list elements.
paul@35 274
paul@35 275
        if name == "li" and len(self.elements) > 1:
paul@35 276
            list_tag = self.elements[-2]
paul@35 277
            conversion = list_tags.get(list_tag)
paul@35 278
paul@35 279
        # Remember link target information.
paul@35 280
paul@35 281
        elif link_target_tags.has_key(name):
paul@54 282
            target_details = []
paul@54 283
paul@54 284
            # Get target details from the element's attributes.
paul@54 285
paul@54 286
            for attrname in link_target_tags[name]:
paul@54 287
                attrvalue = self.attributes[-1].get(attrname)
paul@54 288
                if attrvalue:
paul@85 289
paul@85 290
                    # Obtain a link label.
paul@85 291
paul@85 292
                    if attrname in link_label_attributes and not self.label:
paul@85 293
                        self.label = attrvalue
paul@85 294
paul@85 295
                    # Validate any page title.
paul@85 296
paul@85 297
                    if attrname == "ri:content-title":
paul@85 298
                        attrvalue = get_page_title(attrvalue)
paul@54 299
                    target_details.append(attrvalue)
paul@85 300
paul@85 301
                    # Insert any prefix required for the link.
paul@85 302
paul@54 303
                    prefix = link_target_prefixes.get(attrname)
paul@54 304
                    if prefix:
paul@54 305
                        target_details.insert(0, prefix)
paul@142 306
                        if self.is_comment_page:
paul@142 307
                            target_details.insert(0, prefix)
paul@54 308
paul@54 309
            # Make a link based on the details.
paul@54 310
paul@59 311
            self.target = u"/".join(target_details)
paul@35 312
            self.target_type = name
paul@35 313
            text = ""
paul@35 314
paul@51 315
        # For anchor links, just use the raw text and let Moin do the formatting.
paul@94 316
        # Set an empty default target, overwriting it if enclosing elements
paul@94 317
        # specify target details.
paul@51 318
paul@141 319
        elif name in ("ac:link-body", "ac:plain-text-link-body"):
paul@94 320
            self.target = self.target or ""
paul@59 321
            self.label = text.strip()
paul@51 322
            text = ""
paul@51 323
paul@55 324
        # For conventional links, remember the href attribute as the target.
paul@55 325
paul@55 326
        elif name == "a":
paul@55 327
            self.target = self.attributes[-1].get("href")
paul@59 328
            self.label = text.strip()
paul@55 329
            text = ""
paul@55 330
paul@35 331
        # Remember macro information.
paul@35 332
paul@63 333
        elif name == "ac:parameter":
paul@93 334
            self.macro_parameters[-1][self.attributes[-1].get("ac:name")] = text
paul@35 335
            text = ""
paul@35 336
paul@63 337
        elif name == "ac:default-parameter":
paul@93 338
            self.macro_parameters[-1][self.attributes[-2].get("ac:name")] = text
paul@63 339
            text = ""
paul@63 340
paul@51 341
        # Handle single-level tags.
paul@51 342
paul@51 343
        elif name in single_level_tags and self.states[name] > 1:
paul@51 344
            conversion = "%s"
paul@51 345
paul@51 346
        # Handle preformatted sections.
paul@51 347
paul@51 348
        elif name in preformatted_tags or name in formatted_tags:
paul@51 349
paul@51 350
            # Nest the section appropriately.
paul@51 351
paul@51 352
            level = 3 + self.max_level - self.level
paul@51 353
            opening = "{" * level
paul@51 354
            closing = "}" * level
paul@51 355
paul@51 356
            # Macro name information is used to style rich text body regions.
paul@51 357
paul@93 358
            if name != "table" and self.macros and macro_rich_text_styles.has_key(self.macros[-1]):
paul@93 359
                details = macro_rich_text_styles[self.macros[-1]]
paul@93 360
                title = self.macro_parameters[-1].get("title")
paul@51 361
                if title:
paul@51 362
                    details = "%s\n\n%s" % (details, title)
paul@51 363
paul@51 364
                conversion = "%s#!wiki %s\n\n%%s\n%s" % (opening, details, closing)
paul@51 365
paul@51 366
            elif name == "table":
paul@51 367
                conversion = "%s#!table\n%%s\n%s" % (opening, closing)
paul@51 368
paul@51 369
            else:
paul@60 370
                # Preformatted sections containing newlines must contain an initial
paul@60 371
                # newline.
paul@60 372
paul@60 373
                if text.find("\n") != -1 and not text.startswith("\n"):
paul@60 374
                    opening += "\n"
paul@60 375
paul@51 376
                conversion = "%s%%s%s" % (opening, closing)
paul@35 377
paul@55 378
        # Handle the common case and simpler special cases.
paul@35 379
paul@55 380
        if not conversion:
paul@35 381
            conversion = tags.get(name)
paul@35 382
paul@56 383
paul@56 384
paul@35 385
        # Attempt to convert the text.
paul@35 386
paul@35 387
        # Links require target information.
paul@35 388
paul@42 389
        if name in ("ac:link", "ac:image"):
paul@54 390
            prefix = link_target_types.get(self.target_type, "")
paul@84 391
            anchor = self.attributes[-1].get("ac:anchor") or ""
paul@84 392
            label = self.label or text.strip() or self.target
paul@84 393
            text = conversion % (prefix, self.target, anchor and ("#%s" % anchor) or "", label)
paul@51 394
            self.target = self.target_type = self.label = None
paul@35 395
paul@55 396
        elif name == "a":
paul@59 397
            text = conversion % (self.target, self.label or self.target)
paul@55 398
            self.target = self.target_type = self.label = None
paul@55 399
paul@63 400
        # Macros require various kinds of information.
paul@71 401
        # Some macros affect the formatting of their contents, whereas other
paul@71 402
        # simpler macros are handled here.
paul@63 403
paul@144 404
        elif name in ("ac:macro", "ac:structured-macro"):
paul@93 405
            conversion = macrotypes.get(self.macros[-1])
paul@144 406
paul@144 407
            # Produce the converted macro.
paul@144 408
paul@71 409
            if conversion:
paul@71 410
                parameters = {"content" : text}
paul@93 411
                parameters.update(self.macro_parameters[-1])
paul@93 412
                argnames = macroargs.get(self.macros[-1])
paul@71 413
                if argnames:
paul@71 414
                    confargname, moinargname = argnames
paul@93 415
                    parameters["args"] = quote_macro_argument("%s=%s" % (moinargname, self.macro_parameters[-1][confargname]))
paul@144 416
paul@144 417
                # Obtain the Moin macro with parameters substituted.
paul@144 418
paul@71 419
                text = conversion % parameters
paul@93 420
                if self.macros[-1] == "anchor" and self.forbids_macros():
paul@73 421
                    self.held_anchors.append(text)
paul@73 422
                    text = ""
paul@63 423
paul@144 424
            # Warn about macros that are not converted.
paul@144 425
paul@144 426
            elif not macro_rich_text_styles.has_key(self.macros[-1]):
paul@144 427
                print >>sys.stderr, "No conversion possible for macro", self.macros[-1]
paul@144 428
                print >>sys.stderr, "Macro has arguments", self.macro_parameters[-1]
paul@144 429
                print >>sys.stderr
paul@144 430
paul@63 431
        # Handle the common cases for parameterised and unparameterised
paul@63 432
        # substitutions.
paul@35 433
paul@35 434
        elif text and conversion:
paul@35 435
            text = conversion % text
paul@66 436
        elif simple_tags.has_key(name) and not self.is_preformatted():
paul@35 437
            text = simple_tags[name]
paul@66 438
        elif simple_preformatted_tags.has_key(name) and self.is_preformatted():
paul@66 439
            text = simple_preformatted_tags[name]
paul@35 440
paul@63 441
paul@63 442
paul@41 443
        # Postprocess table columns and rows.
paul@41 444
paul@41 445
        if name in ("th", "td"):
paul@41 446
            if self.table_columns:
paul@41 447
                text = "\n|| %s" % text
paul@41 448
            self.table_columns += 1
paul@41 449
        elif name == "tr":
paul@41 450
            if self.table_rows:
paul@41 451
                text = "\n==\n%s" % text
paul@41 452
            self.table_rows += 1
paul@41 453
paul@73 454
        # Postprocess held anchor tags in headings.
paul@73 455
paul@73 456
        elif name in headings and self.held_anchors:
paul@73 457
            text = "%s\n%s" % ("".join(self.held_anchors), text)
paul@73 458
paul@63 459
paul@63 460
paul@35 461
        # Normalise leading whitespace and indent the text if appropriate.
paul@35 462
paul@35 463
        if name in indented_tags:
paul@63 464
            text = " " * self.indents[-1] + text.lstrip()
paul@35 465
paul@35 466
        # Add the converted text to the end of the parent element's text nodes.
paul@35 467
paul@35 468
        if len(self.text) > 1:
paul@35 469
            nodes = self.text[-2]
paul@58 470
            parent = self.elements[-2]
paul@56 471
paul@56 472
            # Where preceding text exists, add any blank line separators.
paul@56 473
paul@59 474
            if u"".join(nodes):
paul@56 475
paul@56 476
                # All top-level elements are separated with blank lines.
paul@56 477
paul@35 478
                if parent == "body":
paul@56 479
                    nodes.append("\n")
paul@56 480
paul@56 481
                # Block elements always cause a new line to be started.
paul@56 482
paul@58 483
                if name in block_tags or self.have_block and name not in span_override_tags:
paul@35 484
                    nodes.append("\n")
paul@56 485
paul@56 486
                self.have_block = False
paul@56 487
paul@58 488
            # Lists inside lists require separation.
paul@58 489
paul@58 490
            elif list_tags.has_key(name) and parent == "li":
paul@58 491
                nodes.append("\n")
paul@56 492
paul@58 493
            # Without preceding text, save any block node state for non-block
paul@60 494
            # elements so that newline separators can be added at another
paul@58 495
            # level.
paul@58 496
paul@58 497
            elif name in block_tags and parent not in block_tags:
paul@58 498
                self.have_block = True
paul@58 499
paul@58 500
            elif name not in block_tags and self.have_block and name not in span_override_tags:
paul@56 501
                self.have_block = True
paul@56 502
paul@56 503
            else:
paul@56 504
                self.have_block = False
paul@56 505
paul@35 506
            nodes.append(text)
paul@35 507
paul@56 508
        # Otherwise, emit the text (at the top level of the document).
paul@35 509
paul@35 510
        else:
paul@35 511
            self.out.write(text)
paul@35 512
paul@35 513
    def is_preformatted(self):
paul@51 514
        return reduce(operator.or_, [self.states[tag] for tag in preformatted_tags], False)
paul@35 515
paul@71 516
    def forbids_macros(self):
paul@71 517
        return reduce(operator.or_, [(tag in headings or tag == "a") for tag in self.elements], False)
paul@71 518
paul@35 519
    # Whitespace normalisation.
paul@35 520
paul@35 521
    def get_replacement(self, name):
paul@42 522
        if name in ("html", "body", "table", "tbody", "tr") or list_tags.has_key(name):
paul@35 523
            return ""
paul@35 524
        else:
paul@35 525
            return " "
paul@35 526
paul@35 527
    def normalise(self, text, name):
paul@35 528
        return normalise_regexp.sub(self.get_replacement(name), text)
paul@35 529
paul@142 530
def parse(s, out, is_comment_page=False):
paul@35 531
paul@35 532
    "Parse the content in the string 's', writing a translation to 'out'."
paul@35 533
paul@35 534
    # NOTE: CDATA sections appear to have erroneous endings.
paul@35 535
paul@35 536
    s = u"""\
paul@35 537
<?xml version="1.0"?>
paul@35 538
<!DOCTYPE html 
paul@35 539
     PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
paul@35 540
     "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
paul@35 541
<html xmlns="http://www.w3.org/1999/xhtml">
paul@35 542
<body>
paul@35 543
%s
paul@35 544
</body>
paul@35 545
</html>""" % s.replace("]] >", "]]>")
paul@35 546
paul@35 547
    f = StringIO(s.encode("utf-8"))
paul@35 548
    try:
paul@142 549
        parser = ConfluenceXMLParser(out, is_comment_page)
paul@35 550
        parser.parse(f)
paul@35 551
    finally:
paul@35 552
        f.close()
paul@35 553
paul@35 554
if __name__ == "__main__":
paul@63 555
    s = codecs.getreader("utf-8")(sys.stdin).read()
paul@41 556
    out = codecs.getwriter("utf-8")(sys.stdout)
paul@41 557
    parse(s, out)
paul@35 558
paul@35 559
# vim: tabstop=4 expandtab shiftwidth=4