ConfluenceConverter

Annotated convert.py

53:4ecf97af8a76
2013-04-09 Paul Boddie Handle page renaming to a reasonable extent, although comments and attachments may not appear on particular versions of renamed pages. Added an item about inter-space linking to the "to do" list.
paul@0 1
#!/usr/bin/env python
paul@0 2
paul@8 3
"""
paul@8 4
Confluence XML dump conversion to a MoinMoin-compatible representation.
paul@8 5
paul@33 6
Copyright (C) 2012, 2013 Paul Boddie <paul@boddie.org.uk>
paul@8 7
paul@8 8
This software is free software; you can redistribute it and/or
paul@8 9
modify it under the terms of the GNU General Public License as
paul@8 10
published by the Free Software Foundation; either version 2 of
paul@8 11
the License, or (at your option) any later version.
paul@8 12
paul@8 13
This software is distributed in the hope that it will be useful,
paul@8 14
but WITHOUT ANY WARRANTY; without even the implied warranty of
paul@8 15
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
paul@8 16
GNU General Public License for more details.
paul@8 17
paul@8 18
You should have received a copy of the GNU General Public
paul@8 19
License along with this library; see the file LICENCE.txt
paul@8 20
If not, write to the Free Software Foundation, Inc.,
paul@8 21
51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA
paul@8 22
"""
paul@8 23
paul@40 24
from os import chdir, getcwd, listdir, mkdir, makedirs, walk
paul@40 25
from os.path import exists, extsep, join, split, splitext
paul@0 26
from zipfile import ZipFile
paul@0 27
from cStringIO import StringIO
paul@40 28
from MoinMoin import wikiutil
paul@0 29
import codecs
paul@0 30
import xmlread
paul@35 31
import wikiparser, xmlparser
paul@25 32
import sys
paul@0 33
paul@23 34
MAX_TITLE_LENGTH = 120
paul@23 35
paul@0 36
class ConfluenceHandler:
paul@0 37
paul@0 38
    "Handle content from a Confluence Wiki dump."
paul@0 39
paul@13 40
    def __init__(self, space, no_translate=False):
paul@0 41
        self.content = {}
paul@0 42
        self.elements = []
paul@12 43
        self.space = space
paul@13 44
        self.no_translate = no_translate
paul@0 45
paul@0 46
    def handle_object(self, name, elements, attributes, all_text, text):
paul@0 47
paul@40 48
        """
paul@40 49
        Handle objects according to type. Objects appear as follows:
paul@40 50
paul@40 51
        <object class="Page" package="...">
paul@40 52
        <id name="id">...</id>
paul@40 53
        ...
paul@40 54
        </object>
paul@40 55
paul@40 56
        Within objects, one finds things like properties and collections, which
paul@40 57
        are handled by their own methods but which are stored in the content
paul@40 58
        dictionary associated with the current object.
paul@40 59
paul@40 60
        By the time this method is called, the contents of the object will have
paul@40 61
        been gathered and the properties and collections populated in the
paul@40 62
        content dictionary. Any identifier will have been assigned to the
paul@40 63
        textual content of the object element and will be available in the
paul@40 64
        'text' parameter.
paul@40 65
        """
paul@0 66
paul@0 67
        objecttype = attributes[-1]["class"]
paul@25 68
paul@25 69
        # Any identifier is stored as the object's textual content.
paul@25 70
paul@0 71
        identifier = text.strip()
paul@25 72
paul@25 73
        # The content is a dictionary mapping names to properties and
paul@25 74
        # collections.
paul@25 75
paul@0 76
        content = self.content
paul@0 77
paul@12 78
        pages_dir = join(self.space, "pages")
paul@12 79
        versions_dir = join(self.space, "versions")
paul@0 80
paul@0 81
        # Handle particular types.
paul@0 82
paul@10 83
        if objecttype in ("Page", "Comment", "BlogPost"):
paul@0 84
paul@0 85
            # Handle pages and revisions, adding revisions to the page manifest.
paul@9 86
            # The original version is used as a unifying identifier for all the
paul@9 87
            # different revisions (each of which being defined by a Page
paul@9 88
            # element). Although "original" implies the first identifier used,
paul@9 89
            # it actually appears to be the latest and will have the highest
paul@9 90
            # version number.
paul@0 91
paul@0 92
            if content.has_key("originalVersion"):
paul@0 93
                pageid = content["originalVersion"]
paul@0 94
            else:
paul@0 95
                pageid = identifier
paul@0 96
paul@0 97
            versionfile = join(versions_dir, identifier)
paul@0 98
paul@0 99
            # Note page metadata, not necessarily in the correct order.
paul@9 100
            # For comments, the title will need to be rewritten, since they
paul@9 101
            # should be defined in terms of their owner page.
paul@0 102
paul@53 103
            # NOTE: This only makes the current title available to comments.
paul@53 104
paul@0 105
            mkdirs(join(pages_dir, pageid))
paul@0 106
paul@12 107
            title = content["title"]
paul@23 108
paul@23 109
            # Limit the title to a "safe" number of characters in order to avoid
paul@23 110
            # filesystem issues.
paul@23 111
paul@23 112
            title = title[:MAX_TITLE_LENGTH]
paul@23 113
paul@12 114
            if title:
paul@12 115
                title = "%s/%s" % (self.space, title)
paul@31 116
                write(join(pages_dir, pageid, "pagetitle"), title)
paul@12 117
paul@28 118
            # See sort_manifest for access to this data.
paul@28 119
paul@24 120
            append(join(pages_dir, pageid, "manifest"),
paul@40 121
                "%s|AddRevision|_|%s|%s|%s|%s\n" % ( # blank added for consistency with AddAttachment
paul@24 122
                    content["version"],
paul@24 123
                    versionfile,
paul@31 124
                    title, # comment titles will incorporate the comment's position
paul@24 125
                    content["lastModifierName"],
paul@24 126
                    content["versionComment"]
paul@24 127
                ))
paul@0 128
paul@24 129
            # Add information to parent pages for child page lists.
paul@24 130
paul@24 131
            if content.has_key("parent"):
paul@24 132
                parentid = content["parent"]
paul@24 133
                mkdirs(join(pages_dir, parentid))
paul@24 134
                append(join(pages_dir, parentid, "children"), title + "\n")
paul@24 135
paul@31 136
            # Add creation details for comments to the owner page.
paul@31 137
            # Since comments can be versioned, the date of the original version
paul@31 138
            # is used, and only this "original" version has the owner property.
paul@31 139
paul@31 140
            if objecttype == "Comment" and content.has_key("owner"):
paul@31 141
                ownerid = content["owner"]
paul@31 142
                mkdirs(join(pages_dir, ownerid))
paul@31 143
                append(join(pages_dir, ownerid, "comments"), "%s|%s\n" % (content["creationDate"], pageid))
paul@31 144
paul@0 145
            # Some metadata is not particularly relevant. For example,
paul@0 146
            # ancestors, children, parent are navigation-related.
paul@0 147
paul@0 148
            # Other metadata could be added to the page content itself.
paul@0 149
            # For example, labelling could be converted to categories.
paul@0 150
paul@0 151
        # Handle revisions.
paul@0 152
paul@0 153
        elif objecttype == "BodyContent":
paul@12 154
            body = content["body"]
paul@12 155
            if not body:
paul@12 156
                body = "## Empty page."
paul@13 157
paul@25 158
            # NOTE: Very simple technique employed for guessing the format.
paul@25 159
paul@13 160
            if no_translate:
paul@13 161
                fn = write
paul@25 162
            elif body.startswith("<"):
paul@25 163
                fn = xmltranslate
paul@13 164
            else:
paul@13 165
                fn = translate
paul@13 166
paul@25 167
            try:
paul@25 168
                fn(join(versions_dir, content["content"]), body)
paul@25 169
            except:
paul@42 170
                err = codecs.getwriter("utf-8")(sys.stderr)
paul@42 171
                print >>err, "Error parsing", content["content"]
paul@25 172
                raise
paul@0 173
paul@40 174
        # Handle attachments.
paul@40 175
paul@40 176
        elif objecttype == "Attachment":
paul@40 177
            pageid = content["content"]
paul@40 178
            version = content["attachmentVersion"]
paul@40 179
paul@40 180
            if content.has_key("originalVersion"):
paul@40 181
                attachid = content["originalVersion"]
paul@40 182
            else:
paul@40 183
                attachid = identifier
paul@40 184
paul@40 185
            append(join(pages_dir, pageid, "attachments"),
paul@40 186
                "%s|AddAttachment|%s|%s|%s|%s|%s\n" % (
paul@40 187
                    version,
paul@40 188
                    # Have to "taint" archive filenames, although Moin will
paul@40 189
                    # probably handle package script filename tainting.
paul@40 190
                    wikiutil.taintfilename(join("attachments", pageid, attachid, version)),
paul@40 191
                    wikiutil.taintfilename(content["fileName"]),
paul@40 192
                    "", # pagename is substituted later
paul@40 193
                    content["lastModifierName"],
paul@40 194
                    content["comment"]
paul@40 195
                ))
paul@40 196
paul@0 197
        self.content = {}
paul@0 198
paul@0 199
    def handle_property(self, name, elements, attributes, all_text, text):
paul@0 200
paul@0 201
        "Record properties in the current content dictionary."
paul@0 202
paul@0 203
        self.content[attributes[-1]["name"]] = text.strip()
paul@0 204
paul@0 205
    def handle_id(self, name, elements, attributes, all_text, text):
paul@0 206
paul@0 207
        "Promote identifiers to the parent element's text."
paul@0 208
paul@0 209
        all_text[-2].append(text)
paul@0 210
paul@0 211
    def handle_collection(self, name, elements, attributes, all_text, text):
paul@0 212
paul@0 213
        "Record collections in the current content dictionary."
paul@0 214
paul@0 215
        self.content[attributes[-1]["name"]] = self.elements
paul@0 216
        self.elements = []
paul@0 217
paul@0 218
    def handle_element(self, name, elements, attributes, all_text, text):
paul@0 219
paul@0 220
        "Add elements to the current collection."
paul@0 221
paul@0 222
        self.elements.append((attributes[-1]["class"], text.strip()))
paul@0 223
paul@0 224
def mkdirs(name):
paul@2 225
paul@2 226
    "Make the directory with the given 'name' at any depth."
paul@2 227
paul@0 228
    try:
paul@0 229
        makedirs(name)
paul@0 230
    except OSError:
paul@0 231
        pass
paul@0 232
paul@0 233
def append(filename, s):
paul@2 234
paul@2 235
    "Append to the file with the given 'filename' the string 's'."
paul@2 236
paul@0 237
    write(filename, s, True)
paul@0 238
paul@0 239
def write(filename, s, append=False):
paul@2 240
paul@2 241
    """
paul@2 242
    Write to the file with the given 'filename' the string 's'. If the optional
paul@2 243
    'append' parameter is set to a true value, 's' will be appended to the file.
paul@2 244
    """
paul@2 245
paul@2 246
    f = codecs.open(filename, append and "a" or "w", encoding="utf-8")
paul@0 247
    try:
paul@0 248
        f.write(s)
paul@0 249
    finally:
paul@0 250
        f.close()
paul@0 251
paul@9 252
def read(filename):
paul@9 253
paul@9 254
    """
paul@9 255
    Read from the file with the given 'filename', returning a string containing
paul@9 256
    its contents.
paul@9 257
    """
paul@9 258
paul@9 259
    f = codecs.open(filename, encoding="utf-8")
paul@9 260
    try:
paul@9 261
        return f.read()
paul@9 262
    finally:
paul@9 263
        f.close()
paul@3 264
paul@25 265
def translate(filename, body, fn=None):
paul@11 266
paul@11 267
    """
paul@11 268
    Write to the file with the given 'filename' a translation of the given
paul@11 269
    'body'.
paul@11 270
    """
paul@11 271
paul@35 272
    fn = fn or wikiparser.parse
paul@25 273
paul@11 274
    out = codecs.open(filename, "w", encoding="utf-8")
paul@11 275
    try:
paul@44 276
        print >>out, "#pragma page-filename", filename
paul@25 277
        fn(body, out)
paul@11 278
    finally:
paul@11 279
        out.close()
paul@11 280
paul@25 281
def xmltranslate(filename, body):
paul@35 282
    translate(filename, body, xmlparser.parse)
paul@25 283
paul@31 284
def sort_comments(pages_dir, pageid):
paul@31 285
paul@31 286
    """
paul@31 287
    Where 'pageid' has comments associated with it, sort them chronologically
paul@31 288
    and label the comment pages with the owner page's title and comment's
paul@31 289
    position in the chronological sequence. Such labelling is done by writing
paul@31 290
    a "pagetitle" file in each comment page's directory.
paul@31 291
    """
paul@31 292
paul@31 293
    comments = join(pages_dir, pageid, "comments")
paul@31 294
paul@31 295
    if not exists(comments):
paul@31 296
        return
paul@31 297
paul@31 298
    title = read(join(pages_dir, pageid, "pagetitle"))
paul@31 299
paul@31 300
    details = [line.split("|") for line in read(comments).split("\n") if line]
paul@31 301
    details.sort()
paul@31 302
paul@31 303
    # Write the sorted comments list for testing purposes.
paul@31 304
paul@31 305
    write(comments, "\n".join(["|".join(x) for x in details]))
paul@31 306
paul@31 307
    # Define comments as subpages by setting their titles using this
paul@31 308
    # page's name/title and their position in the comments collection.
paul@31 309
paul@31 310
    for position, (_lastmodified, commentid) in enumerate(details):
paul@31 311
paul@31 312
        # In the page directory for each comment, write the page title in a
paul@31 313
        # special file for later processing.
paul@31 314
paul@32 315
        write(join(pages_dir, commentid, "pagetitle"), "%s/%04d" % (title, position))
paul@31 316
paul@40 317
def _sort_manifest(manifest, title):
paul@40 318
paul@40 319
    """
paul@40 320
    Open the given 'manifest' and sort it according to revision so that it will
paul@40 321
    be added to MoinMoin in the correct order.
paul@40 322
paul@40 323
    If a 'title' is provided, the title column in the manifest will be augmented
paul@40 324
    with that information. This is typically done for comments and is necessary
paul@40 325
    for attachments.
paul@40 326
paul@40 327
    A list of manifest entries is returned.
paul@40 328
    """
paul@40 329
paul@40 330
    f = codecs.open(manifest, "r", encoding="utf-8")
paul@40 331
    try:
paul@40 332
        lines = [x.split("|") for x in f.readlines()]
paul@40 333
        lines.sort(cmp=lambda x, y: cmp(int(x[0]), int(y[0])))
paul@40 334
paul@40 335
        # Reconstruct the lines, optionally changing the titles.
paul@40 336
paul@40 337
        result = []
paul@40 338
paul@40 339
        for line in lines:
paul@40 340
            version, _action, _archive_filename, filename, old_title, username, comment = line
paul@40 341
paul@40 342
            # Replace title information with the information already present.
paul@40 343
paul@53 344
            if not old_title:
paul@40 345
                new_title = title
paul@40 346
            else:
paul@40 347
                new_title = old_title
paul@40 348
paul@40 349
            # The version is omitted now that the manifest is ordered.
paul@40 350
paul@40 351
            line = _action, _archive_filename, filename, new_title, username, comment
paul@40 352
            result.append(line)
paul@40 353
paul@40 354
        return result
paul@40 355
paul@40 356
    finally:
paul@40 357
        f.close()
paul@40 358
paul@40 359
def serialise_manifest(manifest):
paul@40 360
paul@40 361
    """
paul@40 362
    Process the 'manifest' consisting of entries, removing superfluous columns.
paul@40 363
    """
paul@40 364
paul@40 365
    result = []
paul@40 366
paul@40 367
    for columns in manifest:
paul@40 368
        action = columns[0]
paul@40 369
        if action == "AddRevision":
paul@40 370
            columns = list(columns)
paul@40 371
            del columns[1]
paul@40 372
        result.append("|".join(columns))
paul@40 373
paul@40 374
    return "".join(result)
paul@40 375
            
paul@33 376
def sort_manifest(pages_dir, pageid, output=None, no_translate=False):
paul@9 377
paul@9 378
    """
paul@28 379
    Using the given 'pageid', locate the manifest for the page and any page
paul@28 380
    title information written to a "pagetitle" file.
paul@23 381
paul@28 382
    Then sort the manifest according to revision so that it will be added to
paul@28 383
    MoinMoin in the correct order.
paul@28 384
paul@28 385
    If a "pagetitle" file exists, the title column in the manifest will be
paul@23 386
    augmented with the contents of that file. This is typically done for
paul@23 387
    comments.
paul@23 388
paul@28 389
    If a "children" file exists, the pages in that file will be added as a list
paul@28 390
    to the end of each revision's content.
paul@28 391
paul@23 392
    If 'output' is given, the manifest details will be appended to the file
paul@23 393
    having that filename instead of being rewritten to the original manifest
paul@23 394
    file.
paul@9 395
    """
paul@9 396
paul@28 397
    manifest = join(pages_dir, pageid, "manifest")
paul@40 398
    attachments = join(pages_dir, pageid, "attachments")
paul@28 399
    pagetitle = join(pages_dir, pageid, "pagetitle")
paul@28 400
    children = join(pages_dir, pageid, "children")
paul@32 401
    comments = join(pages_dir, pageid, "comments")
paul@28 402
paul@9 403
    if exists(pagetitle):
paul@9 404
        title = read(pagetitle)
paul@9 405
    else:
paul@9 406
        title = None
paul@3 407
paul@40 408
    # Sort the revision manifest.
paul@40 409
paul@40 410
    result = _sort_manifest(manifest, title)
paul@9 411
paul@40 412
    for _action, _archive_filename, filename, new_title, username, comment in result:
paul@9 413
paul@40 414
        # Add child page information to the content.
paul@28 415
paul@40 416
        if exists(children) and not no_translate:
paul@40 417
            child_pages = []
paul@40 418
            child_page_names = [x for x in read(children).split("\n") if x]
paul@40 419
            child_page_names.sort()
paul@28 420
paul@40 421
            for child_page_name in child_page_names:
paul@40 422
                child_pages.append(" * [[%s]]" % child_page_name)
paul@28 423
paul@40 424
            append(filename, child_page_section % "\n".join(child_pages))
paul@28 425
paul@40 426
        # Add comments to the content.
paul@40 427
paul@40 428
        if exists(comments) and title and not no_translate:
paul@40 429
            append(filename, comment_section % title)
paul@28 430
paul@40 431
    # Add the attachments to the manifest.
paul@32 432
paul@40 433
    if exists(attachments):
paul@40 434
        result += _sort_manifest(attachments, title)
paul@32 435
paul@40 436
    # Serialise the manifest.
paul@3 437
paul@40 438
    s = serialise_manifest(result)
paul@10 439
paul@10 440
    if output is None:
paul@28 441
        write(manifest, s)
paul@10 442
    else:
paul@10 443
        append(output, s)
paul@3 444
paul@28 445
# Template for child page information.
paul@28 446
paul@28 447
child_page_section = """
paul@28 448
----
paul@28 449
paul@28 450
%s
paul@28 451
"""
paul@28 452
paul@32 453
# Template for comments.
paul@32 454
paul@32 455
comment_section = """
paul@32 456
----
paul@32 457
paul@32 458
<<Include("^%s/")>>
paul@32 459
"""
paul@32 460
paul@28 461
# Main program.
paul@28 462
paul@0 463
if __name__ == "__main__":
paul@20 464
    try:
paul@20 465
        filename = sys.argv[1]
paul@20 466
        is_zipfile = splitext(filename)[-1] == extsep + "zip"
paul@20 467
        space = sys.argv[2]
paul@44 468
        if len(sys.argv) > 3 and sys.argv[3]:
paul@40 469
            attachments = sys.argv[3]
paul@40 470
        else:
paul@40 471
            attachments = None
paul@20 472
    except IndexError:
paul@47 473
        print >>sys.stderr, """
paul@47 474
Please specify an XML file containing Wiki data, a workspace name, and an
paul@47 475
optional attachments directory location. For example:
paul@47 476
paul@47 477
com_entities.xml COM attachments
paul@47 478
paul@47 479
Adding --no-translate will unpack the Wiki but not translate the content.
paul@47 480
When doing so without an attachments directory, add an empty argument as
paul@47 481
follows:
paul@47 482
paul@47 483
com_entities.xml COM '' --no-translate
paul@47 484
"""
paul@20 485
        sys.exit(1)
paul@0 486
paul@13 487
    no_translate = "--no-translate" in sys.argv
paul@0 488
paul@12 489
    if exists(space):
paul@12 490
        print >>sys.stderr, "Directory exists for space %s. Please choose another or remove its contents." % space
paul@0 491
        sys.exit(1)
paul@0 492
paul@12 493
    package_zip = space + extsep + "zip"
paul@12 494
paul@12 495
    if exists(package_zip):
paul@12 496
        print >>sys.stderr, "Page package exists. Please remove or rename it:", package_zip
paul@12 497
        sys.exit(1)
paul@12 498
paul@12 499
    mkdir(space)
paul@12 500
    mkdirs(join(space, "pages"))
paul@12 501
    mkdirs(join(space, "versions"))
paul@0 502
paul@0 503
    p = xmlread.ConfigurableParser()
paul@13 504
    handler = ConfluenceHandler(space, no_translate)
paul@0 505
paul@24 506
    # Register handlers in the parser for different elements.
paul@24 507
paul@0 508
    p["object"] = handler.handle_object
paul@0 509
    p["property"] = handler.handle_property
paul@0 510
    p["id"] = handler.handle_id
paul@0 511
    p["collection"] = handler.handle_collection
paul@0 512
    p["element"] = handler.handle_element
paul@0 513
paul@2 514
    # Open the XML dump.
paul@2 515
paul@0 516
    f = open(filename)
paul@0 517
paul@0 518
    if is_zipfile:
paul@0 519
        zf = ZipFile(f)
paul@0 520
        ff = StringIO(zf.read("entities.xml"))
paul@0 521
    else:
paul@0 522
        ff = f
paul@0 523
paul@2 524
    # Parse the data.
paul@2 525
paul@0 526
    try:
paul@0 527
        p.parse(ff)
paul@40 528
paul@40 529
        # Tidy up the import manifests, sorting each of them by revision and
paul@40 530
        # finalising them.
paul@40 531
paul@40 532
        pages_dir = join(space, "pages")
paul@40 533
paul@40 534
        for pageid in listdir(pages_dir):
paul@40 535
            sort_comments(pages_dir, pageid)
paul@40 536
paul@40 537
        output_manifest = join(space, "MOIN_PACKAGE")
paul@40 538
        append(output_manifest, "MoinMoinPackage|1\n")
paul@40 539
paul@40 540
        for pageid in listdir(pages_dir):
paul@40 541
            sort_manifest(pages_dir, pageid, output_manifest, no_translate)
paul@40 542
paul@40 543
        # Write the page package.
paul@40 544
paul@40 545
        page_package = ZipFile(package_zip, "w")
paul@40 546
paul@40 547
        try:
paul@40 548
            # Include the page revisions.
paul@40 549
paul@40 550
            versions_dir = join(space, "versions")
paul@40 551
paul@40 552
            for versionid in listdir(versions_dir):
paul@40 553
                page_package.write(join(versions_dir, versionid))
paul@40 554
paul@40 555
            # Include the attachments.
paul@40 556
paul@40 557
            if attachments:
paul@40 558
                cwd = getcwd()
paul@40 559
                chdir(split(attachments)[0])
paul@40 560
                try:
paul@40 561
                    for path, dirnames, filenames in walk(split(attachments)[1]):
paul@40 562
                        for filename in filenames:
paul@40 563
                            # Have to "taint" archive filenames.
paul@40 564
                            page_package.write(join(path, filename), wikiutil.taintfilename(join(path, filename)))
paul@40 565
                finally:
paul@40 566
                    chdir(cwd)
paul@40 567
            elif is_zipfile:
paul@40 568
                for filename in zf.namelist():
paul@40 569
                    if filename.startswith("attachments"):
paul@40 570
                        # Have to "taint" archive filenames.
paul@40 571
                        page_package.writestr(wikiutil.taintfilename(filename), zf.read(filename))
paul@40 572
paul@40 573
            # Include only the top-level manifest.
paul@40 574
paul@40 575
            page_package.write(output_manifest, "MOIN_PACKAGE")
paul@40 576
paul@40 577
        finally:
paul@40 578
            page_package.close()
paul@40 579
paul@0 580
    finally:
paul@0 581
        f.close()
paul@0 582
paul@0 583
# vim: tabstop=4 expandtab shiftwidth=4