ConfluenceConverter

Annotated convert.py

33:7fdb737bed89
2013-02-17 Paul Boddie Propagated the "no translate" option to functionality wanting to add extra content to pages.
paul@0 1
#!/usr/bin/env python
paul@0 2
paul@8 3
"""
paul@8 4
Confluence XML dump conversion to a MoinMoin-compatible representation.
paul@8 5
paul@33 6
Copyright (C) 2012, 2013 Paul Boddie <paul@boddie.org.uk>
paul@8 7
paul@8 8
This software is free software; you can redistribute it and/or
paul@8 9
modify it under the terms of the GNU General Public License as
paul@8 10
published by the Free Software Foundation; either version 2 of
paul@8 11
the License, or (at your option) any later version.
paul@8 12
paul@8 13
This software is distributed in the hope that it will be useful,
paul@8 14
but WITHOUT ANY WARRANTY; without even the implied warranty of
paul@8 15
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
paul@8 16
GNU General Public License for more details.
paul@8 17
paul@8 18
You should have received a copy of the GNU General Public
paul@8 19
License along with this library; see the file LICENCE.txt
paul@8 20
If not, write to the Free Software Foundation, Inc.,
paul@8 21
51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA
paul@8 22
"""
paul@8 23
paul@3 24
from os import listdir, mkdir, makedirs
paul@1 25
from os.path import exists, extsep, join, splitext
paul@0 26
from zipfile import ZipFile
paul@0 27
from cStringIO import StringIO
paul@0 28
import codecs
paul@0 29
import xmlread
paul@11 30
import parser
paul@25 31
import sys
paul@0 32
paul@23 33
MAX_TITLE_LENGTH = 120
paul@23 34
paul@0 35
class ConfluenceHandler:
paul@0 36
paul@0 37
    "Handle content from a Confluence Wiki dump."
paul@0 38
paul@13 39
    def __init__(self, space, no_translate=False):
paul@0 40
        self.content = {}
paul@0 41
        self.elements = []
paul@12 42
        self.space = space
paul@13 43
        self.no_translate = no_translate
paul@0 44
paul@0 45
    def handle_object(self, name, elements, attributes, all_text, text):
paul@0 46
paul@0 47
        "Handle objects according to type."
paul@0 48
paul@0 49
        objecttype = attributes[-1]["class"]
paul@25 50
paul@25 51
        # Any identifier is stored as the object's textual content.
paul@25 52
paul@0 53
        identifier = text.strip()
paul@25 54
paul@25 55
        # The content is a dictionary mapping names to properties and
paul@25 56
        # collections.
paul@25 57
paul@0 58
        content = self.content
paul@0 59
paul@12 60
        pages_dir = join(self.space, "pages")
paul@12 61
        versions_dir = join(self.space, "versions")
paul@0 62
paul@0 63
        # Handle particular types.
paul@0 64
paul@10 65
        if objecttype in ("Page", "Comment", "BlogPost"):
paul@0 66
paul@0 67
            # Handle pages and revisions, adding revisions to the page manifest.
paul@9 68
            # The original version is used as a unifying identifier for all the
paul@9 69
            # different revisions (each of which being defined by a Page
paul@9 70
            # element). Although "original" implies the first identifier used,
paul@9 71
            # it actually appears to be the latest and will have the highest
paul@9 72
            # version number.
paul@0 73
paul@0 74
            if content.has_key("originalVersion"):
paul@0 75
                pageid = content["originalVersion"]
paul@0 76
            else:
paul@0 77
                pageid = identifier
paul@0 78
paul@0 79
            versionfile = join(versions_dir, identifier)
paul@0 80
paul@0 81
            # Note page metadata, not necessarily in the correct order.
paul@9 82
            # For comments, the title will need to be rewritten, since they
paul@9 83
            # should be defined in terms of their owner page.
paul@0 84
paul@0 85
            mkdirs(join(pages_dir, pageid))
paul@0 86
paul@12 87
            title = content["title"]
paul@23 88
paul@23 89
            # Limit the title to a "safe" number of characters in order to avoid
paul@23 90
            # filesystem issues.
paul@23 91
paul@23 92
            title = title[:MAX_TITLE_LENGTH]
paul@23 93
paul@12 94
            if title:
paul@12 95
                title = "%s/%s" % (self.space, title)
paul@31 96
                write(join(pages_dir, pageid, "pagetitle"), title)
paul@12 97
paul@28 98
            # See sort_manifest for access to this data.
paul@28 99
paul@24 100
            append(join(pages_dir, pageid, "manifest"),
paul@24 101
                "%s|AddRevision|%s|%s|%s|%s\n" % (
paul@24 102
                    content["version"],
paul@24 103
                    versionfile,
paul@31 104
                    title, # comment titles will incorporate the comment's position
paul@24 105
                    content["lastModifierName"],
paul@24 106
                    content["versionComment"]
paul@24 107
                ))
paul@0 108
paul@24 109
            # Add information to parent pages for child page lists.
paul@24 110
paul@24 111
            if content.has_key("parent"):
paul@24 112
                parentid = content["parent"]
paul@24 113
                mkdirs(join(pages_dir, parentid))
paul@24 114
                append(join(pages_dir, parentid, "children"), title + "\n")
paul@24 115
paul@31 116
            # Add creation details for comments to the owner page.
paul@31 117
            # Since comments can be versioned, the date of the original version
paul@31 118
            # is used, and only this "original" version has the owner property.
paul@31 119
paul@31 120
            if objecttype == "Comment" and content.has_key("owner"):
paul@31 121
                ownerid = content["owner"]
paul@31 122
                mkdirs(join(pages_dir, ownerid))
paul@31 123
                append(join(pages_dir, ownerid, "comments"), "%s|%s\n" % (content["creationDate"], pageid))
paul@31 124
paul@0 125
            # Some metadata is not particularly relevant. For example,
paul@0 126
            # ancestors, children, parent are navigation-related.
paul@0 127
paul@0 128
            # Other metadata could be added to the page content itself.
paul@0 129
            # For example, labelling could be converted to categories.
paul@0 130
paul@0 131
        # Handle revisions.
paul@0 132
paul@0 133
        elif objecttype == "BodyContent":
paul@12 134
            body = content["body"]
paul@12 135
            if not body:
paul@12 136
                body = "## Empty page."
paul@13 137
paul@25 138
            # NOTE: Very simple technique employed for guessing the format.
paul@25 139
paul@13 140
            if no_translate:
paul@13 141
                fn = write
paul@25 142
            elif body.startswith("<"):
paul@25 143
                fn = xmltranslate
paul@13 144
            else:
paul@13 145
                fn = translate
paul@13 146
paul@25 147
            try:
paul@25 148
                fn(join(versions_dir, content["content"]), body)
paul@25 149
            except:
paul@25 150
                print >>sys.stderr, "Error parsing..."
paul@25 151
                print >>sys.stderr, body
paul@25 152
                raise
paul@0 153
paul@0 154
        self.content = {}
paul@0 155
paul@0 156
    def handle_property(self, name, elements, attributes, all_text, text):
paul@0 157
paul@0 158
        "Record properties in the current content dictionary."
paul@0 159
paul@0 160
        self.content[attributes[-1]["name"]] = text.strip()
paul@0 161
paul@0 162
    def handle_id(self, name, elements, attributes, all_text, text):
paul@0 163
paul@0 164
        "Promote identifiers to the parent element's text."
paul@0 165
paul@0 166
        all_text[-2].append(text)
paul@0 167
paul@0 168
    def handle_collection(self, name, elements, attributes, all_text, text):
paul@0 169
paul@0 170
        "Record collections in the current content dictionary."
paul@0 171
paul@0 172
        self.content[attributes[-1]["name"]] = self.elements
paul@0 173
        self.elements = []
paul@0 174
paul@0 175
    def handle_element(self, name, elements, attributes, all_text, text):
paul@0 176
paul@0 177
        "Add elements to the current collection."
paul@0 178
paul@0 179
        self.elements.append((attributes[-1]["class"], text.strip()))
paul@0 180
paul@0 181
def mkdirs(name):
paul@2 182
paul@2 183
    "Make the directory with the given 'name' at any depth."
paul@2 184
paul@0 185
    try:
paul@0 186
        makedirs(name)
paul@0 187
    except OSError:
paul@0 188
        pass
paul@0 189
paul@0 190
def append(filename, s):
paul@2 191
paul@2 192
    "Append to the file with the given 'filename' the string 's'."
paul@2 193
paul@0 194
    write(filename, s, True)
paul@0 195
paul@0 196
def write(filename, s, append=False):
paul@2 197
paul@2 198
    """
paul@2 199
    Write to the file with the given 'filename' the string 's'. If the optional
paul@2 200
    'append' parameter is set to a true value, 's' will be appended to the file.
paul@2 201
    """
paul@2 202
paul@2 203
    f = codecs.open(filename, append and "a" or "w", encoding="utf-8")
paul@0 204
    try:
paul@0 205
        f.write(s)
paul@0 206
    finally:
paul@0 207
        f.close()
paul@0 208
paul@9 209
def read(filename):
paul@9 210
paul@9 211
    """
paul@9 212
    Read from the file with the given 'filename', returning a string containing
paul@9 213
    its contents.
paul@9 214
    """
paul@9 215
paul@9 216
    f = codecs.open(filename, encoding="utf-8")
paul@9 217
    try:
paul@9 218
        return f.read()
paul@9 219
    finally:
paul@9 220
        f.close()
paul@3 221
paul@25 222
def translate(filename, body, fn=None):
paul@11 223
paul@11 224
    """
paul@11 225
    Write to the file with the given 'filename' a translation of the given
paul@11 226
    'body'.
paul@11 227
    """
paul@11 228
paul@25 229
    fn = fn or parser.parse
paul@25 230
paul@11 231
    out = codecs.open(filename, "w", encoding="utf-8")
paul@11 232
    try:
paul@25 233
        fn(body, out)
paul@11 234
    finally:
paul@11 235
        out.close()
paul@11 236
paul@25 237
def xmltranslate(filename, body):
paul@25 238
    translate(filename, body, parser.xmlparse)
paul@25 239
paul@31 240
def sort_comments(pages_dir, pageid):
paul@31 241
paul@31 242
    """
paul@31 243
    Where 'pageid' has comments associated with it, sort them chronologically
paul@31 244
    and label the comment pages with the owner page's title and comment's
paul@31 245
    position in the chronological sequence. Such labelling is done by writing
paul@31 246
    a "pagetitle" file in each comment page's directory.
paul@31 247
    """
paul@31 248
paul@31 249
    comments = join(pages_dir, pageid, "comments")
paul@31 250
paul@31 251
    if not exists(comments):
paul@31 252
        return
paul@31 253
paul@31 254
    title = read(join(pages_dir, pageid, "pagetitle"))
paul@31 255
paul@31 256
    details = [line.split("|") for line in read(comments).split("\n") if line]
paul@31 257
    details.sort()
paul@31 258
paul@31 259
    # Write the sorted comments list for testing purposes.
paul@31 260
paul@31 261
    write(comments, "\n".join(["|".join(x) for x in details]))
paul@31 262
paul@31 263
    # Define comments as subpages by setting their titles using this
paul@31 264
    # page's name/title and their position in the comments collection.
paul@31 265
paul@31 266
    for position, (_lastmodified, commentid) in enumerate(details):
paul@31 267
paul@31 268
        # In the page directory for each comment, write the page title in a
paul@31 269
        # special file for later processing.
paul@31 270
paul@32 271
        write(join(pages_dir, commentid, "pagetitle"), "%s/%04d" % (title, position))
paul@31 272
paul@33 273
def sort_manifest(pages_dir, pageid, output=None, no_translate=False):
paul@9 274
paul@9 275
    """
paul@28 276
    Using the given 'pageid', locate the manifest for the page and any page
paul@28 277
    title information written to a "pagetitle" file.
paul@23 278
paul@28 279
    Then sort the manifest according to revision so that it will be added to
paul@28 280
    MoinMoin in the correct order.
paul@28 281
paul@28 282
    If a "pagetitle" file exists, the title column in the manifest will be
paul@23 283
    augmented with the contents of that file. This is typically done for
paul@23 284
    comments.
paul@23 285
paul@28 286
    If a "children" file exists, the pages in that file will be added as a list
paul@28 287
    to the end of each revision's content.
paul@28 288
paul@23 289
    If 'output' is given, the manifest details will be appended to the file
paul@23 290
    having that filename instead of being rewritten to the original manifest
paul@23 291
    file.
paul@9 292
    """
paul@9 293
paul@28 294
    manifest = join(pages_dir, pageid, "manifest")
paul@28 295
    pagetitle = join(pages_dir, pageid, "pagetitle")
paul@28 296
    children = join(pages_dir, pageid, "children")
paul@32 297
    comments = join(pages_dir, pageid, "comments")
paul@28 298
paul@9 299
    if exists(pagetitle):
paul@9 300
        title = read(pagetitle)
paul@9 301
    else:
paul@9 302
        title = None
paul@3 303
paul@28 304
    f = codecs.open(manifest, "r", encoding="utf-8")
paul@3 305
    try:
paul@3 306
        lines = [x.split("|") for x in f.readlines()]
paul@3 307
        lines.sort(cmp=lambda x, y: cmp(int(x[0]), int(y[0])))
paul@9 308
paul@9 309
        # Reconstruct the lines, optionally changing the titles.
paul@9 310
paul@9 311
        result = []
paul@28 312
paul@28 313
        for line in lines:
paul@28 314
            version, _addrevision, filename, old_title, username, comment = line
paul@28 315
paul@31 316
            # Replace title information with the information already present.
paul@28 317
paul@9 318
            if title is not None:
paul@31 319
                new_title = title
paul@28 320
            else:
paul@28 321
                new_title = old_title
paul@28 322
paul@28 323
            # The version is omitted now that the manifest is ordered.
paul@28 324
paul@28 325
            line = _addrevision, filename, new_title, username, comment
paul@28 326
            result.append("|".join(line))
paul@28 327
paul@28 328
            # Add child page information to the content.
paul@28 329
paul@33 330
            if exists(children) and not no_translate:
paul@28 331
                child_pages = []
paul@28 332
                child_page_names = [x for x in read(children).split("\n") if x]
paul@28 333
                child_page_names.sort()
paul@28 334
paul@28 335
                for child_page_name in child_page_names:
paul@28 336
                    child_pages.append(" * [[%s]]" % child_page_name)
paul@28 337
paul@28 338
                append(filename, child_page_section % "\n".join(child_pages))
paul@28 339
paul@32 340
            # Add comments to the content.
paul@32 341
paul@33 342
            if exists(comments) and title and not no_translate:
paul@32 343
                append(filename, comment_section % title)
paul@32 344
paul@3 345
    finally:
paul@3 346
        f.close()
paul@3 347
paul@10 348
    s = "".join(result)
paul@10 349
paul@10 350
    if output is None:
paul@28 351
        write(manifest, s)
paul@10 352
    else:
paul@10 353
        append(output, s)
paul@3 354
paul@28 355
# Template for child page information.
paul@28 356
paul@28 357
child_page_section = """
paul@28 358
----
paul@28 359
paul@28 360
%s
paul@28 361
"""
paul@28 362
paul@32 363
# Template for comments.
paul@32 364
paul@32 365
comment_section = """
paul@32 366
----
paul@32 367
paul@32 368
<<Include("^%s/")>>
paul@32 369
"""
paul@32 370
paul@28 371
# Main program.
paul@28 372
paul@0 373
if __name__ == "__main__":
paul@20 374
    try:
paul@20 375
        filename = sys.argv[1]
paul@20 376
        is_zipfile = splitext(filename)[-1] == extsep + "zip"
paul@20 377
        space = sys.argv[2]
paul@20 378
    except IndexError:
paul@20 379
        print >>sys.stderr, "Please specify an XML file containing Wiki data and a workspace name."
paul@20 380
        print >>sys.stderr, "For example: com_entities.xml COM"
paul@20 381
        sys.exit(1)
paul@0 382
paul@13 383
    no_translate = "--no-translate" in sys.argv
paul@0 384
paul@12 385
    if exists(space):
paul@12 386
        print >>sys.stderr, "Directory exists for space %s. Please choose another or remove its contents." % space
paul@0 387
        sys.exit(1)
paul@0 388
paul@12 389
    package_zip = space + extsep + "zip"
paul@12 390
paul@12 391
    if exists(package_zip):
paul@12 392
        print >>sys.stderr, "Page package exists. Please remove or rename it:", package_zip
paul@12 393
        sys.exit(1)
paul@12 394
paul@12 395
    mkdir(space)
paul@12 396
    mkdirs(join(space, "pages"))
paul@12 397
    mkdirs(join(space, "versions"))
paul@0 398
paul@0 399
    p = xmlread.ConfigurableParser()
paul@13 400
    handler = ConfluenceHandler(space, no_translate)
paul@0 401
paul@24 402
    # Register handlers in the parser for different elements.
paul@24 403
paul@0 404
    p["object"] = handler.handle_object
paul@0 405
    p["property"] = handler.handle_property
paul@0 406
    p["id"] = handler.handle_id
paul@0 407
    p["collection"] = handler.handle_collection
paul@0 408
    p["element"] = handler.handle_element
paul@0 409
paul@2 410
    # Open the XML dump.
paul@2 411
paul@0 412
    f = open(filename)
paul@0 413
paul@0 414
    if is_zipfile:
paul@0 415
        zf = ZipFile(f)
paul@0 416
        ff = StringIO(zf.read("entities.xml"))
paul@0 417
    else:
paul@0 418
        ff = f
paul@0 419
paul@2 420
    # Parse the data.
paul@2 421
paul@0 422
    try:
paul@0 423
        p.parse(ff)
paul@0 424
    finally:
paul@0 425
        f.close()
paul@0 426
paul@2 427
    # Tidy up the import manifests, sorting each of them by revision and
paul@2 428
    # finalising them.
paul@2 429
paul@12 430
    pages_dir = join(space, "pages")
paul@3 431
paul@31 432
    for pageid in listdir(pages_dir):
paul@31 433
        sort_comments(pages_dir, pageid)
paul@31 434
paul@12 435
    output_manifest = join(space, "MOIN_PACKAGE")
paul@10 436
    append(output_manifest, "MoinMoinPackage|1\n")
paul@10 437
paul@3 438
    for pageid in listdir(pages_dir):
paul@33 439
        sort_manifest(pages_dir, pageid, output_manifest, no_translate)
paul@10 440
paul@10 441
    # Write the page package.
paul@10 442
paul@12 443
    page_package = ZipFile(package_zip, "w")
paul@10 444
paul@10 445
    try:
paul@10 446
        # Include the page revisions.
paul@10 447
paul@12 448
        versions_dir = join(space, "versions")
paul@10 449
paul@10 450
        for versionid in listdir(versions_dir):
paul@10 451
            page_package.write(join(versions_dir, versionid))
paul@10 452
paul@10 453
        # Include only the top-level manifest.
paul@10 454
paul@10 455
        page_package.write(output_manifest, "MOIN_PACKAGE")
paul@10 456
paul@10 457
    finally:
paul@10 458
        page_package.close()
paul@3 459
paul@0 460
# vim: tabstop=4 expandtab shiftwidth=4