ConfluenceConverter

Annotated convert.py

20:702a040785d7
2012-06-05 Paul Boddie Added some documentation; added command line argument handling in the converter.
paul@0 1
#!/usr/bin/env python
paul@0 2
paul@8 3
"""
paul@8 4
Confluence XML dump conversion to a MoinMoin-compatible representation.
paul@8 5
paul@8 6
Copyright (C) 2012 Paul Boddie <paul@boddie.org.uk>
paul@8 7
paul@8 8
This software is free software; you can redistribute it and/or
paul@8 9
modify it under the terms of the GNU General Public License as
paul@8 10
published by the Free Software Foundation; either version 2 of
paul@8 11
the License, or (at your option) any later version.
paul@8 12
paul@8 13
This software is distributed in the hope that it will be useful,
paul@8 14
but WITHOUT ANY WARRANTY; without even the implied warranty of
paul@8 15
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
paul@8 16
GNU General Public License for more details.
paul@8 17
paul@8 18
You should have received a copy of the GNU General Public
paul@8 19
License along with this library; see the file LICENCE.txt
paul@8 20
If not, write to the Free Software Foundation, Inc.,
paul@8 21
51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA
paul@8 22
"""
paul@8 23
paul@3 24
from os import listdir, mkdir, makedirs
paul@1 25
from os.path import exists, extsep, join, splitext
paul@0 26
from zipfile import ZipFile
paul@0 27
from cStringIO import StringIO
paul@0 28
import codecs
paul@0 29
import xmlread
paul@11 30
import parser
paul@0 31
paul@0 32
class ConfluenceHandler:
paul@0 33
paul@0 34
    "Handle content from a Confluence Wiki dump."
paul@0 35
paul@13 36
    def __init__(self, space, no_translate=False):
paul@0 37
        self.content = {}
paul@0 38
        self.elements = []
paul@12 39
        self.space = space
paul@13 40
        self.no_translate = no_translate
paul@0 41
paul@0 42
    def handle_object(self, name, elements, attributes, all_text, text):
paul@0 43
paul@0 44
        "Handle objects according to type."
paul@0 45
paul@0 46
        objecttype = attributes[-1]["class"]
paul@0 47
        identifier = text.strip()
paul@0 48
        content = self.content
paul@0 49
paul@12 50
        pages_dir = join(self.space, "pages")
paul@12 51
        versions_dir = join(self.space, "versions")
paul@0 52
paul@0 53
        # Handle particular types.
paul@0 54
paul@10 55
        if objecttype in ("Page", "Comment", "BlogPost"):
paul@0 56
paul@0 57
            # Handle pages and revisions, adding revisions to the page manifest.
paul@9 58
            # The original version is used as a unifying identifier for all the
paul@9 59
            # different revisions (each of which being defined by a Page
paul@9 60
            # element). Although "original" implies the first identifier used,
paul@9 61
            # it actually appears to be the latest and will have the highest
paul@9 62
            # version number.
paul@0 63
paul@0 64
            if content.has_key("originalVersion"):
paul@0 65
                pageid = content["originalVersion"]
paul@0 66
            else:
paul@0 67
                pageid = identifier
paul@0 68
paul@0 69
            versionfile = join(versions_dir, identifier)
paul@0 70
paul@0 71
            # Note page metadata, not necessarily in the correct order.
paul@9 72
            # For comments, the title will need to be rewritten, since they
paul@9 73
            # should be defined in terms of their owner page.
paul@0 74
paul@0 75
            mkdirs(join(pages_dir, pageid))
paul@0 76
paul@12 77
            title = content["title"]
paul@12 78
            if title:
paul@12 79
                title = "%s/%s" % (self.space, title)
paul@12 80
paul@3 81
            append(join(pages_dir, pageid, "manifest"), "%s|AddRevision|%s|%s|%s|%s\n" % (
paul@0 82
                content["version"],
paul@0 83
                versionfile,
paul@12 84
                title or content["version"], # comment titles will incorporate the version
paul@0 85
                content["lastModifierName"],
paul@0 86
                content["versionComment"]))
paul@0 87
paul@9 88
            # Write comments as subpages.
paul@9 89
paul@9 90
            if content.has_key("comments"):
paul@9 91
paul@9 92
                # Define a page directory for each comment, and write the page
paul@9 93
                # title in a special file for later processing.
paul@9 94
paul@9 95
                for _comment, commentid in content["comments"]:
paul@9 96
                    mkdirs(join(pages_dir, commentid))
paul@12 97
                    append(join(pages_dir, commentid, "pagetitle"), title)
paul@9 98
paul@0 99
            # Some metadata is not particularly relevant. For example,
paul@0 100
            # ancestors, children, parent are navigation-related.
paul@0 101
paul@0 102
            # Other metadata could be added to the page content itself.
paul@0 103
            # For example, labelling could be converted to categories.
paul@0 104
paul@0 105
        # Handle revisions.
paul@0 106
paul@0 107
        elif objecttype == "BodyContent":
paul@12 108
            body = content["body"]
paul@12 109
            if not body:
paul@12 110
                body = "## Empty page."
paul@13 111
paul@13 112
            if no_translate:
paul@13 113
                fn = write
paul@13 114
            else:
paul@13 115
                fn = translate
paul@13 116
paul@13 117
            fn(join(versions_dir, content["content"]), body)
paul@0 118
paul@0 119
        self.content = {}
paul@0 120
paul@0 121
    def handle_property(self, name, elements, attributes, all_text, text):
paul@0 122
paul@0 123
        "Record properties in the current content dictionary."
paul@0 124
paul@0 125
        self.content[attributes[-1]["name"]] = text.strip()
paul@0 126
paul@0 127
    def handle_id(self, name, elements, attributes, all_text, text):
paul@0 128
paul@0 129
        "Promote identifiers to the parent element's text."
paul@0 130
paul@0 131
        all_text[-2].append(text)
paul@0 132
paul@0 133
    def handle_collection(self, name, elements, attributes, all_text, text):
paul@0 134
paul@0 135
        "Record collections in the current content dictionary."
paul@0 136
paul@0 137
        self.content[attributes[-1]["name"]] = self.elements
paul@0 138
        self.elements = []
paul@0 139
paul@0 140
    def handle_element(self, name, elements, attributes, all_text, text):
paul@0 141
paul@0 142
        "Add elements to the current collection."
paul@0 143
paul@0 144
        self.elements.append((attributes[-1]["class"], text.strip()))
paul@0 145
paul@0 146
def mkdirs(name):
paul@2 147
paul@2 148
    "Make the directory with the given 'name' at any depth."
paul@2 149
paul@0 150
    try:
paul@0 151
        makedirs(name)
paul@0 152
    except OSError:
paul@0 153
        pass
paul@0 154
paul@0 155
def append(filename, s):
paul@2 156
paul@2 157
    "Append to the file with the given 'filename' the string 's'."
paul@2 158
paul@0 159
    write(filename, s, True)
paul@0 160
paul@0 161
def write(filename, s, append=False):
paul@2 162
paul@2 163
    """
paul@2 164
    Write to the file with the given 'filename' the string 's'. If the optional
paul@2 165
    'append' parameter is set to a true value, 's' will be appended to the file.
paul@2 166
    """
paul@2 167
paul@2 168
    f = codecs.open(filename, append and "a" or "w", encoding="utf-8")
paul@0 169
    try:
paul@0 170
        f.write(s)
paul@0 171
    finally:
paul@0 172
        f.close()
paul@0 173
paul@9 174
def read(filename):
paul@9 175
paul@9 176
    """
paul@9 177
    Read from the file with the given 'filename', returning a string containing
paul@9 178
    its contents.
paul@9 179
    """
paul@9 180
paul@9 181
    f = codecs.open(filename, encoding="utf-8")
paul@9 182
    try:
paul@9 183
        return f.read()
paul@9 184
    finally:
paul@9 185
        f.close()
paul@3 186
paul@11 187
def translate(filename, body):
paul@11 188
paul@11 189
    """
paul@11 190
    Write to the file with the given 'filename' a translation of the given
paul@11 191
    'body'.
paul@11 192
    """
paul@11 193
paul@11 194
    out = codecs.open(filename, "w", encoding="utf-8")
paul@11 195
    try:
paul@11 196
        parser.parse(body, out)
paul@11 197
    finally:
paul@11 198
        out.close()
paul@11 199
paul@10 200
def sort_manifest(filename, pagetitle, output=None):
paul@9 201
paul@9 202
    """
paul@9 203
    Sort the manifest given in 'filename' according to revision. If a
paul@9 204
    'pagetitle' file exists, the title column in the manifest will be augmented
paul@10 205
    with the contents of that file. If 'output' is given, the manifest details
paul@10 206
    will be appended to the file having that filename instead of being rewritten
paul@10 207
    to the original manifest file.
paul@9 208
    """
paul@9 209
paul@9 210
    if exists(pagetitle):
paul@9 211
        title = read(pagetitle)
paul@9 212
    else:
paul@9 213
        title = None
paul@3 214
paul@5 215
    f = codecs.open(filename, "r", encoding="utf-8")
paul@3 216
    try:
paul@3 217
        lines = [x.split("|") for x in f.readlines()]
paul@3 218
        lines.sort(cmp=lambda x, y: cmp(int(x[0]), int(y[0])))
paul@9 219
paul@9 220
        # Reconstruct the lines, optionally changing the titles.
paul@9 221
paul@9 222
        result = []
paul@9 223
        for x in lines:
paul@9 224
            if title is not None:
paul@9 225
                x[3] = "%s/%s" % (title, x[3])
paul@9 226
            result.append("|".join(x[1:]))
paul@3 227
    finally:
paul@3 228
        f.close()
paul@3 229
paul@10 230
    s = "".join(result)
paul@10 231
paul@10 232
    if output is None:
paul@10 233
        write(filename, s)
paul@10 234
    else:
paul@10 235
        append(output, s)
paul@3 236
paul@0 237
if __name__ == "__main__":
paul@0 238
    import sys
paul@0 239
paul@20 240
    try:
paul@20 241
        filename = sys.argv[1]
paul@20 242
        is_zipfile = splitext(filename)[-1] == extsep + "zip"
paul@20 243
        space = sys.argv[2]
paul@20 244
    except IndexError:
paul@20 245
        print >>sys.stderr, "Please specify an XML file containing Wiki data and a workspace name."
paul@20 246
        print >>sys.stderr, "For example: com_entities.xml COM"
paul@20 247
        sys.exit(1)
paul@0 248
paul@13 249
    no_translate = "--no-translate" in sys.argv
paul@0 250
paul@12 251
    if exists(space):
paul@12 252
        print >>sys.stderr, "Directory exists for space %s. Please choose another or remove its contents." % space
paul@0 253
        sys.exit(1)
paul@0 254
paul@12 255
    package_zip = space + extsep + "zip"
paul@12 256
paul@12 257
    if exists(package_zip):
paul@12 258
        print >>sys.stderr, "Page package exists. Please remove or rename it:", package_zip
paul@12 259
        sys.exit(1)
paul@12 260
paul@12 261
    mkdir(space)
paul@12 262
    mkdirs(join(space, "pages"))
paul@12 263
    mkdirs(join(space, "versions"))
paul@0 264
paul@0 265
    p = xmlread.ConfigurableParser()
paul@13 266
    handler = ConfluenceHandler(space, no_translate)
paul@0 267
paul@0 268
    p["object"] = handler.handle_object
paul@0 269
    p["property"] = handler.handle_property
paul@0 270
    p["id"] = handler.handle_id
paul@0 271
    p["collection"] = handler.handle_collection
paul@0 272
    p["element"] = handler.handle_element
paul@0 273
paul@2 274
    # Open the XML dump.
paul@2 275
paul@0 276
    f = open(filename)
paul@0 277
paul@0 278
    if is_zipfile:
paul@0 279
        zf = ZipFile(f)
paul@0 280
        ff = StringIO(zf.read("entities.xml"))
paul@0 281
    else:
paul@0 282
        ff = f
paul@0 283
paul@2 284
    # Parse the data.
paul@2 285
paul@0 286
    try:
paul@0 287
        p.parse(ff)
paul@0 288
    finally:
paul@0 289
        f.close()
paul@0 290
paul@2 291
    # Tidy up the import manifests, sorting each of them by revision and
paul@2 292
    # finalising them.
paul@2 293
paul@12 294
    pages_dir = join(space, "pages")
paul@3 295
paul@12 296
    output_manifest = join(space, "MOIN_PACKAGE")
paul@10 297
    append(output_manifest, "MoinMoinPackage|1\n")
paul@10 298
paul@3 299
    for pageid in listdir(pages_dir):
paul@3 300
        manifest = join(pages_dir, pageid, "manifest")
paul@9 301
        pagetitle = join(pages_dir, pageid, "pagetitle")
paul@10 302
        sort_manifest(manifest, pagetitle, output_manifest)
paul@10 303
paul@10 304
    # Write the page package.
paul@10 305
paul@12 306
    page_package = ZipFile(package_zip, "w")
paul@10 307
paul@10 308
    try:
paul@10 309
        # Include the page revisions.
paul@10 310
paul@12 311
        versions_dir = join(space, "versions")
paul@10 312
paul@10 313
        for versionid in listdir(versions_dir):
paul@10 314
            page_package.write(join(versions_dir, versionid))
paul@10 315
paul@10 316
        # Include only the top-level manifest.
paul@10 317
paul@10 318
        page_package.write(output_manifest, "MOIN_PACKAGE")
paul@10 319
paul@10 320
    finally:
paul@10 321
        page_package.close()
paul@3 322
paul@0 323
# vim: tabstop=4 expandtab shiftwidth=4