ConfluenceConverter

Annotated convert.py

3:8c5fc57d4fbe
2012-03-31 Paul Boddie Changed the name of the manifest files and added support for sorting their contents.
paul@0 1
#!/usr/bin/env python
paul@0 2
paul@3 3
from os import listdir, mkdir, makedirs
paul@1 4
from os.path import exists, extsep, join, splitext
paul@0 5
from zipfile import ZipFile
paul@0 6
from cStringIO import StringIO
paul@0 7
import codecs
paul@0 8
import xmlread
paul@0 9
paul@0 10
class ConfluenceHandler:
paul@0 11
paul@0 12
    "Handle content from a Confluence Wiki dump."
paul@0 13
paul@0 14
    def __init__(self, directory):
paul@0 15
        self.content = {}
paul@0 16
        self.elements = []
paul@0 17
        self.directory = directory
paul@0 18
paul@0 19
    def handle_object(self, name, elements, attributes, all_text, text):
paul@0 20
paul@0 21
        "Handle objects according to type."
paul@0 22
paul@0 23
        objecttype = attributes[-1]["class"]
paul@0 24
        identifier = text.strip()
paul@0 25
        content = self.content
paul@0 26
paul@0 27
        pages_dir = join(self.directory, "pages")
paul@0 28
        versions_dir = join(self.directory, "versions")
paul@0 29
paul@0 30
        # Handle particular types.
paul@0 31
paul@0 32
        if objecttype == "Page":
paul@0 33
paul@0 34
            # Handle pages and revisions, adding revisions to the page manifest.
paul@0 35
paul@0 36
            if content.has_key("originalVersion"):
paul@0 37
                pageid = content["originalVersion"]
paul@0 38
            else:
paul@0 39
                pageid = identifier
paul@0 40
paul@0 41
            versionfile = join(versions_dir, identifier)
paul@0 42
paul@0 43
            # Note page metadata, not necessarily in the correct order.
paul@0 44
paul@0 45
            mkdirs(join(pages_dir, pageid))
paul@0 46
paul@3 47
            append(join(pages_dir, pageid, "manifest"), "%s|AddRevision|%s|%s|%s|%s\n" % (
paul@0 48
                content["version"],
paul@0 49
                versionfile,
paul@0 50
                content["title"],
paul@0 51
                content["lastModifierName"],
paul@0 52
                content["versionComment"]))
paul@0 53
paul@0 54
            # Some metadata is not particularly relevant. For example,
paul@0 55
            # ancestors, children, parent are navigation-related.
paul@0 56
paul@0 57
            # Other metadata could be added to the page content itself.
paul@0 58
            # For example, labelling could be converted to categories.
paul@0 59
paul@0 60
        # Handle revisions.
paul@0 61
paul@0 62
        elif objecttype == "BodyContent":
paul@0 63
            write(join(versions_dir, content["content"]), content["body"])
paul@0 64
paul@0 65
        self.content = {}
paul@0 66
paul@0 67
    def handle_property(self, name, elements, attributes, all_text, text):
paul@0 68
paul@0 69
        "Record properties in the current content dictionary."
paul@0 70
paul@0 71
        self.content[attributes[-1]["name"]] = text.strip()
paul@0 72
paul@0 73
    def handle_id(self, name, elements, attributes, all_text, text):
paul@0 74
paul@0 75
        "Promote identifiers to the parent element's text."
paul@0 76
paul@0 77
        all_text[-2].append(text)
paul@0 78
paul@0 79
    def handle_collection(self, name, elements, attributes, all_text, text):
paul@0 80
paul@0 81
        "Record collections in the current content dictionary."
paul@0 82
paul@0 83
        self.content[attributes[-1]["name"]] = self.elements
paul@0 84
        self.elements = []
paul@0 85
paul@0 86
    def handle_element(self, name, elements, attributes, all_text, text):
paul@0 87
paul@0 88
        "Add elements to the current collection."
paul@0 89
paul@0 90
        self.elements.append((attributes[-1]["class"], text.strip()))
paul@0 91
paul@0 92
def mkdirs(name):
paul@2 93
paul@2 94
    "Make the directory with the given 'name' at any depth."
paul@2 95
paul@0 96
    try:
paul@0 97
        makedirs(name)
paul@0 98
    except OSError:
paul@0 99
        pass
paul@0 100
paul@0 101
def append(filename, s):
paul@2 102
paul@2 103
    "Append to the file with the given 'filename' the string 's'."
paul@2 104
paul@0 105
    write(filename, s, True)
paul@0 106
paul@0 107
def write(filename, s, append=False):
paul@2 108
paul@2 109
    """
paul@2 110
    Write to the file with the given 'filename' the string 's'. If the optional
paul@2 111
    'append' parameter is set to a true value, 's' will be appended to the file.
paul@2 112
    """
paul@2 113
paul@2 114
    f = codecs.open(filename, append and "a" or "w", encoding="utf-8")
paul@0 115
    try:
paul@0 116
        f.write(s)
paul@0 117
    finally:
paul@0 118
        f.close()
paul@0 119
paul@3 120
def sort_manifest(filename):
paul@3 121
paul@3 122
    "Sort the manifest given in 'filename' according to revision."
paul@3 123
paul@3 124
    f = open(filename, "r")
paul@3 125
    try:
paul@3 126
        lines = [x.split("|") for x in f.readlines()]
paul@3 127
        lines.sort(cmp=lambda x, y: cmp(int(x[0]), int(y[0])))
paul@3 128
        lines = ["|".join(x[1:]) for x in lines]
paul@3 129
    finally:
paul@3 130
        f.close()
paul@3 131
paul@3 132
    write(filename, "".join(lines))
paul@3 133
paul@0 134
if __name__ == "__main__":
paul@0 135
    import sys
paul@0 136
paul@0 137
    filename = sys.argv[1]
paul@0 138
    is_zipfile = splitext(filename)[-1] == extsep + "zip"
paul@0 139
paul@0 140
    directory = sys.argv[2]
paul@0 141
paul@0 142
    if exists(directory):
paul@0 143
        print >>sys.stderr, "Directory exists. Please choose another or remove its contents."
paul@0 144
        sys.exit(1)
paul@0 145
paul@0 146
    mkdir(directory)
paul@0 147
    mkdirs(join(directory, "pages"))
paul@0 148
    mkdirs(join(directory, "versions"))
paul@0 149
paul@0 150
    p = xmlread.ConfigurableParser()
paul@0 151
    handler = ConfluenceHandler(directory)
paul@0 152
paul@0 153
    p["object"] = handler.handle_object
paul@0 154
    p["property"] = handler.handle_property
paul@0 155
    p["id"] = handler.handle_id
paul@0 156
    p["collection"] = handler.handle_collection
paul@0 157
    p["element"] = handler.handle_element
paul@0 158
paul@2 159
    # Open the XML dump.
paul@2 160
paul@0 161
    f = open(filename)
paul@0 162
paul@0 163
    if is_zipfile:
paul@0 164
        zf = ZipFile(f)
paul@0 165
        ff = StringIO(zf.read("entities.xml"))
paul@0 166
    else:
paul@0 167
        ff = f
paul@0 168
paul@2 169
    # Parse the data.
paul@2 170
paul@0 171
    try:
paul@0 172
        p.parse(ff)
paul@0 173
    finally:
paul@0 174
        f.close()
paul@0 175
paul@2 176
    # Tidy up the import manifests, sorting each of them by revision and
paul@2 177
    # finalising them.
paul@2 178
paul@3 179
    pages_dir = join(directory, "pages")
paul@3 180
paul@3 181
    for pageid in listdir(pages_dir):
paul@3 182
        manifest = join(pages_dir, pageid, "manifest")
paul@3 183
        sort_manifest(manifest)
paul@3 184
paul@0 185
# vim: tabstop=4 expandtab shiftwidth=4