An attempt to convert Confluence Wiki content into a form suitable for import into MoinMoin.

     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/convert.py	Sat Mar 31 01:34:31 2012 +0200
     1.3 @@ -0,0 +1,170 @@
     1.4 +#!/usr/bin/env python
     1.5 +
     1.6 +from os import mkdir, makedirs, symlink
     1.7 +from os.path import extsep, join, splitext
     1.8 +from zipfile import ZipFile
     1.9 +from cStringIO import StringIO
    1.10 +import codecs
    1.11 +import xmlread
    1.12 +
    1.13 +class ConfluenceHandler:
    1.14 +
    1.15 +    "Handle content from a Confluence Wiki dump."
    1.16 +
    1.17 +    def __init__(self, directory):
    1.18 +        self.content = {}
    1.19 +        self.elements = []
    1.20 +        self.directory = directory
    1.21 +
    1.22 +    def handle_object(self, name, elements, attributes, all_text, text):
    1.23 +
    1.24 +        "Handle objects according to type."
    1.25 +
    1.26 +        objecttype = attributes[-1]["class"]
    1.27 +        identifier = text.strip()
    1.28 +        content = self.content
    1.29 +
    1.30 +        pages_dir = join(self.directory, "pages")
    1.31 +        versions_dir = join(self.directory, "versions")
    1.32 +
    1.33 +        # Handle particular types.
    1.34 +
    1.35 +        if objecttype == "Page":
    1.36 +
    1.37 +            # Handle pages.
    1.38 +
    1.39 +            if content.has_key("historicalVersions"):
    1.40 +                versions = content["historicalVersions"]
    1.41 +
    1.42 +                # Make a page directory and links from the versions to separate
    1.43 +                # files.
    1.44 +
    1.45 +                page_dir = join(pages_dir, identifier)
    1.46 +                revisions_dir = join(page_dir, "revisions")
    1.47 +
    1.48 +                mkdir(page_dir)
    1.49 +                mkdir(revisions_dir)
    1.50 +
    1.51 +                # Historical versions are stored separately.
    1.52 +
    1.53 +                for n, (cls, version) in enumerate(versions):
    1.54 +                    if cls == "Page":
    1.55 +                        symlink(join("..", "..", "..", "..", versions_dir, version), join(revisions_dir, str(n+1)))
    1.56 +
    1.57 +                # The page always seems to hold the current version itself.
    1.58 +
    1.59 +                symlink(join("..", "..", "..", "..", versions_dir, identifier), join(revisions_dir, str(len(versions)+1)))
    1.60 +
    1.61 +            # Handle pages and revisions, adding revisions to the page manifest.
    1.62 +
    1.63 +            if content.has_key("originalVersion"):
    1.64 +                pageid = content["originalVersion"]
    1.65 +            else:
    1.66 +                pageid = identifier
    1.67 +
    1.68 +            versionfile = join(versions_dir, identifier)
    1.69 +
    1.70 +            # Note page metadata, not necessarily in the correct order.
    1.71 +
    1.72 +            mkdirs(join(pages_dir, pageid))
    1.73 +
    1.74 +            append(join(pages_dir, pageid, "import"), "%s|AddRevision|%s|%s|%s|%s\n" % (
    1.75 +                content["version"],
    1.76 +                versionfile,
    1.77 +                content["title"],
    1.78 +                content["lastModifierName"],
    1.79 +                content["versionComment"]))
    1.80 +
    1.81 +            # Some metadata is not particularly relevant. For example,
    1.82 +            # ancestors, children, parent are navigation-related.
    1.83 +
    1.84 +            # Other metadata could be added to the page content itself.
    1.85 +            # For example, labelling could be converted to categories.
    1.86 +
    1.87 +        # Handle revisions.
    1.88 +
    1.89 +        elif objecttype == "BodyContent":
    1.90 +            write(join(versions_dir, content["content"]), content["body"])
    1.91 +
    1.92 +        self.content = {}
    1.93 +
    1.94 +    def handle_property(self, name, elements, attributes, all_text, text):
    1.95 +
    1.96 +        "Record properties in the current content dictionary."
    1.97 +
    1.98 +        self.content[attributes[-1]["name"]] = text.strip()
    1.99 +
   1.100 +    def handle_id(self, name, elements, attributes, all_text, text):
   1.101 +
   1.102 +        "Promote identifiers to the parent element's text."
   1.103 +
   1.104 +        all_text[-2].append(text)
   1.105 +
   1.106 +    def handle_collection(self, name, elements, attributes, all_text, text):
   1.107 +
   1.108 +        "Record collections in the current content dictionary."
   1.109 +
   1.110 +        self.content[attributes[-1]["name"]] = self.elements
   1.111 +        self.elements = []
   1.112 +
   1.113 +    def handle_element(self, name, elements, attributes, all_text, text):
   1.114 +
   1.115 +        "Add elements to the current collection."
   1.116 +
   1.117 +        self.elements.append((attributes[-1]["class"], text.strip()))
   1.118 +
   1.119 +def mkdirs(name):
   1.120 +    try:
   1.121 +        makedirs(name)
   1.122 +    except OSError:
   1.123 +        pass
   1.124 +
   1.125 +def append(filename, s):
   1.126 +    write(filename, s, True)
   1.127 +
   1.128 +def write(filename, s, append=False):
   1.129 +    f = codecs.open(filename, append and "ab" or "wb", encoding="utf-8")
   1.130 +    try:
   1.131 +        f.write(s)
   1.132 +    finally:
   1.133 +        f.close()
   1.134 +
   1.135 +if __name__ == "__main__":
   1.136 +    import sys
   1.137 +
   1.138 +    filename = sys.argv[1]
   1.139 +    is_zipfile = splitext(filename)[-1] == extsep + "zip"
   1.140 +
   1.141 +    directory = sys.argv[2]
   1.142 +
   1.143 +    if exists(directory):
   1.144 +        print >>sys.stderr, "Directory exists. Please choose another or remove its contents."
   1.145 +        sys.exit(1)
   1.146 +
   1.147 +    mkdir(directory)
   1.148 +    mkdirs(join(directory, "pages"))
   1.149 +    mkdirs(join(directory, "versions"))
   1.150 +
   1.151 +    p = xmlread.ConfigurableParser()
   1.152 +    handler = ConfluenceHandler(directory)
   1.153 +
   1.154 +    p["object"] = handler.handle_object
   1.155 +    p["property"] = handler.handle_property
   1.156 +    p["id"] = handler.handle_id
   1.157 +    p["collection"] = handler.handle_collection
   1.158 +    p["element"] = handler.handle_element
   1.159 +
   1.160 +    f = open(filename)
   1.161 +
   1.162 +    if is_zipfile:
   1.163 +        zf = ZipFile(f)
   1.164 +        ff = StringIO(zf.read("entities.xml"))
   1.165 +    else:
   1.166 +        ff = f
   1.167 +
   1.168 +    try:
   1.169 +        p.parse(ff)
   1.170 +    finally:
   1.171 +        f.close()
   1.172 +
   1.173 +# vim: tabstop=4 expandtab shiftwidth=4
2012-03-31	Paul Boddie	raw files shortlog changelog graph	An attempt to convert Confluence Wiki content into a form suitable for import into MoinMoin.
			convert.py (file)