1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
1.2 +++ b/convert.py Sat Mar 31 01:34:31 2012 +0200
1.3 @@ -0,0 +1,170 @@
1.4 +#!/usr/bin/env python
1.5 +
1.6 +from os import mkdir, makedirs, symlink
1.7 +from os.path import extsep, join, splitext
1.8 +from zipfile import ZipFile
1.9 +from cStringIO import StringIO
1.10 +import codecs
1.11 +import xmlread
1.12 +
1.13 +class ConfluenceHandler:
1.14 +
1.15 + "Handle content from a Confluence Wiki dump."
1.16 +
1.17 + def __init__(self, directory):
1.18 + self.content = {}
1.19 + self.elements = []
1.20 + self.directory = directory
1.21 +
1.22 + def handle_object(self, name, elements, attributes, all_text, text):
1.23 +
1.24 + "Handle objects according to type."
1.25 +
1.26 + objecttype = attributes[-1]["class"]
1.27 + identifier = text.strip()
1.28 + content = self.content
1.29 +
1.30 + pages_dir = join(self.directory, "pages")
1.31 + versions_dir = join(self.directory, "versions")
1.32 +
1.33 + # Handle particular types.
1.34 +
1.35 + if objecttype == "Page":
1.36 +
1.37 + # Handle pages.
1.38 +
1.39 + if content.has_key("historicalVersions"):
1.40 + versions = content["historicalVersions"]
1.41 +
1.42 + # Make a page directory and links from the versions to separate
1.43 + # files.
1.44 +
1.45 + page_dir = join(pages_dir, identifier)
1.46 + revisions_dir = join(page_dir, "revisions")
1.47 +
1.48 + mkdir(page_dir)
1.49 + mkdir(revisions_dir)
1.50 +
1.51 + # Historical versions are stored separately.
1.52 +
1.53 + for n, (cls, version) in enumerate(versions):
1.54 + if cls == "Page":
1.55 + symlink(join("..", "..", "..", "..", versions_dir, version), join(revisions_dir, str(n+1)))
1.56 +
1.57 + # The page always seems to hold the current version itself.
1.58 +
1.59 + symlink(join("..", "..", "..", "..", versions_dir, identifier), join(revisions_dir, str(len(versions)+1)))
1.60 +
1.61 + # Handle pages and revisions, adding revisions to the page manifest.
1.62 +
1.63 + if content.has_key("originalVersion"):
1.64 + pageid = content["originalVersion"]
1.65 + else:
1.66 + pageid = identifier
1.67 +
1.68 + versionfile = join(versions_dir, identifier)
1.69 +
1.70 + # Note page metadata, not necessarily in the correct order.
1.71 +
1.72 + mkdirs(join(pages_dir, pageid))
1.73 +
1.74 + append(join(pages_dir, pageid, "import"), "%s|AddRevision|%s|%s|%s|%s\n" % (
1.75 + content["version"],
1.76 + versionfile,
1.77 + content["title"],
1.78 + content["lastModifierName"],
1.79 + content["versionComment"]))
1.80 +
1.81 + # Some metadata is not particularly relevant. For example,
1.82 + # ancestors, children, parent are navigation-related.
1.83 +
1.84 + # Other metadata could be added to the page content itself.
1.85 + # For example, labelling could be converted to categories.
1.86 +
1.87 + # Handle revisions.
1.88 +
1.89 + elif objecttype == "BodyContent":
1.90 + write(join(versions_dir, content["content"]), content["body"])
1.91 +
1.92 + self.content = {}
1.93 +
1.94 + def handle_property(self, name, elements, attributes, all_text, text):
1.95 +
1.96 + "Record properties in the current content dictionary."
1.97 +
1.98 + self.content[attributes[-1]["name"]] = text.strip()
1.99 +
1.100 + def handle_id(self, name, elements, attributes, all_text, text):
1.101 +
1.102 + "Promote identifiers to the parent element's text."
1.103 +
1.104 + all_text[-2].append(text)
1.105 +
1.106 + def handle_collection(self, name, elements, attributes, all_text, text):
1.107 +
1.108 + "Record collections in the current content dictionary."
1.109 +
1.110 + self.content[attributes[-1]["name"]] = self.elements
1.111 + self.elements = []
1.112 +
1.113 + def handle_element(self, name, elements, attributes, all_text, text):
1.114 +
1.115 + "Add elements to the current collection."
1.116 +
1.117 + self.elements.append((attributes[-1]["class"], text.strip()))
1.118 +
1.119 +def mkdirs(name):
1.120 + try:
1.121 + makedirs(name)
1.122 + except OSError:
1.123 + pass
1.124 +
1.125 +def append(filename, s):
1.126 + write(filename, s, True)
1.127 +
1.128 +def write(filename, s, append=False):
1.129 + f = codecs.open(filename, append and "ab" or "wb", encoding="utf-8")
1.130 + try:
1.131 + f.write(s)
1.132 + finally:
1.133 + f.close()
1.134 +
1.135 +if __name__ == "__main__":
1.136 + import sys
1.137 +
1.138 + filename = sys.argv[1]
1.139 + is_zipfile = splitext(filename)[-1] == extsep + "zip"
1.140 +
1.141 + directory = sys.argv[2]
1.142 +
1.143 + if exists(directory):
1.144 + print >>sys.stderr, "Directory exists. Please choose another or remove its contents."
1.145 + sys.exit(1)
1.146 +
1.147 + mkdir(directory)
1.148 + mkdirs(join(directory, "pages"))
1.149 + mkdirs(join(directory, "versions"))
1.150 +
1.151 + p = xmlread.ConfigurableParser()
1.152 + handler = ConfluenceHandler(directory)
1.153 +
1.154 + p["object"] = handler.handle_object
1.155 + p["property"] = handler.handle_property
1.156 + p["id"] = handler.handle_id
1.157 + p["collection"] = handler.handle_collection
1.158 + p["element"] = handler.handle_element
1.159 +
1.160 + f = open(filename)
1.161 +
1.162 + if is_zipfile:
1.163 + zf = ZipFile(f)
1.164 + ff = StringIO(zf.read("entities.xml"))
1.165 + else:
1.166 + ff = f
1.167 +
1.168 + try:
1.169 + p.parse(ff)
1.170 + finally:
1.171 + f.close()
1.172 +
1.173 +# vim: tabstop=4 expandtab shiftwidth=4