paul@0 | 1 | #!/usr/bin/env python |
paul@0 | 2 | |
paul@3 | 3 | from os import listdir, mkdir, makedirs |
paul@1 | 4 | from os.path import exists, extsep, join, splitext |
paul@0 | 5 | from zipfile import ZipFile |
paul@0 | 6 | from cStringIO import StringIO |
paul@0 | 7 | import codecs |
paul@0 | 8 | import xmlread |
paul@0 | 9 | |
paul@0 | 10 | class ConfluenceHandler: |
paul@0 | 11 | |
paul@0 | 12 | "Handle content from a Confluence Wiki dump." |
paul@0 | 13 | |
paul@0 | 14 | def __init__(self, directory): |
paul@0 | 15 | self.content = {} |
paul@0 | 16 | self.elements = [] |
paul@0 | 17 | self.directory = directory |
paul@0 | 18 | |
paul@0 | 19 | def handle_object(self, name, elements, attributes, all_text, text): |
paul@0 | 20 | |
paul@0 | 21 | "Handle objects according to type." |
paul@0 | 22 | |
paul@0 | 23 | objecttype = attributes[-1]["class"] |
paul@0 | 24 | identifier = text.strip() |
paul@0 | 25 | content = self.content |
paul@0 | 26 | |
paul@0 | 27 | pages_dir = join(self.directory, "pages") |
paul@0 | 28 | versions_dir = join(self.directory, "versions") |
paul@0 | 29 | |
paul@0 | 30 | # Handle particular types. |
paul@0 | 31 | |
paul@0 | 32 | if objecttype == "Page": |
paul@0 | 33 | |
paul@0 | 34 | # Handle pages and revisions, adding revisions to the page manifest. |
paul@0 | 35 | |
paul@0 | 36 | if content.has_key("originalVersion"): |
paul@0 | 37 | pageid = content["originalVersion"] |
paul@0 | 38 | else: |
paul@0 | 39 | pageid = identifier |
paul@0 | 40 | |
paul@0 | 41 | versionfile = join(versions_dir, identifier) |
paul@0 | 42 | |
paul@0 | 43 | # Note page metadata, not necessarily in the correct order. |
paul@0 | 44 | |
paul@0 | 45 | mkdirs(join(pages_dir, pageid)) |
paul@0 | 46 | |
paul@3 | 47 | append(join(pages_dir, pageid, "manifest"), "%s|AddRevision|%s|%s|%s|%s\n" % ( |
paul@0 | 48 | content["version"], |
paul@0 | 49 | versionfile, |
paul@0 | 50 | content["title"], |
paul@0 | 51 | content["lastModifierName"], |
paul@0 | 52 | content["versionComment"])) |
paul@0 | 53 | |
paul@0 | 54 | # Some metadata is not particularly relevant. For example, |
paul@0 | 55 | # ancestors, children, parent are navigation-related. |
paul@0 | 56 | |
paul@0 | 57 | # Other metadata could be added to the page content itself. |
paul@0 | 58 | # For example, labelling could be converted to categories. |
paul@0 | 59 | |
paul@0 | 60 | # Handle revisions. |
paul@0 | 61 | |
paul@0 | 62 | elif objecttype == "BodyContent": |
paul@0 | 63 | write(join(versions_dir, content["content"]), content["body"]) |
paul@0 | 64 | |
paul@0 | 65 | self.content = {} |
paul@0 | 66 | |
paul@0 | 67 | def handle_property(self, name, elements, attributes, all_text, text): |
paul@0 | 68 | |
paul@0 | 69 | "Record properties in the current content dictionary." |
paul@0 | 70 | |
paul@0 | 71 | self.content[attributes[-1]["name"]] = text.strip() |
paul@0 | 72 | |
paul@0 | 73 | def handle_id(self, name, elements, attributes, all_text, text): |
paul@0 | 74 | |
paul@0 | 75 | "Promote identifiers to the parent element's text." |
paul@0 | 76 | |
paul@0 | 77 | all_text[-2].append(text) |
paul@0 | 78 | |
paul@0 | 79 | def handle_collection(self, name, elements, attributes, all_text, text): |
paul@0 | 80 | |
paul@0 | 81 | "Record collections in the current content dictionary." |
paul@0 | 82 | |
paul@0 | 83 | self.content[attributes[-1]["name"]] = self.elements |
paul@0 | 84 | self.elements = [] |
paul@0 | 85 | |
paul@0 | 86 | def handle_element(self, name, elements, attributes, all_text, text): |
paul@0 | 87 | |
paul@0 | 88 | "Add elements to the current collection." |
paul@0 | 89 | |
paul@0 | 90 | self.elements.append((attributes[-1]["class"], text.strip())) |
paul@0 | 91 | |
paul@0 | 92 | def mkdirs(name): |
paul@2 | 93 | |
paul@2 | 94 | "Make the directory with the given 'name' at any depth." |
paul@2 | 95 | |
paul@0 | 96 | try: |
paul@0 | 97 | makedirs(name) |
paul@0 | 98 | except OSError: |
paul@0 | 99 | pass |
paul@0 | 100 | |
paul@0 | 101 | def append(filename, s): |
paul@2 | 102 | |
paul@2 | 103 | "Append to the file with the given 'filename' the string 's'." |
paul@2 | 104 | |
paul@0 | 105 | write(filename, s, True) |
paul@0 | 106 | |
paul@0 | 107 | def write(filename, s, append=False): |
paul@2 | 108 | |
paul@2 | 109 | """ |
paul@2 | 110 | Write to the file with the given 'filename' the string 's'. If the optional |
paul@2 | 111 | 'append' parameter is set to a true value, 's' will be appended to the file. |
paul@2 | 112 | """ |
paul@2 | 113 | |
paul@2 | 114 | f = codecs.open(filename, append and "a" or "w", encoding="utf-8") |
paul@0 | 115 | try: |
paul@0 | 116 | f.write(s) |
paul@0 | 117 | finally: |
paul@0 | 118 | f.close() |
paul@0 | 119 | |
paul@3 | 120 | def sort_manifest(filename): |
paul@3 | 121 | |
paul@3 | 122 | "Sort the manifest given in 'filename' according to revision." |
paul@3 | 123 | |
paul@3 | 124 | f = open(filename, "r") |
paul@3 | 125 | try: |
paul@3 | 126 | lines = [x.split("|") for x in f.readlines()] |
paul@3 | 127 | lines.sort(cmp=lambda x, y: cmp(int(x[0]), int(y[0]))) |
paul@3 | 128 | lines = ["|".join(x[1:]) for x in lines] |
paul@3 | 129 | finally: |
paul@3 | 130 | f.close() |
paul@3 | 131 | |
paul@3 | 132 | write(filename, "".join(lines)) |
paul@3 | 133 | |
paul@0 | 134 | if __name__ == "__main__": |
paul@0 | 135 | import sys |
paul@0 | 136 | |
paul@0 | 137 | filename = sys.argv[1] |
paul@0 | 138 | is_zipfile = splitext(filename)[-1] == extsep + "zip" |
paul@0 | 139 | |
paul@0 | 140 | directory = sys.argv[2] |
paul@0 | 141 | |
paul@0 | 142 | if exists(directory): |
paul@0 | 143 | print >>sys.stderr, "Directory exists. Please choose another or remove its contents." |
paul@0 | 144 | sys.exit(1) |
paul@0 | 145 | |
paul@0 | 146 | mkdir(directory) |
paul@0 | 147 | mkdirs(join(directory, "pages")) |
paul@0 | 148 | mkdirs(join(directory, "versions")) |
paul@0 | 149 | |
paul@0 | 150 | p = xmlread.ConfigurableParser() |
paul@0 | 151 | handler = ConfluenceHandler(directory) |
paul@0 | 152 | |
paul@0 | 153 | p["object"] = handler.handle_object |
paul@0 | 154 | p["property"] = handler.handle_property |
paul@0 | 155 | p["id"] = handler.handle_id |
paul@0 | 156 | p["collection"] = handler.handle_collection |
paul@0 | 157 | p["element"] = handler.handle_element |
paul@0 | 158 | |
paul@2 | 159 | # Open the XML dump. |
paul@2 | 160 | |
paul@0 | 161 | f = open(filename) |
paul@0 | 162 | |
paul@0 | 163 | if is_zipfile: |
paul@0 | 164 | zf = ZipFile(f) |
paul@0 | 165 | ff = StringIO(zf.read("entities.xml")) |
paul@0 | 166 | else: |
paul@0 | 167 | ff = f |
paul@0 | 168 | |
paul@2 | 169 | # Parse the data. |
paul@2 | 170 | |
paul@0 | 171 | try: |
paul@0 | 172 | p.parse(ff) |
paul@0 | 173 | finally: |
paul@0 | 174 | f.close() |
paul@0 | 175 | |
paul@2 | 176 | # Tidy up the import manifests, sorting each of them by revision and |
paul@2 | 177 | # finalising them. |
paul@2 | 178 | |
paul@3 | 179 | pages_dir = join(directory, "pages") |
paul@3 | 180 | |
paul@3 | 181 | for pageid in listdir(pages_dir): |
paul@3 | 182 | manifest = join(pages_dir, pageid, "manifest") |
paul@3 | 183 | sort_manifest(manifest) |
paul@3 | 184 | |
paul@0 | 185 | # vim: tabstop=4 expandtab shiftwidth=4 |