1 #!/usr/bin/env python 2 3 from os import mkdir, makedirs, symlink 4 from os.path import extsep, join, splitext 5 from zipfile import ZipFile 6 from cStringIO import StringIO 7 import codecs 8 import xmlread 9 10 class ConfluenceHandler: 11 12 "Handle content from a Confluence Wiki dump." 13 14 def __init__(self, directory): 15 self.content = {} 16 self.elements = [] 17 self.directory = directory 18 19 def handle_object(self, name, elements, attributes, all_text, text): 20 21 "Handle objects according to type." 22 23 objecttype = attributes[-1]["class"] 24 identifier = text.strip() 25 content = self.content 26 27 pages_dir = join(self.directory, "pages") 28 versions_dir = join(self.directory, "versions") 29 30 # Handle particular types. 31 32 if objecttype == "Page": 33 34 # Handle pages. 35 36 if content.has_key("historicalVersions"): 37 versions = content["historicalVersions"] 38 39 # Make a page directory and links from the versions to separate 40 # files. 41 42 page_dir = join(pages_dir, identifier) 43 revisions_dir = join(page_dir, "revisions") 44 45 mkdir(page_dir) 46 mkdir(revisions_dir) 47 48 # Historical versions are stored separately. 49 50 for n, (cls, version) in enumerate(versions): 51 if cls == "Page": 52 symlink(join("..", "..", "..", "..", versions_dir, version), join(revisions_dir, str(n+1))) 53 54 # The page always seems to hold the current version itself. 55 56 symlink(join("..", "..", "..", "..", versions_dir, identifier), join(revisions_dir, str(len(versions)+1))) 57 58 # Handle pages and revisions, adding revisions to the page manifest. 59 60 if content.has_key("originalVersion"): 61 pageid = content["originalVersion"] 62 else: 63 pageid = identifier 64 65 versionfile = join(versions_dir, identifier) 66 67 # Note page metadata, not necessarily in the correct order. 68 69 mkdirs(join(pages_dir, pageid)) 70 71 append(join(pages_dir, pageid, "import"), "%s|AddRevision|%s|%s|%s|%s\n" % ( 72 content["version"], 73 versionfile, 74 content["title"], 75 content["lastModifierName"], 76 content["versionComment"])) 77 78 # Some metadata is not particularly relevant. For example, 79 # ancestors, children, parent are navigation-related. 80 81 # Other metadata could be added to the page content itself. 82 # For example, labelling could be converted to categories. 83 84 # Handle revisions. 85 86 elif objecttype == "BodyContent": 87 write(join(versions_dir, content["content"]), content["body"]) 88 89 self.content = {} 90 91 def handle_property(self, name, elements, attributes, all_text, text): 92 93 "Record properties in the current content dictionary." 94 95 self.content[attributes[-1]["name"]] = text.strip() 96 97 def handle_id(self, name, elements, attributes, all_text, text): 98 99 "Promote identifiers to the parent element's text." 100 101 all_text[-2].append(text) 102 103 def handle_collection(self, name, elements, attributes, all_text, text): 104 105 "Record collections in the current content dictionary." 106 107 self.content[attributes[-1]["name"]] = self.elements 108 self.elements = [] 109 110 def handle_element(self, name, elements, attributes, all_text, text): 111 112 "Add elements to the current collection." 113 114 self.elements.append((attributes[-1]["class"], text.strip())) 115 116 def mkdirs(name): 117 try: 118 makedirs(name) 119 except OSError: 120 pass 121 122 def append(filename, s): 123 write(filename, s, True) 124 125 def write(filename, s, append=False): 126 f = codecs.open(filename, append and "ab" or "wb", encoding="utf-8") 127 try: 128 f.write(s) 129 finally: 130 f.close() 131 132 if __name__ == "__main__": 133 import sys 134 135 filename = sys.argv[1] 136 is_zipfile = splitext(filename)[-1] == extsep + "zip" 137 138 directory = sys.argv[2] 139 140 if exists(directory): 141 print >>sys.stderr, "Directory exists. Please choose another or remove its contents." 142 sys.exit(1) 143 144 mkdir(directory) 145 mkdirs(join(directory, "pages")) 146 mkdirs(join(directory, "versions")) 147 148 p = xmlread.ConfigurableParser() 149 handler = ConfluenceHandler(directory) 150 151 p["object"] = handler.handle_object 152 p["property"] = handler.handle_property 153 p["id"] = handler.handle_id 154 p["collection"] = handler.handle_collection 155 p["element"] = handler.handle_element 156 157 f = open(filename) 158 159 if is_zipfile: 160 zf = ZipFile(f) 161 ff = StringIO(zf.read("entities.xml")) 162 else: 163 ff = f 164 165 try: 166 p.parse(ff) 167 finally: 168 f.close() 169 170 # vim: tabstop=4 expandtab shiftwidth=4