1 #!/usr/bin/env python 2 3 """ 4 Confluence XML dump conversion to a MoinMoin-compatible representation. 5 6 Copyright (C) 2012 Paul Boddie <paul@boddie.org.uk> 7 8 This software is free software; you can redistribute it and/or 9 modify it under the terms of the GNU General Public License as 10 published by the Free Software Foundation; either version 2 of 11 the License, or (at your option) any later version. 12 13 This software is distributed in the hope that it will be useful, 14 but WITHOUT ANY WARRANTY; without even the implied warranty of 15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 GNU General Public License for more details. 17 18 You should have received a copy of the GNU General Public 19 License along with this library; see the file LICENCE.txt 20 If not, write to the Free Software Foundation, Inc., 21 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA 22 """ 23 24 from os import listdir, mkdir, makedirs 25 from os.path import exists, extsep, join, splitext 26 from zipfile import ZipFile 27 from cStringIO import StringIO 28 import codecs 29 import xmlread 30 import parser 31 32 class ConfluenceHandler: 33 34 "Handle content from a Confluence Wiki dump." 35 36 def __init__(self, space, no_translate=False): 37 self.content = {} 38 self.elements = [] 39 self.space = space 40 self.no_translate = no_translate 41 42 def handle_object(self, name, elements, attributes, all_text, text): 43 44 "Handle objects according to type." 45 46 objecttype = attributes[-1]["class"] 47 identifier = text.strip() 48 content = self.content 49 50 pages_dir = join(self.space, "pages") 51 versions_dir = join(self.space, "versions") 52 53 # Handle particular types. 54 55 if objecttype in ("Page", "Comment", "BlogPost"): 56 57 # Handle pages and revisions, adding revisions to the page manifest. 58 # The original version is used as a unifying identifier for all the 59 # different revisions (each of which being defined by a Page 60 # element). Although "original" implies the first identifier used, 61 # it actually appears to be the latest and will have the highest 62 # version number. 63 64 if content.has_key("originalVersion"): 65 pageid = content["originalVersion"] 66 else: 67 pageid = identifier 68 69 versionfile = join(versions_dir, identifier) 70 71 # Note page metadata, not necessarily in the correct order. 72 # For comments, the title will need to be rewritten, since they 73 # should be defined in terms of their owner page. 74 75 mkdirs(join(pages_dir, pageid)) 76 77 title = content["title"] 78 if title: 79 title = "%s/%s" % (self.space, title) 80 81 append(join(pages_dir, pageid, "manifest"), "%s|AddRevision|%s|%s|%s|%s\n" % ( 82 content["version"], 83 versionfile, 84 title or content["version"], # comment titles will incorporate the version 85 content["lastModifierName"], 86 content["versionComment"])) 87 88 # Write comments as subpages. 89 90 if content.has_key("comments"): 91 92 # Define a page directory for each comment, and write the page 93 # title in a special file for later processing. 94 95 for _comment, commentid in content["comments"]: 96 mkdirs(join(pages_dir, commentid)) 97 append(join(pages_dir, commentid, "pagetitle"), title) 98 99 # Some metadata is not particularly relevant. For example, 100 # ancestors, children, parent are navigation-related. 101 102 # Other metadata could be added to the page content itself. 103 # For example, labelling could be converted to categories. 104 105 # Handle revisions. 106 107 elif objecttype == "BodyContent": 108 body = content["body"] 109 if not body: 110 body = "## Empty page." 111 112 if no_translate: 113 fn = write 114 else: 115 fn = translate 116 117 fn(join(versions_dir, content["content"]), body) 118 119 self.content = {} 120 121 def handle_property(self, name, elements, attributes, all_text, text): 122 123 "Record properties in the current content dictionary." 124 125 self.content[attributes[-1]["name"]] = text.strip() 126 127 def handle_id(self, name, elements, attributes, all_text, text): 128 129 "Promote identifiers to the parent element's text." 130 131 all_text[-2].append(text) 132 133 def handle_collection(self, name, elements, attributes, all_text, text): 134 135 "Record collections in the current content dictionary." 136 137 self.content[attributes[-1]["name"]] = self.elements 138 self.elements = [] 139 140 def handle_element(self, name, elements, attributes, all_text, text): 141 142 "Add elements to the current collection." 143 144 self.elements.append((attributes[-1]["class"], text.strip())) 145 146 def mkdirs(name): 147 148 "Make the directory with the given 'name' at any depth." 149 150 try: 151 makedirs(name) 152 except OSError: 153 pass 154 155 def append(filename, s): 156 157 "Append to the file with the given 'filename' the string 's'." 158 159 write(filename, s, True) 160 161 def write(filename, s, append=False): 162 163 """ 164 Write to the file with the given 'filename' the string 's'. If the optional 165 'append' parameter is set to a true value, 's' will be appended to the file. 166 """ 167 168 f = codecs.open(filename, append and "a" or "w", encoding="utf-8") 169 try: 170 f.write(s) 171 finally: 172 f.close() 173 174 def read(filename): 175 176 """ 177 Read from the file with the given 'filename', returning a string containing 178 its contents. 179 """ 180 181 f = codecs.open(filename, encoding="utf-8") 182 try: 183 return f.read() 184 finally: 185 f.close() 186 187 def translate(filename, body): 188 189 """ 190 Write to the file with the given 'filename' a translation of the given 191 'body'. 192 """ 193 194 out = codecs.open(filename, "w", encoding="utf-8") 195 try: 196 parser.parse(body, out) 197 finally: 198 out.close() 199 200 def sort_manifest(filename, pagetitle, output=None): 201 202 """ 203 Sort the manifest given in 'filename' according to revision. If a 204 'pagetitle' file exists, the title column in the manifest will be augmented 205 with the contents of that file. If 'output' is given, the manifest details 206 will be appended to the file having that filename instead of being rewritten 207 to the original manifest file. 208 """ 209 210 if exists(pagetitle): 211 title = read(pagetitle) 212 else: 213 title = None 214 215 f = codecs.open(filename, "r", encoding="utf-8") 216 try: 217 lines = [x.split("|") for x in f.readlines()] 218 lines.sort(cmp=lambda x, y: cmp(int(x[0]), int(y[0]))) 219 220 # Reconstruct the lines, optionally changing the titles. 221 222 result = [] 223 for x in lines: 224 if title is not None: 225 x[3] = "%s/%s" % (title, x[3]) 226 result.append("|".join(x[1:])) 227 finally: 228 f.close() 229 230 s = "".join(result) 231 232 if output is None: 233 write(filename, s) 234 else: 235 append(output, s) 236 237 if __name__ == "__main__": 238 import sys 239 240 filename = sys.argv[1] 241 is_zipfile = splitext(filename)[-1] == extsep + "zip" 242 space = sys.argv[2] 243 244 no_translate = "--no-translate" in sys.argv 245 246 if exists(space): 247 print >>sys.stderr, "Directory exists for space %s. Please choose another or remove its contents." % space 248 sys.exit(1) 249 250 package_zip = space + extsep + "zip" 251 252 if exists(package_zip): 253 print >>sys.stderr, "Page package exists. Please remove or rename it:", package_zip 254 sys.exit(1) 255 256 mkdir(space) 257 mkdirs(join(space, "pages")) 258 mkdirs(join(space, "versions")) 259 260 p = xmlread.ConfigurableParser() 261 handler = ConfluenceHandler(space, no_translate) 262 263 p["object"] = handler.handle_object 264 p["property"] = handler.handle_property 265 p["id"] = handler.handle_id 266 p["collection"] = handler.handle_collection 267 p["element"] = handler.handle_element 268 269 # Open the XML dump. 270 271 f = open(filename) 272 273 if is_zipfile: 274 zf = ZipFile(f) 275 ff = StringIO(zf.read("entities.xml")) 276 else: 277 ff = f 278 279 # Parse the data. 280 281 try: 282 p.parse(ff) 283 finally: 284 f.close() 285 286 # Tidy up the import manifests, sorting each of them by revision and 287 # finalising them. 288 289 pages_dir = join(space, "pages") 290 291 output_manifest = join(space, "MOIN_PACKAGE") 292 append(output_manifest, "MoinMoinPackage|1\n") 293 294 for pageid in listdir(pages_dir): 295 manifest = join(pages_dir, pageid, "manifest") 296 pagetitle = join(pages_dir, pageid, "pagetitle") 297 sort_manifest(manifest, pagetitle, output_manifest) 298 299 # Write the page package. 300 301 page_package = ZipFile(package_zip, "w") 302 303 try: 304 # Include the page revisions. 305 306 versions_dir = join(space, "versions") 307 308 for versionid in listdir(versions_dir): 309 page_package.write(join(versions_dir, versionid)) 310 311 # Include only the top-level manifest. 312 313 page_package.write(output_manifest, "MOIN_PACKAGE") 314 315 finally: 316 page_package.close() 317 318 # vim: tabstop=4 expandtab shiftwidth=4