1 #!/usr/bin/env python 2 3 """ 4 Confluence XML dump conversion to a MoinMoin-compatible representation. 5 6 Copyright (C) 2012 Paul Boddie <paul@boddie.org.uk> 7 8 This software is free software; you can redistribute it and/or 9 modify it under the terms of the GNU General Public License as 10 published by the Free Software Foundation; either version 2 of 11 the License, or (at your option) any later version. 12 13 This software is distributed in the hope that it will be useful, 14 but WITHOUT ANY WARRANTY; without even the implied warranty of 15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 GNU General Public License for more details. 17 18 You should have received a copy of the GNU General Public 19 License along with this library; see the file LICENCE.txt 20 If not, write to the Free Software Foundation, Inc., 21 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA 22 """ 23 24 from os import listdir, mkdir, makedirs 25 from os.path import exists, extsep, join, splitext 26 from zipfile import ZipFile 27 from cStringIO import StringIO 28 import codecs 29 import xmlread 30 import parser 31 import sys 32 33 MAX_TITLE_LENGTH = 120 34 35 class ConfluenceHandler: 36 37 "Handle content from a Confluence Wiki dump." 38 39 def __init__(self, space, no_translate=False): 40 self.content = {} 41 self.elements = [] 42 self.space = space 43 self.no_translate = no_translate 44 45 def handle_object(self, name, elements, attributes, all_text, text): 46 47 "Handle objects according to type." 48 49 objecttype = attributes[-1]["class"] 50 51 # Any identifier is stored as the object's textual content. 52 53 identifier = text.strip() 54 55 # The content is a dictionary mapping names to properties and 56 # collections. 57 58 content = self.content 59 60 pages_dir = join(self.space, "pages") 61 versions_dir = join(self.space, "versions") 62 63 # Handle particular types. 64 65 if objecttype in ("Page", "Comment", "BlogPost"): 66 67 # Handle pages and revisions, adding revisions to the page manifest. 68 # The original version is used as a unifying identifier for all the 69 # different revisions (each of which being defined by a Page 70 # element). Although "original" implies the first identifier used, 71 # it actually appears to be the latest and will have the highest 72 # version number. 73 74 if content.has_key("originalVersion"): 75 pageid = content["originalVersion"] 76 else: 77 pageid = identifier 78 79 versionfile = join(versions_dir, identifier) 80 81 # Note page metadata, not necessarily in the correct order. 82 # For comments, the title will need to be rewritten, since they 83 # should be defined in terms of their owner page. 84 85 mkdirs(join(pages_dir, pageid)) 86 87 title = content["title"] 88 89 # Limit the title to a "safe" number of characters in order to avoid 90 # filesystem issues. 91 92 title = title[:MAX_TITLE_LENGTH] 93 94 if title: 95 title = "%s/%s" % (self.space, title) 96 97 # See sort_manifest for access to this data. 98 99 append(join(pages_dir, pageid, "manifest"), 100 "%s|AddRevision|%s|%s|%s|%s\n" % ( 101 content["version"], 102 versionfile, 103 title or content["version"], # comment titles will incorporate the version 104 content["lastModifierName"], 105 content["versionComment"] 106 )) 107 108 # Write comments as subpages. 109 110 if content.has_key("comments"): 111 112 # Define a page directory for each comment, and write the page 113 # title in a special file for later processing. 114 115 for _comment, commentid in content["comments"]: 116 mkdirs(join(pages_dir, commentid)) 117 append(join(pages_dir, commentid, "pagetitle"), title) 118 119 # Add information to parent pages for child page lists. 120 121 if content.has_key("parent"): 122 parentid = content["parent"] 123 mkdirs(join(pages_dir, parentid)) 124 append(join(pages_dir, parentid, "children"), title + "\n") 125 126 # Some metadata is not particularly relevant. For example, 127 # ancestors, children, parent are navigation-related. 128 129 # Other metadata could be added to the page content itself. 130 # For example, labelling could be converted to categories. 131 132 # Handle revisions. 133 134 elif objecttype == "BodyContent": 135 body = content["body"] 136 if not body: 137 body = "## Empty page." 138 139 # NOTE: Very simple technique employed for guessing the format. 140 141 if no_translate: 142 fn = write 143 elif body.startswith("<"): 144 fn = xmltranslate 145 else: 146 fn = translate 147 148 try: 149 fn(join(versions_dir, content["content"]), body) 150 except: 151 print >>sys.stderr, "Error parsing..." 152 print >>sys.stderr, body 153 raise 154 155 self.content = {} 156 157 def handle_property(self, name, elements, attributes, all_text, text): 158 159 "Record properties in the current content dictionary." 160 161 self.content[attributes[-1]["name"]] = text.strip() 162 163 def handle_id(self, name, elements, attributes, all_text, text): 164 165 "Promote identifiers to the parent element's text." 166 167 all_text[-2].append(text) 168 169 def handle_collection(self, name, elements, attributes, all_text, text): 170 171 "Record collections in the current content dictionary." 172 173 self.content[attributes[-1]["name"]] = self.elements 174 self.elements = [] 175 176 def handle_element(self, name, elements, attributes, all_text, text): 177 178 "Add elements to the current collection." 179 180 self.elements.append((attributes[-1]["class"], text.strip())) 181 182 def mkdirs(name): 183 184 "Make the directory with the given 'name' at any depth." 185 186 try: 187 makedirs(name) 188 except OSError: 189 pass 190 191 def append(filename, s): 192 193 "Append to the file with the given 'filename' the string 's'." 194 195 write(filename, s, True) 196 197 def write(filename, s, append=False): 198 199 """ 200 Write to the file with the given 'filename' the string 's'. If the optional 201 'append' parameter is set to a true value, 's' will be appended to the file. 202 """ 203 204 f = codecs.open(filename, append and "a" or "w", encoding="utf-8") 205 try: 206 f.write(s) 207 finally: 208 f.close() 209 210 def read(filename): 211 212 """ 213 Read from the file with the given 'filename', returning a string containing 214 its contents. 215 """ 216 217 f = codecs.open(filename, encoding="utf-8") 218 try: 219 return f.read() 220 finally: 221 f.close() 222 223 def translate(filename, body, fn=None): 224 225 """ 226 Write to the file with the given 'filename' a translation of the given 227 'body'. 228 """ 229 230 fn = fn or parser.parse 231 232 out = codecs.open(filename, "w", encoding="utf-8") 233 try: 234 fn(body, out) 235 finally: 236 out.close() 237 238 def xmltranslate(filename, body): 239 translate(filename, body, parser.xmlparse) 240 241 def sort_manifest(pageid, output=None): 242 243 """ 244 Using the given 'pageid', locate the manifest for the page and any page 245 title information written to a "pagetitle" file. 246 247 Then sort the manifest according to revision so that it will be added to 248 MoinMoin in the correct order. 249 250 If a "pagetitle" file exists, the title column in the manifest will be 251 augmented with the contents of that file. This is typically done for 252 comments. 253 254 If a "children" file exists, the pages in that file will be added as a list 255 to the end of each revision's content. 256 257 If 'output' is given, the manifest details will be appended to the file 258 having that filename instead of being rewritten to the original manifest 259 file. 260 """ 261 262 manifest = join(pages_dir, pageid, "manifest") 263 pagetitle = join(pages_dir, pageid, "pagetitle") 264 children = join(pages_dir, pageid, "children") 265 266 if exists(pagetitle): 267 title = read(pagetitle) 268 else: 269 title = None 270 271 f = codecs.open(manifest, "r", encoding="utf-8") 272 try: 273 lines = [x.split("|") for x in f.readlines()] 274 lines.sort(cmp=lambda x, y: cmp(int(x[0]), int(y[0]))) 275 276 # Reconstruct the lines, optionally changing the titles. 277 278 result = [] 279 280 for line in lines: 281 version, _addrevision, filename, old_title, username, comment = line 282 283 # Add title information to the information already present. 284 285 if title is not None: 286 new_title = "%s/%s" % (title, old_title) 287 else: 288 new_title = old_title 289 290 # The version is omitted now that the manifest is ordered. 291 292 line = _addrevision, filename, new_title, username, comment 293 result.append("|".join(line)) 294 295 # Add child page information to the content. 296 297 if exists(children): 298 child_pages = [] 299 child_page_names = [x for x in read(children).split("\n") if x] 300 child_page_names.sort() 301 302 for child_page_name in child_page_names: 303 child_pages.append(" * [[%s]]" % child_page_name) 304 305 append(filename, child_page_section % "\n".join(child_pages)) 306 307 finally: 308 f.close() 309 310 s = "".join(result) 311 312 if output is None: 313 write(manifest, s) 314 else: 315 append(output, s) 316 317 # Template for child page information. 318 319 child_page_section = """ 320 ---- 321 322 %s 323 """ 324 325 # Main program. 326 327 if __name__ == "__main__": 328 try: 329 filename = sys.argv[1] 330 is_zipfile = splitext(filename)[-1] == extsep + "zip" 331 space = sys.argv[2] 332 except IndexError: 333 print >>sys.stderr, "Please specify an XML file containing Wiki data and a workspace name." 334 print >>sys.stderr, "For example: com_entities.xml COM" 335 sys.exit(1) 336 337 no_translate = "--no-translate" in sys.argv 338 339 if exists(space): 340 print >>sys.stderr, "Directory exists for space %s. Please choose another or remove its contents." % space 341 sys.exit(1) 342 343 package_zip = space + extsep + "zip" 344 345 if exists(package_zip): 346 print >>sys.stderr, "Page package exists. Please remove or rename it:", package_zip 347 sys.exit(1) 348 349 mkdir(space) 350 mkdirs(join(space, "pages")) 351 mkdirs(join(space, "versions")) 352 353 p = xmlread.ConfigurableParser() 354 handler = ConfluenceHandler(space, no_translate) 355 356 # Register handlers in the parser for different elements. 357 358 p["object"] = handler.handle_object 359 p["property"] = handler.handle_property 360 p["id"] = handler.handle_id 361 p["collection"] = handler.handle_collection 362 p["element"] = handler.handle_element 363 364 # Open the XML dump. 365 366 f = open(filename) 367 368 if is_zipfile: 369 zf = ZipFile(f) 370 ff = StringIO(zf.read("entities.xml")) 371 else: 372 ff = f 373 374 # Parse the data. 375 376 try: 377 p.parse(ff) 378 finally: 379 f.close() 380 381 # Tidy up the import manifests, sorting each of them by revision and 382 # finalising them. 383 384 pages_dir = join(space, "pages") 385 386 output_manifest = join(space, "MOIN_PACKAGE") 387 append(output_manifest, "MoinMoinPackage|1\n") 388 389 for pageid in listdir(pages_dir): 390 sort_manifest(pageid, output_manifest) 391 392 # Write the page package. 393 394 page_package = ZipFile(package_zip, "w") 395 396 try: 397 # Include the page revisions. 398 399 versions_dir = join(space, "versions") 400 401 for versionid in listdir(versions_dir): 402 page_package.write(join(versions_dir, versionid)) 403 404 # Include only the top-level manifest. 405 406 page_package.write(output_manifest, "MOIN_PACKAGE") 407 408 finally: 409 page_package.close() 410 411 # vim: tabstop=4 expandtab shiftwidth=4