1 #!/usr/bin/env python 2 3 """ 4 Confluence XML dump conversion to a MoinMoin-compatible representation. 5 6 Copyright (C) 2012, 2013 Paul Boddie <paul@boddie.org.uk> 7 8 This software is free software; you can redistribute it and/or 9 modify it under the terms of the GNU General Public License as 10 published by the Free Software Foundation; either version 2 of 11 the License, or (at your option) any later version. 12 13 This software is distributed in the hope that it will be useful, 14 but WITHOUT ANY WARRANTY; without even the implied warranty of 15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 GNU General Public License for more details. 17 18 You should have received a copy of the GNU General Public 19 License along with this library; see the file LICENCE.txt 20 If not, write to the Free Software Foundation, Inc., 21 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA 22 """ 23 24 from os import chdir, getcwd, listdir, mkdir, makedirs, walk 25 from os.path import exists, extsep, join, split, splitext 26 from zipfile import ZipFile 27 from cStringIO import StringIO 28 from MoinMoin import wikiutil 29 import codecs 30 import xmlread 31 import wikiparser, xmlparser 32 import sys 33 34 from common import get_page_title 35 36 class ConfluenceHandler: 37 38 "Handle content from a Confluence Wiki dump." 39 40 def __init__(self, space, no_translate=False): 41 self.content = {} 42 self.elements = [] 43 self.space = space 44 self.no_translate = no_translate 45 46 def handle_object(self, name, elements, attributes, all_text, text): 47 48 """ 49 Handle objects according to type. Objects appear as follows: 50 51 <object class="Page" package="..."> 52 <id name="id">...</id> 53 ... 54 </object> 55 56 Within objects, one finds things like properties and collections, which 57 are handled by their own methods but which are stored in the content 58 dictionary associated with the current object. 59 60 By the time this method is called, the contents of the object will have 61 been gathered and the properties and collections populated in the 62 content dictionary. Any identifier will have been assigned to the 63 textual content of the object element and will be available in the 64 'text' parameter. 65 """ 66 67 objecttype = attributes[-1]["class"] 68 69 # Any identifier is stored as the object's textual content. 70 71 identifier = text.strip() 72 73 # The content is a dictionary mapping names to properties and 74 # collections. 75 76 content = self.content 77 78 pages_dir = join(self.space, "pages") 79 versions_dir = join(self.space, "versions") 80 81 # Handle particular types. 82 83 if objecttype in ("Page", "Comment", "BlogPost"): 84 85 # Handle pages and revisions, adding revisions to the page manifest. 86 # The original version is used as a unifying identifier for all the 87 # different revisions (each of which being defined by a Page 88 # element). Although "original" implies the first identifier used, 89 # it actually appears to be the latest and will have the highest 90 # version number. 91 92 if content.has_key("originalVersion"): 93 pageid = content["originalVersion"] 94 else: 95 pageid = identifier 96 97 versionfile = join(versions_dir, identifier) 98 99 # Note page metadata, not necessarily in the correct order. 100 # For comments, the title will need to be rewritten, since they 101 # should be defined in terms of their owner page. 102 103 # NOTE: This only makes the current title available to comments. 104 105 mkdirs(join(pages_dir, pageid)) 106 107 title = content["title"] 108 109 # Limit the title to a "safe" number of characters in order to avoid 110 # filesystem issues. 111 112 title = get_page_title(title) 113 114 if title: 115 title = "%s/%s" % (self.space, title) 116 write(join(pages_dir, pageid, "pagetitle"), title) 117 118 # See sort_manifest for access to this data. 119 120 append(join(pages_dir, pageid, "manifest"), 121 "%s|AddRevision|_|%s|%s|%s|%s\n" % ( # blank added for consistency with AddAttachment 122 content["version"], 123 versionfile, 124 title, # comment titles will incorporate the comment's position 125 content["lastModifierName"], 126 content["versionComment"] 127 )) 128 129 # Add information to parent pages for child page lists. 130 131 if content.has_key("parent"): 132 parentid = content["parent"] 133 mkdirs(join(pages_dir, parentid)) 134 append(join(pages_dir, parentid, "children"), title + "\n") 135 136 # Add creation details for comments to the owner page. 137 # Since comments can be versioned, the date of the original version 138 # is used, and only this "original" version has the owner property. 139 140 if objecttype == "Comment" and content.has_key("owner"): 141 ownerid = content["owner"] 142 mkdirs(join(pages_dir, ownerid)) 143 append(join(pages_dir, ownerid, "comments"), "%s|%s\n" % (content["creationDate"], pageid)) 144 145 # Some metadata is not particularly relevant. For example, 146 # ancestors, children, parent are navigation-related. 147 148 # Other metadata could be added to the page content itself. 149 # For example, labelling could be converted to categories. 150 151 # Handle revisions. 152 153 elif objecttype == "BodyContent": 154 body = content["body"] 155 if not body: 156 body = "## Empty page." 157 158 # NOTE: Very simple technique employed for guessing the format. 159 160 if no_translate: 161 fn = write 162 elif body.startswith("<"): 163 fn = xmltranslate 164 else: 165 fn = translate 166 167 try: 168 fn(join(versions_dir, content["content"]), body) 169 except: 170 err = codecs.getwriter("utf-8")(sys.stderr) 171 print >>err, "Error parsing", content["content"] 172 raise 173 174 # Handle attachments. 175 176 elif objecttype == "Attachment": 177 pageid = content["content"] 178 version = content["attachmentVersion"] 179 180 if content.has_key("originalVersion"): 181 attachid = content["originalVersion"] 182 else: 183 attachid = identifier 184 185 append(join(pages_dir, pageid, "attachments"), 186 "%s|AddAttachment|%s|%s|%s|%s|%s\n" % ( 187 version, 188 # Have to "taint" archive filenames, although Moin will 189 # probably handle package script filename tainting. 190 wikiutil.taintfilename(join("attachments", pageid, attachid, version)), 191 wikiutil.taintfilename(content["fileName"]), 192 "", # pagename is substituted later 193 content["lastModifierName"], 194 content["comment"] 195 )) 196 197 self.content = {} 198 199 def handle_property(self, name, elements, attributes, all_text, text): 200 201 "Record properties in the current content dictionary." 202 203 self.content[attributes[-1]["name"]] = text.strip() 204 205 def handle_id(self, name, elements, attributes, all_text, text): 206 207 "Promote identifiers to the parent element's text." 208 209 all_text[-2].append(text) 210 211 def handle_collection(self, name, elements, attributes, all_text, text): 212 213 "Record collections in the current content dictionary." 214 215 self.content[attributes[-1]["name"]] = self.elements 216 self.elements = [] 217 218 def handle_element(self, name, elements, attributes, all_text, text): 219 220 "Add elements to the current collection." 221 222 self.elements.append((attributes[-1]["class"], text.strip())) 223 224 def mkdirs(name): 225 226 "Make the directory with the given 'name' at any depth." 227 228 try: 229 makedirs(name) 230 except OSError: 231 pass 232 233 def append(filename, s): 234 235 "Append to the file with the given 'filename' the string 's'." 236 237 write(filename, s, True) 238 239 def write(filename, s, append=False): 240 241 """ 242 Write to the file with the given 'filename' the string 's'. If the optional 243 'append' parameter is set to a true value, 's' will be appended to the file. 244 """ 245 246 f = codecs.open(filename, append and "a" or "w", encoding="utf-8") 247 try: 248 f.write(s) 249 finally: 250 f.close() 251 252 def read(filename): 253 254 """ 255 Read from the file with the given 'filename', returning a string containing 256 its contents. 257 """ 258 259 f = codecs.open(filename, encoding="utf-8") 260 try: 261 return f.read() 262 finally: 263 f.close() 264 265 def translate(filename, body, fn=None): 266 267 """ 268 Write to the file with the given 'filename' a translation of the given 269 'body'. 270 """ 271 272 fn = fn or wikiparser.parse 273 274 out = codecs.open(filename, "w", encoding="utf-8") 275 try: 276 print >>out, "#pragma page-filename", filename 277 fn(body, out) 278 finally: 279 out.close() 280 281 def xmltranslate(filename, body): 282 translate(filename, body, xmlparser.parse) 283 284 def sort_comments(pages_dir, pageid): 285 286 """ 287 Where 'pageid' has comments associated with it, sort them chronologically 288 and label the comment pages with the owner page's title and comment's 289 position in the chronological sequence. Such labelling is done by writing 290 a "pagetitle" file in each comment page's directory. 291 """ 292 293 comments = join(pages_dir, pageid, "comments") 294 295 if not exists(comments): 296 return 297 298 title = read(join(pages_dir, pageid, "pagetitle")) 299 300 details = [line.split("|") for line in read(comments).split("\n") if line] 301 details.sort() 302 303 # Write the sorted comments list for testing purposes. 304 305 write(comments, "\n".join(["|".join(x) for x in details])) 306 307 # Define comments as subpages by setting their titles using this 308 # page's name/title and their position in the comments collection. 309 310 for position, (_lastmodified, commentid) in enumerate(details): 311 312 # In the page directory for each comment, write the page title in a 313 # special file for later processing. 314 315 write(join(pages_dir, commentid, "pagetitle"), "%s/%04d" % (title, position)) 316 317 def _sort_manifest(manifest, title): 318 319 """ 320 Open the given 'manifest' and sort it according to revision so that it will 321 be added to MoinMoin in the correct order. 322 323 If a 'title' is provided, the title column in the manifest will be augmented 324 with that information. This is typically done for comments and is necessary 325 for attachments. 326 327 A list of manifest entries is returned. 328 """ 329 330 f = codecs.open(manifest, "r", encoding="utf-8") 331 try: 332 lines = [x.split("|") for x in f.readlines()] 333 lines.sort(cmp=lambda x, y: cmp(int(x[0]), int(y[0]))) 334 335 # Reconstruct the lines, optionally changing the titles. 336 337 result = [] 338 339 for line in lines: 340 version, _action, _archive_filename, filename, old_title, username, comment = line 341 342 # Replace title information with the information already present. 343 344 if not old_title: 345 new_title = title 346 else: 347 new_title = old_title 348 349 # The version is omitted now that the manifest is ordered. 350 351 line = _action, _archive_filename, filename, new_title, username, comment 352 result.append(line) 353 354 return result 355 356 finally: 357 f.close() 358 359 def serialise_manifest(manifest): 360 361 """ 362 Process the 'manifest' consisting of entries, removing superfluous columns. 363 """ 364 365 result = [] 366 367 for columns in manifest: 368 action = columns[0] 369 if action == "AddRevision": 370 columns = list(columns) 371 del columns[1] 372 result.append("|".join(columns)) 373 374 return "".join(result) 375 376 def sort_manifest(pages_dir, pageid, output=None, no_translate=False): 377 378 """ 379 Using the given 'pageid', locate the manifest for the page and any page 380 title information written to a "pagetitle" file. 381 382 Then sort the manifest according to revision so that it will be added to 383 MoinMoin in the correct order. 384 385 If a "pagetitle" file exists, the title column in the manifest will be 386 augmented with the contents of that file. This is typically done for 387 comments. 388 389 If a "children" file exists, the pages in that file will be added as a list 390 to the end of each revision's content. 391 392 If 'output' is given, the manifest details will be appended to the file 393 having that filename instead of being rewritten to the original manifest 394 file. 395 """ 396 397 manifest = join(pages_dir, pageid, "manifest") 398 attachments = join(pages_dir, pageid, "attachments") 399 pagetitle = join(pages_dir, pageid, "pagetitle") 400 children = join(pages_dir, pageid, "children") 401 comments = join(pages_dir, pageid, "comments") 402 403 if exists(pagetitle): 404 title = read(pagetitle) 405 space, _page_name = get_space_and_name(title) 406 else: 407 title = space = None 408 409 # Sort the revision manifest. 410 411 result = _sort_manifest(manifest, title) 412 413 for _action, _archive_filename, filename, new_title, username, comment in result: 414 415 # Add child page information to the content. 416 417 if exists(children) and not no_translate: 418 child_pages = [] 419 child_page_names = [x for x in read(children).split("\n") if x] 420 child_page_names.sort() 421 422 # Produce links which hide the space prefix. 423 424 for child_page_name in child_page_names: 425 child_space, page_name = get_space_and_name(child_page_name) 426 if child_space == space: 427 child_page_label = page_name 428 else: 429 child_page_label = child_page_name 430 431 child_pages.append(" * [[%s|%s]]" % (child_page_name, child_page_label)) 432 433 append(filename, child_page_section % "\n".join(child_pages)) 434 435 # Add comments to the content. 436 437 if exists(comments) and title and not no_translate: 438 append(filename, comment_section % title) 439 440 # Add the attachments to the manifest. 441 442 if exists(attachments): 443 result += _sort_manifest(attachments, title) 444 445 # Serialise the manifest. 446 447 s = serialise_manifest(result) 448 449 if output is None: 450 write(manifest, s) 451 else: 452 append(output, s) 453 454 def get_space_and_name(page_name): 455 try: 456 return page_name.split("/", 1) 457 except IndexError: 458 return None, page_name 459 460 # Template for child page information. 461 462 child_page_section = """ 463 ---- 464 465 %s 466 """ 467 468 # Template for comments. 469 470 comment_section = """ 471 ---- 472 473 <<Include("^%s/")>> 474 """ 475 476 # Main program. 477 478 if __name__ == "__main__": 479 try: 480 filename = sys.argv[1] 481 is_zipfile = splitext(filename)[-1] == extsep + "zip" 482 space = sys.argv[2] 483 if len(sys.argv) > 3 and sys.argv[3]: 484 attachments = sys.argv[3] 485 else: 486 attachments = None 487 except IndexError: 488 print >>sys.stderr, """ 489 Please specify an XML file containing Wiki data, a workspace name, and an 490 optional attachments directory location. For example: 491 492 com_entities.xml COM attachments 493 494 Adding --no-translate will unpack the Wiki but not translate the content. 495 When doing so without an attachments directory, add an empty argument as 496 follows: 497 498 com_entities.xml COM '' --no-translate 499 """ 500 sys.exit(1) 501 502 no_translate = "--no-translate" in sys.argv 503 504 if exists(space): 505 print >>sys.stderr, "Directory exists for space %s. Please choose another or remove its contents." % space 506 sys.exit(1) 507 508 package_zip = space + extsep + "zip" 509 510 if exists(package_zip): 511 print >>sys.stderr, "Page package exists. Please remove or rename it:", package_zip 512 sys.exit(1) 513 514 mkdir(space) 515 mkdirs(join(space, "pages")) 516 mkdirs(join(space, "versions")) 517 518 p = xmlread.ConfigurableParser() 519 handler = ConfluenceHandler(space, no_translate) 520 521 # Register handlers in the parser for different elements. 522 523 p["object"] = handler.handle_object 524 p["property"] = handler.handle_property 525 p["id"] = handler.handle_id 526 p["collection"] = handler.handle_collection 527 p["element"] = handler.handle_element 528 529 # Open the XML dump. 530 531 f = open(filename) 532 533 if is_zipfile: 534 zf = ZipFile(f) 535 ff = StringIO(zf.read("entities.xml")) 536 else: 537 ff = f 538 539 # Parse the data. 540 541 try: 542 p.parse(ff) 543 544 # Tidy up the import manifests, sorting each of them by revision and 545 # finalising them. 546 547 pages_dir = join(space, "pages") 548 549 for pageid in listdir(pages_dir): 550 sort_comments(pages_dir, pageid) 551 552 output_manifest = join(space, "MOIN_PACKAGE") 553 append(output_manifest, "MoinMoinPackage|1\n") 554 555 for pageid in listdir(pages_dir): 556 sort_manifest(pages_dir, pageid, output_manifest, no_translate) 557 558 # Write the page package. 559 560 page_package = ZipFile(package_zip, "w") 561 562 try: 563 # Include the page revisions. 564 565 versions_dir = join(space, "versions") 566 567 for versionid in listdir(versions_dir): 568 page_package.write(join(versions_dir, versionid)) 569 570 # Include the attachments. 571 572 if attachments: 573 cwd = getcwd() 574 chdir(split(attachments)[0]) 575 try: 576 for path, dirnames, filenames in walk(split(attachments)[1]): 577 for filename in filenames: 578 # Have to "taint" archive filenames. 579 page_package.write(join(path, filename), wikiutil.taintfilename(join(path, filename))) 580 finally: 581 chdir(cwd) 582 elif is_zipfile: 583 for filename in zf.namelist(): 584 if filename.startswith("attachments"): 585 # Have to "taint" archive filenames. 586 page_package.writestr(wikiutil.taintfilename(filename), zf.read(filename)) 587 588 # Include only the top-level manifest. 589 590 page_package.write(output_manifest, "MOIN_PACKAGE") 591 592 finally: 593 page_package.close() 594 595 finally: 596 f.close() 597 598 # vim: tabstop=4 expandtab shiftwidth=4