1 #!/usr/bin/env python 2 3 """ 4 Confluence XML dump conversion to a MoinMoin-compatible representation. 5 6 Copyright (C) 2012, 2013 Paul Boddie <paul@boddie.org.uk> 7 8 This software is free software; you can redistribute it and/or 9 modify it under the terms of the GNU General Public License as 10 published by the Free Software Foundation; either version 2 of 11 the License, or (at your option) any later version. 12 13 This software is distributed in the hope that it will be useful, 14 but WITHOUT ANY WARRANTY; without even the implied warranty of 15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 GNU General Public License for more details. 17 18 You should have received a copy of the GNU General Public 19 License along with this library; see the file LICENCE.txt 20 If not, write to the Free Software Foundation, Inc., 21 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA 22 """ 23 24 from os import chdir, getcwd, listdir, mkdir, makedirs, walk 25 from os.path import exists, extsep, join, split, splitext 26 from zipfile import ZipFile 27 from cStringIO import StringIO 28 from MoinMoin import wikiutil 29 import codecs 30 import xmlread 31 import wikiparser, xmlparser 32 import sys 33 import time, calendar 34 35 from common import get_page_title 36 37 def date_to_seconds(s): 38 return calendar.timegm(time.strptime(s.split(".", 1)[0], "%Y-%m-%d %H:%M:%S")) 39 40 class ConfluenceHandler: 41 42 "Handle content from a Confluence Wiki dump." 43 44 def __init__(self, space, no_translate=False): 45 self.content = {} 46 self.elements = [] 47 self.space = space 48 self.no_translate = no_translate 49 50 def handle_object(self, name, elements, attributes, all_text, text): 51 52 """ 53 Handle objects according to type. Objects appear as follows: 54 55 <object class="Page" package="..."> 56 <id name="id">...</id> 57 ... 58 </object> 59 60 Within objects, one finds things like properties and collections, which 61 are handled by their own methods but which are stored in the content 62 dictionary associated with the current object. 63 64 By the time this method is called, the contents of the object will have 65 been gathered and the properties and collections populated in the 66 content dictionary. Any identifier will have been assigned to the 67 textual content of the object element and will be available in the 68 'text' parameter. 69 """ 70 71 objecttype = attributes[-1]["class"] 72 73 # Any identifier is stored as the object's textual content. 74 75 identifier = text.strip() 76 77 # The content is a dictionary mapping names to properties and 78 # collections. 79 80 content = self.content 81 82 pages_dir = join(self.space, "pages") 83 versions_dir = join(self.space, "versions") 84 85 # Handle particular types. 86 87 if objecttype in ("Page", "Comment", "BlogPost"): 88 89 # Handle pages and revisions, adding revisions to the page manifest. 90 # The original version is used as a unifying identifier for all the 91 # different revisions (each of which being defined by a Page 92 # element). Although "original" implies the first identifier used, 93 # it actually appears to be the latest and will have the highest 94 # version number. 95 96 if content.has_key("originalVersion"): 97 pageid = content["originalVersion"] 98 else: 99 pageid = identifier 100 101 versionfile = join(versions_dir, identifier) 102 103 # Note page metadata, not necessarily in the correct order. 104 # For comments, the title will need to be rewritten, since they 105 # should be defined in terms of their owner page. 106 107 # NOTE: This only makes the current title available to comments. 108 109 mkdirs(join(pages_dir, pageid)) 110 111 title = content["title"] 112 113 # Limit the title to a "safe" number of characters in order to avoid 114 # filesystem issues. 115 116 title = get_page_title(title) 117 118 if title: 119 title = "%s/%s" % (self.space, title) 120 write(join(pages_dir, pageid, "pagetitle"), title) 121 122 # Note the type of the page. 123 124 write(join(pages_dir, pageid, "pagetype"), objecttype) 125 126 # See sort_manifest for access to this data. 127 128 append(join(pages_dir, pageid, "manifest"), 129 "%s|AddRevision|_|%s|%s|%s|%s|%d\n" % ( # blank added for consistency with AddAttachment 130 content["version"], 131 versionfile, 132 title, # comment titles will incorporate the comment's position 133 content["lastModifierName"], 134 content["versionComment"], 135 date_to_seconds(content["lastModificationDate"]) 136 )) 137 138 # Add information to parent pages for child page lists. 139 140 if content.has_key("parent"): 141 parentid = content["parent"] 142 mkdirs(join(pages_dir, parentid)) 143 append(join(pages_dir, parentid, "children"), title + "\n") 144 145 # Add creation details for comments to the owner page. 146 # Since comments can be versioned, the date of the original version 147 # is used, and only this "original" version has the owner property. 148 149 if objecttype == "Comment" and content.has_key("owner"): 150 ownerid = content["owner"] 151 mkdirs(join(pages_dir, ownerid)) 152 append(join(pages_dir, ownerid, "comments"), "%s|%s\n" % (content["creationDate"], pageid)) 153 154 # Some metadata is not particularly relevant. For example, 155 # ancestors, children, parent are navigation-related. 156 157 # Other metadata could be added to the page content itself. 158 # For example, labelling could be converted to categories. 159 160 # Handle revisions. 161 162 elif objecttype == "BodyContent": 163 body = content["body"] 164 if not body: 165 body = "## Empty page." 166 167 is_comment_page = content.get("content:class") == "Comment" 168 169 # NOTE: Very simple technique employed for guessing the format. 170 171 if no_translate: 172 fn = notranslate 173 elif body.startswith("<"): 174 fn = xmltranslate 175 else: 176 fn = translate 177 178 try: 179 fn(join(versions_dir, content["content"]), body, is_comment_page) 180 except: 181 err = codecs.getwriter("utf-8")(sys.stderr) 182 print >>err, "Error parsing", content["content"] 183 raise 184 185 # Handle attachments. 186 187 elif objecttype == "Attachment": 188 pageid = content["content"] 189 version = content["attachmentVersion"] 190 191 if content.has_key("originalVersion"): 192 attachid = content["originalVersion"] 193 else: 194 attachid = identifier 195 196 append(join(pages_dir, pageid, "attachments"), 197 "%s|AddAttachment|%s|%s|%s|%s|%s|%d\n" % ( 198 version, 199 # Have to "taint" archive filenames, although Moin will 200 # probably handle package script filename tainting. 201 wikiutil.taintfilename(join("attachments", pageid, attachid, version)), 202 wikiutil.taintfilename(content["fileName"]), 203 "", # pagename is substituted later 204 content["lastModifierName"], 205 content["comment"], 206 date_to_seconds(content["lastModificationDate"]) 207 )) 208 209 self.content = {} 210 211 def handle_property(self, name, elements, attributes, all_text, text): 212 213 "Record properties in the current content dictionary." 214 215 property_name = attributes[-1]["name"] 216 self.content[property_name] = text.strip() 217 218 property_class = attributes[-1].get("class") 219 if property_class: 220 self.content["%s:%s" % (property_name, "class")] = property_class.strip() 221 222 def handle_id(self, name, elements, attributes, all_text, text): 223 224 "Promote identifiers to the parent element's text." 225 226 all_text[-2].append(text) 227 228 def handle_collection(self, name, elements, attributes, all_text, text): 229 230 "Record collections in the current content dictionary." 231 232 self.content[attributes[-1]["name"]] = self.elements 233 self.elements = [] 234 235 def handle_element(self, name, elements, attributes, all_text, text): 236 237 "Add elements to the current collection." 238 239 self.elements.append((attributes[-1]["class"], text.strip())) 240 241 def mkdirs(name): 242 243 "Make the directory with the given 'name' at any depth." 244 245 try: 246 makedirs(name) 247 except OSError: 248 pass 249 250 def append(filename, s): 251 252 "Append to the file with the given 'filename' the string 's'." 253 254 write(filename, s, True) 255 256 def write(filename, s, append=False): 257 258 """ 259 Write to the file with the given 'filename' the string 's'. If the optional 260 'append' parameter is set to a true value, 's' will be appended to the file. 261 """ 262 263 f = codecs.open(filename, append and "a" or "w", encoding="utf-8") 264 try: 265 f.write(s) 266 finally: 267 f.close() 268 269 def read(filename): 270 271 """ 272 Read from the file with the given 'filename', returning a string containing 273 its contents. 274 """ 275 276 f = codecs.open(filename, encoding="utf-8") 277 try: 278 return f.read() 279 finally: 280 f.close() 281 282 def translate(filename, body, is_comment_page, fn=None): 283 284 """ 285 Write to the file with the given 'filename' a translation of the given 286 'body'. 287 """ 288 289 fn = fn or wikiparser.parse 290 291 out = codecs.open(filename, "w", encoding="utf-8") 292 try: 293 print >>out, "#pragma page-filename", filename 294 fn(body, out, is_comment_page) 295 finally: 296 out.close() 297 298 def xmltranslate(filename, body, is_comment_page): 299 translate(filename, body, is_comment_page, xmlparser.parse) 300 301 def notranslate(filename, body, is_comment_page): 302 write(filename, body) 303 304 def sort_comments(pages_dir, pageid): 305 306 """ 307 Where 'pageid' has comments associated with it, sort them chronologically 308 and label the comment pages with the owner page's title and comment's 309 position in the chronological sequence. Such labelling is done by writing 310 a "pagetitle" file in each comment page's directory. 311 """ 312 313 comments = join(pages_dir, pageid, "comments") 314 315 if not exists(comments): 316 return 317 318 title = read(join(pages_dir, pageid, "pagetitle")) 319 320 details = [line.split("|") for line in read(comments).split("\n") if line] 321 details.sort() 322 323 # Write the sorted comments list for testing purposes. 324 325 write(comments, "\n".join(["|".join(x) for x in details])) 326 327 # Define comments as subpages by setting their titles using this 328 # page's name/title and their position in the comments collection. 329 330 for position, (_lastmodified, commentid) in enumerate(details): 331 332 # In the page directory for each comment, write the page title in a 333 # special file for later processing. 334 335 write(join(pages_dir, commentid, "pagetitle"), "%s/%04d" % (title, position)) 336 337 def _manifest_to_mapping(manifest, output_mapping): 338 339 """ 340 Open the given 'manifest' and write a mapping from version identifiers to 341 page names/titles to the file with the given 'output_mapping' filename. 342 """ 343 344 f = codecs.open(manifest, "r", encoding="utf-8") 345 try: 346 mapping = [] 347 348 lines = [x.split("|") for x in f.readlines()] 349 for line in lines: 350 version, _action, _archive_filename, filename, title, username, comment, mtime = line 351 if title: 352 mapping.append((split(filename)[-1], title)) 353 354 append(output_mapping, "".join([("\t".join(x) + "\n") for x in mapping])) 355 356 finally: 357 f.close() 358 359 def _sort_manifest(manifest, title): 360 361 """ 362 Open the given 'manifest' and sort it according to revision so that it will 363 be added to MoinMoin in the correct order. 364 365 If a 'title' is provided, the title column in the manifest will be augmented 366 with that information. This is typically done for comments and is necessary 367 for attachments. 368 369 A list of manifest entries is returned. 370 """ 371 372 f = codecs.open(manifest, "r", encoding="utf-8") 373 try: 374 lines = [x.rstrip("\n").split("|") for x in f.readlines()] 375 lines.sort(cmp=lambda x, y: cmp(int(x[0]), int(y[0]))) 376 377 # Reconstruct the lines, optionally changing the titles. 378 379 result = [] 380 381 for line in lines: 382 version, _action, _archive_filename, filename, old_title, username, comment, mtime = line 383 384 # Replace title information with the information already present. 385 386 if not old_title: 387 new_title = title 388 else: 389 new_title = old_title 390 391 # The version is omitted now that the manifest is ordered. 392 393 line = _action, _archive_filename, filename, new_title, username, comment, mtime 394 result.append(line) 395 396 return result 397 398 finally: 399 f.close() 400 401 def serialise_manifest(manifest): 402 403 """ 404 Process the 'manifest' consisting of entries, removing superfluous columns. 405 """ 406 407 result = [] 408 409 for columns in manifest: 410 action = columns[0] 411 if action == "AddRevision": 412 columns = list(columns) 413 del columns[1] 414 result.append("|".join(columns) + "\n") 415 416 return "".join(result) 417 418 def sort_manifest(pages_dir, pageid, output_mapping=None, no_translate=False): 419 420 """ 421 Using the given 'pageid', locate the manifest for the page and any page 422 title information written to a "pagetitle" file. 423 424 Then sort the manifest according to revision so that historical operations 425 such as page renaming can be detected. 426 427 If a "pagetitle" file exists, the title column in the manifest will be 428 augmented with the contents of that file. This is typically done for 429 comments. 430 431 If a "children" file exists, the pages in that file will be added as a list 432 to the end of each revision's content. 433 434 If 'output_mapping' is given, a mapping from version identifiers to page 435 titles will be appended to the file having that filename. 436 """ 437 438 pagetype = join(pages_dir, pageid, "pagetype") 439 manifest = join(pages_dir, pageid, "manifest") 440 attachments = join(pages_dir, pageid, "attachments") 441 pagetitle = join(pages_dir, pageid, "pagetitle") 442 children = join(pages_dir, pageid, "children") 443 comments = join(pages_dir, pageid, "comments") 444 445 type = exists(pagetype) and read(pagetype) or None 446 447 if exists(pagetitle): 448 title = read(pagetitle) 449 space, _page_name = get_space_and_name(title) 450 else: 451 title = space = None 452 453 # Sort the revision manifest. 454 455 result = _sort_manifest(manifest, title) 456 457 # Output a mapping of identifiers to page names. 458 459 if output_mapping: 460 _manifest_to_mapping(manifest, output_mapping) 461 462 # Modify the content to include child pages and comments. 463 464 last_title = None 465 final_result = [] 466 467 for details in result: 468 _action, _archive_filename, filename, new_title, username, comment, mtime = details 469 470 # Detect renamed pages and add a redirect revision. 471 472 if last_title and last_title != new_title and _action == "AddRevision": 473 renaming_versionfile = filename + ".rename" 474 final_result.append((_action, "_", renaming_versionfile, last_title, username, "Page renamed to %s" % new_title, mtime)) 475 write(renaming_versionfile, "#REDIRECT %s" % new_title) 476 477 last_title = new_title 478 479 # Add this revision to the manifest. 480 481 final_result.append(details) 482 483 # Obtain the text only if modifications are to be made. 484 485 text = None 486 487 # Add an ACL to comment pages so that people cannot change other 488 # people's comments. 489 # NOTE: This should match the PostComment action. 490 491 if type == "Comment": 492 text = """\ 493 #acl %s:read,write,delete,revert All:read 494 #pragma comment-owner %s 495 %s""" % (username, username, text or read(filename)) 496 497 # Add child page information to the content. 498 499 if exists(children) and not no_translate: 500 child_pages = [] 501 child_page_names = [x for x in read(children).split("\n") if x] 502 child_page_names.sort() 503 504 # Produce links which hide the space prefix. 505 506 for child_page_name in child_page_names: 507 child_space, page_name = get_space_and_name(child_page_name) 508 if child_space == space: 509 child_page_label = page_name 510 else: 511 child_page_label = child_page_name 512 513 child_pages.append(" * [[%s|%s]]" % (child_page_name, child_page_label)) 514 515 text = (text or read(filename)) + child_page_section % "\n".join(child_pages) 516 517 # Add comments to the content. 518 519 if exists(comments) and title and not no_translate: 520 text = (text or read(filename)) + comment_section 521 522 # Rewrite the file if necessary. 523 524 if text: 525 write(filename, text) 526 527 # Add the attachments to the manifest. 528 529 if exists(attachments): 530 final_result += _sort_manifest(attachments, title) 531 532 return final_result 533 534 def sort_final_manifest(entries, output): 535 536 """ 537 Sort the manifest 'entries' by last modified time and serialise it. 538 The manifest details will be appended to the file named by 'output'. 539 """ 540 541 # The final entry in each element is the mtime. 542 543 entries.sort(cmp=lambda x, y: cmp(int(x[-1]), int(y[-1]))) 544 545 # Serialise the manifest. 546 547 s = serialise_manifest(entries) 548 append(output, s) 549 550 def get_space_and_name(page_name): 551 try: 552 return page_name.split("/", 1) 553 except IndexError: 554 return None, page_name 555 556 # Template for child page information. 557 558 child_page_section = """ 559 ---- 560 561 %s 562 """ 563 564 # Template for comments. 565 566 comment_section = """ 567 ---- 568 569 <<IncludeComments>> 570 """ 571 572 # Main program. 573 574 if __name__ == "__main__": 575 try: 576 filename = sys.argv[1] 577 is_zipfile = splitext(filename)[-1] == extsep + "zip" 578 space = sys.argv[2] 579 if len(sys.argv) > 3 and sys.argv[3]: 580 attachments = sys.argv[3] 581 else: 582 attachments = None 583 except IndexError: 584 print >>sys.stderr, """ 585 Please specify an XML file containing Wiki data, a workspace name, and an 586 optional attachments directory location. For example: 587 588 %(progname)s com_entities.xml COM attachments 589 590 Adding --no-translate will unpack the Wiki but not translate the content. 591 When doing so without an attachments directory, add an empty argument as 592 follows: 593 594 %(progname)s com_entities.xml COM '' --no-translate 595 596 An archive can be used instead of the XML file, and since this may include 597 attachments, no additional attachments directory needs to be specified: 598 599 %(progname)s COM-123456-789012.zip COM 600 """ % {"progname" : split(sys.argv[0])[-1]} 601 602 sys.exit(1) 603 604 no_translate = "--no-translate" in sys.argv 605 606 if exists(space): 607 print >>sys.stderr, "Directory exists for space %s. Please choose another or remove its contents." % space 608 sys.exit(1) 609 610 package_zip = space + extsep + "zip" 611 612 if exists(package_zip): 613 print >>sys.stderr, "Page package exists. Please remove or rename it:", package_zip 614 sys.exit(1) 615 616 mkdir(space) 617 mkdirs(join(space, "pages")) 618 mkdirs(join(space, "versions")) 619 620 p = xmlread.ConfigurableParser() 621 handler = ConfluenceHandler(space, no_translate) 622 623 # Register handlers in the parser for different elements. 624 625 p["object"] = handler.handle_object 626 p["property"] = handler.handle_property 627 p["id"] = handler.handle_id 628 p["collection"] = handler.handle_collection 629 p["element"] = handler.handle_element 630 631 # Open the XML dump. 632 633 f = open(filename) 634 635 if is_zipfile: 636 zf = ZipFile(f) 637 ff = StringIO(zf.read("entities.xml")) 638 else: 639 ff = f 640 641 # Parse the data. 642 643 try: 644 p.parse(ff) 645 646 # Tidy up the import manifests, sorting each of them by revision and 647 # finalising them. 648 649 pages_dir = join(space, "pages") 650 651 for pageid in listdir(pages_dir): 652 sort_comments(pages_dir, pageid) 653 654 output_mapping = join(space, "MAPPING") 655 656 output_manifest = join(space, "MOIN_PACKAGE") 657 append(output_manifest, "MoinMoinPackage|1\n") 658 659 entries = [] 660 661 for pageid in listdir(pages_dir): 662 entries += sort_manifest(pages_dir, pageid, output_mapping, no_translate) 663 664 sort_final_manifest(entries, output_manifest) 665 666 # Write the page package. 667 668 page_package = ZipFile(package_zip, "w") 669 670 try: 671 # Include the page revisions. 672 673 versions_dir = join(space, "versions") 674 675 for versionid in listdir(versions_dir): 676 page_package.write(join(versions_dir, versionid)) 677 678 # Include the attachments. 679 680 if attachments: 681 cwd = getcwd() 682 chdir(split(attachments)[0]) 683 try: 684 for path, dirnames, filenames in walk(split(attachments)[1]): 685 for filename in filenames: 686 # Have to "taint" archive filenames. 687 page_package.write(join(path, filename), wikiutil.taintfilename(join(path, filename))) 688 finally: 689 chdir(cwd) 690 elif is_zipfile: 691 for filename in zf.namelist(): 692 if filename.startswith("attachments"): 693 # Have to "taint" archive filenames. 694 page_package.writestr(wikiutil.taintfilename(filename), zf.read(filename)) 695 696 # Include only the top-level manifest. 697 698 page_package.write(output_manifest, "MOIN_PACKAGE") 699 700 finally: 701 page_package.close() 702 703 finally: 704 f.close() 705 706 # vim: tabstop=4 expandtab shiftwidth=4