1 #!/usr/bin/env python 2 3 """ 4 Confluence XML dump conversion to a MoinMoin-compatible representation. 5 6 Copyright (C) 2012, 2013 Paul Boddie <paul@boddie.org.uk> 7 8 This software is free software; you can redistribute it and/or 9 modify it under the terms of the GNU General Public License as 10 published by the Free Software Foundation; either version 2 of 11 the License, or (at your option) any later version. 12 13 This software is distributed in the hope that it will be useful, 14 but WITHOUT ANY WARRANTY; without even the implied warranty of 15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 GNU General Public License for more details. 17 18 You should have received a copy of the GNU General Public 19 License along with this library; see the file LICENCE.txt 20 If not, write to the Free Software Foundation, Inc., 21 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA 22 """ 23 24 from os import chdir, getcwd, listdir, mkdir, makedirs, walk 25 from os.path import exists, extsep, join, split, splitext 26 from zipfile import ZipFile 27 from cStringIO import StringIO 28 from MoinMoin import wikiutil 29 import codecs 30 import xmlread 31 import wikiparser, xmlparser 32 import sys 33 import time, calendar 34 35 from common import get_page_title 36 37 def date_to_seconds(s): 38 return calendar.timegm(time.strptime(s.split(".", 1)[0], "%Y-%m-%d %H:%M:%S")) 39 40 class ConfluenceHandler: 41 42 "Handle content from a Confluence Wiki dump." 43 44 def __init__(self, space, no_translate=False): 45 self.content = {} 46 self.elements = [] 47 self.space = space 48 self.no_translate = no_translate 49 50 def handle_object(self, name, elements, attributes, all_text, text): 51 52 """ 53 Handle objects according to type. Objects appear as follows: 54 55 <object class="Page" package="..."> 56 <id name="id">...</id> 57 ... 58 </object> 59 60 Within objects, one finds things like properties and collections, which 61 are handled by their own methods but which are stored in the content 62 dictionary associated with the current object. 63 64 By the time this method is called, the contents of the object will have 65 been gathered and the properties and collections populated in the 66 content dictionary. Any identifier will have been assigned to the 67 textual content of the object element and will be available in the 68 'text' parameter. 69 """ 70 71 objecttype = attributes[-1]["class"] 72 73 # Any identifier is stored as the object's textual content. 74 75 identifier = text.strip() 76 77 # The content is a dictionary mapping names to properties and 78 # collections. 79 80 content = self.content 81 82 pages_dir = join(self.space, "pages") 83 versions_dir = join(self.space, "versions") 84 85 # Handle particular types. 86 87 if objecttype in ("Page", "Comment", "BlogPost"): 88 89 # Handle pages and revisions, adding revisions to the page manifest. 90 # The original version is used as a unifying identifier for all the 91 # different revisions (each of which being defined by a Page 92 # element). Although "original" implies the first identifier used, 93 # it actually appears to be the latest and will have the highest 94 # version number. 95 96 if content.has_key("originalVersion"): 97 pageid = content["originalVersion"] 98 else: 99 pageid = identifier 100 101 versionfile = join(versions_dir, identifier) 102 103 # Note page metadata, not necessarily in the correct order. 104 # For comments, the title will need to be rewritten, since they 105 # should be defined in terms of their owner page. 106 107 # NOTE: This only makes the current title available to comments. 108 109 mkdirs(join(pages_dir, pageid)) 110 111 title = content["title"] 112 113 # Limit the title to a "safe" number of characters in order to avoid 114 # filesystem issues. 115 116 title = get_page_title(title) 117 118 if title: 119 title = "%s/%s" % (self.space, title) 120 write(join(pages_dir, pageid, "pagetitle"), title) 121 122 # Note the type of the page. 123 124 write(join(pages_dir, pageid, "pagetype"), objecttype) 125 126 # See sort_manifest for access to this data. 127 128 append(join(pages_dir, pageid, "manifest"), 129 "%s|AddRevision|_|%s|%s|%s|%s|%d\n" % ( # blank added for consistency with AddAttachment 130 content["version"], 131 versionfile, 132 title, # comment titles will incorporate the comment's position 133 content["lastModifierName"], 134 content["versionComment"], 135 date_to_seconds(content["lastModificationDate"]) 136 )) 137 138 # Add information to parent pages for child page lists. 139 140 if content.has_key("parent"): 141 parentid = content["parent"] 142 mkdirs(join(pages_dir, parentid)) 143 append(join(pages_dir, parentid, "children"), title + "\n") 144 145 # Add creation details for comments to the owner page. 146 # Since comments can be versioned, the date of the original version 147 # is used, and only this "original" version has the owner property. 148 149 if objecttype == "Comment" and content.has_key("owner"): 150 ownerid = content["owner"] 151 mkdirs(join(pages_dir, ownerid)) 152 append(join(pages_dir, ownerid, "comments"), "%s|%s\n" % (content["creationDate"], pageid)) 153 154 # Some metadata is not particularly relevant. For example, 155 # ancestors, children, parent are navigation-related. 156 157 # Other metadata could be added to the page content itself. 158 # For example, labelling could be converted to categories. 159 160 # Handle revisions. 161 162 elif objecttype == "BodyContent": 163 body = content["body"] 164 if not body: 165 body = "## Empty page." 166 167 # NOTE: Very simple technique employed for guessing the format. 168 169 if no_translate: 170 fn = write 171 elif body.startswith("<"): 172 fn = xmltranslate 173 else: 174 fn = translate 175 176 try: 177 fn(join(versions_dir, content["content"]), body) 178 except: 179 err = codecs.getwriter("utf-8")(sys.stderr) 180 print >>err, "Error parsing", content["content"] 181 raise 182 183 # Handle attachments. 184 185 elif objecttype == "Attachment": 186 pageid = content["content"] 187 version = content["attachmentVersion"] 188 189 if content.has_key("originalVersion"): 190 attachid = content["originalVersion"] 191 else: 192 attachid = identifier 193 194 append(join(pages_dir, pageid, "attachments"), 195 "%s|AddAttachment|%s|%s|%s|%s|%s|%d\n" % ( 196 version, 197 # Have to "taint" archive filenames, although Moin will 198 # probably handle package script filename tainting. 199 wikiutil.taintfilename(join("attachments", pageid, attachid, version)), 200 wikiutil.taintfilename(content["fileName"]), 201 "", # pagename is substituted later 202 content["lastModifierName"], 203 content["comment"], 204 date_to_seconds(content["lastModificationDate"]) 205 )) 206 207 self.content = {} 208 209 def handle_property(self, name, elements, attributes, all_text, text): 210 211 "Record properties in the current content dictionary." 212 213 self.content[attributes[-1]["name"]] = text.strip() 214 215 def handle_id(self, name, elements, attributes, all_text, text): 216 217 "Promote identifiers to the parent element's text." 218 219 all_text[-2].append(text) 220 221 def handle_collection(self, name, elements, attributes, all_text, text): 222 223 "Record collections in the current content dictionary." 224 225 self.content[attributes[-1]["name"]] = self.elements 226 self.elements = [] 227 228 def handle_element(self, name, elements, attributes, all_text, text): 229 230 "Add elements to the current collection." 231 232 self.elements.append((attributes[-1]["class"], text.strip())) 233 234 def mkdirs(name): 235 236 "Make the directory with the given 'name' at any depth." 237 238 try: 239 makedirs(name) 240 except OSError: 241 pass 242 243 def append(filename, s): 244 245 "Append to the file with the given 'filename' the string 's'." 246 247 write(filename, s, True) 248 249 def write(filename, s, append=False): 250 251 """ 252 Write to the file with the given 'filename' the string 's'. If the optional 253 'append' parameter is set to a true value, 's' will be appended to the file. 254 """ 255 256 f = codecs.open(filename, append and "a" or "w", encoding="utf-8") 257 try: 258 f.write(s) 259 finally: 260 f.close() 261 262 def read(filename): 263 264 """ 265 Read from the file with the given 'filename', returning a string containing 266 its contents. 267 """ 268 269 f = codecs.open(filename, encoding="utf-8") 270 try: 271 return f.read() 272 finally: 273 f.close() 274 275 def translate(filename, body, fn=None): 276 277 """ 278 Write to the file with the given 'filename' a translation of the given 279 'body'. 280 """ 281 282 fn = fn or wikiparser.parse 283 284 out = codecs.open(filename, "w", encoding="utf-8") 285 try: 286 print >>out, "#pragma page-filename", filename 287 fn(body, out) 288 finally: 289 out.close() 290 291 def xmltranslate(filename, body): 292 translate(filename, body, xmlparser.parse) 293 294 def sort_comments(pages_dir, pageid): 295 296 """ 297 Where 'pageid' has comments associated with it, sort them chronologically 298 and label the comment pages with the owner page's title and comment's 299 position in the chronological sequence. Such labelling is done by writing 300 a "pagetitle" file in each comment page's directory. 301 """ 302 303 comments = join(pages_dir, pageid, "comments") 304 305 if not exists(comments): 306 return 307 308 title = read(join(pages_dir, pageid, "pagetitle")) 309 310 details = [line.split("|") for line in read(comments).split("\n") if line] 311 details.sort() 312 313 # Write the sorted comments list for testing purposes. 314 315 write(comments, "\n".join(["|".join(x) for x in details])) 316 317 # Define comments as subpages by setting their titles using this 318 # page's name/title and their position in the comments collection. 319 320 for position, (_lastmodified, commentid) in enumerate(details): 321 322 # In the page directory for each comment, write the page title in a 323 # special file for later processing. 324 325 write(join(pages_dir, commentid, "pagetitle"), "%s/%04d" % (title, position)) 326 327 def _manifest_to_mapping(manifest, output_mapping): 328 329 """ 330 Open the given 'manifest' and write a mapping from version identifiers to 331 page names/titles to the file with the given 'output_mapping' filename. 332 """ 333 334 f = codecs.open(manifest, "r", encoding="utf-8") 335 try: 336 mapping = [] 337 338 lines = [x.split("|") for x in f.readlines()] 339 for line in lines: 340 version, _action, _archive_filename, filename, title, username, comment, mtime = line 341 if title: 342 mapping.append((split(filename)[-1], title)) 343 344 append(output_mapping, "".join([("\t".join(x) + "\n") for x in mapping])) 345 346 finally: 347 f.close() 348 349 def _sort_manifest(manifest, title): 350 351 """ 352 Open the given 'manifest' and sort it according to revision so that it will 353 be added to MoinMoin in the correct order. 354 355 If a 'title' is provided, the title column in the manifest will be augmented 356 with that information. This is typically done for comments and is necessary 357 for attachments. 358 359 A list of manifest entries is returned. 360 """ 361 362 f = codecs.open(manifest, "r", encoding="utf-8") 363 try: 364 lines = [x.rstrip("\n").split("|") for x in f.readlines()] 365 lines.sort(cmp=lambda x, y: cmp(int(x[0]), int(y[0]))) 366 367 # Reconstruct the lines, optionally changing the titles. 368 369 result = [] 370 371 for line in lines: 372 version, _action, _archive_filename, filename, old_title, username, comment, mtime = line 373 374 # Replace title information with the information already present. 375 376 if not old_title: 377 new_title = title 378 else: 379 new_title = old_title 380 381 # The version is omitted now that the manifest is ordered. 382 383 line = _action, _archive_filename, filename, new_title, username, comment, mtime 384 result.append(line) 385 386 return result 387 388 finally: 389 f.close() 390 391 def serialise_manifest(manifest): 392 393 """ 394 Process the 'manifest' consisting of entries, removing superfluous columns. 395 """ 396 397 result = [] 398 399 for columns in manifest: 400 action = columns[0] 401 if action == "AddRevision": 402 columns = list(columns) 403 del columns[1] 404 result.append("|".join(columns) + "\n") 405 406 return "".join(result) 407 408 def sort_manifest(pages_dir, pageid, output_mapping=None, no_translate=False): 409 410 """ 411 Using the given 'pageid', locate the manifest for the page and any page 412 title information written to a "pagetitle" file. 413 414 Then sort the manifest according to revision so that historical operations 415 such as page renaming can be detected. 416 417 If a "pagetitle" file exists, the title column in the manifest will be 418 augmented with the contents of that file. This is typically done for 419 comments. 420 421 If a "children" file exists, the pages in that file will be added as a list 422 to the end of each revision's content. 423 424 If 'output_mapping' is given, a mapping from version identifiers to page 425 titles will be appended to the file having that filename. 426 """ 427 428 pagetype = join(pages_dir, pageid, "pagetype") 429 manifest = join(pages_dir, pageid, "manifest") 430 attachments = join(pages_dir, pageid, "attachments") 431 pagetitle = join(pages_dir, pageid, "pagetitle") 432 children = join(pages_dir, pageid, "children") 433 comments = join(pages_dir, pageid, "comments") 434 435 type = exists(pagetype) and read(pagetype) or None 436 437 if exists(pagetitle): 438 title = read(pagetitle) 439 space, _page_name = get_space_and_name(title) 440 else: 441 title = space = None 442 443 # Sort the revision manifest. 444 445 result = _sort_manifest(manifest, title) 446 447 # Output a mapping of identifiers to page names. 448 449 if output_mapping: 450 _manifest_to_mapping(manifest, output_mapping) 451 452 # Modify the content to include child pages and comments. 453 454 last_title = None 455 final_result = [] 456 457 for details in result: 458 _action, _archive_filename, filename, new_title, username, comment, mtime = details 459 460 # Detect renamed pages and add a redirect revision. 461 462 if last_title and last_title != new_title and _action == "AddRevision": 463 renaming_versionfile = filename + ".rename" 464 final_result.append((_action, "_", renaming_versionfile, last_title, username, "Page renamed to %s" % new_title, mtime)) 465 write(renaming_versionfile, "#REDIRECT %s" % new_title) 466 467 last_title = new_title 468 469 # Add this revision to the manifest. 470 471 final_result.append(details) 472 473 # Obtain the text only if modifications are to be made. 474 475 text = None 476 477 # Add an ACL to comment pages so that people cannot change other 478 # people's comments. 479 # NOTE: This should match the PostComment action. 480 481 if type == "Comment": 482 text = """\ 483 #acl %s:read,write,delete,revert All:read 484 #pragma comment-owner %s 485 %s""" % (username, username, text or read(filename)) 486 487 # Add child page information to the content. 488 489 if exists(children) and not no_translate: 490 child_pages = [] 491 child_page_names = [x for x in read(children).split("\n") if x] 492 child_page_names.sort() 493 494 # Produce links which hide the space prefix. 495 496 for child_page_name in child_page_names: 497 child_space, page_name = get_space_and_name(child_page_name) 498 if child_space == space: 499 child_page_label = page_name 500 else: 501 child_page_label = child_page_name 502 503 child_pages.append(" * [[%s|%s]]" % (child_page_name, child_page_label)) 504 505 text = (text or read(filename)) + child_page_section % "\n".join(child_pages) 506 507 # Add comments to the content. 508 509 if exists(comments) and title and not no_translate: 510 text = (text or read(filename)) + comment_section 511 512 # Rewrite the file if necessary. 513 514 if text: 515 write(filename, text) 516 517 # Add the attachments to the manifest. 518 519 if exists(attachments): 520 final_result += _sort_manifest(attachments, title) 521 522 return final_result 523 524 def sort_final_manifest(entries, output): 525 526 """ 527 Sort the manifest 'entries' by last modified time and serialise it. 528 The manifest details will be appended to the file named by 'output'. 529 """ 530 531 # The final entry in each element is the mtime. 532 533 entries.sort(cmp=lambda x, y: cmp(int(x[-1]), int(y[-1]))) 534 535 # Serialise the manifest. 536 537 s = serialise_manifest(entries) 538 append(output, s) 539 540 def get_space_and_name(page_name): 541 try: 542 return page_name.split("/", 1) 543 except IndexError: 544 return None, page_name 545 546 # Template for child page information. 547 548 child_page_section = """ 549 ---- 550 551 %s 552 """ 553 554 # Template for comments. 555 556 comment_section = """ 557 ---- 558 559 <<IncludeComments>> 560 """ 561 562 # Main program. 563 564 if __name__ == "__main__": 565 try: 566 filename = sys.argv[1] 567 is_zipfile = splitext(filename)[-1] == extsep + "zip" 568 space = sys.argv[2] 569 if len(sys.argv) > 3 and sys.argv[3]: 570 attachments = sys.argv[3] 571 else: 572 attachments = None 573 except IndexError: 574 print >>sys.stderr, """ 575 Please specify an XML file containing Wiki data, a workspace name, and an 576 optional attachments directory location. For example: 577 578 %(progname)s com_entities.xml COM attachments 579 580 Adding --no-translate will unpack the Wiki but not translate the content. 581 When doing so without an attachments directory, add an empty argument as 582 follows: 583 584 %(progname)s com_entities.xml COM '' --no-translate 585 586 An archive can be used instead of the XML file, and since this may include 587 attachments, no additional attachments directory needs to be specified: 588 589 %(progname)s COM-123456-789012.zip COM 590 """ % {"progname" : split(sys.argv[0])[-1]} 591 592 sys.exit(1) 593 594 no_translate = "--no-translate" in sys.argv 595 596 if exists(space): 597 print >>sys.stderr, "Directory exists for space %s. Please choose another or remove its contents." % space 598 sys.exit(1) 599 600 package_zip = space + extsep + "zip" 601 602 if exists(package_zip): 603 print >>sys.stderr, "Page package exists. Please remove or rename it:", package_zip 604 sys.exit(1) 605 606 mkdir(space) 607 mkdirs(join(space, "pages")) 608 mkdirs(join(space, "versions")) 609 610 p = xmlread.ConfigurableParser() 611 handler = ConfluenceHandler(space, no_translate) 612 613 # Register handlers in the parser for different elements. 614 615 p["object"] = handler.handle_object 616 p["property"] = handler.handle_property 617 p["id"] = handler.handle_id 618 p["collection"] = handler.handle_collection 619 p["element"] = handler.handle_element 620 621 # Open the XML dump. 622 623 f = open(filename) 624 625 if is_zipfile: 626 zf = ZipFile(f) 627 ff = StringIO(zf.read("entities.xml")) 628 else: 629 ff = f 630 631 # Parse the data. 632 633 try: 634 p.parse(ff) 635 636 # Tidy up the import manifests, sorting each of them by revision and 637 # finalising them. 638 639 pages_dir = join(space, "pages") 640 641 for pageid in listdir(pages_dir): 642 sort_comments(pages_dir, pageid) 643 644 output_mapping = join(space, "MAPPING") 645 646 output_manifest = join(space, "MOIN_PACKAGE") 647 append(output_manifest, "MoinMoinPackage|1\n") 648 649 entries = [] 650 651 for pageid in listdir(pages_dir): 652 entries += sort_manifest(pages_dir, pageid, output_mapping, no_translate) 653 654 sort_final_manifest(entries, output_manifest) 655 656 # Write the page package. 657 658 page_package = ZipFile(package_zip, "w") 659 660 try: 661 # Include the page revisions. 662 663 versions_dir = join(space, "versions") 664 665 for versionid in listdir(versions_dir): 666 page_package.write(join(versions_dir, versionid)) 667 668 # Include the attachments. 669 670 if attachments: 671 cwd = getcwd() 672 chdir(split(attachments)[0]) 673 try: 674 for path, dirnames, filenames in walk(split(attachments)[1]): 675 for filename in filenames: 676 # Have to "taint" archive filenames. 677 page_package.write(join(path, filename), wikiutil.taintfilename(join(path, filename))) 678 finally: 679 chdir(cwd) 680 elif is_zipfile: 681 for filename in zf.namelist(): 682 if filename.startswith("attachments"): 683 # Have to "taint" archive filenames. 684 page_package.writestr(wikiutil.taintfilename(filename), zf.read(filename)) 685 686 # Include only the top-level manifest. 687 688 page_package.write(output_manifest, "MOIN_PACKAGE") 689 690 finally: 691 page_package.close() 692 693 finally: 694 f.close() 695 696 # vim: tabstop=4 expandtab shiftwidth=4