1 #!/usr/bin/env python 2 3 """ 4 Confluence XML dump conversion to a MoinMoin-compatible representation. 5 6 Copyright (C) 2012, 2013 Paul Boddie <paul@boddie.org.uk> 7 8 This software is free software; you can redistribute it and/or 9 modify it under the terms of the GNU General Public License as 10 published by the Free Software Foundation; either version 2 of 11 the License, or (at your option) any later version. 12 13 This software is distributed in the hope that it will be useful, 14 but WITHOUT ANY WARRANTY; without even the implied warranty of 15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 GNU General Public License for more details. 17 18 You should have received a copy of the GNU General Public 19 License along with this library; see the file LICENCE.txt 20 If not, write to the Free Software Foundation, Inc., 21 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA 22 """ 23 24 from os import chdir, getcwd, listdir, mkdir, makedirs, walk 25 from os.path import exists, extsep, join, split, splitext 26 from zipfile import ZipFile 27 from cStringIO import StringIO 28 from MoinMoin import wikiutil 29 import codecs 30 import xmlread 31 import wikiparser, xmlparser 32 import sys 33 import time, calendar 34 35 from common import get_page_title 36 37 def date_to_seconds(s): 38 return calendar.timegm(time.strptime(s.split(".", 1)[0], "%Y-%m-%d %H:%M:%S")) 39 40 class ConfluenceHandler: 41 42 "Handle content from a Confluence Wiki dump." 43 44 def __init__(self, space, no_translate=False): 45 self.content = {} 46 self.elements = [] 47 self.space = space 48 self.no_translate = no_translate 49 50 def handle_object(self, name, elements, attributes, all_text, text): 51 52 """ 53 Handle objects according to type. Objects appear as follows: 54 55 <object class="Page" package="..."> 56 <id name="id">...</id> 57 ... 58 </object> 59 60 Within objects, one finds things like properties and collections, which 61 are handled by their own methods but which are stored in the content 62 dictionary associated with the current object. 63 64 By the time this method is called, the contents of the object will have 65 been gathered and the properties and collections populated in the 66 content dictionary. Any identifier will have been assigned to the 67 textual content of the object element and will be available in the 68 'text' parameter. 69 """ 70 71 objecttype = attributes[-1]["class"] 72 73 # Any identifier is stored as the object's textual content. 74 75 identifier = text.strip() 76 77 # The content is a dictionary mapping names to properties and 78 # collections. 79 80 content = self.content 81 82 pages_dir = join(self.space, "pages") 83 versions_dir = join(self.space, "versions") 84 85 # Handle particular types. 86 87 if objecttype in ("Page", "Comment", "BlogPost"): 88 89 # Handle pages and revisions, adding revisions to the page manifest. 90 # The original version is used as a unifying identifier for all the 91 # different revisions (each of which being defined by a Page 92 # element). Although "original" implies the first identifier used, 93 # it actually appears to be the latest and will have the highest 94 # version number. 95 96 if content.has_key("originalVersion"): 97 pageid = content["originalVersion"] 98 else: 99 pageid = identifier 100 101 versionfile = join(versions_dir, identifier) 102 103 # Note page metadata, not necessarily in the correct order. 104 # For comments, the title will need to be rewritten, since they 105 # should be defined in terms of their owner page. 106 107 # NOTE: This only makes the current title available to comments. 108 109 mkdirs(join(pages_dir, pageid)) 110 111 title = content["title"] 112 113 # Limit the title to a "safe" number of characters in order to avoid 114 # filesystem issues. 115 116 title = get_page_title(title) 117 118 if title: 119 title = "%s/%s" % (self.space, title) 120 write(join(pages_dir, pageid, "pagetitle"), title) 121 122 # Note the type of the page. 123 124 write(join(pages_dir, pageid, "pagetype"), objecttype) 125 126 # See sort_manifest for access to this data. 127 128 append(join(pages_dir, pageid, "manifest"), 129 "%s|AddRevision|_|%s|%s|%s|%s|%d\n" % ( # blank added for consistency with AddAttachment 130 content["version"], 131 versionfile, 132 title, # comment titles will incorporate the comment's position 133 content["lastModifierName"], 134 content["versionComment"], 135 date_to_seconds(content["lastModificationDate"]) 136 )) 137 138 # Add information to parent pages for child page lists. 139 140 if content.has_key("parent"): 141 parentid = content["parent"] 142 mkdirs(join(pages_dir, parentid)) 143 append(join(pages_dir, parentid, "children"), title + "\n") 144 145 # Add creation details for comments to the owner page. 146 # Since comments can be versioned, the date of the original version 147 # is used, and only this "original" version has the owner property. 148 149 if objecttype == "Comment" and content.has_key("owner"): 150 ownerid = content["owner"] 151 mkdirs(join(pages_dir, ownerid)) 152 append(join(pages_dir, ownerid, "comments"), "%s|%s\n" % (content["creationDate"], pageid)) 153 154 # Some metadata is not particularly relevant. For example, 155 # ancestors, children, parent are navigation-related. 156 157 # Other metadata could be added to the page content itself. 158 # For example, labelling could be converted to categories. 159 160 # Handle revisions. 161 162 elif objecttype == "BodyContent": 163 body = content["body"] 164 if not body: 165 body = "## Empty page." 166 167 # NOTE: Very simple technique employed for guessing the format. 168 169 if no_translate: 170 fn = write 171 elif body.startswith("<"): 172 fn = xmltranslate 173 else: 174 fn = translate 175 176 try: 177 fn(join(versions_dir, content["content"]), body) 178 except: 179 err = codecs.getwriter("utf-8")(sys.stderr) 180 print >>err, "Error parsing", content["content"] 181 raise 182 183 # Handle attachments. 184 185 elif objecttype == "Attachment": 186 pageid = content["content"] 187 version = content["attachmentVersion"] 188 189 if content.has_key("originalVersion"): 190 attachid = content["originalVersion"] 191 else: 192 attachid = identifier 193 194 append(join(pages_dir, pageid, "attachments"), 195 "%s|AddAttachment|%s|%s|%s|%s|%s|%d\n" % ( 196 version, 197 # Have to "taint" archive filenames, although Moin will 198 # probably handle package script filename tainting. 199 wikiutil.taintfilename(join("attachments", pageid, attachid, version)), 200 wikiutil.taintfilename(content["fileName"]), 201 "", # pagename is substituted later 202 content["lastModifierName"], 203 content["comment"], 204 date_to_seconds(content["lastModificationDate"]) 205 )) 206 207 self.content = {} 208 209 def handle_property(self, name, elements, attributes, all_text, text): 210 211 "Record properties in the current content dictionary." 212 213 self.content[attributes[-1]["name"]] = text.strip() 214 215 def handle_id(self, name, elements, attributes, all_text, text): 216 217 "Promote identifiers to the parent element's text." 218 219 all_text[-2].append(text) 220 221 def handle_collection(self, name, elements, attributes, all_text, text): 222 223 "Record collections in the current content dictionary." 224 225 self.content[attributes[-1]["name"]] = self.elements 226 self.elements = [] 227 228 def handle_element(self, name, elements, attributes, all_text, text): 229 230 "Add elements to the current collection." 231 232 self.elements.append((attributes[-1]["class"], text.strip())) 233 234 def mkdirs(name): 235 236 "Make the directory with the given 'name' at any depth." 237 238 try: 239 makedirs(name) 240 except OSError: 241 pass 242 243 def append(filename, s): 244 245 "Append to the file with the given 'filename' the string 's'." 246 247 write(filename, s, True) 248 249 def write(filename, s, append=False): 250 251 """ 252 Write to the file with the given 'filename' the string 's'. If the optional 253 'append' parameter is set to a true value, 's' will be appended to the file. 254 """ 255 256 f = codecs.open(filename, append and "a" or "w", encoding="utf-8") 257 try: 258 f.write(s) 259 finally: 260 f.close() 261 262 def read(filename): 263 264 """ 265 Read from the file with the given 'filename', returning a string containing 266 its contents. 267 """ 268 269 f = codecs.open(filename, encoding="utf-8") 270 try: 271 return f.read() 272 finally: 273 f.close() 274 275 def translate(filename, body, fn=None): 276 277 """ 278 Write to the file with the given 'filename' a translation of the given 279 'body'. 280 """ 281 282 fn = fn or wikiparser.parse 283 284 out = codecs.open(filename, "w", encoding="utf-8") 285 try: 286 print >>out, "#pragma page-filename", filename 287 fn(body, out) 288 finally: 289 out.close() 290 291 def xmltranslate(filename, body): 292 translate(filename, body, xmlparser.parse) 293 294 def sort_comments(pages_dir, pageid): 295 296 """ 297 Where 'pageid' has comments associated with it, sort them chronologically 298 and label the comment pages with the owner page's title and comment's 299 position in the chronological sequence. Such labelling is done by writing 300 a "pagetitle" file in each comment page's directory. 301 """ 302 303 comments = join(pages_dir, pageid, "comments") 304 305 if not exists(comments): 306 return 307 308 title = read(join(pages_dir, pageid, "pagetitle")) 309 310 details = [line.split("|") for line in read(comments).split("\n") if line] 311 details.sort() 312 313 # Write the sorted comments list for testing purposes. 314 315 write(comments, "\n".join(["|".join(x) for x in details])) 316 317 # Define comments as subpages by setting their titles using this 318 # page's name/title and their position in the comments collection. 319 320 for position, (_lastmodified, commentid) in enumerate(details): 321 322 # In the page directory for each comment, write the page title in a 323 # special file for later processing. 324 325 write(join(pages_dir, commentid, "pagetitle"), "%s/%04d" % (title, position)) 326 327 def _manifest_to_mapping(manifest, output_mapping): 328 329 """ 330 Open the given 'manifest' and write a mapping from version identifiers to 331 page names/titles to the file with the given 'output_mapping' filename. 332 """ 333 334 f = codecs.open(manifest, "r", encoding="utf-8") 335 try: 336 mapping = [] 337 338 lines = [x.split("|") for x in f.readlines()] 339 for line in lines: 340 version, _action, _archive_filename, filename, title, username, comment, mtime = line 341 if title: 342 mapping.append((split(filename)[-1], title)) 343 344 append(output_mapping, "".join([("\t".join(x) + "\n") for x in mapping])) 345 346 finally: 347 f.close() 348 349 def _sort_manifest(manifest, title): 350 351 """ 352 Open the given 'manifest' and sort it according to revision so that it will 353 be added to MoinMoin in the correct order. 354 355 If a 'title' is provided, the title column in the manifest will be augmented 356 with that information. This is typically done for comments and is necessary 357 for attachments. 358 359 A list of manifest entries is returned. 360 """ 361 362 f = codecs.open(manifest, "r", encoding="utf-8") 363 try: 364 lines = [x.rstrip("\n").split("|") for x in f.readlines()] 365 lines.sort(cmp=lambda x, y: cmp(int(x[0]), int(y[0]))) 366 367 # Reconstruct the lines, optionally changing the titles. 368 369 result = [] 370 371 for line in lines: 372 version, _action, _archive_filename, filename, old_title, username, comment, mtime = line 373 374 # Replace title information with the information already present. 375 376 if not old_title: 377 new_title = title 378 else: 379 new_title = old_title 380 381 # The version is omitted now that the manifest is ordered. 382 383 line = _action, _archive_filename, filename, new_title, username, comment, mtime 384 result.append(line) 385 386 return result 387 388 finally: 389 f.close() 390 391 def serialise_manifest(manifest): 392 393 """ 394 Process the 'manifest' consisting of entries, removing superfluous columns. 395 """ 396 397 result = [] 398 399 for columns in manifest: 400 action = columns[0] 401 if action == "AddRevision": 402 columns = list(columns) 403 del columns[1] 404 result.append("|".join(columns) + "\n") 405 406 return "".join(result) 407 408 def sort_manifest(pages_dir, pageid, output_mapping=None, no_translate=False): 409 410 """ 411 Using the given 'pageid', locate the manifest for the page and any page 412 title information written to a "pagetitle" file. 413 414 Then sort the manifest according to revision so that historical operations 415 such as page renaming can be detected. 416 417 If a "pagetitle" file exists, the title column in the manifest will be 418 augmented with the contents of that file. This is typically done for 419 comments. 420 421 If a "children" file exists, the pages in that file will be added as a list 422 to the end of each revision's content. 423 424 If 'output_mapping' is given, a mapping from version identifiers to page 425 titles will be appended to the file having that filename. 426 """ 427 428 pagetype = join(pages_dir, pageid, "pagetype") 429 manifest = join(pages_dir, pageid, "manifest") 430 attachments = join(pages_dir, pageid, "attachments") 431 pagetitle = join(pages_dir, pageid, "pagetitle") 432 children = join(pages_dir, pageid, "children") 433 comments = join(pages_dir, pageid, "comments") 434 435 type = exists(pagetype) and read(pagetype) or None 436 437 if exists(pagetitle): 438 title = read(pagetitle) 439 space, _page_name = get_space_and_name(title) 440 else: 441 title = space = None 442 443 # Sort the revision manifest. 444 445 result = _sort_manifest(manifest, title) 446 447 # Output a mapping of identifiers to page names. 448 449 if output_mapping: 450 _manifest_to_mapping(manifest, output_mapping) 451 452 # Modify the content to include child pages and comments. 453 454 last_title = None 455 final_result = [] 456 457 for details in result: 458 _action, _archive_filename, filename, new_title, username, comment, mtime = details 459 460 # Detect renamed pages and add a redirect revision. 461 462 if last_title and last_title != new_title and _action == "AddRevision": 463 renaming_versionfile = filename + ".rename" 464 final_result.append((_action, "_", renaming_versionfile, last_title, username, "Page renamed to %s" % new_title, mtime)) 465 write(renaming_versionfile, "#REDIRECT %s" % new_title) 466 467 last_title = new_title 468 469 # Add this revision to the manifest. 470 471 final_result.append(details) 472 473 # Obtain the text only if modifications are to be made. 474 475 text = None 476 477 # Add an ACL to comment pages so that people cannot change other 478 # people's comments. 479 480 if type == "Comment": 481 text = "#acl %s:read,write,delete,revert All:read\n%s" % (username, text or read(filename)) 482 483 # Add child page information to the content. 484 485 if exists(children) and not no_translate: 486 child_pages = [] 487 child_page_names = [x for x in read(children).split("\n") if x] 488 child_page_names.sort() 489 490 # Produce links which hide the space prefix. 491 492 for child_page_name in child_page_names: 493 child_space, page_name = get_space_and_name(child_page_name) 494 if child_space == space: 495 child_page_label = page_name 496 else: 497 child_page_label = child_page_name 498 499 child_pages.append(" * [[%s|%s]]" % (child_page_name, child_page_label)) 500 501 text = (text or read(filename)) + child_page_section % "\n".join(child_pages) 502 503 # Add comments to the content. 504 505 if exists(comments) and title and not no_translate: 506 text = (text or read(filename)) + comment_section 507 508 # Rewrite the file if necessary. 509 510 if text: 511 write(filename, text) 512 513 # Add the attachments to the manifest. 514 515 if exists(attachments): 516 final_result += _sort_manifest(attachments, title) 517 518 return final_result 519 520 def sort_final_manifest(entries, output): 521 522 """ 523 Sort the manifest 'entries' by last modified time and serialise it. 524 The manifest details will be appended to the file named by 'output'. 525 """ 526 527 # The final entry in each element is the mtime. 528 529 entries.sort(cmp=lambda x, y: cmp(int(x[-1]), int(y[-1]))) 530 531 # Serialise the manifest. 532 533 s = serialise_manifest(entries) 534 append(output, s) 535 536 def get_space_and_name(page_name): 537 try: 538 return page_name.split("/", 1) 539 except IndexError: 540 return None, page_name 541 542 # Template for child page information. 543 544 child_page_section = """ 545 ---- 546 547 %s 548 """ 549 550 # Template for comments. 551 552 comment_section = """ 553 ---- 554 555 <<IncludeComments>> 556 """ 557 558 # Main program. 559 560 if __name__ == "__main__": 561 try: 562 filename = sys.argv[1] 563 is_zipfile = splitext(filename)[-1] == extsep + "zip" 564 space = sys.argv[2] 565 if len(sys.argv) > 3 and sys.argv[3]: 566 attachments = sys.argv[3] 567 else: 568 attachments = None 569 except IndexError: 570 print >>sys.stderr, """ 571 Please specify an XML file containing Wiki data, a workspace name, and an 572 optional attachments directory location. For example: 573 574 %(progname)s com_entities.xml COM attachments 575 576 Adding --no-translate will unpack the Wiki but not translate the content. 577 When doing so without an attachments directory, add an empty argument as 578 follows: 579 580 %(progname)s com_entities.xml COM '' --no-translate 581 582 An archive can be used instead of the XML file, and since this may include 583 attachments, no additional attachments directory needs to be specified: 584 585 %(progname)s COM-123456-789012.zip COM 586 """ % {"progname" : split(sys.argv[0])[-1]} 587 588 sys.exit(1) 589 590 no_translate = "--no-translate" in sys.argv 591 592 if exists(space): 593 print >>sys.stderr, "Directory exists for space %s. Please choose another or remove its contents." % space 594 sys.exit(1) 595 596 package_zip = space + extsep + "zip" 597 598 if exists(package_zip): 599 print >>sys.stderr, "Page package exists. Please remove or rename it:", package_zip 600 sys.exit(1) 601 602 mkdir(space) 603 mkdirs(join(space, "pages")) 604 mkdirs(join(space, "versions")) 605 606 p = xmlread.ConfigurableParser() 607 handler = ConfluenceHandler(space, no_translate) 608 609 # Register handlers in the parser for different elements. 610 611 p["object"] = handler.handle_object 612 p["property"] = handler.handle_property 613 p["id"] = handler.handle_id 614 p["collection"] = handler.handle_collection 615 p["element"] = handler.handle_element 616 617 # Open the XML dump. 618 619 f = open(filename) 620 621 if is_zipfile: 622 zf = ZipFile(f) 623 ff = StringIO(zf.read("entities.xml")) 624 else: 625 ff = f 626 627 # Parse the data. 628 629 try: 630 p.parse(ff) 631 632 # Tidy up the import manifests, sorting each of them by revision and 633 # finalising them. 634 635 pages_dir = join(space, "pages") 636 637 for pageid in listdir(pages_dir): 638 sort_comments(pages_dir, pageid) 639 640 output_mapping = join(space, "MAPPING") 641 642 output_manifest = join(space, "MOIN_PACKAGE") 643 append(output_manifest, "MoinMoinPackage|1\n") 644 645 entries = [] 646 647 for pageid in listdir(pages_dir): 648 entries += sort_manifest(pages_dir, pageid, output_mapping, no_translate) 649 650 sort_final_manifest(entries, output_manifest) 651 652 # Write the page package. 653 654 page_package = ZipFile(package_zip, "w") 655 656 try: 657 # Include the page revisions. 658 659 versions_dir = join(space, "versions") 660 661 for versionid in listdir(versions_dir): 662 page_package.write(join(versions_dir, versionid)) 663 664 # Include the attachments. 665 666 if attachments: 667 cwd = getcwd() 668 chdir(split(attachments)[0]) 669 try: 670 for path, dirnames, filenames in walk(split(attachments)[1]): 671 for filename in filenames: 672 # Have to "taint" archive filenames. 673 page_package.write(join(path, filename), wikiutil.taintfilename(join(path, filename))) 674 finally: 675 chdir(cwd) 676 elif is_zipfile: 677 for filename in zf.namelist(): 678 if filename.startswith("attachments"): 679 # Have to "taint" archive filenames. 680 page_package.writestr(wikiutil.taintfilename(filename), zf.read(filename)) 681 682 # Include only the top-level manifest. 683 684 page_package.write(output_manifest, "MOIN_PACKAGE") 685 686 finally: 687 page_package.close() 688 689 finally: 690 f.close() 691 692 # vim: tabstop=4 expandtab shiftwidth=4