1 #!/usr/bin/env python 2 3 """ 4 Confluence XML dump conversion to a MoinMoin-compatible representation. 5 6 Copyright (C) 2012, 2013 Paul Boddie <paul@boddie.org.uk> 7 8 This software is free software; you can redistribute it and/or 9 modify it under the terms of the GNU General Public License as 10 published by the Free Software Foundation; either version 2 of 11 the License, or (at your option) any later version. 12 13 This software is distributed in the hope that it will be useful, 14 but WITHOUT ANY WARRANTY; without even the implied warranty of 15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 GNU General Public License for more details. 17 18 You should have received a copy of the GNU General Public 19 License along with this library; see the file LICENCE.txt 20 If not, write to the Free Software Foundation, Inc., 21 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA 22 """ 23 24 from os import chdir, getcwd, listdir, mkdir, makedirs, walk 25 from os.path import exists, extsep, join, split, splitext 26 from zipfile import ZipFile 27 from cStringIO import StringIO 28 from MoinMoin import wikiutil 29 import codecs 30 import xmlread 31 import wikiparser, xmlparser 32 import sys 33 34 from common import get_page_title 35 36 class ConfluenceHandler: 37 38 "Handle content from a Confluence Wiki dump." 39 40 def __init__(self, space, no_translate=False): 41 self.content = {} 42 self.elements = [] 43 self.space = space 44 self.no_translate = no_translate 45 46 def handle_object(self, name, elements, attributes, all_text, text): 47 48 """ 49 Handle objects according to type. Objects appear as follows: 50 51 <object class="Page" package="..."> 52 <id name="id">...</id> 53 ... 54 </object> 55 56 Within objects, one finds things like properties and collections, which 57 are handled by their own methods but which are stored in the content 58 dictionary associated with the current object. 59 60 By the time this method is called, the contents of the object will have 61 been gathered and the properties and collections populated in the 62 content dictionary. Any identifier will have been assigned to the 63 textual content of the object element and will be available in the 64 'text' parameter. 65 """ 66 67 objecttype = attributes[-1]["class"] 68 69 # Any identifier is stored as the object's textual content. 70 71 identifier = text.strip() 72 73 # The content is a dictionary mapping names to properties and 74 # collections. 75 76 content = self.content 77 78 pages_dir = join(self.space, "pages") 79 versions_dir = join(self.space, "versions") 80 81 # Handle particular types. 82 83 if objecttype in ("Page", "Comment", "BlogPost"): 84 85 # Handle pages and revisions, adding revisions to the page manifest. 86 # The original version is used as a unifying identifier for all the 87 # different revisions (each of which being defined by a Page 88 # element). Although "original" implies the first identifier used, 89 # it actually appears to be the latest and will have the highest 90 # version number. 91 92 if content.has_key("originalVersion"): 93 pageid = content["originalVersion"] 94 else: 95 pageid = identifier 96 97 versionfile = join(versions_dir, identifier) 98 99 # Note page metadata, not necessarily in the correct order. 100 # For comments, the title will need to be rewritten, since they 101 # should be defined in terms of their owner page. 102 103 # NOTE: This only makes the current title available to comments. 104 105 mkdirs(join(pages_dir, pageid)) 106 107 title = content["title"] 108 109 # Limit the title to a "safe" number of characters in order to avoid 110 # filesystem issues. 111 112 title = get_page_title(title) 113 114 if title: 115 title = "%s/%s" % (self.space, title) 116 write(join(pages_dir, pageid, "pagetitle"), title) 117 118 # Note the type of the page. 119 120 write(join(pages_dir, pageid, "pagetype"), objecttype) 121 122 # See sort_manifest for access to this data. 123 124 append(join(pages_dir, pageid, "manifest"), 125 "%s|AddRevision|_|%s|%s|%s|%s\n" % ( # blank added for consistency with AddAttachment 126 content["version"], 127 versionfile, 128 title, # comment titles will incorporate the comment's position 129 content["lastModifierName"], 130 content["versionComment"] 131 )) 132 133 # Add information to parent pages for child page lists. 134 135 if content.has_key("parent"): 136 parentid = content["parent"] 137 mkdirs(join(pages_dir, parentid)) 138 append(join(pages_dir, parentid, "children"), title + "\n") 139 140 # Add creation details for comments to the owner page. 141 # Since comments can be versioned, the date of the original version 142 # is used, and only this "original" version has the owner property. 143 144 if objecttype == "Comment" and content.has_key("owner"): 145 ownerid = content["owner"] 146 mkdirs(join(pages_dir, ownerid)) 147 append(join(pages_dir, ownerid, "comments"), "%s|%s\n" % (content["creationDate"], pageid)) 148 149 # Some metadata is not particularly relevant. For example, 150 # ancestors, children, parent are navigation-related. 151 152 # Other metadata could be added to the page content itself. 153 # For example, labelling could be converted to categories. 154 155 # Handle revisions. 156 157 elif objecttype == "BodyContent": 158 body = content["body"] 159 if not body: 160 body = "## Empty page." 161 162 # NOTE: Very simple technique employed for guessing the format. 163 164 if no_translate: 165 fn = write 166 elif body.startswith("<"): 167 fn = xmltranslate 168 else: 169 fn = translate 170 171 try: 172 fn(join(versions_dir, content["content"]), body) 173 except: 174 err = codecs.getwriter("utf-8")(sys.stderr) 175 print >>err, "Error parsing", content["content"] 176 raise 177 178 # Handle attachments. 179 180 elif objecttype == "Attachment": 181 pageid = content["content"] 182 version = content["attachmentVersion"] 183 184 if content.has_key("originalVersion"): 185 attachid = content["originalVersion"] 186 else: 187 attachid = identifier 188 189 append(join(pages_dir, pageid, "attachments"), 190 "%s|AddAttachment|%s|%s|%s|%s|%s\n" % ( 191 version, 192 # Have to "taint" archive filenames, although Moin will 193 # probably handle package script filename tainting. 194 wikiutil.taintfilename(join("attachments", pageid, attachid, version)), 195 wikiutil.taintfilename(content["fileName"]), 196 "", # pagename is substituted later 197 content["lastModifierName"], 198 content["comment"] 199 )) 200 201 self.content = {} 202 203 def handle_property(self, name, elements, attributes, all_text, text): 204 205 "Record properties in the current content dictionary." 206 207 self.content[attributes[-1]["name"]] = text.strip() 208 209 def handle_id(self, name, elements, attributes, all_text, text): 210 211 "Promote identifiers to the parent element's text." 212 213 all_text[-2].append(text) 214 215 def handle_collection(self, name, elements, attributes, all_text, text): 216 217 "Record collections in the current content dictionary." 218 219 self.content[attributes[-1]["name"]] = self.elements 220 self.elements = [] 221 222 def handle_element(self, name, elements, attributes, all_text, text): 223 224 "Add elements to the current collection." 225 226 self.elements.append((attributes[-1]["class"], text.strip())) 227 228 def mkdirs(name): 229 230 "Make the directory with the given 'name' at any depth." 231 232 try: 233 makedirs(name) 234 except OSError: 235 pass 236 237 def append(filename, s): 238 239 "Append to the file with the given 'filename' the string 's'." 240 241 write(filename, s, True) 242 243 def write(filename, s, append=False): 244 245 """ 246 Write to the file with the given 'filename' the string 's'. If the optional 247 'append' parameter is set to a true value, 's' will be appended to the file. 248 """ 249 250 f = codecs.open(filename, append and "a" or "w", encoding="utf-8") 251 try: 252 f.write(s) 253 finally: 254 f.close() 255 256 def read(filename): 257 258 """ 259 Read from the file with the given 'filename', returning a string containing 260 its contents. 261 """ 262 263 f = codecs.open(filename, encoding="utf-8") 264 try: 265 return f.read() 266 finally: 267 f.close() 268 269 def translate(filename, body, fn=None): 270 271 """ 272 Write to the file with the given 'filename' a translation of the given 273 'body'. 274 """ 275 276 fn = fn or wikiparser.parse 277 278 out = codecs.open(filename, "w", encoding="utf-8") 279 try: 280 print >>out, "#pragma page-filename", filename 281 fn(body, out) 282 finally: 283 out.close() 284 285 def xmltranslate(filename, body): 286 translate(filename, body, xmlparser.parse) 287 288 def sort_comments(pages_dir, pageid): 289 290 """ 291 Where 'pageid' has comments associated with it, sort them chronologically 292 and label the comment pages with the owner page's title and comment's 293 position in the chronological sequence. Such labelling is done by writing 294 a "pagetitle" file in each comment page's directory. 295 """ 296 297 comments = join(pages_dir, pageid, "comments") 298 299 if not exists(comments): 300 return 301 302 title = read(join(pages_dir, pageid, "pagetitle")) 303 304 details = [line.split("|") for line in read(comments).split("\n") if line] 305 details.sort() 306 307 # Write the sorted comments list for testing purposes. 308 309 write(comments, "\n".join(["|".join(x) for x in details])) 310 311 # Define comments as subpages by setting their titles using this 312 # page's name/title and their position in the comments collection. 313 314 for position, (_lastmodified, commentid) in enumerate(details): 315 316 # In the page directory for each comment, write the page title in a 317 # special file for later processing. 318 319 write(join(pages_dir, commentid, "pagetitle"), "%s/%04d" % (title, position)) 320 321 def _manifest_to_mapping(manifest, output_mapping): 322 323 """ 324 Open the given 'manifest' and write a mapping from version identifiers to 325 page names/titles to the file with the given 'output_mapping' filename. 326 """ 327 328 f = codecs.open(manifest, "r", encoding="utf-8") 329 try: 330 mapping = [] 331 332 lines = [x.split("|") for x in f.readlines()] 333 for line in lines: 334 version, _action, _archive_filename, filename, title, username, comment = line 335 if title: 336 mapping.append((split(filename)[-1], title)) 337 338 append(output_mapping, "".join([("\t".join(x) + "\n") for x in mapping])) 339 340 finally: 341 f.close() 342 343 def _sort_manifest(manifest, title): 344 345 """ 346 Open the given 'manifest' and sort it according to revision so that it will 347 be added to MoinMoin in the correct order. 348 349 If a 'title' is provided, the title column in the manifest will be augmented 350 with that information. This is typically done for comments and is necessary 351 for attachments. 352 353 A list of manifest entries is returned. 354 """ 355 356 f = codecs.open(manifest, "r", encoding="utf-8") 357 try: 358 lines = [x.rstrip("\n").split("|") for x in f.readlines()] 359 lines.sort(cmp=lambda x, y: cmp(int(x[0]), int(y[0]))) 360 361 # Reconstruct the lines, optionally changing the titles. 362 363 result = [] 364 365 for line in lines: 366 version, _action, _archive_filename, filename, old_title, username, comment = line 367 368 # Replace title information with the information already present. 369 370 if not old_title: 371 new_title = title 372 else: 373 new_title = old_title 374 375 # The version is omitted now that the manifest is ordered. 376 377 line = _action, _archive_filename, filename, new_title, username, comment 378 result.append(line) 379 380 return result 381 382 finally: 383 f.close() 384 385 def serialise_manifest(manifest): 386 387 """ 388 Process the 'manifest' consisting of entries, removing superfluous columns. 389 """ 390 391 result = [] 392 393 for columns in manifest: 394 action = columns[0] 395 if action == "AddRevision": 396 columns = list(columns) 397 del columns[1] 398 result.append("|".join(columns) + "\n") 399 400 return "".join(result) 401 402 def sort_manifest(pages_dir, pageid, output=None, output_mapping=None, no_translate=False): 403 404 """ 405 Using the given 'pageid', locate the manifest for the page and any page 406 title information written to a "pagetitle" file. 407 408 Then sort the manifest according to revision so that it will be added to 409 MoinMoin in the correct order. 410 411 If a "pagetitle" file exists, the title column in the manifest will be 412 augmented with the contents of that file. This is typically done for 413 comments. 414 415 If a "children" file exists, the pages in that file will be added as a list 416 to the end of each revision's content. 417 418 If 'output' is given, the manifest details will be appended to the file 419 having that filename instead of being rewritten to the original manifest 420 file. 421 422 If 'output_mapping' is given, a mapping from version identifiers to page 423 titles will be appended to the file having that filename. 424 """ 425 426 pagetype = join(pages_dir, pageid, "pagetype") 427 manifest = join(pages_dir, pageid, "manifest") 428 attachments = join(pages_dir, pageid, "attachments") 429 pagetitle = join(pages_dir, pageid, "pagetitle") 430 children = join(pages_dir, pageid, "children") 431 comments = join(pages_dir, pageid, "comments") 432 433 type = exists(pagetype) and read(pagetype) or None 434 435 if exists(pagetitle): 436 title = read(pagetitle) 437 space, _page_name = get_space_and_name(title) 438 else: 439 title = space = None 440 441 # Sort the revision manifest. 442 443 result = _sort_manifest(manifest, title) 444 445 # Output a mapping of identifiers to page names. 446 447 if output_mapping: 448 _manifest_to_mapping(manifest, output_mapping) 449 450 # Modify the content to include child pages and comments. 451 452 last_title = None 453 final_result = [] 454 455 for details in result: 456 _action, _archive_filename, filename, new_title, username, comment = details 457 458 # Detect renamed pages and add a redirect revision. 459 460 if last_title and last_title != new_title and _action == "AddRevision": 461 renaming_versionfile = filename + ".rename" 462 final_result.append((_action, "_", renaming_versionfile, last_title, username, "Page renamed to %s" % new_title)) 463 write(renaming_versionfile, "#REDIRECT %s" % new_title) 464 465 last_title = new_title 466 467 # Add this revision to the manifest. 468 469 final_result.append(details) 470 471 # Obtain the text only if modifications are to be made. 472 473 text = None 474 475 # Add an ACL to comment pages so that people cannot change other 476 # people's comments. 477 478 if type == "Comment": 479 text = "#acl %s:read,write,delete,revert All:read\n%s" % (username, text or read(filename)) 480 481 # Add child page information to the content. 482 483 if exists(children) and not no_translate: 484 child_pages = [] 485 child_page_names = [x for x in read(children).split("\n") if x] 486 child_page_names.sort() 487 488 # Produce links which hide the space prefix. 489 490 for child_page_name in child_page_names: 491 child_space, page_name = get_space_and_name(child_page_name) 492 if child_space == space: 493 child_page_label = page_name 494 else: 495 child_page_label = child_page_name 496 497 child_pages.append(" * [[%s|%s]]" % (child_page_name, child_page_label)) 498 499 text = (text or read(filename)) + child_page_section % "\n".join(child_pages) 500 501 # Add comments to the content. 502 503 if exists(comments) and title and not no_translate: 504 text = (text or read(filename)) + comment_section 505 506 # Rewrite the file if necessary. 507 508 if text: 509 write(filename, text) 510 511 # Add the attachments to the manifest. 512 513 if exists(attachments): 514 final_result += _sort_manifest(attachments, title) 515 516 # Serialise the manifest. 517 518 s = serialise_manifest(final_result) 519 520 if output is None: 521 write(manifest, s) 522 else: 523 append(output, s) 524 525 def get_space_and_name(page_name): 526 try: 527 return page_name.split("/", 1) 528 except IndexError: 529 return None, page_name 530 531 # Template for child page information. 532 533 child_page_section = """ 534 ---- 535 536 %s 537 """ 538 539 # Template for comments. 540 541 comment_section = """ 542 ---- 543 544 <<IncludeComments>> 545 """ 546 547 # Main program. 548 549 if __name__ == "__main__": 550 try: 551 filename = sys.argv[1] 552 is_zipfile = splitext(filename)[-1] == extsep + "zip" 553 space = sys.argv[2] 554 if len(sys.argv) > 3 and sys.argv[3]: 555 attachments = sys.argv[3] 556 else: 557 attachments = None 558 except IndexError: 559 print >>sys.stderr, """ 560 Please specify an XML file containing Wiki data, a workspace name, and an 561 optional attachments directory location. For example: 562 563 %(progname)s com_entities.xml COM attachments 564 565 Adding --no-translate will unpack the Wiki but not translate the content. 566 When doing so without an attachments directory, add an empty argument as 567 follows: 568 569 %(progname)s com_entities.xml COM '' --no-translate 570 571 An archive can be used instead of the XML file, and since this may include 572 attachments, no additional attachments directory needs to be specified: 573 574 %(progname)s COM-123456-789012.zip COM 575 """ % {"progname" : split(sys.argv[0])[-1]} 576 577 sys.exit(1) 578 579 no_translate = "--no-translate" in sys.argv 580 581 if exists(space): 582 print >>sys.stderr, "Directory exists for space %s. Please choose another or remove its contents." % space 583 sys.exit(1) 584 585 package_zip = space + extsep + "zip" 586 587 if exists(package_zip): 588 print >>sys.stderr, "Page package exists. Please remove or rename it:", package_zip 589 sys.exit(1) 590 591 mkdir(space) 592 mkdirs(join(space, "pages")) 593 mkdirs(join(space, "versions")) 594 595 p = xmlread.ConfigurableParser() 596 handler = ConfluenceHandler(space, no_translate) 597 598 # Register handlers in the parser for different elements. 599 600 p["object"] = handler.handle_object 601 p["property"] = handler.handle_property 602 p["id"] = handler.handle_id 603 p["collection"] = handler.handle_collection 604 p["element"] = handler.handle_element 605 606 # Open the XML dump. 607 608 f = open(filename) 609 610 if is_zipfile: 611 zf = ZipFile(f) 612 ff = StringIO(zf.read("entities.xml")) 613 else: 614 ff = f 615 616 # Parse the data. 617 618 try: 619 p.parse(ff) 620 621 # Tidy up the import manifests, sorting each of them by revision and 622 # finalising them. 623 624 pages_dir = join(space, "pages") 625 626 for pageid in listdir(pages_dir): 627 sort_comments(pages_dir, pageid) 628 629 output_mapping = join(space, "MAPPING") 630 631 output_manifest = join(space, "MOIN_PACKAGE") 632 append(output_manifest, "MoinMoinPackage|1\n") 633 634 for pageid in listdir(pages_dir): 635 sort_manifest(pages_dir, pageid, output_manifest, output_mapping, no_translate) 636 637 # Write the page package. 638 639 page_package = ZipFile(package_zip, "w") 640 641 try: 642 # Include the page revisions. 643 644 versions_dir = join(space, "versions") 645 646 for versionid in listdir(versions_dir): 647 page_package.write(join(versions_dir, versionid)) 648 649 # Include the attachments. 650 651 if attachments: 652 cwd = getcwd() 653 chdir(split(attachments)[0]) 654 try: 655 for path, dirnames, filenames in walk(split(attachments)[1]): 656 for filename in filenames: 657 # Have to "taint" archive filenames. 658 page_package.write(join(path, filename), wikiutil.taintfilename(join(path, filename))) 659 finally: 660 chdir(cwd) 661 elif is_zipfile: 662 for filename in zf.namelist(): 663 if filename.startswith("attachments"): 664 # Have to "taint" archive filenames. 665 page_package.writestr(wikiutil.taintfilename(filename), zf.read(filename)) 666 667 # Include only the top-level manifest. 668 669 page_package.write(output_manifest, "MOIN_PACKAGE") 670 671 finally: 672 page_package.close() 673 674 finally: 675 f.close() 676 677 # vim: tabstop=4 expandtab shiftwidth=4