1 #!/usr/bin/env python 2 3 """ 4 Confluence XML dump conversion to a MoinMoin-compatible representation. 5 6 Copyright (C) 2012, 2013 Paul Boddie <paul@boddie.org.uk> 7 8 This software is free software; you can redistribute it and/or 9 modify it under the terms of the GNU General Public License as 10 published by the Free Software Foundation; either version 2 of 11 the License, or (at your option) any later version. 12 13 This software is distributed in the hope that it will be useful, 14 but WITHOUT ANY WARRANTY; without even the implied warranty of 15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 GNU General Public License for more details. 17 18 You should have received a copy of the GNU General Public 19 License along with this library; see the file LICENCE.txt 20 If not, write to the Free Software Foundation, Inc., 21 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA 22 """ 23 24 from os import chdir, getcwd, listdir, mkdir, makedirs, walk 25 from os.path import exists, extsep, join, split, splitext 26 from zipfile import ZipFile 27 from cStringIO import StringIO 28 from MoinMoin import wikiutil 29 import codecs 30 import xmlread 31 import wikiparser, xmlparser 32 import sys 33 34 from common import get_page_title 35 36 class ConfluenceHandler: 37 38 "Handle content from a Confluence Wiki dump." 39 40 def __init__(self, space, no_translate=False): 41 self.content = {} 42 self.elements = [] 43 self.space = space 44 self.no_translate = no_translate 45 46 def handle_object(self, name, elements, attributes, all_text, text): 47 48 """ 49 Handle objects according to type. Objects appear as follows: 50 51 <object class="Page" package="..."> 52 <id name="id">...</id> 53 ... 54 </object> 55 56 Within objects, one finds things like properties and collections, which 57 are handled by their own methods but which are stored in the content 58 dictionary associated with the current object. 59 60 By the time this method is called, the contents of the object will have 61 been gathered and the properties and collections populated in the 62 content dictionary. Any identifier will have been assigned to the 63 textual content of the object element and will be available in the 64 'text' parameter. 65 """ 66 67 objecttype = attributes[-1]["class"] 68 69 # Any identifier is stored as the object's textual content. 70 71 identifier = text.strip() 72 73 # The content is a dictionary mapping names to properties and 74 # collections. 75 76 content = self.content 77 78 pages_dir = join(self.space, "pages") 79 versions_dir = join(self.space, "versions") 80 81 # Handle particular types. 82 83 if objecttype in ("Page", "Comment", "BlogPost"): 84 85 # Handle pages and revisions, adding revisions to the page manifest. 86 # The original version is used as a unifying identifier for all the 87 # different revisions (each of which being defined by a Page 88 # element). Although "original" implies the first identifier used, 89 # it actually appears to be the latest and will have the highest 90 # version number. 91 92 if content.has_key("originalVersion"): 93 pageid = content["originalVersion"] 94 else: 95 pageid = identifier 96 97 versionfile = join(versions_dir, identifier) 98 99 # Note page metadata, not necessarily in the correct order. 100 # For comments, the title will need to be rewritten, since they 101 # should be defined in terms of their owner page. 102 103 # NOTE: This only makes the current title available to comments. 104 105 mkdirs(join(pages_dir, pageid)) 106 107 title = content["title"] 108 109 # Limit the title to a "safe" number of characters in order to avoid 110 # filesystem issues. 111 112 title = get_page_title(title) 113 114 if title: 115 title = "%s/%s" % (self.space, title) 116 write(join(pages_dir, pageid, "pagetitle"), title) 117 118 # Note the type of the page. 119 120 write(join(pages_dir, pageid, "pagetype"), objecttype) 121 122 # See sort_manifest for access to this data. 123 124 append(join(pages_dir, pageid, "manifest"), 125 "%s|AddRevision|_|%s|%s|%s|%s\n" % ( # blank added for consistency with AddAttachment 126 content["version"], 127 versionfile, 128 title, # comment titles will incorporate the comment's position 129 content["lastModifierName"], 130 content["versionComment"] 131 )) 132 133 # Add information to parent pages for child page lists. 134 135 if content.has_key("parent"): 136 parentid = content["parent"] 137 mkdirs(join(pages_dir, parentid)) 138 append(join(pages_dir, parentid, "children"), title + "\n") 139 140 # Add creation details for comments to the owner page. 141 # Since comments can be versioned, the date of the original version 142 # is used, and only this "original" version has the owner property. 143 144 if objecttype == "Comment" and content.has_key("owner"): 145 ownerid = content["owner"] 146 mkdirs(join(pages_dir, ownerid)) 147 append(join(pages_dir, ownerid, "comments"), "%s|%s\n" % (content["creationDate"], pageid)) 148 149 # Some metadata is not particularly relevant. For example, 150 # ancestors, children, parent are navigation-related. 151 152 # Other metadata could be added to the page content itself. 153 # For example, labelling could be converted to categories. 154 155 # Handle revisions. 156 157 elif objecttype == "BodyContent": 158 body = content["body"] 159 if not body: 160 body = "## Empty page." 161 162 # NOTE: Very simple technique employed for guessing the format. 163 164 if no_translate: 165 fn = write 166 elif body.startswith("<"): 167 fn = xmltranslate 168 else: 169 fn = translate 170 171 try: 172 fn(join(versions_dir, content["content"]), body) 173 except: 174 err = codecs.getwriter("utf-8")(sys.stderr) 175 print >>err, "Error parsing", content["content"] 176 raise 177 178 # Handle attachments. 179 180 elif objecttype == "Attachment": 181 pageid = content["content"] 182 version = content["attachmentVersion"] 183 184 if content.has_key("originalVersion"): 185 attachid = content["originalVersion"] 186 else: 187 attachid = identifier 188 189 append(join(pages_dir, pageid, "attachments"), 190 "%s|AddAttachment|%s|%s|%s|%s|%s\n" % ( 191 version, 192 # Have to "taint" archive filenames, although Moin will 193 # probably handle package script filename tainting. 194 wikiutil.taintfilename(join("attachments", pageid, attachid, version)), 195 wikiutil.taintfilename(content["fileName"]), 196 "", # pagename is substituted later 197 content["lastModifierName"], 198 content["comment"] 199 )) 200 201 self.content = {} 202 203 def handle_property(self, name, elements, attributes, all_text, text): 204 205 "Record properties in the current content dictionary." 206 207 self.content[attributes[-1]["name"]] = text.strip() 208 209 def handle_id(self, name, elements, attributes, all_text, text): 210 211 "Promote identifiers to the parent element's text." 212 213 all_text[-2].append(text) 214 215 def handle_collection(self, name, elements, attributes, all_text, text): 216 217 "Record collections in the current content dictionary." 218 219 self.content[attributes[-1]["name"]] = self.elements 220 self.elements = [] 221 222 def handle_element(self, name, elements, attributes, all_text, text): 223 224 "Add elements to the current collection." 225 226 self.elements.append((attributes[-1]["class"], text.strip())) 227 228 def mkdirs(name): 229 230 "Make the directory with the given 'name' at any depth." 231 232 try: 233 makedirs(name) 234 except OSError: 235 pass 236 237 def append(filename, s): 238 239 "Append to the file with the given 'filename' the string 's'." 240 241 write(filename, s, True) 242 243 def write(filename, s, append=False): 244 245 """ 246 Write to the file with the given 'filename' the string 's'. If the optional 247 'append' parameter is set to a true value, 's' will be appended to the file. 248 """ 249 250 f = codecs.open(filename, append and "a" or "w", encoding="utf-8") 251 try: 252 f.write(s) 253 finally: 254 f.close() 255 256 def read(filename): 257 258 """ 259 Read from the file with the given 'filename', returning a string containing 260 its contents. 261 """ 262 263 f = codecs.open(filename, encoding="utf-8") 264 try: 265 return f.read() 266 finally: 267 f.close() 268 269 def translate(filename, body, fn=None): 270 271 """ 272 Write to the file with the given 'filename' a translation of the given 273 'body'. 274 """ 275 276 fn = fn or wikiparser.parse 277 278 out = codecs.open(filename, "w", encoding="utf-8") 279 try: 280 print >>out, "#pragma page-filename", filename 281 fn(body, out) 282 finally: 283 out.close() 284 285 def xmltranslate(filename, body): 286 translate(filename, body, xmlparser.parse) 287 288 def sort_comments(pages_dir, pageid): 289 290 """ 291 Where 'pageid' has comments associated with it, sort them chronologically 292 and label the comment pages with the owner page's title and comment's 293 position in the chronological sequence. Such labelling is done by writing 294 a "pagetitle" file in each comment page's directory. 295 """ 296 297 comments = join(pages_dir, pageid, "comments") 298 299 if not exists(comments): 300 return 301 302 title = read(join(pages_dir, pageid, "pagetitle")) 303 304 details = [line.split("|") for line in read(comments).split("\n") if line] 305 details.sort() 306 307 # Write the sorted comments list for testing purposes. 308 309 write(comments, "\n".join(["|".join(x) for x in details])) 310 311 # Define comments as subpages by setting their titles using this 312 # page's name/title and their position in the comments collection. 313 314 for position, (_lastmodified, commentid) in enumerate(details): 315 316 # In the page directory for each comment, write the page title in a 317 # special file for later processing. 318 319 write(join(pages_dir, commentid, "pagetitle"), "%s/%04d" % (title, position)) 320 321 def _manifest_to_mapping(manifest, output_mapping): 322 323 """ 324 Open the given 'manifest' and write a mapping from version identifiers to 325 page names/titles to the file with the given 'output_mapping' filename. 326 """ 327 328 f = codecs.open(manifest, "r", encoding="utf-8") 329 try: 330 mapping = [] 331 332 lines = [x.split("|") for x in f.readlines()] 333 for line in lines: 334 version, _action, _archive_filename, filename, title, username, comment = line 335 if title: 336 mapping.append((split(filename)[-1], title)) 337 338 append(output_mapping, "".join([("\t".join(x) + "\n") for x in mapping])) 339 340 finally: 341 f.close() 342 343 def _sort_manifest(manifest, title): 344 345 """ 346 Open the given 'manifest' and sort it according to revision so that it will 347 be added to MoinMoin in the correct order. 348 349 If a 'title' is provided, the title column in the manifest will be augmented 350 with that information. This is typically done for comments and is necessary 351 for attachments. 352 353 A list of manifest entries is returned. 354 """ 355 356 f = codecs.open(manifest, "r", encoding="utf-8") 357 try: 358 lines = [x.split("|") for x in f.readlines()] 359 lines.sort(cmp=lambda x, y: cmp(int(x[0]), int(y[0]))) 360 361 # Reconstruct the lines, optionally changing the titles. 362 363 result = [] 364 365 for line in lines: 366 version, _action, _archive_filename, filename, old_title, username, comment = line 367 368 # Replace title information with the information already present. 369 370 if not old_title: 371 new_title = title 372 else: 373 new_title = old_title 374 375 # The version is omitted now that the manifest is ordered. 376 377 line = _action, _archive_filename, filename, new_title, username, comment 378 result.append(line) 379 380 return result 381 382 finally: 383 f.close() 384 385 def serialise_manifest(manifest): 386 387 """ 388 Process the 'manifest' consisting of entries, removing superfluous columns. 389 """ 390 391 result = [] 392 393 for columns in manifest: 394 action = columns[0] 395 if action == "AddRevision": 396 columns = list(columns) 397 del columns[1] 398 result.append("|".join(columns)) 399 400 return "".join(result) 401 402 def sort_manifest(pages_dir, pageid, output=None, output_mapping=None, no_translate=False): 403 404 """ 405 Using the given 'pageid', locate the manifest for the page and any page 406 title information written to a "pagetitle" file. 407 408 Then sort the manifest according to revision so that it will be added to 409 MoinMoin in the correct order. 410 411 If a "pagetitle" file exists, the title column in the manifest will be 412 augmented with the contents of that file. This is typically done for 413 comments. 414 415 If a "children" file exists, the pages in that file will be added as a list 416 to the end of each revision's content. 417 418 If 'output' is given, the manifest details will be appended to the file 419 having that filename instead of being rewritten to the original manifest 420 file. 421 422 If 'output_mapping' is given, a mapping from version identifiers to page 423 titles will be appended to the file having that filename. 424 """ 425 426 pagetype = join(pages_dir, pageid, "pagetype") 427 manifest = join(pages_dir, pageid, "manifest") 428 attachments = join(pages_dir, pageid, "attachments") 429 pagetitle = join(pages_dir, pageid, "pagetitle") 430 children = join(pages_dir, pageid, "children") 431 comments = join(pages_dir, pageid, "comments") 432 433 type = exists(pagetype) and read(pagetype) or None 434 435 if exists(pagetitle): 436 title = read(pagetitle) 437 space, _page_name = get_space_and_name(title) 438 else: 439 title = space = None 440 441 # Sort the revision manifest. 442 443 result = _sort_manifest(manifest, title) 444 445 # Output a mapping of identifiers to page names. 446 447 if output_mapping: 448 _manifest_to_mapping(manifest, output_mapping) 449 450 # Modify the content to include child pages and comments. 451 452 for _action, _archive_filename, filename, new_title, username, comment in result: 453 text = read(filename) 454 455 # Add an ACL to comment pages so that people cannot change other 456 # people's comments. 457 458 if type == "Comment": 459 text = "#acl %s:read,write,delete,revert All:read\n%s" % (username, text) 460 461 # Add child page information to the content. 462 463 if exists(children) and not no_translate: 464 child_pages = [] 465 child_page_names = [x for x in read(children).split("\n") if x] 466 child_page_names.sort() 467 468 # Produce links which hide the space prefix. 469 470 for child_page_name in child_page_names: 471 child_space, page_name = get_space_and_name(child_page_name) 472 if child_space == space: 473 child_page_label = page_name 474 else: 475 child_page_label = child_page_name 476 477 child_pages.append(" * [[%s|%s]]" % (child_page_name, child_page_label)) 478 479 text += child_page_section % "\n".join(child_pages) 480 481 # Add comments to the content. 482 483 if exists(comments) and title and not no_translate: 484 text += comment_section % title 485 486 # Rewrite the file. 487 488 write(filename, text) 489 490 # Add the attachments to the manifest. 491 492 if exists(attachments): 493 result += _sort_manifest(attachments, title) 494 495 # Serialise the manifest. 496 497 s = serialise_manifest(result) 498 499 if output is None: 500 write(manifest, s) 501 else: 502 append(output, s) 503 504 def get_space_and_name(page_name): 505 try: 506 return page_name.split("/", 1) 507 except IndexError: 508 return None, page_name 509 510 # Template for child page information. 511 512 child_page_section = """ 513 ---- 514 515 %s 516 """ 517 518 # Template for comments. 519 520 comment_section = """ 521 ---- 522 523 <<Include("^%s/")>> 524 """ 525 526 # Main program. 527 528 if __name__ == "__main__": 529 try: 530 filename = sys.argv[1] 531 is_zipfile = splitext(filename)[-1] == extsep + "zip" 532 space = sys.argv[2] 533 if len(sys.argv) > 3 and sys.argv[3]: 534 attachments = sys.argv[3] 535 else: 536 attachments = None 537 except IndexError: 538 print >>sys.stderr, """ 539 Please specify an XML file containing Wiki data, a workspace name, and an 540 optional attachments directory location. For example: 541 542 %(progname)s com_entities.xml COM attachments 543 544 Adding --no-translate will unpack the Wiki but not translate the content. 545 When doing so without an attachments directory, add an empty argument as 546 follows: 547 548 %(progname)s com_entities.xml COM '' --no-translate 549 550 An archive can be used instead of the XML file, and since this may include 551 attachments, no additional attachments directory needs to be specified: 552 553 %(progname)s COM-123456-789012.zip COM 554 """ % {"progname" : split(sys.argv[0])[-1]} 555 556 sys.exit(1) 557 558 no_translate = "--no-translate" in sys.argv 559 560 if exists(space): 561 print >>sys.stderr, "Directory exists for space %s. Please choose another or remove its contents." % space 562 sys.exit(1) 563 564 package_zip = space + extsep + "zip" 565 566 if exists(package_zip): 567 print >>sys.stderr, "Page package exists. Please remove or rename it:", package_zip 568 sys.exit(1) 569 570 mkdir(space) 571 mkdirs(join(space, "pages")) 572 mkdirs(join(space, "versions")) 573 574 p = xmlread.ConfigurableParser() 575 handler = ConfluenceHandler(space, no_translate) 576 577 # Register handlers in the parser for different elements. 578 579 p["object"] = handler.handle_object 580 p["property"] = handler.handle_property 581 p["id"] = handler.handle_id 582 p["collection"] = handler.handle_collection 583 p["element"] = handler.handle_element 584 585 # Open the XML dump. 586 587 f = open(filename) 588 589 if is_zipfile: 590 zf = ZipFile(f) 591 ff = StringIO(zf.read("entities.xml")) 592 else: 593 ff = f 594 595 # Parse the data. 596 597 try: 598 p.parse(ff) 599 600 # Tidy up the import manifests, sorting each of them by revision and 601 # finalising them. 602 603 pages_dir = join(space, "pages") 604 605 for pageid in listdir(pages_dir): 606 sort_comments(pages_dir, pageid) 607 608 output_mapping = join(space, "MAPPING") 609 610 output_manifest = join(space, "MOIN_PACKAGE") 611 append(output_manifest, "MoinMoinPackage|1\n") 612 613 for pageid in listdir(pages_dir): 614 sort_manifest(pages_dir, pageid, output_manifest, output_mapping, no_translate) 615 616 # Write the page package. 617 618 page_package = ZipFile(package_zip, "w") 619 620 try: 621 # Include the page revisions. 622 623 versions_dir = join(space, "versions") 624 625 for versionid in listdir(versions_dir): 626 page_package.write(join(versions_dir, versionid)) 627 628 # Include the attachments. 629 630 if attachments: 631 cwd = getcwd() 632 chdir(split(attachments)[0]) 633 try: 634 for path, dirnames, filenames in walk(split(attachments)[1]): 635 for filename in filenames: 636 # Have to "taint" archive filenames. 637 page_package.write(join(path, filename), wikiutil.taintfilename(join(path, filename))) 638 finally: 639 chdir(cwd) 640 elif is_zipfile: 641 for filename in zf.namelist(): 642 if filename.startswith("attachments"): 643 # Have to "taint" archive filenames. 644 page_package.writestr(wikiutil.taintfilename(filename), zf.read(filename)) 645 646 # Include only the top-level manifest. 647 648 page_package.write(output_manifest, "MOIN_PACKAGE") 649 650 finally: 651 page_package.close() 652 653 finally: 654 f.close() 655 656 # vim: tabstop=4 expandtab shiftwidth=4