1 #!/usr/bin/env python 2 3 """ 4 Confluence XML dump conversion to a MoinMoin-compatible representation. 5 6 Copyright (C) 2012, 2013 Paul Boddie <paul@boddie.org.uk> 7 8 This software is free software; you can redistribute it and/or 9 modify it under the terms of the GNU General Public License as 10 published by the Free Software Foundation; either version 2 of 11 the License, or (at your option) any later version. 12 13 This software is distributed in the hope that it will be useful, 14 but WITHOUT ANY WARRANTY; without even the implied warranty of 15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 GNU General Public License for more details. 17 18 You should have received a copy of the GNU General Public 19 License along with this library; see the file LICENCE.txt 20 If not, write to the Free Software Foundation, Inc., 21 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA 22 """ 23 24 from os import chdir, getcwd, listdir, mkdir, makedirs, walk 25 from os.path import exists, extsep, join, split, splitext 26 from zipfile import ZipFile 27 from cStringIO import StringIO 28 from MoinMoin import wikiutil 29 import codecs 30 import xmlread 31 import wikiparser, xmlparser 32 import sys 33 34 from common import get_page_title 35 36 class ConfluenceHandler: 37 38 "Handle content from a Confluence Wiki dump." 39 40 def __init__(self, space, no_translate=False): 41 self.content = {} 42 self.elements = [] 43 self.space = space 44 self.no_translate = no_translate 45 46 def handle_object(self, name, elements, attributes, all_text, text): 47 48 """ 49 Handle objects according to type. Objects appear as follows: 50 51 <object class="Page" package="..."> 52 <id name="id">...</id> 53 ... 54 </object> 55 56 Within objects, one finds things like properties and collections, which 57 are handled by their own methods but which are stored in the content 58 dictionary associated with the current object. 59 60 By the time this method is called, the contents of the object will have 61 been gathered and the properties and collections populated in the 62 content dictionary. Any identifier will have been assigned to the 63 textual content of the object element and will be available in the 64 'text' parameter. 65 """ 66 67 objecttype = attributes[-1]["class"] 68 69 # Any identifier is stored as the object's textual content. 70 71 identifier = text.strip() 72 73 # The content is a dictionary mapping names to properties and 74 # collections. 75 76 content = self.content 77 78 pages_dir = join(self.space, "pages") 79 versions_dir = join(self.space, "versions") 80 81 # Handle particular types. 82 83 if objecttype in ("Page", "Comment", "BlogPost"): 84 85 # Handle pages and revisions, adding revisions to the page manifest. 86 # The original version is used as a unifying identifier for all the 87 # different revisions (each of which being defined by a Page 88 # element). Although "original" implies the first identifier used, 89 # it actually appears to be the latest and will have the highest 90 # version number. 91 92 if content.has_key("originalVersion"): 93 pageid = content["originalVersion"] 94 else: 95 pageid = identifier 96 97 versionfile = join(versions_dir, identifier) 98 99 # Note page metadata, not necessarily in the correct order. 100 # For comments, the title will need to be rewritten, since they 101 # should be defined in terms of their owner page. 102 103 # NOTE: This only makes the current title available to comments. 104 105 mkdirs(join(pages_dir, pageid)) 106 107 title = content["title"] 108 109 # Limit the title to a "safe" number of characters in order to avoid 110 # filesystem issues. 111 112 title = get_page_title(title) 113 114 if title: 115 title = "%s/%s" % (self.space, title) 116 write(join(pages_dir, pageid, "pagetitle"), title) 117 118 # See sort_manifest for access to this data. 119 120 append(join(pages_dir, pageid, "manifest"), 121 "%s|AddRevision|_|%s|%s|%s|%s\n" % ( # blank added for consistency with AddAttachment 122 content["version"], 123 versionfile, 124 title, # comment titles will incorporate the comment's position 125 content["lastModifierName"], 126 content["versionComment"] 127 )) 128 129 # Add information to parent pages for child page lists. 130 131 if content.has_key("parent"): 132 parentid = content["parent"] 133 mkdirs(join(pages_dir, parentid)) 134 append(join(pages_dir, parentid, "children"), title + "\n") 135 136 # Add creation details for comments to the owner page. 137 # Since comments can be versioned, the date of the original version 138 # is used, and only this "original" version has the owner property. 139 140 if objecttype == "Comment" and content.has_key("owner"): 141 ownerid = content["owner"] 142 mkdirs(join(pages_dir, ownerid)) 143 append(join(pages_dir, ownerid, "comments"), "%s|%s\n" % (content["creationDate"], pageid)) 144 145 # Some metadata is not particularly relevant. For example, 146 # ancestors, children, parent are navigation-related. 147 148 # Other metadata could be added to the page content itself. 149 # For example, labelling could be converted to categories. 150 151 # Handle revisions. 152 153 elif objecttype == "BodyContent": 154 body = content["body"] 155 if not body: 156 body = "## Empty page." 157 158 # NOTE: Very simple technique employed for guessing the format. 159 160 if no_translate: 161 fn = write 162 elif body.startswith("<"): 163 fn = xmltranslate 164 else: 165 fn = translate 166 167 try: 168 fn(join(versions_dir, content["content"]), body) 169 except: 170 err = codecs.getwriter("utf-8")(sys.stderr) 171 print >>err, "Error parsing", content["content"] 172 raise 173 174 # Handle attachments. 175 176 elif objecttype == "Attachment": 177 pageid = content["content"] 178 version = content["attachmentVersion"] 179 180 if content.has_key("originalVersion"): 181 attachid = content["originalVersion"] 182 else: 183 attachid = identifier 184 185 append(join(pages_dir, pageid, "attachments"), 186 "%s|AddAttachment|%s|%s|%s|%s|%s\n" % ( 187 version, 188 # Have to "taint" archive filenames, although Moin will 189 # probably handle package script filename tainting. 190 wikiutil.taintfilename(join("attachments", pageid, attachid, version)), 191 wikiutil.taintfilename(content["fileName"]), 192 "", # pagename is substituted later 193 content["lastModifierName"], 194 content["comment"] 195 )) 196 197 self.content = {} 198 199 def handle_property(self, name, elements, attributes, all_text, text): 200 201 "Record properties in the current content dictionary." 202 203 self.content[attributes[-1]["name"]] = text.strip() 204 205 def handle_id(self, name, elements, attributes, all_text, text): 206 207 "Promote identifiers to the parent element's text." 208 209 all_text[-2].append(text) 210 211 def handle_collection(self, name, elements, attributes, all_text, text): 212 213 "Record collections in the current content dictionary." 214 215 self.content[attributes[-1]["name"]] = self.elements 216 self.elements = [] 217 218 def handle_element(self, name, elements, attributes, all_text, text): 219 220 "Add elements to the current collection." 221 222 self.elements.append((attributes[-1]["class"], text.strip())) 223 224 def mkdirs(name): 225 226 "Make the directory with the given 'name' at any depth." 227 228 try: 229 makedirs(name) 230 except OSError: 231 pass 232 233 def append(filename, s): 234 235 "Append to the file with the given 'filename' the string 's'." 236 237 write(filename, s, True) 238 239 def write(filename, s, append=False): 240 241 """ 242 Write to the file with the given 'filename' the string 's'. If the optional 243 'append' parameter is set to a true value, 's' will be appended to the file. 244 """ 245 246 f = codecs.open(filename, append and "a" or "w", encoding="utf-8") 247 try: 248 f.write(s) 249 finally: 250 f.close() 251 252 def read(filename): 253 254 """ 255 Read from the file with the given 'filename', returning a string containing 256 its contents. 257 """ 258 259 f = codecs.open(filename, encoding="utf-8") 260 try: 261 return f.read() 262 finally: 263 f.close() 264 265 def translate(filename, body, fn=None): 266 267 """ 268 Write to the file with the given 'filename' a translation of the given 269 'body'. 270 """ 271 272 fn = fn or wikiparser.parse 273 274 out = codecs.open(filename, "w", encoding="utf-8") 275 try: 276 print >>out, "#pragma page-filename", filename 277 fn(body, out) 278 finally: 279 out.close() 280 281 def xmltranslate(filename, body): 282 translate(filename, body, xmlparser.parse) 283 284 def sort_comments(pages_dir, pageid): 285 286 """ 287 Where 'pageid' has comments associated with it, sort them chronologically 288 and label the comment pages with the owner page's title and comment's 289 position in the chronological sequence. Such labelling is done by writing 290 a "pagetitle" file in each comment page's directory. 291 """ 292 293 comments = join(pages_dir, pageid, "comments") 294 295 if not exists(comments): 296 return 297 298 title = read(join(pages_dir, pageid, "pagetitle")) 299 300 details = [line.split("|") for line in read(comments).split("\n") if line] 301 details.sort() 302 303 # Write the sorted comments list for testing purposes. 304 305 write(comments, "\n".join(["|".join(x) for x in details])) 306 307 # Define comments as subpages by setting their titles using this 308 # page's name/title and their position in the comments collection. 309 310 for position, (_lastmodified, commentid) in enumerate(details): 311 312 # In the page directory for each comment, write the page title in a 313 # special file for later processing. 314 315 write(join(pages_dir, commentid, "pagetitle"), "%s/%04d" % (title, position)) 316 317 def _manifest_to_mapping(manifest, output_mapping): 318 319 """ 320 Open the given 'manifest' and write a mapping from version identifiers to 321 page names/titles to the file with the given 'output_mapping' filename. 322 """ 323 324 f = codecs.open(manifest, "r", encoding="utf-8") 325 try: 326 mapping = [] 327 328 lines = [x.split("|") for x in f.readlines()] 329 for line in lines: 330 version, _action, _archive_filename, filename, title, username, comment = line 331 if title: 332 mapping.append((split(filename)[-1], title)) 333 334 append(output_mapping, "".join([("\t".join(x) + "\n") for x in mapping])) 335 336 finally: 337 f.close() 338 339 def _sort_manifest(manifest, title): 340 341 """ 342 Open the given 'manifest' and sort it according to revision so that it will 343 be added to MoinMoin in the correct order. 344 345 If a 'title' is provided, the title column in the manifest will be augmented 346 with that information. This is typically done for comments and is necessary 347 for attachments. 348 349 A list of manifest entries is returned. 350 """ 351 352 f = codecs.open(manifest, "r", encoding="utf-8") 353 try: 354 lines = [x.split("|") for x in f.readlines()] 355 lines.sort(cmp=lambda x, y: cmp(int(x[0]), int(y[0]))) 356 357 # Reconstruct the lines, optionally changing the titles. 358 359 result = [] 360 361 for line in lines: 362 version, _action, _archive_filename, filename, old_title, username, comment = line 363 364 # Replace title information with the information already present. 365 366 if not old_title: 367 new_title = title 368 else: 369 new_title = old_title 370 371 # The version is omitted now that the manifest is ordered. 372 373 line = _action, _archive_filename, filename, new_title, username, comment 374 result.append(line) 375 376 return result 377 378 finally: 379 f.close() 380 381 def serialise_manifest(manifest): 382 383 """ 384 Process the 'manifest' consisting of entries, removing superfluous columns. 385 """ 386 387 result = [] 388 389 for columns in manifest: 390 action = columns[0] 391 if action == "AddRevision": 392 columns = list(columns) 393 del columns[1] 394 result.append("|".join(columns)) 395 396 return "".join(result) 397 398 def sort_manifest(pages_dir, pageid, output=None, output_mapping=None, no_translate=False): 399 400 """ 401 Using the given 'pageid', locate the manifest for the page and any page 402 title information written to a "pagetitle" file. 403 404 Then sort the manifest according to revision so that it will be added to 405 MoinMoin in the correct order. 406 407 If a "pagetitle" file exists, the title column in the manifest will be 408 augmented with the contents of that file. This is typically done for 409 comments. 410 411 If a "children" file exists, the pages in that file will be added as a list 412 to the end of each revision's content. 413 414 If 'output' is given, the manifest details will be appended to the file 415 having that filename instead of being rewritten to the original manifest 416 file. 417 418 If 'output_mapping' is given, a mapping from version identifiers to page 419 titles will be appended to the file having that filename. 420 """ 421 422 manifest = join(pages_dir, pageid, "manifest") 423 attachments = join(pages_dir, pageid, "attachments") 424 pagetitle = join(pages_dir, pageid, "pagetitle") 425 children = join(pages_dir, pageid, "children") 426 comments = join(pages_dir, pageid, "comments") 427 428 if exists(pagetitle): 429 title = read(pagetitle) 430 space, _page_name = get_space_and_name(title) 431 else: 432 title = space = None 433 434 # Sort the revision manifest. 435 436 result = _sort_manifest(manifest, title) 437 438 # Output a mapping of identifiers to page names. 439 440 if output_mapping: 441 _manifest_to_mapping(manifest, output_mapping) 442 443 # Modify the content to include child pages and comments. 444 445 for _action, _archive_filename, filename, new_title, username, comment in result: 446 447 # Add child page information to the content. 448 449 if exists(children) and not no_translate: 450 child_pages = [] 451 child_page_names = [x for x in read(children).split("\n") if x] 452 child_page_names.sort() 453 454 # Produce links which hide the space prefix. 455 456 for child_page_name in child_page_names: 457 child_space, page_name = get_space_and_name(child_page_name) 458 if child_space == space: 459 child_page_label = page_name 460 else: 461 child_page_label = child_page_name 462 463 child_pages.append(" * [[%s|%s]]" % (child_page_name, child_page_label)) 464 465 append(filename, child_page_section % "\n".join(child_pages)) 466 467 # Add comments to the content. 468 469 if exists(comments) and title and not no_translate: 470 append(filename, comment_section % title) 471 472 # Add the attachments to the manifest. 473 474 if exists(attachments): 475 result += _sort_manifest(attachments, title) 476 477 # Serialise the manifest. 478 479 s = serialise_manifest(result) 480 481 if output is None: 482 write(manifest, s) 483 else: 484 append(output, s) 485 486 def get_space_and_name(page_name): 487 try: 488 return page_name.split("/", 1) 489 except IndexError: 490 return None, page_name 491 492 # Template for child page information. 493 494 child_page_section = """ 495 ---- 496 497 %s 498 """ 499 500 # Template for comments. 501 502 comment_section = """ 503 ---- 504 505 <<Include("^%s/")>> 506 """ 507 508 # Main program. 509 510 if __name__ == "__main__": 511 try: 512 filename = sys.argv[1] 513 is_zipfile = splitext(filename)[-1] == extsep + "zip" 514 space = sys.argv[2] 515 if len(sys.argv) > 3 and sys.argv[3]: 516 attachments = sys.argv[3] 517 else: 518 attachments = None 519 except IndexError: 520 print >>sys.stderr, """ 521 Please specify an XML file containing Wiki data, a workspace name, and an 522 optional attachments directory location. For example: 523 524 com_entities.xml COM attachments 525 526 Adding --no-translate will unpack the Wiki but not translate the content. 527 When doing so without an attachments directory, add an empty argument as 528 follows: 529 530 com_entities.xml COM '' --no-translate 531 """ 532 sys.exit(1) 533 534 no_translate = "--no-translate" in sys.argv 535 536 if exists(space): 537 print >>sys.stderr, "Directory exists for space %s. Please choose another or remove its contents." % space 538 sys.exit(1) 539 540 package_zip = space + extsep + "zip" 541 542 if exists(package_zip): 543 print >>sys.stderr, "Page package exists. Please remove or rename it:", package_zip 544 sys.exit(1) 545 546 mkdir(space) 547 mkdirs(join(space, "pages")) 548 mkdirs(join(space, "versions")) 549 550 p = xmlread.ConfigurableParser() 551 handler = ConfluenceHandler(space, no_translate) 552 553 # Register handlers in the parser for different elements. 554 555 p["object"] = handler.handle_object 556 p["property"] = handler.handle_property 557 p["id"] = handler.handle_id 558 p["collection"] = handler.handle_collection 559 p["element"] = handler.handle_element 560 561 # Open the XML dump. 562 563 f = open(filename) 564 565 if is_zipfile: 566 zf = ZipFile(f) 567 ff = StringIO(zf.read("entities.xml")) 568 else: 569 ff = f 570 571 # Parse the data. 572 573 try: 574 p.parse(ff) 575 576 # Tidy up the import manifests, sorting each of them by revision and 577 # finalising them. 578 579 pages_dir = join(space, "pages") 580 581 for pageid in listdir(pages_dir): 582 sort_comments(pages_dir, pageid) 583 584 output_mapping = join(space, "MAPPING") 585 586 output_manifest = join(space, "MOIN_PACKAGE") 587 append(output_manifest, "MoinMoinPackage|1\n") 588 589 for pageid in listdir(pages_dir): 590 sort_manifest(pages_dir, pageid, output_manifest, output_mapping, no_translate) 591 592 # Write the page package. 593 594 page_package = ZipFile(package_zip, "w") 595 596 try: 597 # Include the page revisions. 598 599 versions_dir = join(space, "versions") 600 601 for versionid in listdir(versions_dir): 602 page_package.write(join(versions_dir, versionid)) 603 604 # Include the attachments. 605 606 if attachments: 607 cwd = getcwd() 608 chdir(split(attachments)[0]) 609 try: 610 for path, dirnames, filenames in walk(split(attachments)[1]): 611 for filename in filenames: 612 # Have to "taint" archive filenames. 613 page_package.write(join(path, filename), wikiutil.taintfilename(join(path, filename))) 614 finally: 615 chdir(cwd) 616 elif is_zipfile: 617 for filename in zf.namelist(): 618 if filename.startswith("attachments"): 619 # Have to "taint" archive filenames. 620 page_package.writestr(wikiutil.taintfilename(filename), zf.read(filename)) 621 622 # Include only the top-level manifest. 623 624 page_package.write(output_manifest, "MOIN_PACKAGE") 625 626 finally: 627 page_package.close() 628 629 finally: 630 f.close() 631 632 # vim: tabstop=4 expandtab shiftwidth=4