paul@0 | 1 | #!/usr/bin/env python |
paul@0 | 2 | |
paul@8 | 3 | """ |
paul@8 | 4 | Confluence XML dump conversion to a MoinMoin-compatible representation. |
paul@8 | 5 | |
paul@33 | 6 | Copyright (C) 2012, 2013 Paul Boddie <paul@boddie.org.uk> |
paul@8 | 7 | |
paul@8 | 8 | This software is free software; you can redistribute it and/or |
paul@8 | 9 | modify it under the terms of the GNU General Public License as |
paul@8 | 10 | published by the Free Software Foundation; either version 2 of |
paul@8 | 11 | the License, or (at your option) any later version. |
paul@8 | 12 | |
paul@8 | 13 | This software is distributed in the hope that it will be useful, |
paul@8 | 14 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
paul@8 | 15 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
paul@8 | 16 | GNU General Public License for more details. |
paul@8 | 17 | |
paul@8 | 18 | You should have received a copy of the GNU General Public |
paul@8 | 19 | License along with this library; see the file LICENCE.txt |
paul@8 | 20 | If not, write to the Free Software Foundation, Inc., |
paul@8 | 21 | 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA |
paul@8 | 22 | """ |
paul@8 | 23 | |
paul@3 | 24 | from os import listdir, mkdir, makedirs |
paul@1 | 25 | from os.path import exists, extsep, join, splitext |
paul@0 | 26 | from zipfile import ZipFile |
paul@0 | 27 | from cStringIO import StringIO |
paul@0 | 28 | import codecs |
paul@0 | 29 | import xmlread |
paul@11 | 30 | import parser |
paul@25 | 31 | import sys |
paul@0 | 32 | |
paul@23 | 33 | MAX_TITLE_LENGTH = 120 |
paul@23 | 34 | |
paul@0 | 35 | class ConfluenceHandler: |
paul@0 | 36 | |
paul@0 | 37 | "Handle content from a Confluence Wiki dump." |
paul@0 | 38 | |
paul@13 | 39 | def __init__(self, space, no_translate=False): |
paul@0 | 40 | self.content = {} |
paul@0 | 41 | self.elements = [] |
paul@12 | 42 | self.space = space |
paul@13 | 43 | self.no_translate = no_translate |
paul@0 | 44 | |
paul@0 | 45 | def handle_object(self, name, elements, attributes, all_text, text): |
paul@0 | 46 | |
paul@0 | 47 | "Handle objects according to type." |
paul@0 | 48 | |
paul@0 | 49 | objecttype = attributes[-1]["class"] |
paul@25 | 50 | |
paul@25 | 51 | # Any identifier is stored as the object's textual content. |
paul@25 | 52 | |
paul@0 | 53 | identifier = text.strip() |
paul@25 | 54 | |
paul@25 | 55 | # The content is a dictionary mapping names to properties and |
paul@25 | 56 | # collections. |
paul@25 | 57 | |
paul@0 | 58 | content = self.content |
paul@0 | 59 | |
paul@12 | 60 | pages_dir = join(self.space, "pages") |
paul@12 | 61 | versions_dir = join(self.space, "versions") |
paul@0 | 62 | |
paul@0 | 63 | # Handle particular types. |
paul@0 | 64 | |
paul@10 | 65 | if objecttype in ("Page", "Comment", "BlogPost"): |
paul@0 | 66 | |
paul@0 | 67 | # Handle pages and revisions, adding revisions to the page manifest. |
paul@9 | 68 | # The original version is used as a unifying identifier for all the |
paul@9 | 69 | # different revisions (each of which being defined by a Page |
paul@9 | 70 | # element). Although "original" implies the first identifier used, |
paul@9 | 71 | # it actually appears to be the latest and will have the highest |
paul@9 | 72 | # version number. |
paul@0 | 73 | |
paul@0 | 74 | if content.has_key("originalVersion"): |
paul@0 | 75 | pageid = content["originalVersion"] |
paul@0 | 76 | else: |
paul@0 | 77 | pageid = identifier |
paul@0 | 78 | |
paul@0 | 79 | versionfile = join(versions_dir, identifier) |
paul@0 | 80 | |
paul@0 | 81 | # Note page metadata, not necessarily in the correct order. |
paul@9 | 82 | # For comments, the title will need to be rewritten, since they |
paul@9 | 83 | # should be defined in terms of their owner page. |
paul@0 | 84 | |
paul@0 | 85 | mkdirs(join(pages_dir, pageid)) |
paul@0 | 86 | |
paul@12 | 87 | title = content["title"] |
paul@23 | 88 | |
paul@23 | 89 | # Limit the title to a "safe" number of characters in order to avoid |
paul@23 | 90 | # filesystem issues. |
paul@23 | 91 | |
paul@23 | 92 | title = title[:MAX_TITLE_LENGTH] |
paul@23 | 93 | |
paul@12 | 94 | if title: |
paul@12 | 95 | title = "%s/%s" % (self.space, title) |
paul@31 | 96 | write(join(pages_dir, pageid, "pagetitle"), title) |
paul@12 | 97 | |
paul@28 | 98 | # See sort_manifest for access to this data. |
paul@28 | 99 | |
paul@24 | 100 | append(join(pages_dir, pageid, "manifest"), |
paul@24 | 101 | "%s|AddRevision|%s|%s|%s|%s\n" % ( |
paul@24 | 102 | content["version"], |
paul@24 | 103 | versionfile, |
paul@31 | 104 | title, # comment titles will incorporate the comment's position |
paul@24 | 105 | content["lastModifierName"], |
paul@24 | 106 | content["versionComment"] |
paul@24 | 107 | )) |
paul@0 | 108 | |
paul@24 | 109 | # Add information to parent pages for child page lists. |
paul@24 | 110 | |
paul@24 | 111 | if content.has_key("parent"): |
paul@24 | 112 | parentid = content["parent"] |
paul@24 | 113 | mkdirs(join(pages_dir, parentid)) |
paul@24 | 114 | append(join(pages_dir, parentid, "children"), title + "\n") |
paul@24 | 115 | |
paul@31 | 116 | # Add creation details for comments to the owner page. |
paul@31 | 117 | # Since comments can be versioned, the date of the original version |
paul@31 | 118 | # is used, and only this "original" version has the owner property. |
paul@31 | 119 | |
paul@31 | 120 | if objecttype == "Comment" and content.has_key("owner"): |
paul@31 | 121 | ownerid = content["owner"] |
paul@31 | 122 | mkdirs(join(pages_dir, ownerid)) |
paul@31 | 123 | append(join(pages_dir, ownerid, "comments"), "%s|%s\n" % (content["creationDate"], pageid)) |
paul@31 | 124 | |
paul@0 | 125 | # Some metadata is not particularly relevant. For example, |
paul@0 | 126 | # ancestors, children, parent are navigation-related. |
paul@0 | 127 | |
paul@0 | 128 | # Other metadata could be added to the page content itself. |
paul@0 | 129 | # For example, labelling could be converted to categories. |
paul@0 | 130 | |
paul@0 | 131 | # Handle revisions. |
paul@0 | 132 | |
paul@0 | 133 | elif objecttype == "BodyContent": |
paul@12 | 134 | body = content["body"] |
paul@12 | 135 | if not body: |
paul@12 | 136 | body = "## Empty page." |
paul@13 | 137 | |
paul@25 | 138 | # NOTE: Very simple technique employed for guessing the format. |
paul@25 | 139 | |
paul@13 | 140 | if no_translate: |
paul@13 | 141 | fn = write |
paul@25 | 142 | elif body.startswith("<"): |
paul@25 | 143 | fn = xmltranslate |
paul@13 | 144 | else: |
paul@13 | 145 | fn = translate |
paul@13 | 146 | |
paul@25 | 147 | try: |
paul@25 | 148 | fn(join(versions_dir, content["content"]), body) |
paul@25 | 149 | except: |
paul@25 | 150 | print >>sys.stderr, "Error parsing..." |
paul@25 | 151 | print >>sys.stderr, body |
paul@25 | 152 | raise |
paul@0 | 153 | |
paul@0 | 154 | self.content = {} |
paul@0 | 155 | |
paul@0 | 156 | def handle_property(self, name, elements, attributes, all_text, text): |
paul@0 | 157 | |
paul@0 | 158 | "Record properties in the current content dictionary." |
paul@0 | 159 | |
paul@0 | 160 | self.content[attributes[-1]["name"]] = text.strip() |
paul@0 | 161 | |
paul@0 | 162 | def handle_id(self, name, elements, attributes, all_text, text): |
paul@0 | 163 | |
paul@0 | 164 | "Promote identifiers to the parent element's text." |
paul@0 | 165 | |
paul@0 | 166 | all_text[-2].append(text) |
paul@0 | 167 | |
paul@0 | 168 | def handle_collection(self, name, elements, attributes, all_text, text): |
paul@0 | 169 | |
paul@0 | 170 | "Record collections in the current content dictionary." |
paul@0 | 171 | |
paul@0 | 172 | self.content[attributes[-1]["name"]] = self.elements |
paul@0 | 173 | self.elements = [] |
paul@0 | 174 | |
paul@0 | 175 | def handle_element(self, name, elements, attributes, all_text, text): |
paul@0 | 176 | |
paul@0 | 177 | "Add elements to the current collection." |
paul@0 | 178 | |
paul@0 | 179 | self.elements.append((attributes[-1]["class"], text.strip())) |
paul@0 | 180 | |
paul@0 | 181 | def mkdirs(name): |
paul@2 | 182 | |
paul@2 | 183 | "Make the directory with the given 'name' at any depth." |
paul@2 | 184 | |
paul@0 | 185 | try: |
paul@0 | 186 | makedirs(name) |
paul@0 | 187 | except OSError: |
paul@0 | 188 | pass |
paul@0 | 189 | |
paul@0 | 190 | def append(filename, s): |
paul@2 | 191 | |
paul@2 | 192 | "Append to the file with the given 'filename' the string 's'." |
paul@2 | 193 | |
paul@0 | 194 | write(filename, s, True) |
paul@0 | 195 | |
paul@0 | 196 | def write(filename, s, append=False): |
paul@2 | 197 | |
paul@2 | 198 | """ |
paul@2 | 199 | Write to the file with the given 'filename' the string 's'. If the optional |
paul@2 | 200 | 'append' parameter is set to a true value, 's' will be appended to the file. |
paul@2 | 201 | """ |
paul@2 | 202 | |
paul@2 | 203 | f = codecs.open(filename, append and "a" or "w", encoding="utf-8") |
paul@0 | 204 | try: |
paul@0 | 205 | f.write(s) |
paul@0 | 206 | finally: |
paul@0 | 207 | f.close() |
paul@0 | 208 | |
paul@9 | 209 | def read(filename): |
paul@9 | 210 | |
paul@9 | 211 | """ |
paul@9 | 212 | Read from the file with the given 'filename', returning a string containing |
paul@9 | 213 | its contents. |
paul@9 | 214 | """ |
paul@9 | 215 | |
paul@9 | 216 | f = codecs.open(filename, encoding="utf-8") |
paul@9 | 217 | try: |
paul@9 | 218 | return f.read() |
paul@9 | 219 | finally: |
paul@9 | 220 | f.close() |
paul@3 | 221 | |
paul@25 | 222 | def translate(filename, body, fn=None): |
paul@11 | 223 | |
paul@11 | 224 | """ |
paul@11 | 225 | Write to the file with the given 'filename' a translation of the given |
paul@11 | 226 | 'body'. |
paul@11 | 227 | """ |
paul@11 | 228 | |
paul@25 | 229 | fn = fn or parser.parse |
paul@25 | 230 | |
paul@11 | 231 | out = codecs.open(filename, "w", encoding="utf-8") |
paul@11 | 232 | try: |
paul@25 | 233 | fn(body, out) |
paul@11 | 234 | finally: |
paul@11 | 235 | out.close() |
paul@11 | 236 | |
paul@25 | 237 | def xmltranslate(filename, body): |
paul@25 | 238 | translate(filename, body, parser.xmlparse) |
paul@25 | 239 | |
paul@31 | 240 | def sort_comments(pages_dir, pageid): |
paul@31 | 241 | |
paul@31 | 242 | """ |
paul@31 | 243 | Where 'pageid' has comments associated with it, sort them chronologically |
paul@31 | 244 | and label the comment pages with the owner page's title and comment's |
paul@31 | 245 | position in the chronological sequence. Such labelling is done by writing |
paul@31 | 246 | a "pagetitle" file in each comment page's directory. |
paul@31 | 247 | """ |
paul@31 | 248 | |
paul@31 | 249 | comments = join(pages_dir, pageid, "comments") |
paul@31 | 250 | |
paul@31 | 251 | if not exists(comments): |
paul@31 | 252 | return |
paul@31 | 253 | |
paul@31 | 254 | title = read(join(pages_dir, pageid, "pagetitle")) |
paul@31 | 255 | |
paul@31 | 256 | details = [line.split("|") for line in read(comments).split("\n") if line] |
paul@31 | 257 | details.sort() |
paul@31 | 258 | |
paul@31 | 259 | # Write the sorted comments list for testing purposes. |
paul@31 | 260 | |
paul@31 | 261 | write(comments, "\n".join(["|".join(x) for x in details])) |
paul@31 | 262 | |
paul@31 | 263 | # Define comments as subpages by setting their titles using this |
paul@31 | 264 | # page's name/title and their position in the comments collection. |
paul@31 | 265 | |
paul@31 | 266 | for position, (_lastmodified, commentid) in enumerate(details): |
paul@31 | 267 | |
paul@31 | 268 | # In the page directory for each comment, write the page title in a |
paul@31 | 269 | # special file for later processing. |
paul@31 | 270 | |
paul@32 | 271 | write(join(pages_dir, commentid, "pagetitle"), "%s/%04d" % (title, position)) |
paul@31 | 272 | |
paul@33 | 273 | def sort_manifest(pages_dir, pageid, output=None, no_translate=False): |
paul@9 | 274 | |
paul@9 | 275 | """ |
paul@28 | 276 | Using the given 'pageid', locate the manifest for the page and any page |
paul@28 | 277 | title information written to a "pagetitle" file. |
paul@23 | 278 | |
paul@28 | 279 | Then sort the manifest according to revision so that it will be added to |
paul@28 | 280 | MoinMoin in the correct order. |
paul@28 | 281 | |
paul@28 | 282 | If a "pagetitle" file exists, the title column in the manifest will be |
paul@23 | 283 | augmented with the contents of that file. This is typically done for |
paul@23 | 284 | comments. |
paul@23 | 285 | |
paul@28 | 286 | If a "children" file exists, the pages in that file will be added as a list |
paul@28 | 287 | to the end of each revision's content. |
paul@28 | 288 | |
paul@23 | 289 | If 'output' is given, the manifest details will be appended to the file |
paul@23 | 290 | having that filename instead of being rewritten to the original manifest |
paul@23 | 291 | file. |
paul@9 | 292 | """ |
paul@9 | 293 | |
paul@28 | 294 | manifest = join(pages_dir, pageid, "manifest") |
paul@28 | 295 | pagetitle = join(pages_dir, pageid, "pagetitle") |
paul@28 | 296 | children = join(pages_dir, pageid, "children") |
paul@32 | 297 | comments = join(pages_dir, pageid, "comments") |
paul@28 | 298 | |
paul@9 | 299 | if exists(pagetitle): |
paul@9 | 300 | title = read(pagetitle) |
paul@9 | 301 | else: |
paul@9 | 302 | title = None |
paul@3 | 303 | |
paul@28 | 304 | f = codecs.open(manifest, "r", encoding="utf-8") |
paul@3 | 305 | try: |
paul@3 | 306 | lines = [x.split("|") for x in f.readlines()] |
paul@3 | 307 | lines.sort(cmp=lambda x, y: cmp(int(x[0]), int(y[0]))) |
paul@9 | 308 | |
paul@9 | 309 | # Reconstruct the lines, optionally changing the titles. |
paul@9 | 310 | |
paul@9 | 311 | result = [] |
paul@28 | 312 | |
paul@28 | 313 | for line in lines: |
paul@28 | 314 | version, _addrevision, filename, old_title, username, comment = line |
paul@28 | 315 | |
paul@31 | 316 | # Replace title information with the information already present. |
paul@28 | 317 | |
paul@9 | 318 | if title is not None: |
paul@31 | 319 | new_title = title |
paul@28 | 320 | else: |
paul@28 | 321 | new_title = old_title |
paul@28 | 322 | |
paul@28 | 323 | # The version is omitted now that the manifest is ordered. |
paul@28 | 324 | |
paul@28 | 325 | line = _addrevision, filename, new_title, username, comment |
paul@28 | 326 | result.append("|".join(line)) |
paul@28 | 327 | |
paul@28 | 328 | # Add child page information to the content. |
paul@28 | 329 | |
paul@33 | 330 | if exists(children) and not no_translate: |
paul@28 | 331 | child_pages = [] |
paul@28 | 332 | child_page_names = [x for x in read(children).split("\n") if x] |
paul@28 | 333 | child_page_names.sort() |
paul@28 | 334 | |
paul@28 | 335 | for child_page_name in child_page_names: |
paul@28 | 336 | child_pages.append(" * [[%s]]" % child_page_name) |
paul@28 | 337 | |
paul@28 | 338 | append(filename, child_page_section % "\n".join(child_pages)) |
paul@28 | 339 | |
paul@32 | 340 | # Add comments to the content. |
paul@32 | 341 | |
paul@33 | 342 | if exists(comments) and title and not no_translate: |
paul@32 | 343 | append(filename, comment_section % title) |
paul@32 | 344 | |
paul@3 | 345 | finally: |
paul@3 | 346 | f.close() |
paul@3 | 347 | |
paul@10 | 348 | s = "".join(result) |
paul@10 | 349 | |
paul@10 | 350 | if output is None: |
paul@28 | 351 | write(manifest, s) |
paul@10 | 352 | else: |
paul@10 | 353 | append(output, s) |
paul@3 | 354 | |
paul@28 | 355 | # Template for child page information. |
paul@28 | 356 | |
paul@28 | 357 | child_page_section = """ |
paul@28 | 358 | ---- |
paul@28 | 359 | |
paul@28 | 360 | %s |
paul@28 | 361 | """ |
paul@28 | 362 | |
paul@32 | 363 | # Template for comments. |
paul@32 | 364 | |
paul@32 | 365 | comment_section = """ |
paul@32 | 366 | ---- |
paul@32 | 367 | |
paul@32 | 368 | <<Include("^%s/")>> |
paul@32 | 369 | """ |
paul@32 | 370 | |
paul@28 | 371 | # Main program. |
paul@28 | 372 | |
paul@0 | 373 | if __name__ == "__main__": |
paul@20 | 374 | try: |
paul@20 | 375 | filename = sys.argv[1] |
paul@20 | 376 | is_zipfile = splitext(filename)[-1] == extsep + "zip" |
paul@20 | 377 | space = sys.argv[2] |
paul@20 | 378 | except IndexError: |
paul@20 | 379 | print >>sys.stderr, "Please specify an XML file containing Wiki data and a workspace name." |
paul@20 | 380 | print >>sys.stderr, "For example: com_entities.xml COM" |
paul@20 | 381 | sys.exit(1) |
paul@0 | 382 | |
paul@13 | 383 | no_translate = "--no-translate" in sys.argv |
paul@0 | 384 | |
paul@12 | 385 | if exists(space): |
paul@12 | 386 | print >>sys.stderr, "Directory exists for space %s. Please choose another or remove its contents." % space |
paul@0 | 387 | sys.exit(1) |
paul@0 | 388 | |
paul@12 | 389 | package_zip = space + extsep + "zip" |
paul@12 | 390 | |
paul@12 | 391 | if exists(package_zip): |
paul@12 | 392 | print >>sys.stderr, "Page package exists. Please remove or rename it:", package_zip |
paul@12 | 393 | sys.exit(1) |
paul@12 | 394 | |
paul@12 | 395 | mkdir(space) |
paul@12 | 396 | mkdirs(join(space, "pages")) |
paul@12 | 397 | mkdirs(join(space, "versions")) |
paul@0 | 398 | |
paul@0 | 399 | p = xmlread.ConfigurableParser() |
paul@13 | 400 | handler = ConfluenceHandler(space, no_translate) |
paul@0 | 401 | |
paul@24 | 402 | # Register handlers in the parser for different elements. |
paul@24 | 403 | |
paul@0 | 404 | p["object"] = handler.handle_object |
paul@0 | 405 | p["property"] = handler.handle_property |
paul@0 | 406 | p["id"] = handler.handle_id |
paul@0 | 407 | p["collection"] = handler.handle_collection |
paul@0 | 408 | p["element"] = handler.handle_element |
paul@0 | 409 | |
paul@2 | 410 | # Open the XML dump. |
paul@2 | 411 | |
paul@0 | 412 | f = open(filename) |
paul@0 | 413 | |
paul@0 | 414 | if is_zipfile: |
paul@0 | 415 | zf = ZipFile(f) |
paul@0 | 416 | ff = StringIO(zf.read("entities.xml")) |
paul@0 | 417 | else: |
paul@0 | 418 | ff = f |
paul@0 | 419 | |
paul@2 | 420 | # Parse the data. |
paul@2 | 421 | |
paul@0 | 422 | try: |
paul@0 | 423 | p.parse(ff) |
paul@0 | 424 | finally: |
paul@0 | 425 | f.close() |
paul@0 | 426 | |
paul@2 | 427 | # Tidy up the import manifests, sorting each of them by revision and |
paul@2 | 428 | # finalising them. |
paul@2 | 429 | |
paul@12 | 430 | pages_dir = join(space, "pages") |
paul@3 | 431 | |
paul@31 | 432 | for pageid in listdir(pages_dir): |
paul@31 | 433 | sort_comments(pages_dir, pageid) |
paul@31 | 434 | |
paul@12 | 435 | output_manifest = join(space, "MOIN_PACKAGE") |
paul@10 | 436 | append(output_manifest, "MoinMoinPackage|1\n") |
paul@10 | 437 | |
paul@3 | 438 | for pageid in listdir(pages_dir): |
paul@33 | 439 | sort_manifest(pages_dir, pageid, output_manifest, no_translate) |
paul@10 | 440 | |
paul@10 | 441 | # Write the page package. |
paul@10 | 442 | |
paul@12 | 443 | page_package = ZipFile(package_zip, "w") |
paul@10 | 444 | |
paul@10 | 445 | try: |
paul@10 | 446 | # Include the page revisions. |
paul@10 | 447 | |
paul@12 | 448 | versions_dir = join(space, "versions") |
paul@10 | 449 | |
paul@10 | 450 | for versionid in listdir(versions_dir): |
paul@10 | 451 | page_package.write(join(versions_dir, versionid)) |
paul@10 | 452 | |
paul@10 | 453 | # Include only the top-level manifest. |
paul@10 | 454 | |
paul@10 | 455 | page_package.write(output_manifest, "MOIN_PACKAGE") |
paul@10 | 456 | |
paul@10 | 457 | finally: |
paul@10 | 458 | page_package.close() |
paul@3 | 459 | |
paul@0 | 460 | # vim: tabstop=4 expandtab shiftwidth=4 |