ConfluenceConverter (file convert.py at 484cacbd9436)

     1 #!/usr/bin/env python     2      3 """     4 Confluence XML dump conversion to a MoinMoin-compatible representation.     5      6 Copyright (C) 2012, 2013 Paul Boddie <paul@boddie.org.uk>     7      8 This software is free software; you can redistribute it and/or     9 modify it under the terms of the GNU General Public License as    10 published by the Free Software Foundation; either version 2 of    11 the License, or (at your option) any later version.    12     13 This software is distributed in the hope that it will be useful,    14 but WITHOUT ANY WARRANTY; without even the implied warranty of    15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the    16 GNU General Public License for more details.    17     18 You should have received a copy of the GNU General Public    19 License along with this library; see the file LICENCE.txt    20 If not, write to the Free Software Foundation, Inc.,    21 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA    22 """    23     24 from os import chdir, getcwd, listdir, mkdir, makedirs, walk    25 from os.path import exists, extsep, join, split, splitext    26 from zipfile import ZipFile    27 from cStringIO import StringIO    28 from MoinMoin import wikiutil    29 import codecs    30 import xmlread    31 import wikiparser, xmlparser    32 import sys    33 import time, calendar    34     35 from common import get_page_title    36     37 def date_to_seconds(s):    38     return calendar.timegm(time.strptime(s.split(".", 1)[0], "%Y-%m-%d %H:%M:%S"))    39     40 class ConfluenceHandler:    41     42     "Handle content from a Confluence Wiki dump."    43     44     def __init__(self, space, no_translate=False):    45         self.content = {}    46         self.elements = []    47         self.space = space    48         self.no_translate = no_translate    49     50     def handle_object(self, name, elements, attributes, all_text, text):    51     52         """    53         Handle objects according to type. Objects appear as follows:    54     55         <object class="Page" package="...">    56         <id name="id">...</id>    57         ...    58         </object>    59     60         Within objects, one finds things like properties and collections, which    61         are handled by their own methods but which are stored in the content    62         dictionary associated with the current object.    63     64         By the time this method is called, the contents of the object will have    65         been gathered and the properties and collections populated in the    66         content dictionary. Any identifier will have been assigned to the    67         textual content of the object element and will be available in the    68         'text' parameter.    69         """    70     71         objecttype = attributes[-1]["class"]    72     73         # Any identifier is stored as the object's textual content.    74     75         identifier = text.strip()    76     77         # The content is a dictionary mapping names to properties and    78         # collections.    79     80         content = self.content    81     82         pages_dir = join(self.space, "pages")    83         versions_dir = join(self.space, "versions")    84     85         # Handle particular types.    86     87         if objecttype in ("Page", "Comment", "BlogPost"):    88     89             # Handle pages and revisions, adding revisions to the page manifest.    90             # The original version is used as a unifying identifier for all the    91             # different revisions (each of which being defined by a Page    92             # element). Although "original" implies the first identifier used,    93             # it actually appears to be the latest and will have the highest    94             # version number.    95     96             if content.has_key("originalVersion"):    97                 pageid = content["originalVersion"]    98             else:    99                 pageid = identifier   100    101             versionfile = join(versions_dir, identifier)   102    103             # Note page metadata, not necessarily in the correct order.   104             # For comments, the title will need to be rewritten, since they   105             # should be defined in terms of their owner page.   106    107             # NOTE: This only makes the current title available to comments.   108    109             mkdirs(join(pages_dir, pageid))   110    111             title = content["title"]   112    113             # Limit the title to a "safe" number of characters in order to avoid   114             # filesystem issues.   115    116             title = get_page_title(title)   117    118             if title:   119                 title = "%s/%s" % (self.space, title)   120                 write(join(pages_dir, pageid, "pagetitle"), title)   121    122             # Note the type of the page.   123    124             write(join(pages_dir, pageid, "pagetype"), objecttype)   125    126             # See sort_manifest for access to this data.   127    128             append(join(pages_dir, pageid, "manifest"),   129                 "%s|AddRevision|_|%s|%s|%s|%s|%d\n" % ( # blank added for consistency with AddAttachment   130                     content["version"],   131                     versionfile,   132                     title, # comment titles will incorporate the comment's position   133                     content["lastModifierName"],   134                     content["versionComment"],   135                     date_to_seconds(content["lastModificationDate"])   136                 ))   137    138             # Add information to parent pages for child page lists.   139    140             if content.has_key("parent"):   141                 parentid = content["parent"]   142                 mkdirs(join(pages_dir, parentid))   143                 append(join(pages_dir, parentid, "children"), title + "\n")   144    145             # Add creation details for comments to the owner page.   146             # Since comments can be versioned, the date of the original version   147             # is used, and only this "original" version has the owner property.   148    149             if objecttype == "Comment" and content.has_key("owner"):   150                 ownerid = content["owner"]   151                 mkdirs(join(pages_dir, ownerid))   152                 append(join(pages_dir, ownerid, "comments"), "%s|%s\n" % (content["creationDate"], pageid))   153    154             # Some metadata is not particularly relevant. For example,   155             # ancestors, children, parent are navigation-related.   156    157             # Other metadata could be added to the page content itself.   158             # For example, labelling could be converted to categories.   159    160         # Handle revisions.   161    162         elif objecttype == "BodyContent":   163             body = content["body"]   164             if not body:   165                 body = "## Empty page."   166    167             # NOTE: Very simple technique employed for guessing the format.   168    169             if no_translate:   170                 fn = write   171             elif body.startswith("<"):   172                 fn = xmltranslate   173             else:   174                 fn = translate   175    176             try:   177                 fn(join(versions_dir, content["content"]), body)   178             except:   179                 err = codecs.getwriter("utf-8")(sys.stderr)   180                 print >>err, "Error parsing", content["content"]   181                 raise   182    183         # Handle attachments.   184    185         elif objecttype == "Attachment":   186             pageid = content["content"]   187             version = content["attachmentVersion"]   188    189             if content.has_key("originalVersion"):   190                 attachid = content["originalVersion"]   191             else:   192                 attachid = identifier   193    194             append(join(pages_dir, pageid, "attachments"),   195                 "%s|AddAttachment|%s|%s|%s|%s|%s|%d\n" % (   196                     version,   197                     # Have to "taint" archive filenames, although Moin will   198                     # probably handle package script filename tainting.   199                     wikiutil.taintfilename(join("attachments", pageid, attachid, version)),   200                     wikiutil.taintfilename(content["fileName"]),   201                     "", # pagename is substituted later   202                     content["lastModifierName"],   203                     content["comment"],   204                     date_to_seconds(content["lastModificationDate"])   205                 ))   206    207         self.content = {}   208    209     def handle_property(self, name, elements, attributes, all_text, text):   210    211         "Record properties in the current content dictionary."   212    213         self.content[attributes[-1]["name"]] = text.strip()   214    215     def handle_id(self, name, elements, attributes, all_text, text):   216    217         "Promote identifiers to the parent element's text."   218    219         all_text[-2].append(text)   220    221     def handle_collection(self, name, elements, attributes, all_text, text):   222    223         "Record collections in the current content dictionary."   224    225         self.content[attributes[-1]["name"]] = self.elements   226         self.elements = []   227    228     def handle_element(self, name, elements, attributes, all_text, text):   229    230         "Add elements to the current collection."   231    232         self.elements.append((attributes[-1]["class"], text.strip()))   233    234 def mkdirs(name):   235    236     "Make the directory with the given 'name' at any depth."   237    238     try:   239         makedirs(name)   240     except OSError:   241         pass   242    243 def append(filename, s):   244    245     "Append to the file with the given 'filename' the string 's'."   246    247     write(filename, s, True)   248    249 def write(filename, s, append=False):   250    251     """   252     Write to the file with the given 'filename' the string 's'. If the optional   253     'append' parameter is set to a true value, 's' will be appended to the file.   254     """   255    256     f = codecs.open(filename, append and "a" or "w", encoding="utf-8")   257     try:   258         f.write(s)   259     finally:   260         f.close()   261    262 def read(filename):   263    264     """   265     Read from the file with the given 'filename', returning a string containing   266     its contents.   267     """   268    269     f = codecs.open(filename, encoding="utf-8")   270     try:   271         return f.read()   272     finally:   273         f.close()   274    275 def translate(filename, body, fn=None):   276    277     """   278     Write to the file with the given 'filename' a translation of the given   279     'body'.   280     """   281    282     fn = fn or wikiparser.parse   283    284     out = codecs.open(filename, "w", encoding="utf-8")   285     try:   286         print >>out, "#pragma page-filename", filename   287         fn(body, out)   288     finally:   289         out.close()   290    291 def xmltranslate(filename, body):   292     translate(filename, body, xmlparser.parse)   293    294 def sort_comments(pages_dir, pageid):   295    296     """   297     Where 'pageid' has comments associated with it, sort them chronologically   298     and label the comment pages with the owner page's title and comment's   299     position in the chronological sequence. Such labelling is done by writing   300     a "pagetitle" file in each comment page's directory.   301     """   302    303     comments = join(pages_dir, pageid, "comments")   304    305     if not exists(comments):   306         return   307    308     title = read(join(pages_dir, pageid, "pagetitle"))   309    310     details = [line.split("|") for line in read(comments).split("\n") if line]   311     details.sort()   312    313     # Write the sorted comments list for testing purposes.   314    315     write(comments, "\n".join(["|".join(x) for x in details]))   316    317     # Define comments as subpages by setting their titles using this   318     # page's name/title and their position in the comments collection.   319    320     for position, (_lastmodified, commentid) in enumerate(details):   321    322         # In the page directory for each comment, write the page title in a   323         # special file for later processing.   324    325         write(join(pages_dir, commentid, "pagetitle"), "%s/%04d" % (title, position))   326    327 def _manifest_to_mapping(manifest, output_mapping):   328    329     """   330     Open the given 'manifest' and write a mapping from version identifiers to   331     page names/titles to the file with the given 'output_mapping' filename.   332     """   333    334     f = codecs.open(manifest, "r", encoding="utf-8")   335     try:   336         mapping = []   337    338         lines = [x.split("|") for x in f.readlines()]   339         for line in lines:   340             version, _action, _archive_filename, filename, title, username, comment, mtime = line   341             if title:   342                 mapping.append((split(filename)[-1], title))   343    344         append(output_mapping, "".join([("\t".join(x) + "\n") for x in mapping]))   345    346     finally:   347         f.close()   348    349 def _sort_manifest(manifest, title):   350    351     """   352     Open the given 'manifest' and sort it according to revision so that it will   353     be added to MoinMoin in the correct order.   354    355     If a 'title' is provided, the title column in the manifest will be augmented   356     with that information. This is typically done for comments and is necessary   357     for attachments.   358    359     A list of manifest entries is returned.   360     """   361    362     f = codecs.open(manifest, "r", encoding="utf-8")   363     try:   364         lines = [x.rstrip("\n").split("|") for x in f.readlines()]   365         lines.sort(cmp=lambda x, y: cmp(int(x[0]), int(y[0])))   366    367         # Reconstruct the lines, optionally changing the titles.   368    369         result = []   370    371         for line in lines:   372             version, _action, _archive_filename, filename, old_title, username, comment, mtime = line   373    374             # Replace title information with the information already present.   375    376             if not old_title:   377                 new_title = title   378             else:   379                 new_title = old_title   380    381             # The version is omitted now that the manifest is ordered.   382    383             line = _action, _archive_filename, filename, new_title, username, comment, mtime   384             result.append(line)   385    386         return result   387    388     finally:   389         f.close()   390    391 def serialise_manifest(manifest):   392    393     """   394     Process the 'manifest' consisting of entries, removing superfluous columns.   395     """   396    397     result = []   398    399     for columns in manifest:   400         action = columns[0]   401         if action == "AddRevision":   402             columns = list(columns)   403             del columns[1]   404         result.append("|".join(columns) + "\n")   405    406     return "".join(result)   407                408 def sort_manifest(pages_dir, pageid, output_mapping=None, no_translate=False):   409    410     """   411     Using the given 'pageid', locate the manifest for the page and any page   412     title information written to a "pagetitle" file.   413    414     Then sort the manifest according to revision so that historical operations   415     such as page renaming can be detected.   416    417     If a "pagetitle" file exists, the title column in the manifest will be   418     augmented with the contents of that file. This is typically done for   419     comments.   420    421     If a "children" file exists, the pages in that file will be added as a list   422     to the end of each revision's content.   423    424     If 'output_mapping' is given, a mapping from version identifiers to page   425     titles will be appended to the file having that filename.   426     """   427    428     pagetype = join(pages_dir, pageid, "pagetype")   429     manifest = join(pages_dir, pageid, "manifest")   430     attachments = join(pages_dir, pageid, "attachments")   431     pagetitle = join(pages_dir, pageid, "pagetitle")   432     children = join(pages_dir, pageid, "children")   433     comments = join(pages_dir, pageid, "comments")   434    435     type = exists(pagetype) and read(pagetype) or None   436    437     if exists(pagetitle):   438         title = read(pagetitle)   439         space, _page_name = get_space_and_name(title)   440     else:   441         title = space = None   442    443     # Sort the revision manifest.   444    445     result = _sort_manifest(manifest, title)   446    447     # Output a mapping of identifiers to page names.   448    449     if output_mapping:   450         _manifest_to_mapping(manifest, output_mapping)   451    452     # Modify the content to include child pages and comments.   453    454     last_title = None   455     final_result = []   456    457     for details in result:   458         _action, _archive_filename, filename, new_title, username, comment, mtime = details   459    460         # Detect renamed pages and add a redirect revision.   461    462         if last_title and last_title != new_title and _action == "AddRevision":   463             renaming_versionfile = filename + ".rename"   464             final_result.append((_action, "_", renaming_versionfile, last_title, username, "Page renamed to %s" % new_title, mtime))   465             write(renaming_versionfile, "#REDIRECT %s" % new_title)   466    467         last_title = new_title   468    469         # Add this revision to the manifest.   470    471         final_result.append(details)   472    473         # Obtain the text only if modifications are to be made.   474    475         text = None   476    477         # Add an ACL to comment pages so that people cannot change other   478         # people's comments.   479         # NOTE: This should match the PostComment action.   480    481         if type == "Comment":   482             text = """\   483 #acl %s:read,write,delete,revert All:read   484 #pragma comment-owner %s   485 %s""" % (username, username, text or read(filename))   486    487         # Add child page information to the content.   488    489         if exists(children) and not no_translate:   490             child_pages = []   491             child_page_names = [x for x in read(children).split("\n") if x]   492             child_page_names.sort()   493    494             # Produce links which hide the space prefix.   495    496             for child_page_name in child_page_names:   497                 child_space, page_name = get_space_and_name(child_page_name)   498                 if child_space == space:   499                     child_page_label = page_name   500                 else:   501                     child_page_label = child_page_name   502    503                 child_pages.append(" * [[%s|%s]]" % (child_page_name, child_page_label))   504    505             text = (text or read(filename)) + child_page_section % "\n".join(child_pages)   506    507         # Add comments to the content.   508    509         if exists(comments) and title and not no_translate:   510             text = (text or read(filename)) + comment_section   511    512         # Rewrite the file if necessary.   513    514         if text:   515             write(filename, text)   516    517     # Add the attachments to the manifest.   518    519     if exists(attachments):   520         final_result += _sort_manifest(attachments, title)   521    522     return final_result   523    524 def sort_final_manifest(entries, output):   525    526     """   527     Sort the manifest 'entries' by last modified time and serialise it.   528     The manifest details will be appended to the file named by 'output'.   529     """   530    531     # The final entry in each element is the mtime.   532    533     entries.sort(cmp=lambda x, y: cmp(int(x[-1]), int(y[-1])))   534    535     # Serialise the manifest.   536    537     s = serialise_manifest(entries)   538     append(output, s)   539    540 def get_space_and_name(page_name):   541     try:   542         return page_name.split("/", 1)   543     except IndexError:   544         return None, page_name   545    546 # Template for child page information.   547    548 child_page_section = """   549 ----   550    551 %s   552 """   553    554 # Template for comments.   555    556 comment_section = """   557 ----   558    559 <<IncludeComments>>   560 """   561    562 # Main program.   563    564 if __name__ == "__main__":   565     try:   566         filename = sys.argv[1]   567         is_zipfile = splitext(filename)[-1] == extsep + "zip"   568         space = sys.argv[2]   569         if len(sys.argv) > 3 and sys.argv[3]:   570             attachments = sys.argv[3]   571         else:   572             attachments = None   573     except IndexError:   574         print >>sys.stderr, """   575 Please specify an XML file containing Wiki data, a workspace name, and an   576 optional attachments directory location. For example:   577    578 %(progname)s com_entities.xml COM attachments   579    580 Adding --no-translate will unpack the Wiki but not translate the content.   581 When doing so without an attachments directory, add an empty argument as   582 follows:   583    584 %(progname)s com_entities.xml COM '' --no-translate   585    586 An archive can be used instead of the XML file, and since this may include   587 attachments, no additional attachments directory needs to be specified:   588    589 %(progname)s COM-123456-789012.zip COM   590 """ % {"progname" : split(sys.argv[0])[-1]}   591    592         sys.exit(1)   593    594     no_translate = "--no-translate" in sys.argv   595    596     if exists(space):   597         print >>sys.stderr, "Directory exists for space %s. Please choose another or remove its contents." % space   598         sys.exit(1)   599    600     package_zip = space + extsep + "zip"   601    602     if exists(package_zip):   603         print >>sys.stderr, "Page package exists. Please remove or rename it:", package_zip   604         sys.exit(1)   605    606     mkdir(space)   607     mkdirs(join(space, "pages"))   608     mkdirs(join(space, "versions"))   609    610     p = xmlread.ConfigurableParser()   611     handler = ConfluenceHandler(space, no_translate)   612    613     # Register handlers in the parser for different elements.   614    615     p["object"] = handler.handle_object   616     p["property"] = handler.handle_property   617     p["id"] = handler.handle_id   618     p["collection"] = handler.handle_collection   619     p["element"] = handler.handle_element   620    621     # Open the XML dump.   622    623     f = open(filename)   624    625     if is_zipfile:   626         zf = ZipFile(f)   627         ff = StringIO(zf.read("entities.xml"))   628     else:   629         ff = f   630    631     # Parse the data.   632    633     try:   634         p.parse(ff)   635    636         # Tidy up the import manifests, sorting each of them by revision and   637         # finalising them.   638    639         pages_dir = join(space, "pages")   640    641         for pageid in listdir(pages_dir):   642             sort_comments(pages_dir, pageid)   643    644         output_mapping = join(space, "MAPPING")   645    646         output_manifest = join(space, "MOIN_PACKAGE")   647         append(output_manifest, "MoinMoinPackage|1\n")   648    649         entries = []   650    651         for pageid in listdir(pages_dir):   652             entries += sort_manifest(pages_dir, pageid, output_mapping, no_translate)   653    654         sort_final_manifest(entries, output_manifest)   655    656         # Write the page package.   657    658         page_package = ZipFile(package_zip, "w")   659    660         try:   661             # Include the page revisions.   662    663             versions_dir = join(space, "versions")   664    665             for versionid in listdir(versions_dir):   666                 page_package.write(join(versions_dir, versionid))   667    668             # Include the attachments.   669    670             if attachments:   671                 cwd = getcwd()   672                 chdir(split(attachments)[0])   673                 try:   674                     for path, dirnames, filenames in walk(split(attachments)[1]):   675                         for filename in filenames:   676                             # Have to "taint" archive filenames.   677                             page_package.write(join(path, filename), wikiutil.taintfilename(join(path, filename)))   678                 finally:   679                     chdir(cwd)   680             elif is_zipfile:   681                 for filename in zf.namelist():   682                     if filename.startswith("attachments"):   683                         # Have to "taint" archive filenames.   684                         page_package.writestr(wikiutil.taintfilename(filename), zf.read(filename))   685    686             # Include only the top-level manifest.   687    688             page_package.write(output_manifest, "MOIN_PACKAGE")   689    690         finally:   691             page_package.close()   692    693     finally:   694         f.close()   695    696 # vim: tabstop=4 expandtab shiftwidth=4