ConfluenceConverter (file convert.py at 92a814314fda)

     1 #!/usr/bin/env python     2      3 """     4 Confluence XML dump conversion to a MoinMoin-compatible representation.     5      6 Copyright (C) 2012, 2013 Paul Boddie <paul@boddie.org.uk>     7      8 This software is free software; you can redistribute it and/or     9 modify it under the terms of the GNU General Public License as    10 published by the Free Software Foundation; either version 2 of    11 the License, or (at your option) any later version.    12     13 This software is distributed in the hope that it will be useful,    14 but WITHOUT ANY WARRANTY; without even the implied warranty of    15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the    16 GNU General Public License for more details.    17     18 You should have received a copy of the GNU General Public    19 License along with this library; see the file LICENCE.txt    20 If not, write to the Free Software Foundation, Inc.,    21 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA    22 """    23     24 from os import chdir, getcwd, listdir, mkdir, makedirs, walk    25 from os.path import exists, extsep, join, split, splitext    26 from zipfile import ZipFile    27 from cStringIO import StringIO    28 from MoinMoin import wikiutil    29 import codecs    30 import xmlread    31 import wikiparser, xmlparser    32 import sys    33 import time, calendar    34     35 from common import get_page_title    36     37 def date_to_seconds(s):    38     return calendar.timegm(time.strptime(s.split(".", 1)[0], "%Y-%m-%d %H:%M:%S"))    39     40 class ConfluenceHandler:    41     42     "Handle content from a Confluence Wiki dump."    43     44     def __init__(self, space, no_translate=False):    45         self.content = {}    46         self.elements = []    47         self.space = space    48         self.no_translate = no_translate    49     50     def handle_object(self, name, elements, attributes, all_text, text):    51     52         """    53         Handle objects according to type. Objects appear as follows:    54     55         <object class="Page" package="...">    56         <id name="id">...</id>    57         ...    58         </object>    59     60         Within objects, one finds things like properties and collections, which    61         are handled by their own methods but which are stored in the content    62         dictionary associated with the current object.    63     64         By the time this method is called, the contents of the object will have    65         been gathered and the properties and collections populated in the    66         content dictionary. Any identifier will have been assigned to the    67         textual content of the object element and will be available in the    68         'text' parameter.    69         """    70     71         objecttype = attributes[-1]["class"]    72     73         # Any identifier is stored as the object's textual content.    74     75         identifier = text.strip()    76     77         # The content is a dictionary mapping names to properties and    78         # collections.    79     80         content = self.content    81     82         pages_dir = join(self.space, "pages")    83         versions_dir = join(self.space, "versions")    84     85         # Handle particular types.    86     87         if objecttype in ("Page", "Comment", "BlogPost"):    88     89             # Handle pages and revisions, adding revisions to the page manifest.    90             # The original version is used as a unifying identifier for all the    91             # different revisions (each of which being defined by a Page    92             # element). Although "original" implies the first identifier used,    93             # it actually appears to be the latest and will have the highest    94             # version number.    95     96             if content.has_key("originalVersion"):    97                 pageid = content["originalVersion"]    98             else:    99                 pageid = identifier   100    101             versionfile = join(versions_dir, identifier)   102    103             # Note page metadata, not necessarily in the correct order.   104             # For comments, the title will need to be rewritten, since they   105             # should be defined in terms of their owner page.   106    107             # NOTE: This only makes the current title available to comments.   108    109             mkdirs(join(pages_dir, pageid))   110    111             title = content["title"]   112    113             # Limit the title to a "safe" number of characters in order to avoid   114             # filesystem issues.   115    116             title = get_page_title(title)   117    118             if title:   119                 title = "%s/%s" % (self.space, title)   120                 write(join(pages_dir, pageid, "pagetitle"), title)   121    122             # Note the type of the page.   123    124             write(join(pages_dir, pageid, "pagetype"), objecttype)   125    126             # See sort_manifest for access to this data.   127    128             append(join(pages_dir, pageid, "manifest"),   129                 "%s|AddRevision|_|%s|%s|%s|%s|%d\n" % ( # blank added for consistency with AddAttachment   130                     content["version"],   131                     versionfile,   132                     title, # comment titles will incorporate the comment's position   133                     content["lastModifierName"],   134                     content["versionComment"],   135                     date_to_seconds(content["lastModificationDate"])   136                 ))   137    138             # Add information to parent pages for child page lists.   139    140             if content.has_key("parent"):   141                 parentid = content["parent"]   142                 mkdirs(join(pages_dir, parentid))   143                 append(join(pages_dir, parentid, "children"), title + "\n")   144    145             # Add creation details for comments to the owner page.   146             # Since comments can be versioned, the date of the original version   147             # is used, and only this "original" version has the owner property.   148    149             if objecttype == "Comment" and content.has_key("owner"):   150                 ownerid = content["owner"]   151                 mkdirs(join(pages_dir, ownerid))   152                 append(join(pages_dir, ownerid, "comments"), "%s|%s\n" % (content["creationDate"], pageid))   153    154             # Some metadata is not particularly relevant. For example,   155             # ancestors, children, parent are navigation-related.   156    157             # Other metadata could be added to the page content itself.   158             # For example, labelling could be converted to categories.   159    160         # Handle revisions.   161    162         elif objecttype == "BodyContent":   163             body = content["body"]   164             if not body:   165                 body = "## Empty page."   166    167             # NOTE: Very simple technique employed for guessing the format.   168    169             if no_translate:   170                 fn = write   171             elif body.startswith("<"):   172                 fn = xmltranslate   173             else:   174                 fn = translate   175    176             try:   177                 fn(join(versions_dir, content["content"]), body)   178             except:   179                 err = codecs.getwriter("utf-8")(sys.stderr)   180                 print >>err, "Error parsing", content["content"]   181                 raise   182    183         # Handle attachments.   184    185         elif objecttype == "Attachment":   186             pageid = content["content"]   187             version = content["attachmentVersion"]   188    189             if content.has_key("originalVersion"):   190                 attachid = content["originalVersion"]   191             else:   192                 attachid = identifier   193    194             append(join(pages_dir, pageid, "attachments"),   195                 "%s|AddAttachment|%s|%s|%s|%s|%s|%d\n" % (   196                     version,   197                     # Have to "taint" archive filenames, although Moin will   198                     # probably handle package script filename tainting.   199                     wikiutil.taintfilename(join("attachments", pageid, attachid, version)),   200                     wikiutil.taintfilename(content["fileName"]),   201                     "", # pagename is substituted later   202                     content["lastModifierName"],   203                     content["comment"],   204                     date_to_seconds(content["lastModificationDate"])   205                 ))   206    207         self.content = {}   208    209     def handle_property(self, name, elements, attributes, all_text, text):   210    211         "Record properties in the current content dictionary."   212    213         self.content[attributes[-1]["name"]] = text.strip()   214    215     def handle_id(self, name, elements, attributes, all_text, text):   216    217         "Promote identifiers to the parent element's text."   218    219         all_text[-2].append(text)   220    221     def handle_collection(self, name, elements, attributes, all_text, text):   222    223         "Record collections in the current content dictionary."   224    225         self.content[attributes[-1]["name"]] = self.elements   226         self.elements = []   227    228     def handle_element(self, name, elements, attributes, all_text, text):   229    230         "Add elements to the current collection."   231    232         self.elements.append((attributes[-1]["class"], text.strip()))   233    234 def mkdirs(name):   235    236     "Make the directory with the given 'name' at any depth."   237    238     try:   239         makedirs(name)   240     except OSError:   241         pass   242    243 def append(filename, s):   244    245     "Append to the file with the given 'filename' the string 's'."   246    247     write(filename, s, True)   248    249 def write(filename, s, append=False):   250    251     """   252     Write to the file with the given 'filename' the string 's'. If the optional   253     'append' parameter is set to a true value, 's' will be appended to the file.   254     """   255    256     f = codecs.open(filename, append and "a" or "w", encoding="utf-8")   257     try:   258         f.write(s)   259     finally:   260         f.close()   261    262 def read(filename):   263    264     """   265     Read from the file with the given 'filename', returning a string containing   266     its contents.   267     """   268    269     f = codecs.open(filename, encoding="utf-8")   270     try:   271         return f.read()   272     finally:   273         f.close()   274    275 def translate(filename, body, fn=None):   276    277     """   278     Write to the file with the given 'filename' a translation of the given   279     'body'.   280     """   281    282     fn = fn or wikiparser.parse   283    284     out = codecs.open(filename, "w", encoding="utf-8")   285     try:   286         print >>out, "#pragma page-filename", filename   287         fn(body, out)   288     finally:   289         out.close()   290    291 def xmltranslate(filename, body):   292     translate(filename, body, xmlparser.parse)   293    294 def sort_comments(pages_dir, pageid):   295    296     """   297     Where 'pageid' has comments associated with it, sort them chronologically   298     and label the comment pages with the owner page's title and comment's   299     position in the chronological sequence. Such labelling is done by writing   300     a "pagetitle" file in each comment page's directory.   301     """   302    303     comments = join(pages_dir, pageid, "comments")   304    305     if not exists(comments):   306         return   307    308     title = read(join(pages_dir, pageid, "pagetitle"))   309    310     details = [line.split("|") for line in read(comments).split("\n") if line]   311     details.sort()   312    313     # Write the sorted comments list for testing purposes.   314    315     write(comments, "\n".join(["|".join(x) for x in details]))   316    317     # Define comments as subpages by setting their titles using this   318     # page's name/title and their position in the comments collection.   319    320     for position, (_lastmodified, commentid) in enumerate(details):   321    322         # In the page directory for each comment, write the page title in a   323         # special file for later processing.   324    325         write(join(pages_dir, commentid, "pagetitle"), "%s/%04d" % (title, position))   326    327 def _manifest_to_mapping(manifest, output_mapping):   328    329     """   330     Open the given 'manifest' and write a mapping from version identifiers to   331     page names/titles to the file with the given 'output_mapping' filename.   332     """   333    334     f = codecs.open(manifest, "r", encoding="utf-8")   335     try:   336         mapping = []   337    338         lines = [x.split("|") for x in f.readlines()]   339         for line in lines:   340             version, _action, _archive_filename, filename, title, username, comment, mtime = line   341             if title:   342                 mapping.append((split(filename)[-1], title))   343    344         append(output_mapping, "".join([("\t".join(x) + "\n") for x in mapping]))   345    346     finally:   347         f.close()   348    349 def _sort_manifest(manifest, title):   350    351     """   352     Open the given 'manifest' and sort it according to revision so that it will   353     be added to MoinMoin in the correct order.   354    355     If a 'title' is provided, the title column in the manifest will be augmented   356     with that information. This is typically done for comments and is necessary   357     for attachments.   358    359     A list of manifest entries is returned.   360     """   361    362     f = codecs.open(manifest, "r", encoding="utf-8")   363     try:   364         lines = [x.rstrip("\n").split("|") for x in f.readlines()]   365         lines.sort(cmp=lambda x, y: cmp(int(x[0]), int(y[0])))   366    367         # Reconstruct the lines, optionally changing the titles.   368    369         result = []   370    371         for line in lines:   372             version, _action, _archive_filename, filename, old_title, username, comment, mtime = line   373    374             # Replace title information with the information already present.   375    376             if not old_title:   377                 new_title = title   378             else:   379                 new_title = old_title   380    381             # The version is omitted now that the manifest is ordered.   382    383             line = _action, _archive_filename, filename, new_title, username, comment, mtime   384             result.append(line)   385    386         return result   387    388     finally:   389         f.close()   390    391 def serialise_manifest(manifest):   392    393     """   394     Process the 'manifest' consisting of entries, removing superfluous columns.   395     """   396    397     result = []   398    399     for columns in manifest:   400         action = columns[0]   401         if action == "AddRevision":   402             columns = list(columns)   403             del columns[1]   404         result.append("|".join(columns) + "\n")   405    406     return "".join(result)   407                408 def sort_manifest(pages_dir, pageid, output_mapping=None, no_translate=False):   409    410     """   411     Using the given 'pageid', locate the manifest for the page and any page   412     title information written to a "pagetitle" file.   413    414     Then sort the manifest according to revision so that historical operations   415     such as page renaming can be detected.   416    417     If a "pagetitle" file exists, the title column in the manifest will be   418     augmented with the contents of that file. This is typically done for   419     comments.   420    421     If a "children" file exists, the pages in that file will be added as a list   422     to the end of each revision's content.   423    424     If 'output_mapping' is given, a mapping from version identifiers to page   425     titles will be appended to the file having that filename.   426     """   427    428     pagetype = join(pages_dir, pageid, "pagetype")   429     manifest = join(pages_dir, pageid, "manifest")   430     attachments = join(pages_dir, pageid, "attachments")   431     pagetitle = join(pages_dir, pageid, "pagetitle")   432     children = join(pages_dir, pageid, "children")   433     comments = join(pages_dir, pageid, "comments")   434    435     type = exists(pagetype) and read(pagetype) or None   436    437     if exists(pagetitle):   438         title = read(pagetitle)   439         space, _page_name = get_space_and_name(title)   440     else:   441         title = space = None   442    443     # Sort the revision manifest.   444    445     result = _sort_manifest(manifest, title)   446    447     # Output a mapping of identifiers to page names.   448    449     if output_mapping:   450         _manifest_to_mapping(manifest, output_mapping)   451    452     # Modify the content to include child pages and comments.   453    454     last_title = None   455     final_result = []   456    457     for details in result:   458         _action, _archive_filename, filename, new_title, username, comment, mtime = details   459    460         # Detect renamed pages and add a redirect revision.   461    462         if last_title and last_title != new_title and _action == "AddRevision":   463             renaming_versionfile = filename + ".rename"   464             final_result.append((_action, "_", renaming_versionfile, last_title, username, "Page renamed to %s" % new_title, mtime))   465             write(renaming_versionfile, "#REDIRECT %s" % new_title)   466    467         last_title = new_title   468    469         # Add this revision to the manifest.   470    471         final_result.append(details)   472    473         # Obtain the text only if modifications are to be made.   474    475         text = None   476    477         # Add an ACL to comment pages so that people cannot change other   478         # people's comments.   479    480         if type == "Comment":   481             text = "#acl %s:read,write,delete,revert All:read\n%s" % (username, text or read(filename))   482    483         # Add child page information to the content.   484    485         if exists(children) and not no_translate:   486             child_pages = []   487             child_page_names = [x for x in read(children).split("\n") if x]   488             child_page_names.sort()   489    490             # Produce links which hide the space prefix.   491    492             for child_page_name in child_page_names:   493                 child_space, page_name = get_space_and_name(child_page_name)   494                 if child_space == space:   495                     child_page_label = page_name   496                 else:   497                     child_page_label = child_page_name   498    499                 child_pages.append(" * [[%s|%s]]" % (child_page_name, child_page_label))   500    501             text = (text or read(filename)) + child_page_section % "\n".join(child_pages)   502    503         # Add comments to the content.   504    505         if exists(comments) and title and not no_translate:   506             text = (text or read(filename)) + comment_section   507    508         # Rewrite the file if necessary.   509    510         if text:   511             write(filename, text)   512    513     # Add the attachments to the manifest.   514    515     if exists(attachments):   516         final_result += _sort_manifest(attachments, title)   517    518     return final_result   519    520 def sort_final_manifest(entries, output):   521    522     """   523     Sort the manifest 'entries' by last modified time and serialise it.   524     The manifest details will be appended to the file named by 'output'.   525     """   526    527     # The final entry in each element is the mtime.   528    529     entries.sort(cmp=lambda x, y: cmp(int(x[-1]), int(y[-1])))   530    531     # Serialise the manifest.   532    533     s = serialise_manifest(entries)   534     append(output, s)   535    536 def get_space_and_name(page_name):   537     try:   538         return page_name.split("/", 1)   539     except IndexError:   540         return None, page_name   541    542 # Template for child page information.   543    544 child_page_section = """   545 ----   546    547 %s   548 """   549    550 # Template for comments.   551    552 comment_section = """   553 ----   554    555 <<IncludeComments>>   556 """   557    558 # Main program.   559    560 if __name__ == "__main__":   561     try:   562         filename = sys.argv[1]   563         is_zipfile = splitext(filename)[-1] == extsep + "zip"   564         space = sys.argv[2]   565         if len(sys.argv) > 3 and sys.argv[3]:   566             attachments = sys.argv[3]   567         else:   568             attachments = None   569     except IndexError:   570         print >>sys.stderr, """   571 Please specify an XML file containing Wiki data, a workspace name, and an   572 optional attachments directory location. For example:   573    574 %(progname)s com_entities.xml COM attachments   575    576 Adding --no-translate will unpack the Wiki but not translate the content.   577 When doing so without an attachments directory, add an empty argument as   578 follows:   579    580 %(progname)s com_entities.xml COM '' --no-translate   581    582 An archive can be used instead of the XML file, and since this may include   583 attachments, no additional attachments directory needs to be specified:   584    585 %(progname)s COM-123456-789012.zip COM   586 """ % {"progname" : split(sys.argv[0])[-1]}   587    588         sys.exit(1)   589    590     no_translate = "--no-translate" in sys.argv   591    592     if exists(space):   593         print >>sys.stderr, "Directory exists for space %s. Please choose another or remove its contents." % space   594         sys.exit(1)   595    596     package_zip = space + extsep + "zip"   597    598     if exists(package_zip):   599         print >>sys.stderr, "Page package exists. Please remove or rename it:", package_zip   600         sys.exit(1)   601    602     mkdir(space)   603     mkdirs(join(space, "pages"))   604     mkdirs(join(space, "versions"))   605    606     p = xmlread.ConfigurableParser()   607     handler = ConfluenceHandler(space, no_translate)   608    609     # Register handlers in the parser for different elements.   610    611     p["object"] = handler.handle_object   612     p["property"] = handler.handle_property   613     p["id"] = handler.handle_id   614     p["collection"] = handler.handle_collection   615     p["element"] = handler.handle_element   616    617     # Open the XML dump.   618    619     f = open(filename)   620    621     if is_zipfile:   622         zf = ZipFile(f)   623         ff = StringIO(zf.read("entities.xml"))   624     else:   625         ff = f   626    627     # Parse the data.   628    629     try:   630         p.parse(ff)   631    632         # Tidy up the import manifests, sorting each of them by revision and   633         # finalising them.   634    635         pages_dir = join(space, "pages")   636    637         for pageid in listdir(pages_dir):   638             sort_comments(pages_dir, pageid)   639    640         output_mapping = join(space, "MAPPING")   641    642         output_manifest = join(space, "MOIN_PACKAGE")   643         append(output_manifest, "MoinMoinPackage|1\n")   644    645         entries = []   646    647         for pageid in listdir(pages_dir):   648             entries += sort_manifest(pages_dir, pageid, output_mapping, no_translate)   649    650         sort_final_manifest(entries, output_manifest)   651    652         # Write the page package.   653    654         page_package = ZipFile(package_zip, "w")   655    656         try:   657             # Include the page revisions.   658    659             versions_dir = join(space, "versions")   660    661             for versionid in listdir(versions_dir):   662                 page_package.write(join(versions_dir, versionid))   663    664             # Include the attachments.   665    666             if attachments:   667                 cwd = getcwd()   668                 chdir(split(attachments)[0])   669                 try:   670                     for path, dirnames, filenames in walk(split(attachments)[1]):   671                         for filename in filenames:   672                             # Have to "taint" archive filenames.   673                             page_package.write(join(path, filename), wikiutil.taintfilename(join(path, filename)))   674                 finally:   675                     chdir(cwd)   676             elif is_zipfile:   677                 for filename in zf.namelist():   678                     if filename.startswith("attachments"):   679                         # Have to "taint" archive filenames.   680                         page_package.writestr(wikiutil.taintfilename(filename), zf.read(filename))   681    682             # Include only the top-level manifest.   683    684             page_package.write(output_manifest, "MOIN_PACKAGE")   685    686         finally:   687             page_package.close()   688    689     finally:   690         f.close()   691    692 # vim: tabstop=4 expandtab shiftwidth=4