ConfluenceConverter (file convert.py at a79c3559e45c)

     1 #!/usr/bin/env python     2      3 """     4 Confluence XML dump conversion to a MoinMoin-compatible representation.     5      6 Copyright (C) 2012, 2013 Paul Boddie <paul@boddie.org.uk>     7      8 This software is free software; you can redistribute it and/or     9 modify it under the terms of the GNU General Public License as    10 published by the Free Software Foundation; either version 2 of    11 the License, or (at your option) any later version.    12     13 This software is distributed in the hope that it will be useful,    14 but WITHOUT ANY WARRANTY; without even the implied warranty of    15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the    16 GNU General Public License for more details.    17     18 You should have received a copy of the GNU General Public    19 License along with this library; see the file LICENCE.txt    20 If not, write to the Free Software Foundation, Inc.,    21 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA    22 """    23     24 from os import chdir, getcwd, listdir, mkdir, makedirs, walk    25 from os.path import exists, extsep, join, split, splitext    26 from zipfile import ZipFile    27 from cStringIO import StringIO    28 from MoinMoin import wikiutil    29 import codecs    30 import xmlread    31 import wikiparser, xmlparser    32 import sys    33     34 from common import get_page_title    35     36 class ConfluenceHandler:    37     38     "Handle content from a Confluence Wiki dump."    39     40     def __init__(self, space, no_translate=False):    41         self.content = {}    42         self.elements = []    43         self.space = space    44         self.no_translate = no_translate    45     46     def handle_object(self, name, elements, attributes, all_text, text):    47     48         """    49         Handle objects according to type. Objects appear as follows:    50     51         <object class="Page" package="...">    52         <id name="id">...</id>    53         ...    54         </object>    55     56         Within objects, one finds things like properties and collections, which    57         are handled by their own methods but which are stored in the content    58         dictionary associated with the current object.    59     60         By the time this method is called, the contents of the object will have    61         been gathered and the properties and collections populated in the    62         content dictionary. Any identifier will have been assigned to the    63         textual content of the object element and will be available in the    64         'text' parameter.    65         """    66     67         objecttype = attributes[-1]["class"]    68     69         # Any identifier is stored as the object's textual content.    70     71         identifier = text.strip()    72     73         # The content is a dictionary mapping names to properties and    74         # collections.    75     76         content = self.content    77     78         pages_dir = join(self.space, "pages")    79         versions_dir = join(self.space, "versions")    80     81         # Handle particular types.    82     83         if objecttype in ("Page", "Comment", "BlogPost"):    84     85             # Handle pages and revisions, adding revisions to the page manifest.    86             # The original version is used as a unifying identifier for all the    87             # different revisions (each of which being defined by a Page    88             # element). Although "original" implies the first identifier used,    89             # it actually appears to be the latest and will have the highest    90             # version number.    91     92             if content.has_key("originalVersion"):    93                 pageid = content["originalVersion"]    94             else:    95                 pageid = identifier    96     97             versionfile = join(versions_dir, identifier)    98     99             # Note page metadata, not necessarily in the correct order.   100             # For comments, the title will need to be rewritten, since they   101             # should be defined in terms of their owner page.   102    103             # NOTE: This only makes the current title available to comments.   104    105             mkdirs(join(pages_dir, pageid))   106    107             title = content["title"]   108    109             # Limit the title to a "safe" number of characters in order to avoid   110             # filesystem issues.   111    112             title = get_page_title(title)   113    114             if title:   115                 title = "%s/%s" % (self.space, title)   116                 write(join(pages_dir, pageid, "pagetitle"), title)   117    118             # See sort_manifest for access to this data.   119    120             append(join(pages_dir, pageid, "manifest"),   121                 "%s|AddRevision|_|%s|%s|%s|%s\n" % ( # blank added for consistency with AddAttachment   122                     content["version"],   123                     versionfile,   124                     title, # comment titles will incorporate the comment's position   125                     content["lastModifierName"],   126                     content["versionComment"]   127                 ))   128    129             # Add information to parent pages for child page lists.   130    131             if content.has_key("parent"):   132                 parentid = content["parent"]   133                 mkdirs(join(pages_dir, parentid))   134                 append(join(pages_dir, parentid, "children"), title + "\n")   135    136             # Add creation details for comments to the owner page.   137             # Since comments can be versioned, the date of the original version   138             # is used, and only this "original" version has the owner property.   139    140             if objecttype == "Comment" and content.has_key("owner"):   141                 ownerid = content["owner"]   142                 mkdirs(join(pages_dir, ownerid))   143                 append(join(pages_dir, ownerid, "comments"), "%s|%s\n" % (content["creationDate"], pageid))   144    145             # Some metadata is not particularly relevant. For example,   146             # ancestors, children, parent are navigation-related.   147    148             # Other metadata could be added to the page content itself.   149             # For example, labelling could be converted to categories.   150    151         # Handle revisions.   152    153         elif objecttype == "BodyContent":   154             body = content["body"]   155             if not body:   156                 body = "## Empty page."   157    158             # NOTE: Very simple technique employed for guessing the format.   159    160             if no_translate:   161                 fn = write   162             elif body.startswith("<"):   163                 fn = xmltranslate   164             else:   165                 fn = translate   166    167             try:   168                 fn(join(versions_dir, content["content"]), body)   169             except:   170                 err = codecs.getwriter("utf-8")(sys.stderr)   171                 print >>err, "Error parsing", content["content"]   172                 raise   173    174         # Handle attachments.   175    176         elif objecttype == "Attachment":   177             pageid = content["content"]   178             version = content["attachmentVersion"]   179    180             if content.has_key("originalVersion"):   181                 attachid = content["originalVersion"]   182             else:   183                 attachid = identifier   184    185             append(join(pages_dir, pageid, "attachments"),   186                 "%s|AddAttachment|%s|%s|%s|%s|%s\n" % (   187                     version,   188                     # Have to "taint" archive filenames, although Moin will   189                     # probably handle package script filename tainting.   190                     wikiutil.taintfilename(join("attachments", pageid, attachid, version)),   191                     wikiutil.taintfilename(content["fileName"]),   192                     "", # pagename is substituted later   193                     content["lastModifierName"],   194                     content["comment"]   195                 ))   196    197         self.content = {}   198    199     def handle_property(self, name, elements, attributes, all_text, text):   200    201         "Record properties in the current content dictionary."   202    203         self.content[attributes[-1]["name"]] = text.strip()   204    205     def handle_id(self, name, elements, attributes, all_text, text):   206    207         "Promote identifiers to the parent element's text."   208    209         all_text[-2].append(text)   210    211     def handle_collection(self, name, elements, attributes, all_text, text):   212    213         "Record collections in the current content dictionary."   214    215         self.content[attributes[-1]["name"]] = self.elements   216         self.elements = []   217    218     def handle_element(self, name, elements, attributes, all_text, text):   219    220         "Add elements to the current collection."   221    222         self.elements.append((attributes[-1]["class"], text.strip()))   223    224 def mkdirs(name):   225    226     "Make the directory with the given 'name' at any depth."   227    228     try:   229         makedirs(name)   230     except OSError:   231         pass   232    233 def append(filename, s):   234    235     "Append to the file with the given 'filename' the string 's'."   236    237     write(filename, s, True)   238    239 def write(filename, s, append=False):   240    241     """   242     Write to the file with the given 'filename' the string 's'. If the optional   243     'append' parameter is set to a true value, 's' will be appended to the file.   244     """   245    246     f = codecs.open(filename, append and "a" or "w", encoding="utf-8")   247     try:   248         f.write(s)   249     finally:   250         f.close()   251    252 def read(filename):   253    254     """   255     Read from the file with the given 'filename', returning a string containing   256     its contents.   257     """   258    259     f = codecs.open(filename, encoding="utf-8")   260     try:   261         return f.read()   262     finally:   263         f.close()   264    265 def translate(filename, body, fn=None):   266    267     """   268     Write to the file with the given 'filename' a translation of the given   269     'body'.   270     """   271    272     fn = fn or wikiparser.parse   273    274     out = codecs.open(filename, "w", encoding="utf-8")   275     try:   276         print >>out, "#pragma page-filename", filename   277         fn(body, out)   278     finally:   279         out.close()   280    281 def xmltranslate(filename, body):   282     translate(filename, body, xmlparser.parse)   283    284 def sort_comments(pages_dir, pageid):   285    286     """   287     Where 'pageid' has comments associated with it, sort them chronologically   288     and label the comment pages with the owner page's title and comment's   289     position in the chronological sequence. Such labelling is done by writing   290     a "pagetitle" file in each comment page's directory.   291     """   292    293     comments = join(pages_dir, pageid, "comments")   294    295     if not exists(comments):   296         return   297    298     title = read(join(pages_dir, pageid, "pagetitle"))   299    300     details = [line.split("|") for line in read(comments).split("\n") if line]   301     details.sort()   302    303     # Write the sorted comments list for testing purposes.   304    305     write(comments, "\n".join(["|".join(x) for x in details]))   306    307     # Define comments as subpages by setting their titles using this   308     # page's name/title and their position in the comments collection.   309    310     for position, (_lastmodified, commentid) in enumerate(details):   311    312         # In the page directory for each comment, write the page title in a   313         # special file for later processing.   314    315         write(join(pages_dir, commentid, "pagetitle"), "%s/%04d" % (title, position))   316    317 def _sort_manifest(manifest, title):   318    319     """   320     Open the given 'manifest' and sort it according to revision so that it will   321     be added to MoinMoin in the correct order.   322    323     If a 'title' is provided, the title column in the manifest will be augmented   324     with that information. This is typically done for comments and is necessary   325     for attachments.   326    327     A list of manifest entries is returned.   328     """   329    330     f = codecs.open(manifest, "r", encoding="utf-8")   331     try:   332         lines = [x.split("|") for x in f.readlines()]   333         lines.sort(cmp=lambda x, y: cmp(int(x[0]), int(y[0])))   334    335         # Reconstruct the lines, optionally changing the titles.   336    337         result = []   338    339         for line in lines:   340             version, _action, _archive_filename, filename, old_title, username, comment = line   341    342             # Replace title information with the information already present.   343    344             if not old_title:   345                 new_title = title   346             else:   347                 new_title = old_title   348    349             # The version is omitted now that the manifest is ordered.   350    351             line = _action, _archive_filename, filename, new_title, username, comment   352             result.append(line)   353    354         return result   355    356     finally:   357         f.close()   358    359 def serialise_manifest(manifest):   360    361     """   362     Process the 'manifest' consisting of entries, removing superfluous columns.   363     """   364    365     result = []   366    367     for columns in manifest:   368         action = columns[0]   369         if action == "AddRevision":   370             columns = list(columns)   371             del columns[1]   372         result.append("|".join(columns))   373    374     return "".join(result)   375                376 def sort_manifest(pages_dir, pageid, output=None, no_translate=False):   377    378     """   379     Using the given 'pageid', locate the manifest for the page and any page   380     title information written to a "pagetitle" file.   381    382     Then sort the manifest according to revision so that it will be added to   383     MoinMoin in the correct order.   384    385     If a "pagetitle" file exists, the title column in the manifest will be   386     augmented with the contents of that file. This is typically done for   387     comments.   388    389     If a "children" file exists, the pages in that file will be added as a list   390     to the end of each revision's content.   391    392     If 'output' is given, the manifest details will be appended to the file   393     having that filename instead of being rewritten to the original manifest   394     file.   395     """   396    397     manifest = join(pages_dir, pageid, "manifest")   398     attachments = join(pages_dir, pageid, "attachments")   399     pagetitle = join(pages_dir, pageid, "pagetitle")   400     children = join(pages_dir, pageid, "children")   401     comments = join(pages_dir, pageid, "comments")   402    403     if exists(pagetitle):   404         title = read(pagetitle)   405         space, _page_name = get_space_and_name(title)   406     else:   407         title = space = None   408    409     # Sort the revision manifest.   410    411     result = _sort_manifest(manifest, title)   412    413     for _action, _archive_filename, filename, new_title, username, comment in result:   414    415         # Add child page information to the content.   416    417         if exists(children) and not no_translate:   418             child_pages = []   419             child_page_names = [x for x in read(children).split("\n") if x]   420             child_page_names.sort()   421    422             # Produce links which hide the space prefix.   423    424             for child_page_name in child_page_names:   425                 child_space, page_name = get_space_and_name(child_page_name)   426                 if child_space == space:   427                     child_page_label = page_name   428                 else:   429                     child_page_label = child_page_name   430    431                 child_pages.append(" * [[%s|%s]]" % (child_page_name, child_page_label))   432    433             append(filename, child_page_section % "\n".join(child_pages))   434    435         # Add comments to the content.   436    437         if exists(comments) and title and not no_translate:   438             append(filename, comment_section % title)   439    440     # Add the attachments to the manifest.   441    442     if exists(attachments):   443         result += _sort_manifest(attachments, title)   444    445     # Serialise the manifest.   446    447     s = serialise_manifest(result)   448    449     if output is None:   450         write(manifest, s)   451     else:   452         append(output, s)   453    454 def get_space_and_name(page_name):   455     try:   456         return page_name.split("/", 1)   457     except IndexError:   458         return None, page_name   459    460 # Template for child page information.   461    462 child_page_section = """   463 ----   464    465 %s   466 """   467    468 # Template for comments.   469    470 comment_section = """   471 ----   472    473 <<Include("^%s/")>>   474 """   475    476 # Main program.   477    478 if __name__ == "__main__":   479     try:   480         filename = sys.argv[1]   481         is_zipfile = splitext(filename)[-1] == extsep + "zip"   482         space = sys.argv[2]   483         if len(sys.argv) > 3 and sys.argv[3]:   484             attachments = sys.argv[3]   485         else:   486             attachments = None   487     except IndexError:   488         print >>sys.stderr, """   489 Please specify an XML file containing Wiki data, a workspace name, and an   490 optional attachments directory location. For example:   491    492 com_entities.xml COM attachments   493    494 Adding --no-translate will unpack the Wiki but not translate the content.   495 When doing so without an attachments directory, add an empty argument as   496 follows:   497    498 com_entities.xml COM '' --no-translate   499 """   500         sys.exit(1)   501    502     no_translate = "--no-translate" in sys.argv   503    504     if exists(space):   505         print >>sys.stderr, "Directory exists for space %s. Please choose another or remove its contents." % space   506         sys.exit(1)   507    508     package_zip = space + extsep + "zip"   509    510     if exists(package_zip):   511         print >>sys.stderr, "Page package exists. Please remove or rename it:", package_zip   512         sys.exit(1)   513    514     mkdir(space)   515     mkdirs(join(space, "pages"))   516     mkdirs(join(space, "versions"))   517    518     p = xmlread.ConfigurableParser()   519     handler = ConfluenceHandler(space, no_translate)   520    521     # Register handlers in the parser for different elements.   522    523     p["object"] = handler.handle_object   524     p["property"] = handler.handle_property   525     p["id"] = handler.handle_id   526     p["collection"] = handler.handle_collection   527     p["element"] = handler.handle_element   528    529     # Open the XML dump.   530    531     f = open(filename)   532    533     if is_zipfile:   534         zf = ZipFile(f)   535         ff = StringIO(zf.read("entities.xml"))   536     else:   537         ff = f   538    539     # Parse the data.   540    541     try:   542         p.parse(ff)   543    544         # Tidy up the import manifests, sorting each of them by revision and   545         # finalising them.   546    547         pages_dir = join(space, "pages")   548    549         for pageid in listdir(pages_dir):   550             sort_comments(pages_dir, pageid)   551    552         output_manifest = join(space, "MOIN_PACKAGE")   553         append(output_manifest, "MoinMoinPackage|1\n")   554    555         for pageid in listdir(pages_dir):   556             sort_manifest(pages_dir, pageid, output_manifest, no_translate)   557    558         # Write the page package.   559    560         page_package = ZipFile(package_zip, "w")   561    562         try:   563             # Include the page revisions.   564    565             versions_dir = join(space, "versions")   566    567             for versionid in listdir(versions_dir):   568                 page_package.write(join(versions_dir, versionid))   569    570             # Include the attachments.   571    572             if attachments:   573                 cwd = getcwd()   574                 chdir(split(attachments)[0])   575                 try:   576                     for path, dirnames, filenames in walk(split(attachments)[1]):   577                         for filename in filenames:   578                             # Have to "taint" archive filenames.   579                             page_package.write(join(path, filename), wikiutil.taintfilename(join(path, filename)))   580                 finally:   581                     chdir(cwd)   582             elif is_zipfile:   583                 for filename in zf.namelist():   584                     if filename.startswith("attachments"):   585                         # Have to "taint" archive filenames.   586                         page_package.writestr(wikiutil.taintfilename(filename), zf.read(filename))   587    588             # Include only the top-level manifest.   589    590             page_package.write(output_manifest, "MOIN_PACKAGE")   591    592         finally:   593             page_package.close()   594    595     finally:   596         f.close()   597    598 # vim: tabstop=4 expandtab shiftwidth=4