ConfluenceConverter (file convert.py at 7c7e460c54d7)

     1 #!/usr/bin/env python     2      3 """     4 Confluence XML dump conversion to a MoinMoin-compatible representation.     5      6 Copyright (C) 2012, 2013 Paul Boddie <paul@boddie.org.uk>     7      8 This software is free software; you can redistribute it and/or     9 modify it under the terms of the GNU General Public License as    10 published by the Free Software Foundation; either version 2 of    11 the License, or (at your option) any later version.    12     13 This software is distributed in the hope that it will be useful,    14 but WITHOUT ANY WARRANTY; without even the implied warranty of    15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the    16 GNU General Public License for more details.    17     18 You should have received a copy of the GNU General Public    19 License along with this library; see the file LICENCE.txt    20 If not, write to the Free Software Foundation, Inc.,    21 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA    22 """    23     24 from os import chdir, getcwd, listdir, mkdir, makedirs, walk    25 from os.path import exists, extsep, join, split, splitext    26 from zipfile import ZipFile    27 from cStringIO import StringIO    28 from MoinMoin import wikiutil    29 import codecs    30 import xmlread    31 import wikiparser, xmlparser    32 import sys    33     34 from common import get_page_title    35     36 class ConfluenceHandler:    37     38     "Handle content from a Confluence Wiki dump."    39     40     def __init__(self, space, no_translate=False):    41         self.content = {}    42         self.elements = []    43         self.space = space    44         self.no_translate = no_translate    45     46     def handle_object(self, name, elements, attributes, all_text, text):    47     48         """    49         Handle objects according to type. Objects appear as follows:    50     51         <object class="Page" package="...">    52         <id name="id">...</id>    53         ...    54         </object>    55     56         Within objects, one finds things like properties and collections, which    57         are handled by their own methods but which are stored in the content    58         dictionary associated with the current object.    59     60         By the time this method is called, the contents of the object will have    61         been gathered and the properties and collections populated in the    62         content dictionary. Any identifier will have been assigned to the    63         textual content of the object element and will be available in the    64         'text' parameter.    65         """    66     67         objecttype = attributes[-1]["class"]    68     69         # Any identifier is stored as the object's textual content.    70     71         identifier = text.strip()    72     73         # The content is a dictionary mapping names to properties and    74         # collections.    75     76         content = self.content    77     78         pages_dir = join(self.space, "pages")    79         versions_dir = join(self.space, "versions")    80     81         # Handle particular types.    82     83         if objecttype in ("Page", "Comment", "BlogPost"):    84     85             # Handle pages and revisions, adding revisions to the page manifest.    86             # The original version is used as a unifying identifier for all the    87             # different revisions (each of which being defined by a Page    88             # element). Although "original" implies the first identifier used,    89             # it actually appears to be the latest and will have the highest    90             # version number.    91     92             if content.has_key("originalVersion"):    93                 pageid = content["originalVersion"]    94             else:    95                 pageid = identifier    96     97             versionfile = join(versions_dir, identifier)    98     99             # Note page metadata, not necessarily in the correct order.   100             # For comments, the title will need to be rewritten, since they   101             # should be defined in terms of their owner page.   102    103             # NOTE: This only makes the current title available to comments.   104    105             mkdirs(join(pages_dir, pageid))   106    107             title = content["title"]   108    109             # Limit the title to a "safe" number of characters in order to avoid   110             # filesystem issues.   111    112             title = get_page_title(title)   113    114             if title:   115                 title = "%s/%s" % (self.space, title)   116                 write(join(pages_dir, pageid, "pagetitle"), title)   117    118             # Note the type of the page.   119    120             write(join(pages_dir, pageid, "pagetype"), objecttype)   121    122             # See sort_manifest for access to this data.   123    124             append(join(pages_dir, pageid, "manifest"),   125                 "%s|AddRevision|_|%s|%s|%s|%s\n" % ( # blank added for consistency with AddAttachment   126                     content["version"],   127                     versionfile,   128                     title, # comment titles will incorporate the comment's position   129                     content["lastModifierName"],   130                     content["versionComment"]   131                 ))   132    133             # Add information to parent pages for child page lists.   134    135             if content.has_key("parent"):   136                 parentid = content["parent"]   137                 mkdirs(join(pages_dir, parentid))   138                 append(join(pages_dir, parentid, "children"), title + "\n")   139    140             # Add creation details for comments to the owner page.   141             # Since comments can be versioned, the date of the original version   142             # is used, and only this "original" version has the owner property.   143    144             if objecttype == "Comment" and content.has_key("owner"):   145                 ownerid = content["owner"]   146                 mkdirs(join(pages_dir, ownerid))   147                 append(join(pages_dir, ownerid, "comments"), "%s|%s\n" % (content["creationDate"], pageid))   148    149             # Some metadata is not particularly relevant. For example,   150             # ancestors, children, parent are navigation-related.   151    152             # Other metadata could be added to the page content itself.   153             # For example, labelling could be converted to categories.   154    155         # Handle revisions.   156    157         elif objecttype == "BodyContent":   158             body = content["body"]   159             if not body:   160                 body = "## Empty page."   161    162             # NOTE: Very simple technique employed for guessing the format.   163    164             if no_translate:   165                 fn = write   166             elif body.startswith("<"):   167                 fn = xmltranslate   168             else:   169                 fn = translate   170    171             try:   172                 fn(join(versions_dir, content["content"]), body)   173             except:   174                 err = codecs.getwriter("utf-8")(sys.stderr)   175                 print >>err, "Error parsing", content["content"]   176                 raise   177    178         # Handle attachments.   179    180         elif objecttype == "Attachment":   181             pageid = content["content"]   182             version = content["attachmentVersion"]   183    184             if content.has_key("originalVersion"):   185                 attachid = content["originalVersion"]   186             else:   187                 attachid = identifier   188    189             append(join(pages_dir, pageid, "attachments"),   190                 "%s|AddAttachment|%s|%s|%s|%s|%s\n" % (   191                     version,   192                     # Have to "taint" archive filenames, although Moin will   193                     # probably handle package script filename tainting.   194                     wikiutil.taintfilename(join("attachments", pageid, attachid, version)),   195                     wikiutil.taintfilename(content["fileName"]),   196                     "", # pagename is substituted later   197                     content["lastModifierName"],   198                     content["comment"]   199                 ))   200    201         self.content = {}   202    203     def handle_property(self, name, elements, attributes, all_text, text):   204    205         "Record properties in the current content dictionary."   206    207         self.content[attributes[-1]["name"]] = text.strip()   208    209     def handle_id(self, name, elements, attributes, all_text, text):   210    211         "Promote identifiers to the parent element's text."   212    213         all_text[-2].append(text)   214    215     def handle_collection(self, name, elements, attributes, all_text, text):   216    217         "Record collections in the current content dictionary."   218    219         self.content[attributes[-1]["name"]] = self.elements   220         self.elements = []   221    222     def handle_element(self, name, elements, attributes, all_text, text):   223    224         "Add elements to the current collection."   225    226         self.elements.append((attributes[-1]["class"], text.strip()))   227    228 def mkdirs(name):   229    230     "Make the directory with the given 'name' at any depth."   231    232     try:   233         makedirs(name)   234     except OSError:   235         pass   236    237 def append(filename, s):   238    239     "Append to the file with the given 'filename' the string 's'."   240    241     write(filename, s, True)   242    243 def write(filename, s, append=False):   244    245     """   246     Write to the file with the given 'filename' the string 's'. If the optional   247     'append' parameter is set to a true value, 's' will be appended to the file.   248     """   249    250     f = codecs.open(filename, append and "a" or "w", encoding="utf-8")   251     try:   252         f.write(s)   253     finally:   254         f.close()   255    256 def read(filename):   257    258     """   259     Read from the file with the given 'filename', returning a string containing   260     its contents.   261     """   262    263     f = codecs.open(filename, encoding="utf-8")   264     try:   265         return f.read()   266     finally:   267         f.close()   268    269 def translate(filename, body, fn=None):   270    271     """   272     Write to the file with the given 'filename' a translation of the given   273     'body'.   274     """   275    276     fn = fn or wikiparser.parse   277    278     out = codecs.open(filename, "w", encoding="utf-8")   279     try:   280         print >>out, "#pragma page-filename", filename   281         fn(body, out)   282     finally:   283         out.close()   284    285 def xmltranslate(filename, body):   286     translate(filename, body, xmlparser.parse)   287    288 def sort_comments(pages_dir, pageid):   289    290     """   291     Where 'pageid' has comments associated with it, sort them chronologically   292     and label the comment pages with the owner page's title and comment's   293     position in the chronological sequence. Such labelling is done by writing   294     a "pagetitle" file in each comment page's directory.   295     """   296    297     comments = join(pages_dir, pageid, "comments")   298    299     if not exists(comments):   300         return   301    302     title = read(join(pages_dir, pageid, "pagetitle"))   303    304     details = [line.split("|") for line in read(comments).split("\n") if line]   305     details.sort()   306    307     # Write the sorted comments list for testing purposes.   308    309     write(comments, "\n".join(["|".join(x) for x in details]))   310    311     # Define comments as subpages by setting their titles using this   312     # page's name/title and their position in the comments collection.   313    314     for position, (_lastmodified, commentid) in enumerate(details):   315    316         # In the page directory for each comment, write the page title in a   317         # special file for later processing.   318    319         write(join(pages_dir, commentid, "pagetitle"), "%s/%04d" % (title, position))   320    321 def _manifest_to_mapping(manifest, output_mapping):   322    323     """   324     Open the given 'manifest' and write a mapping from version identifiers to   325     page names/titles to the file with the given 'output_mapping' filename.   326     """   327    328     f = codecs.open(manifest, "r", encoding="utf-8")   329     try:   330         mapping = []   331    332         lines = [x.split("|") for x in f.readlines()]   333         for line in lines:   334             version, _action, _archive_filename, filename, title, username, comment = line   335             if title:   336                 mapping.append((split(filename)[-1], title))   337    338         append(output_mapping, "".join([("\t".join(x) + "\n") for x in mapping]))   339    340     finally:   341         f.close()   342    343 def _sort_manifest(manifest, title):   344    345     """   346     Open the given 'manifest' and sort it according to revision so that it will   347     be added to MoinMoin in the correct order.   348    349     If a 'title' is provided, the title column in the manifest will be augmented   350     with that information. This is typically done for comments and is necessary   351     for attachments.   352    353     A list of manifest entries is returned.   354     """   355    356     f = codecs.open(manifest, "r", encoding="utf-8")   357     try:   358         lines = [x.split("|") for x in f.readlines()]   359         lines.sort(cmp=lambda x, y: cmp(int(x[0]), int(y[0])))   360    361         # Reconstruct the lines, optionally changing the titles.   362    363         result = []   364    365         for line in lines:   366             version, _action, _archive_filename, filename, old_title, username, comment = line   367    368             # Replace title information with the information already present.   369    370             if not old_title:   371                 new_title = title   372             else:   373                 new_title = old_title   374    375             # The version is omitted now that the manifest is ordered.   376    377             line = _action, _archive_filename, filename, new_title, username, comment   378             result.append(line)   379    380         return result   381    382     finally:   383         f.close()   384    385 def serialise_manifest(manifest):   386    387     """   388     Process the 'manifest' consisting of entries, removing superfluous columns.   389     """   390    391     result = []   392    393     for columns in manifest:   394         action = columns[0]   395         if action == "AddRevision":   396             columns = list(columns)   397             del columns[1]   398         result.append("|".join(columns))   399    400     return "".join(result)   401                402 def sort_manifest(pages_dir, pageid, output=None, output_mapping=None, no_translate=False):   403    404     """   405     Using the given 'pageid', locate the manifest for the page and any page   406     title information written to a "pagetitle" file.   407    408     Then sort the manifest according to revision so that it will be added to   409     MoinMoin in the correct order.   410    411     If a "pagetitle" file exists, the title column in the manifest will be   412     augmented with the contents of that file. This is typically done for   413     comments.   414    415     If a "children" file exists, the pages in that file will be added as a list   416     to the end of each revision's content.   417    418     If 'output' is given, the manifest details will be appended to the file   419     having that filename instead of being rewritten to the original manifest   420     file.   421    422     If 'output_mapping' is given, a mapping from version identifiers to page   423     titles will be appended to the file having that filename.   424     """   425    426     pagetype = join(pages_dir, pageid, "pagetype")   427     manifest = join(pages_dir, pageid, "manifest")   428     attachments = join(pages_dir, pageid, "attachments")   429     pagetitle = join(pages_dir, pageid, "pagetitle")   430     children = join(pages_dir, pageid, "children")   431     comments = join(pages_dir, pageid, "comments")   432    433     type = exists(pagetype) and read(pagetype) or None   434    435     if exists(pagetitle):   436         title = read(pagetitle)   437         space, _page_name = get_space_and_name(title)   438     else:   439         title = space = None   440    441     # Sort the revision manifest.   442    443     result = _sort_manifest(manifest, title)   444    445     # Output a mapping of identifiers to page names.   446    447     if output_mapping:   448         _manifest_to_mapping(manifest, output_mapping)   449    450     # Modify the content to include child pages and comments.   451    452     for _action, _archive_filename, filename, new_title, username, comment in result:   453         text = read(filename)   454    455         # Add an ACL to comment pages so that people cannot change other   456         # people's comments.   457    458         if type == "Comment":   459             text = "#acl %s:read,write,delete,revert All:read\n%s" % (username, text)   460    461         # Add child page information to the content.   462    463         if exists(children) and not no_translate:   464             child_pages = []   465             child_page_names = [x for x in read(children).split("\n") if x]   466             child_page_names.sort()   467    468             # Produce links which hide the space prefix.   469    470             for child_page_name in child_page_names:   471                 child_space, page_name = get_space_and_name(child_page_name)   472                 if child_space == space:   473                     child_page_label = page_name   474                 else:   475                     child_page_label = child_page_name   476    477                 child_pages.append(" * [[%s|%s]]" % (child_page_name, child_page_label))   478    479             text += child_page_section % "\n".join(child_pages)   480    481         # Add comments to the content.   482    483         if exists(comments) and title and not no_translate:   484             text += comment_section % title   485    486         # Rewrite the file.   487    488         write(filename, text)   489    490     # Add the attachments to the manifest.   491    492     if exists(attachments):   493         result += _sort_manifest(attachments, title)   494    495     # Serialise the manifest.   496    497     s = serialise_manifest(result)   498    499     if output is None:   500         write(manifest, s)   501     else:   502         append(output, s)   503    504 def get_space_and_name(page_name):   505     try:   506         return page_name.split("/", 1)   507     except IndexError:   508         return None, page_name   509    510 # Template for child page information.   511    512 child_page_section = """   513 ----   514    515 %s   516 """   517    518 # Template for comments.   519    520 comment_section = """   521 ----   522    523 <<Include("^%s/")>>   524 """   525    526 # Main program.   527    528 if __name__ == "__main__":   529     try:   530         filename = sys.argv[1]   531         is_zipfile = splitext(filename)[-1] == extsep + "zip"   532         space = sys.argv[2]   533         if len(sys.argv) > 3 and sys.argv[3]:   534             attachments = sys.argv[3]   535         else:   536             attachments = None   537     except IndexError:   538         print >>sys.stderr, """   539 Please specify an XML file containing Wiki data, a workspace name, and an   540 optional attachments directory location. For example:   541    542 %(progname)s com_entities.xml COM attachments   543    544 Adding --no-translate will unpack the Wiki but not translate the content.   545 When doing so without an attachments directory, add an empty argument as   546 follows:   547    548 %(progname)s com_entities.xml COM '' --no-translate   549    550 An archive can be used instead of the XML file, and since this may include   551 attachments, no additional attachments directory needs to be specified:   552    553 %(progname)s COM-123456-789012.zip COM   554 """ % {"progname" : split(sys.argv[0])[-1]}   555    556         sys.exit(1)   557    558     no_translate = "--no-translate" in sys.argv   559    560     if exists(space):   561         print >>sys.stderr, "Directory exists for space %s. Please choose another or remove its contents." % space   562         sys.exit(1)   563    564     package_zip = space + extsep + "zip"   565    566     if exists(package_zip):   567         print >>sys.stderr, "Page package exists. Please remove or rename it:", package_zip   568         sys.exit(1)   569    570     mkdir(space)   571     mkdirs(join(space, "pages"))   572     mkdirs(join(space, "versions"))   573    574     p = xmlread.ConfigurableParser()   575     handler = ConfluenceHandler(space, no_translate)   576    577     # Register handlers in the parser for different elements.   578    579     p["object"] = handler.handle_object   580     p["property"] = handler.handle_property   581     p["id"] = handler.handle_id   582     p["collection"] = handler.handle_collection   583     p["element"] = handler.handle_element   584    585     # Open the XML dump.   586    587     f = open(filename)   588    589     if is_zipfile:   590         zf = ZipFile(f)   591         ff = StringIO(zf.read("entities.xml"))   592     else:   593         ff = f   594    595     # Parse the data.   596    597     try:   598         p.parse(ff)   599    600         # Tidy up the import manifests, sorting each of them by revision and   601         # finalising them.   602    603         pages_dir = join(space, "pages")   604    605         for pageid in listdir(pages_dir):   606             sort_comments(pages_dir, pageid)   607    608         output_mapping = join(space, "MAPPING")   609    610         output_manifest = join(space, "MOIN_PACKAGE")   611         append(output_manifest, "MoinMoinPackage|1\n")   612    613         for pageid in listdir(pages_dir):   614             sort_manifest(pages_dir, pageid, output_manifest, output_mapping, no_translate)   615    616         # Write the page package.   617    618         page_package = ZipFile(package_zip, "w")   619    620         try:   621             # Include the page revisions.   622    623             versions_dir = join(space, "versions")   624    625             for versionid in listdir(versions_dir):   626                 page_package.write(join(versions_dir, versionid))   627    628             # Include the attachments.   629    630             if attachments:   631                 cwd = getcwd()   632                 chdir(split(attachments)[0])   633                 try:   634                     for path, dirnames, filenames in walk(split(attachments)[1]):   635                         for filename in filenames:   636                             # Have to "taint" archive filenames.   637                             page_package.write(join(path, filename), wikiutil.taintfilename(join(path, filename)))   638                 finally:   639                     chdir(cwd)   640             elif is_zipfile:   641                 for filename in zf.namelist():   642                     if filename.startswith("attachments"):   643                         # Have to "taint" archive filenames.   644                         page_package.writestr(wikiutil.taintfilename(filename), zf.read(filename))   645    646             # Include only the top-level manifest.   647    648             page_package.write(output_manifest, "MOIN_PACKAGE")   649    650         finally:   651             page_package.close()   652    653     finally:   654         f.close()   655    656 # vim: tabstop=4 expandtab shiftwidth=4