ConfluenceConverter (file convert.py at e03ec526eebc)

     1 #!/usr/bin/env python     2      3 """     4 Confluence XML dump conversion to a MoinMoin-compatible representation.     5      6 Copyright (C) 2012, 2013 Paul Boddie <paul@boddie.org.uk>     7      8 This software is free software; you can redistribute it and/or     9 modify it under the terms of the GNU General Public License as    10 published by the Free Software Foundation; either version 2 of    11 the License, or (at your option) any later version.    12     13 This software is distributed in the hope that it will be useful,    14 but WITHOUT ANY WARRANTY; without even the implied warranty of    15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the    16 GNU General Public License for more details.    17     18 You should have received a copy of the GNU General Public    19 License along with this library; see the file LICENCE.txt    20 If not, write to the Free Software Foundation, Inc.,    21 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA    22 """    23     24 from os import chdir, getcwd, listdir, mkdir, makedirs, walk    25 from os.path import exists, extsep, join, split, splitext    26 from zipfile import ZipFile    27 from cStringIO import StringIO    28 from MoinMoin import wikiutil    29 import codecs    30 import xmlread    31 import wikiparser, xmlparser    32 import sys    33     34 from common import get_page_title    35     36 class ConfluenceHandler:    37     38     "Handle content from a Confluence Wiki dump."    39     40     def __init__(self, space, no_translate=False):    41         self.content = {}    42         self.elements = []    43         self.space = space    44         self.no_translate = no_translate    45     46     def handle_object(self, name, elements, attributes, all_text, text):    47     48         """    49         Handle objects according to type. Objects appear as follows:    50     51         <object class="Page" package="...">    52         <id name="id">...</id>    53         ...    54         </object>    55     56         Within objects, one finds things like properties and collections, which    57         are handled by their own methods but which are stored in the content    58         dictionary associated with the current object.    59     60         By the time this method is called, the contents of the object will have    61         been gathered and the properties and collections populated in the    62         content dictionary. Any identifier will have been assigned to the    63         textual content of the object element and will be available in the    64         'text' parameter.    65         """    66     67         objecttype = attributes[-1]["class"]    68     69         # Any identifier is stored as the object's textual content.    70     71         identifier = text.strip()    72     73         # The content is a dictionary mapping names to properties and    74         # collections.    75     76         content = self.content    77     78         pages_dir = join(self.space, "pages")    79         versions_dir = join(self.space, "versions")    80     81         # Handle particular types.    82     83         if objecttype in ("Page", "Comment", "BlogPost"):    84     85             # Handle pages and revisions, adding revisions to the page manifest.    86             # The original version is used as a unifying identifier for all the    87             # different revisions (each of which being defined by a Page    88             # element). Although "original" implies the first identifier used,    89             # it actually appears to be the latest and will have the highest    90             # version number.    91     92             if content.has_key("originalVersion"):    93                 pageid = content["originalVersion"]    94             else:    95                 pageid = identifier    96     97             versionfile = join(versions_dir, identifier)    98     99             # Note page metadata, not necessarily in the correct order.   100             # For comments, the title will need to be rewritten, since they   101             # should be defined in terms of their owner page.   102    103             # NOTE: This only makes the current title available to comments.   104    105             mkdirs(join(pages_dir, pageid))   106    107             title = content["title"]   108    109             # Limit the title to a "safe" number of characters in order to avoid   110             # filesystem issues.   111    112             title = get_page_title(title)   113    114             if title:   115                 title = "%s/%s" % (self.space, title)   116                 write(join(pages_dir, pageid, "pagetitle"), title)   117    118             # Note the type of the page.   119    120             write(join(pages_dir, pageid, "pagetype"), objecttype)   121    122             # See sort_manifest for access to this data.   123    124             append(join(pages_dir, pageid, "manifest"),   125                 "%s|AddRevision|_|%s|%s|%s|%s\n" % ( # blank added for consistency with AddAttachment   126                     content["version"],   127                     versionfile,   128                     title, # comment titles will incorporate the comment's position   129                     content["lastModifierName"],   130                     content["versionComment"]   131                 ))   132    133             # Add information to parent pages for child page lists.   134    135             if content.has_key("parent"):   136                 parentid = content["parent"]   137                 mkdirs(join(pages_dir, parentid))   138                 append(join(pages_dir, parentid, "children"), title + "\n")   139    140             # Add creation details for comments to the owner page.   141             # Since comments can be versioned, the date of the original version   142             # is used, and only this "original" version has the owner property.   143    144             if objecttype == "Comment" and content.has_key("owner"):   145                 ownerid = content["owner"]   146                 mkdirs(join(pages_dir, ownerid))   147                 append(join(pages_dir, ownerid, "comments"), "%s|%s\n" % (content["creationDate"], pageid))   148    149             # Some metadata is not particularly relevant. For example,   150             # ancestors, children, parent are navigation-related.   151    152             # Other metadata could be added to the page content itself.   153             # For example, labelling could be converted to categories.   154    155         # Handle revisions.   156    157         elif objecttype == "BodyContent":   158             body = content["body"]   159             if not body:   160                 body = "## Empty page."   161    162             # NOTE: Very simple technique employed for guessing the format.   163    164             if no_translate:   165                 fn = write   166             elif body.startswith("<"):   167                 fn = xmltranslate   168             else:   169                 fn = translate   170    171             try:   172                 fn(join(versions_dir, content["content"]), body)   173             except:   174                 err = codecs.getwriter("utf-8")(sys.stderr)   175                 print >>err, "Error parsing", content["content"]   176                 raise   177    178         # Handle attachments.   179    180         elif objecttype == "Attachment":   181             pageid = content["content"]   182             version = content["attachmentVersion"]   183    184             if content.has_key("originalVersion"):   185                 attachid = content["originalVersion"]   186             else:   187                 attachid = identifier   188    189             append(join(pages_dir, pageid, "attachments"),   190                 "%s|AddAttachment|%s|%s|%s|%s|%s\n" % (   191                     version,   192                     # Have to "taint" archive filenames, although Moin will   193                     # probably handle package script filename tainting.   194                     wikiutil.taintfilename(join("attachments", pageid, attachid, version)),   195                     wikiutil.taintfilename(content["fileName"]),   196                     "", # pagename is substituted later   197                     content["lastModifierName"],   198                     content["comment"]   199                 ))   200    201         self.content = {}   202    203     def handle_property(self, name, elements, attributes, all_text, text):   204    205         "Record properties in the current content dictionary."   206    207         self.content[attributes[-1]["name"]] = text.strip()   208    209     def handle_id(self, name, elements, attributes, all_text, text):   210    211         "Promote identifiers to the parent element's text."   212    213         all_text[-2].append(text)   214    215     def handle_collection(self, name, elements, attributes, all_text, text):   216    217         "Record collections in the current content dictionary."   218    219         self.content[attributes[-1]["name"]] = self.elements   220         self.elements = []   221    222     def handle_element(self, name, elements, attributes, all_text, text):   223    224         "Add elements to the current collection."   225    226         self.elements.append((attributes[-1]["class"], text.strip()))   227    228 def mkdirs(name):   229    230     "Make the directory with the given 'name' at any depth."   231    232     try:   233         makedirs(name)   234     except OSError:   235         pass   236    237 def append(filename, s):   238    239     "Append to the file with the given 'filename' the string 's'."   240    241     write(filename, s, True)   242    243 def write(filename, s, append=False):   244    245     """   246     Write to the file with the given 'filename' the string 's'. If the optional   247     'append' parameter is set to a true value, 's' will be appended to the file.   248     """   249    250     f = codecs.open(filename, append and "a" or "w", encoding="utf-8")   251     try:   252         f.write(s)   253     finally:   254         f.close()   255    256 def read(filename):   257    258     """   259     Read from the file with the given 'filename', returning a string containing   260     its contents.   261     """   262    263     f = codecs.open(filename, encoding="utf-8")   264     try:   265         return f.read()   266     finally:   267         f.close()   268    269 def translate(filename, body, fn=None):   270    271     """   272     Write to the file with the given 'filename' a translation of the given   273     'body'.   274     """   275    276     fn = fn or wikiparser.parse   277    278     out = codecs.open(filename, "w", encoding="utf-8")   279     try:   280         print >>out, "#pragma page-filename", filename   281         fn(body, out)   282     finally:   283         out.close()   284    285 def xmltranslate(filename, body):   286     translate(filename, body, xmlparser.parse)   287    288 def sort_comments(pages_dir, pageid):   289    290     """   291     Where 'pageid' has comments associated with it, sort them chronologically   292     and label the comment pages with the owner page's title and comment's   293     position in the chronological sequence. Such labelling is done by writing   294     a "pagetitle" file in each comment page's directory.   295     """   296    297     comments = join(pages_dir, pageid, "comments")   298    299     if not exists(comments):   300         return   301    302     title = read(join(pages_dir, pageid, "pagetitle"))   303    304     details = [line.split("|") for line in read(comments).split("\n") if line]   305     details.sort()   306    307     # Write the sorted comments list for testing purposes.   308    309     write(comments, "\n".join(["|".join(x) for x in details]))   310    311     # Define comments as subpages by setting their titles using this   312     # page's name/title and their position in the comments collection.   313    314     for position, (_lastmodified, commentid) in enumerate(details):   315    316         # In the page directory for each comment, write the page title in a   317         # special file for later processing.   318    319         write(join(pages_dir, commentid, "pagetitle"), "%s/%04d" % (title, position))   320    321 def _manifest_to_mapping(manifest, output_mapping):   322    323     """   324     Open the given 'manifest' and write a mapping from version identifiers to   325     page names/titles to the file with the given 'output_mapping' filename.   326     """   327    328     f = codecs.open(manifest, "r", encoding="utf-8")   329     try:   330         mapping = []   331    332         lines = [x.split("|") for x in f.readlines()]   333         for line in lines:   334             version, _action, _archive_filename, filename, title, username, comment = line   335             if title:   336                 mapping.append((split(filename)[-1], title))   337    338         append(output_mapping, "".join([("\t".join(x) + "\n") for x in mapping]))   339    340     finally:   341         f.close()   342    343 def _sort_manifest(manifest, title):   344    345     """   346     Open the given 'manifest' and sort it according to revision so that it will   347     be added to MoinMoin in the correct order.   348    349     If a 'title' is provided, the title column in the manifest will be augmented   350     with that information. This is typically done for comments and is necessary   351     for attachments.   352    353     A list of manifest entries is returned.   354     """   355    356     f = codecs.open(manifest, "r", encoding="utf-8")   357     try:   358         lines = [x.rstrip("\n").split("|") for x in f.readlines()]   359         lines.sort(cmp=lambda x, y: cmp(int(x[0]), int(y[0])))   360    361         # Reconstruct the lines, optionally changing the titles.   362    363         result = []   364    365         for line in lines:   366             version, _action, _archive_filename, filename, old_title, username, comment = line   367    368             # Replace title information with the information already present.   369    370             if not old_title:   371                 new_title = title   372             else:   373                 new_title = old_title   374    375             # The version is omitted now that the manifest is ordered.   376    377             line = _action, _archive_filename, filename, new_title, username, comment   378             result.append(line)   379    380         return result   381    382     finally:   383         f.close()   384    385 def serialise_manifest(manifest):   386    387     """   388     Process the 'manifest' consisting of entries, removing superfluous columns.   389     """   390    391     result = []   392    393     for columns in manifest:   394         action = columns[0]   395         if action == "AddRevision":   396             columns = list(columns)   397             del columns[1]   398         result.append("|".join(columns) + "\n")   399    400     return "".join(result)   401                402 def sort_manifest(pages_dir, pageid, output=None, output_mapping=None, no_translate=False):   403    404     """   405     Using the given 'pageid', locate the manifest for the page and any page   406     title information written to a "pagetitle" file.   407    408     Then sort the manifest according to revision so that it will be added to   409     MoinMoin in the correct order.   410    411     If a "pagetitle" file exists, the title column in the manifest will be   412     augmented with the contents of that file. This is typically done for   413     comments.   414    415     If a "children" file exists, the pages in that file will be added as a list   416     to the end of each revision's content.   417    418     If 'output' is given, the manifest details will be appended to the file   419     having that filename instead of being rewritten to the original manifest   420     file.   421    422     If 'output_mapping' is given, a mapping from version identifiers to page   423     titles will be appended to the file having that filename.   424     """   425    426     pagetype = join(pages_dir, pageid, "pagetype")   427     manifest = join(pages_dir, pageid, "manifest")   428     attachments = join(pages_dir, pageid, "attachments")   429     pagetitle = join(pages_dir, pageid, "pagetitle")   430     children = join(pages_dir, pageid, "children")   431     comments = join(pages_dir, pageid, "comments")   432    433     type = exists(pagetype) and read(pagetype) or None   434    435     if exists(pagetitle):   436         title = read(pagetitle)   437         space, _page_name = get_space_and_name(title)   438     else:   439         title = space = None   440    441     # Sort the revision manifest.   442    443     result = _sort_manifest(manifest, title)   444    445     # Output a mapping of identifiers to page names.   446    447     if output_mapping:   448         _manifest_to_mapping(manifest, output_mapping)   449    450     # Modify the content to include child pages and comments.   451    452     last_title = None   453     final_result = []   454    455     for details in result:   456         _action, _archive_filename, filename, new_title, username, comment = details   457    458         # Detect renamed pages and add a redirect revision.   459    460         if last_title and last_title != new_title and _action == "AddRevision":   461             renaming_versionfile = filename + ".rename"   462             final_result.append((_action, "_", renaming_versionfile, last_title, username, "Page renamed to %s" % new_title))   463             write(renaming_versionfile, "#REDIRECT %s" % new_title)   464    465         last_title = new_title   466    467         # Add this revision to the manifest.   468    469         final_result.append(details)   470    471         # Obtain the text only if modifications are to be made.   472    473         text = None   474    475         # Add an ACL to comment pages so that people cannot change other   476         # people's comments.   477    478         if type == "Comment":   479             text = "#acl %s:read,write,delete,revert All:read\n%s" % (username, text or read(filename))   480    481         # Add child page information to the content.   482    483         if exists(children) and not no_translate:   484             child_pages = []   485             child_page_names = [x for x in read(children).split("\n") if x]   486             child_page_names.sort()   487    488             # Produce links which hide the space prefix.   489    490             for child_page_name in child_page_names:   491                 child_space, page_name = get_space_and_name(child_page_name)   492                 if child_space == space:   493                     child_page_label = page_name   494                 else:   495                     child_page_label = child_page_name   496    497                 child_pages.append(" * [[%s|%s]]" % (child_page_name, child_page_label))   498    499             text = (text or read(filename)) + child_page_section % "\n".join(child_pages)   500    501         # Add comments to the content.   502    503         if exists(comments) and title and not no_translate:   504             text = (text or read(filename)) + comment_section   505    506         # Rewrite the file if necessary.   507    508         if text:   509             write(filename, text)   510    511     # Add the attachments to the manifest.   512    513     if exists(attachments):   514         final_result += _sort_manifest(attachments, title)   515    516     # Serialise the manifest.   517    518     s = serialise_manifest(final_result)   519    520     if output is None:   521         write(manifest, s)   522     else:   523         append(output, s)   524    525 def get_space_and_name(page_name):   526     try:   527         return page_name.split("/", 1)   528     except IndexError:   529         return None, page_name   530    531 # Template for child page information.   532    533 child_page_section = """   534 ----   535    536 %s   537 """   538    539 # Template for comments.   540    541 comment_section = """   542 ----   543    544 <<IncludeComments>>   545 """   546    547 # Main program.   548    549 if __name__ == "__main__":   550     try:   551         filename = sys.argv[1]   552         is_zipfile = splitext(filename)[-1] == extsep + "zip"   553         space = sys.argv[2]   554         if len(sys.argv) > 3 and sys.argv[3]:   555             attachments = sys.argv[3]   556         else:   557             attachments = None   558     except IndexError:   559         print >>sys.stderr, """   560 Please specify an XML file containing Wiki data, a workspace name, and an   561 optional attachments directory location. For example:   562    563 %(progname)s com_entities.xml COM attachments   564    565 Adding --no-translate will unpack the Wiki but not translate the content.   566 When doing so without an attachments directory, add an empty argument as   567 follows:   568    569 %(progname)s com_entities.xml COM '' --no-translate   570    571 An archive can be used instead of the XML file, and since this may include   572 attachments, no additional attachments directory needs to be specified:   573    574 %(progname)s COM-123456-789012.zip COM   575 """ % {"progname" : split(sys.argv[0])[-1]}   576    577         sys.exit(1)   578    579     no_translate = "--no-translate" in sys.argv   580    581     if exists(space):   582         print >>sys.stderr, "Directory exists for space %s. Please choose another or remove its contents." % space   583         sys.exit(1)   584    585     package_zip = space + extsep + "zip"   586    587     if exists(package_zip):   588         print >>sys.stderr, "Page package exists. Please remove or rename it:", package_zip   589         sys.exit(1)   590    591     mkdir(space)   592     mkdirs(join(space, "pages"))   593     mkdirs(join(space, "versions"))   594    595     p = xmlread.ConfigurableParser()   596     handler = ConfluenceHandler(space, no_translate)   597    598     # Register handlers in the parser for different elements.   599    600     p["object"] = handler.handle_object   601     p["property"] = handler.handle_property   602     p["id"] = handler.handle_id   603     p["collection"] = handler.handle_collection   604     p["element"] = handler.handle_element   605    606     # Open the XML dump.   607    608     f = open(filename)   609    610     if is_zipfile:   611         zf = ZipFile(f)   612         ff = StringIO(zf.read("entities.xml"))   613     else:   614         ff = f   615    616     # Parse the data.   617    618     try:   619         p.parse(ff)   620    621         # Tidy up the import manifests, sorting each of them by revision and   622         # finalising them.   623    624         pages_dir = join(space, "pages")   625    626         for pageid in listdir(pages_dir):   627             sort_comments(pages_dir, pageid)   628    629         output_mapping = join(space, "MAPPING")   630    631         output_manifest = join(space, "MOIN_PACKAGE")   632         append(output_manifest, "MoinMoinPackage|1\n")   633    634         for pageid in listdir(pages_dir):   635             sort_manifest(pages_dir, pageid, output_manifest, output_mapping, no_translate)   636    637         # Write the page package.   638    639         page_package = ZipFile(package_zip, "w")   640    641         try:   642             # Include the page revisions.   643    644             versions_dir = join(space, "versions")   645    646             for versionid in listdir(versions_dir):   647                 page_package.write(join(versions_dir, versionid))   648    649             # Include the attachments.   650    651             if attachments:   652                 cwd = getcwd()   653                 chdir(split(attachments)[0])   654                 try:   655                     for path, dirnames, filenames in walk(split(attachments)[1]):   656                         for filename in filenames:   657                             # Have to "taint" archive filenames.   658                             page_package.write(join(path, filename), wikiutil.taintfilename(join(path, filename)))   659                 finally:   660                     chdir(cwd)   661             elif is_zipfile:   662                 for filename in zf.namelist():   663                     if filename.startswith("attachments"):   664                         # Have to "taint" archive filenames.   665                         page_package.writestr(wikiutil.taintfilename(filename), zf.read(filename))   666    667             # Include only the top-level manifest.   668    669             page_package.write(output_manifest, "MOIN_PACKAGE")   670    671         finally:   672             page_package.close()   673    674     finally:   675         f.close()   676    677 # vim: tabstop=4 expandtab shiftwidth=4