ConfluenceConverter (file convert.py at e565cc05d5d2)

     1 #!/usr/bin/env python     2      3 """     4 Confluence XML dump conversion to a MoinMoin-compatible representation.     5      6 Copyright (C) 2012, 2013 Paul Boddie <paul@boddie.org.uk>     7      8 This software is free software; you can redistribute it and/or     9 modify it under the terms of the GNU General Public License as    10 published by the Free Software Foundation; either version 2 of    11 the License, or (at your option) any later version.    12     13 This software is distributed in the hope that it will be useful,    14 but WITHOUT ANY WARRANTY; without even the implied warranty of    15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the    16 GNU General Public License for more details.    17     18 You should have received a copy of the GNU General Public    19 License along with this library; see the file LICENCE.txt    20 If not, write to the Free Software Foundation, Inc.,    21 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA    22 """    23     24 from os import chdir, getcwd, listdir, mkdir, makedirs, walk    25 from os.path import exists, extsep, join, split, splitext    26 from zipfile import ZipFile    27 from cStringIO import StringIO    28 from MoinMoin import wikiutil    29 import codecs    30 import xmlread    31 import wikiparser, xmlparser    32 import sys    33 import time, calendar    34     35 from common import get_page_title    36     37 def date_to_seconds(s):    38     return calendar.timegm(time.strptime(s.split(".", 1)[0], "%Y-%m-%d %H:%M:%S"))    39     40 class ConfluenceHandler:    41     42     "Handle content from a Confluence Wiki dump."    43     44     def __init__(self, space, no_translate=False):    45         self.content = {}    46         self.elements = []    47         self.space = space    48         self.no_translate = no_translate    49     50     def handle_object(self, name, elements, attributes, all_text, text):    51     52         """    53         Handle objects according to type. Objects appear as follows:    54     55         <object class="Page" package="...">    56         <id name="id">...</id>    57         ...    58         </object>    59     60         Within objects, one finds things like properties and collections, which    61         are handled by their own methods but which are stored in the content    62         dictionary associated with the current object.    63     64         By the time this method is called, the contents of the object will have    65         been gathered and the properties and collections populated in the    66         content dictionary. Any identifier will have been assigned to the    67         textual content of the object element and will be available in the    68         'text' parameter.    69         """    70     71         objecttype = attributes[-1]["class"]    72     73         # Any identifier is stored as the object's textual content.    74     75         identifier = text.strip()    76     77         # The content is a dictionary mapping names to properties and    78         # collections.    79     80         content = self.content    81     82         pages_dir = join(self.space, "pages")    83         versions_dir = join(self.space, "versions")    84     85         # Handle particular types.    86     87         if objecttype in ("Page", "Comment", "BlogPost"):    88     89             # Handle pages and revisions, adding revisions to the page manifest.    90             # The original version is used as a unifying identifier for all the    91             # different revisions (each of which being defined by a Page    92             # element). Although "original" implies the first identifier used,    93             # it actually appears to be the latest and will have the highest    94             # version number.    95     96             if content.has_key("originalVersion"):    97                 pageid = content["originalVersion"]    98             else:    99                 pageid = identifier   100    101             versionfile = join(versions_dir, identifier)   102    103             # Note page metadata, not necessarily in the correct order.   104             # For comments, the title will need to be rewritten, since they   105             # should be defined in terms of their owner page.   106    107             # NOTE: This only makes the current title available to comments.   108    109             mkdirs(join(pages_dir, pageid))   110    111             title = content["title"]   112    113             # Limit the title to a "safe" number of characters in order to avoid   114             # filesystem issues.   115    116             title = get_page_title(title)   117    118             if title:   119                 title = "%s/%s" % (self.space, title)   120                 write(join(pages_dir, pageid, "pagetitle"), title)   121    122             # Note the type of the page.   123    124             write(join(pages_dir, pageid, "pagetype"), objecttype)   125    126             # See sort_manifest for access to this data.   127    128             append(join(pages_dir, pageid, "manifest"),   129                 "%s|AddRevision|_|%s|%s|%s|%s|%d\n" % ( # blank added for consistency with AddAttachment   130                     content["version"],   131                     versionfile,   132                     title, # comment titles will incorporate the comment's position   133                     content["lastModifierName"],   134                     content["versionComment"],   135                     date_to_seconds(content["lastModificationDate"])   136                 ))   137    138             # Add information to parent pages for child page lists.   139    140             if content.has_key("parent"):   141                 parentid = content["parent"]   142                 mkdirs(join(pages_dir, parentid))   143                 append(join(pages_dir, parentid, "children"), title + "\n")   144    145             # Add creation details for comments to the owner page.   146             # Since comments can be versioned, the date of the original version   147             # is used, and only this "original" version has the owner property.   148    149             if objecttype == "Comment" and content.has_key("owner"):   150                 ownerid = content["owner"]   151                 mkdirs(join(pages_dir, ownerid))   152                 append(join(pages_dir, ownerid, "comments"), "%s|%s\n" % (content["creationDate"], pageid))   153    154             # Some metadata is not particularly relevant. For example,   155             # ancestors, children, parent are navigation-related.   156    157             # Other metadata could be added to the page content itself.   158             # For example, labelling could be converted to categories.   159    160         # Handle revisions.   161    162         elif objecttype == "BodyContent":   163             body = content["body"]   164             if not body:   165                 body = "## Empty page."   166    167             is_comment_page = content.get("content:class") == "Comment"   168    169             # NOTE: Very simple technique employed for guessing the format.   170    171             if no_translate:   172                 fn = notranslate   173             elif body.startswith("<"):   174                 fn = xmltranslate   175             else:   176                 fn = translate   177    178             try:   179                 fn(join(versions_dir, content["content"]), body, is_comment_page)   180             except:   181                 err = codecs.getwriter("utf-8")(sys.stderr)   182                 print >>err, "Error parsing", content["content"]   183                 raise   184    185         # Handle attachments.   186    187         elif objecttype == "Attachment":   188             pageid = content["content"]   189             version = content["attachmentVersion"]   190    191             if content.has_key("originalVersion"):   192                 attachid = content["originalVersion"]   193             else:   194                 attachid = identifier   195    196             append(join(pages_dir, pageid, "attachments"),   197                 "%s|AddAttachment|%s|%s|%s|%s|%s|%d\n" % (   198                     version,   199                     # Have to "taint" archive filenames, although Moin will   200                     # probably handle package script filename tainting.   201                     wikiutil.taintfilename(join("attachments", pageid, attachid, version)),   202                     wikiutil.taintfilename(content["fileName"]),   203                     "", # pagename is substituted later   204                     content["lastModifierName"],   205                     content["comment"],   206                     date_to_seconds(content["lastModificationDate"])   207                 ))   208    209         self.content = {}   210    211     def handle_property(self, name, elements, attributes, all_text, text):   212    213         "Record properties in the current content dictionary."   214    215         property_name = attributes[-1]["name"]   216         self.content[property_name] = text.strip()   217    218         property_class = attributes[-1].get("class")   219         if property_class:   220             self.content["%s:%s" % (property_name, "class")] = property_class.strip()   221    222     def handle_id(self, name, elements, attributes, all_text, text):   223    224         "Promote identifiers to the parent element's text."   225    226         all_text[-2].append(text)   227    228     def handle_collection(self, name, elements, attributes, all_text, text):   229    230         "Record collections in the current content dictionary."   231    232         self.content[attributes[-1]["name"]] = self.elements   233         self.elements = []   234    235     def handle_element(self, name, elements, attributes, all_text, text):   236    237         "Add elements to the current collection."   238    239         self.elements.append((attributes[-1]["class"], text.strip()))   240    241 def mkdirs(name):   242    243     "Make the directory with the given 'name' at any depth."   244    245     try:   246         makedirs(name)   247     except OSError:   248         pass   249    250 def append(filename, s):   251    252     "Append to the file with the given 'filename' the string 's'."   253    254     write(filename, s, True)   255    256 def write(filename, s, append=False):   257    258     """   259     Write to the file with the given 'filename' the string 's'. If the optional   260     'append' parameter is set to a true value, 's' will be appended to the file.   261     """   262    263     f = codecs.open(filename, append and "a" or "w", encoding="utf-8")   264     try:   265         f.write(s)   266     finally:   267         f.close()   268    269 def read(filename):   270    271     """   272     Read from the file with the given 'filename', returning a string containing   273     its contents.   274     """   275    276     f = codecs.open(filename, encoding="utf-8")   277     try:   278         return f.read()   279     finally:   280         f.close()   281    282 def translate(filename, body, is_comment_page, fn=None):   283    284     """   285     Write to the file with the given 'filename' a translation of the given   286     'body'.   287     """   288    289     fn = fn or wikiparser.parse   290    291     out = codecs.open(filename, "w", encoding="utf-8")   292     try:   293         print >>out, "#pragma page-filename", filename   294         fn(body, out, is_comment_page)   295     finally:   296         out.close()   297    298 def xmltranslate(filename, body, is_comment_page):   299     translate(filename, body, is_comment_page, xmlparser.parse)   300    301 def notranslate(filename, body, is_comment_page):   302     write(filename, body)   303    304 def sort_comments(pages_dir, pageid):   305    306     """   307     Where 'pageid' has comments associated with it, sort them chronologically   308     and label the comment pages with the owner page's title and comment's   309     position in the chronological sequence. Such labelling is done by writing   310     a "pagetitle" file in each comment page's directory.   311     """   312    313     comments = join(pages_dir, pageid, "comments")   314    315     if not exists(comments):   316         return   317    318     title = read(join(pages_dir, pageid, "pagetitle"))   319    320     details = [line.split("|") for line in read(comments).split("\n") if line]   321     details.sort()   322    323     # Write the sorted comments list for testing purposes.   324    325     write(comments, "\n".join(["|".join(x) for x in details]))   326    327     # Define comments as subpages by setting their titles using this   328     # page's name/title and their position in the comments collection.   329    330     for position, (_lastmodified, commentid) in enumerate(details):   331    332         # In the page directory for each comment, write the page title in a   333         # special file for later processing.   334    335         write(join(pages_dir, commentid, "pagetitle"), "%s/%04d" % (title, position))   336    337 def _manifest_to_mapping(manifest, output_mapping):   338    339     """   340     Open the given 'manifest' and write a mapping from version identifiers to   341     page names/titles to the file with the given 'output_mapping' filename.   342     """   343    344     f = codecs.open(manifest, "r", encoding="utf-8")   345     try:   346         mapping = []   347    348         lines = [x.split("|") for x in f.readlines()]   349         for line in lines:   350             version, _action, _archive_filename, filename, title, username, comment, mtime = line   351             if title:   352                 mapping.append((split(filename)[-1], title))   353    354         append(output_mapping, "".join([("\t".join(x) + "\n") for x in mapping]))   355    356     finally:   357         f.close()   358    359 def _sort_manifest(manifest, title):   360    361     """   362     Open the given 'manifest' and sort it according to revision so that it will   363     be added to MoinMoin in the correct order.   364    365     If a 'title' is provided, the title column in the manifest will be augmented   366     with that information. This is typically done for comments and is necessary   367     for attachments.   368    369     A list of manifest entries is returned.   370     """   371    372     f = codecs.open(manifest, "r", encoding="utf-8")   373     try:   374         lines = [x.rstrip("\n").split("|") for x in f.readlines()]   375         lines.sort(cmp=lambda x, y: cmp(int(x[0]), int(y[0])))   376    377         # Reconstruct the lines, optionally changing the titles.   378    379         result = []   380    381         for line in lines:   382             version, _action, _archive_filename, filename, old_title, username, comment, mtime = line   383    384             # Replace title information with the information already present.   385    386             if not old_title:   387                 new_title = title   388             else:   389                 new_title = old_title   390    391             # The version is omitted now that the manifest is ordered.   392    393             line = _action, _archive_filename, filename, new_title, username, comment, mtime   394             result.append(line)   395    396         return result   397    398     finally:   399         f.close()   400    401 def serialise_manifest(manifest):   402    403     """   404     Process the 'manifest' consisting of entries, removing superfluous columns.   405     """   406    407     result = []   408    409     for columns in manifest:   410         action = columns[0]   411         if action == "AddRevision":   412             columns = list(columns)   413             del columns[1]   414         result.append("|".join(columns) + "\n")   415    416     return "".join(result)   417                418 def sort_manifest(pages_dir, pageid, output_mapping=None, no_translate=False):   419    420     """   421     Using the given 'pageid', locate the manifest for the page and any page   422     title information written to a "pagetitle" file.   423    424     Then sort the manifest according to revision so that historical operations   425     such as page renaming can be detected.   426    427     If a "pagetitle" file exists, the title column in the manifest will be   428     augmented with the contents of that file. This is typically done for   429     comments.   430    431     If a "children" file exists, the pages in that file will be added as a list   432     to the end of each revision's content.   433    434     If 'output_mapping' is given, a mapping from version identifiers to page   435     titles will be appended to the file having that filename.   436     """   437    438     pagetype = join(pages_dir, pageid, "pagetype")   439     manifest = join(pages_dir, pageid, "manifest")   440     attachments = join(pages_dir, pageid, "attachments")   441     pagetitle = join(pages_dir, pageid, "pagetitle")   442     children = join(pages_dir, pageid, "children")   443     comments = join(pages_dir, pageid, "comments")   444    445     type = exists(pagetype) and read(pagetype) or None   446    447     if exists(pagetitle):   448         title = read(pagetitle)   449         space, _page_name = get_space_and_name(title)   450     else:   451         title = space = None   452    453     # Sort the revision manifest.   454    455     result = _sort_manifest(manifest, title)   456    457     # Output a mapping of identifiers to page names.   458    459     if output_mapping:   460         _manifest_to_mapping(manifest, output_mapping)   461    462     # Modify the content to include child pages and comments.   463    464     last_title = None   465     final_result = []   466    467     for details in result:   468         _action, _archive_filename, filename, new_title, username, comment, mtime = details   469    470         # Detect renamed pages and add a redirect revision.   471    472         if last_title and last_title != new_title and _action == "AddRevision":   473             renaming_versionfile = filename + ".rename"   474             final_result.append((_action, "_", renaming_versionfile, last_title, username, "Page renamed to %s" % new_title, mtime))   475             write(renaming_versionfile, "#REDIRECT %s" % new_title)   476    477         last_title = new_title   478    479         # Add this revision to the manifest.   480    481         final_result.append(details)   482    483         # Obtain the text only if modifications are to be made.   484    485         text = None   486    487         # Add an ACL to comment pages so that people cannot change other   488         # people's comments.   489         # NOTE: This should match the PostComment action.   490    491         if type == "Comment":   492             text = """\   493 #acl %s:read,write,delete,revert All:read   494 #pragma comment-owner %s   495 %s""" % (username, username, text or read(filename))   496    497         # Add child page information to the content.   498    499         if exists(children) and not no_translate:   500             child_pages = []   501             child_page_names = [x for x in read(children).split("\n") if x]   502             child_page_names.sort()   503    504             # Produce links which hide the space prefix.   505    506             for child_page_name in child_page_names:   507                 child_space, page_name = get_space_and_name(child_page_name)   508                 if child_space == space:   509                     child_page_label = page_name   510                 else:   511                     child_page_label = child_page_name   512    513                 child_pages.append(" * [[%s|%s]]" % (child_page_name, child_page_label))   514    515             text = (text or read(filename)) + child_page_section % "\n".join(child_pages)   516    517         # Add comments to the content.   518    519         if exists(comments) and title and not no_translate:   520             text = (text or read(filename)) + comment_section   521    522         # Rewrite the file if necessary.   523    524         if text:   525             write(filename, text)   526    527     # Add the attachments to the manifest.   528    529     if exists(attachments):   530         final_result += _sort_manifest(attachments, title)   531    532     return final_result   533    534 def sort_final_manifest(entries, output):   535    536     """   537     Sort the manifest 'entries' by last modified time and serialise it.   538     The manifest details will be appended to the file named by 'output'.   539     """   540    541     # The final entry in each element is the mtime.   542    543     entries.sort(cmp=lambda x, y: cmp(int(x[-1]), int(y[-1])))   544    545     # Serialise the manifest.   546    547     s = serialise_manifest(entries)   548     append(output, s)   549    550 def get_space_and_name(page_name):   551     try:   552         return page_name.split("/", 1)   553     except IndexError:   554         return None, page_name   555    556 # Template for child page information.   557    558 child_page_section = """   559 ----   560    561 %s   562 """   563    564 # Template for comments.   565    566 comment_section = """   567 ----   568    569 <<IncludeComments>>   570 """   571    572 # Main program.   573    574 if __name__ == "__main__":   575     try:   576         filename = sys.argv[1]   577         is_zipfile = splitext(filename)[-1] == extsep + "zip"   578         space = sys.argv[2]   579         if len(sys.argv) > 3 and sys.argv[3]:   580             attachments = sys.argv[3]   581         else:   582             attachments = None   583     except IndexError:   584         print >>sys.stderr, """   585 Please specify an XML file containing Wiki data, a workspace name, and an   586 optional attachments directory location. For example:   587    588 %(progname)s com_entities.xml COM attachments   589    590 Adding --no-translate will unpack the Wiki but not translate the content.   591 When doing so without an attachments directory, add an empty argument as   592 follows:   593    594 %(progname)s com_entities.xml COM '' --no-translate   595    596 An archive can be used instead of the XML file, and since this may include   597 attachments, no additional attachments directory needs to be specified:   598    599 %(progname)s COM-123456-789012.zip COM   600 """ % {"progname" : split(sys.argv[0])[-1]}   601    602         sys.exit(1)   603    604     no_translate = "--no-translate" in sys.argv   605    606     if exists(space):   607         print >>sys.stderr, "Directory exists for space %s. Please choose another or remove its contents." % space   608         sys.exit(1)   609    610     package_zip = space + extsep + "zip"   611    612     if exists(package_zip):   613         print >>sys.stderr, "Page package exists. Please remove or rename it:", package_zip   614         sys.exit(1)   615    616     mkdir(space)   617     mkdirs(join(space, "pages"))   618     mkdirs(join(space, "versions"))   619    620     p = xmlread.ConfigurableParser()   621     handler = ConfluenceHandler(space, no_translate)   622    623     # Register handlers in the parser for different elements.   624    625     p["object"] = handler.handle_object   626     p["property"] = handler.handle_property   627     p["id"] = handler.handle_id   628     p["collection"] = handler.handle_collection   629     p["element"] = handler.handle_element   630    631     # Open the XML dump.   632    633     f = open(filename)   634    635     if is_zipfile:   636         zf = ZipFile(f)   637         ff = StringIO(zf.read("entities.xml"))   638     else:   639         ff = f   640    641     # Parse the data.   642    643     try:   644         p.parse(ff)   645    646         # Tidy up the import manifests, sorting each of them by revision and   647         # finalising them.   648    649         pages_dir = join(space, "pages")   650    651         for pageid in listdir(pages_dir):   652             sort_comments(pages_dir, pageid)   653    654         output_mapping = join(space, "MAPPING")   655    656         output_manifest = join(space, "MOIN_PACKAGE")   657         append(output_manifest, "MoinMoinPackage|1\n")   658    659         entries = []   660    661         for pageid in listdir(pages_dir):   662             entries += sort_manifest(pages_dir, pageid, output_mapping, no_translate)   663    664         sort_final_manifest(entries, output_manifest)   665    666         # Write the page package.   667    668         page_package = ZipFile(package_zip, "w")   669    670         try:   671             # Include the page revisions.   672    673             versions_dir = join(space, "versions")   674    675             for versionid in listdir(versions_dir):   676                 page_package.write(join(versions_dir, versionid))   677    678             # Include the attachments.   679    680             if attachments:   681                 cwd = getcwd()   682                 chdir(split(attachments)[0])   683                 try:   684                     for path, dirnames, filenames in walk(split(attachments)[1]):   685                         for filename in filenames:   686                             # Have to "taint" archive filenames.   687                             page_package.write(join(path, filename), wikiutil.taintfilename(join(path, filename)))   688                 finally:   689                     chdir(cwd)   690             elif is_zipfile:   691                 for filename in zf.namelist():   692                     if filename.startswith("attachments"):   693                         # Have to "taint" archive filenames.   694                         page_package.writestr(wikiutil.taintfilename(filename), zf.read(filename))   695    696             # Include only the top-level manifest.   697    698             page_package.write(output_manifest, "MOIN_PACKAGE")   699    700         finally:   701             page_package.close()   702    703     finally:   704         f.close()   705    706 # vim: tabstop=4 expandtab shiftwidth=4