ConfluenceConverter (file convert.py at a95675d52731)

     1 #!/usr/bin/env python     2      3 """     4 Confluence XML dump conversion to a MoinMoin-compatible representation.     5      6 Copyright (C) 2012, 2013 Paul Boddie <paul@boddie.org.uk>     7      8 This software is free software; you can redistribute it and/or     9 modify it under the terms of the GNU General Public License as    10 published by the Free Software Foundation; either version 2 of    11 the License, or (at your option) any later version.    12     13 This software is distributed in the hope that it will be useful,    14 but WITHOUT ANY WARRANTY; without even the implied warranty of    15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the    16 GNU General Public License for more details.    17     18 You should have received a copy of the GNU General Public    19 License along with this library; see the file LICENCE.txt    20 If not, write to the Free Software Foundation, Inc.,    21 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA    22 """    23     24 from os import chdir, getcwd, listdir, mkdir, makedirs, walk    25 from os.path import exists, extsep, join, split, splitext    26 from zipfile import ZipFile    27 from cStringIO import StringIO    28 from MoinMoin import wikiutil    29 import codecs    30 import xmlread    31 import wikiparser, xmlparser    32 import sys    33     34 from common import get_page_title    35     36 class ConfluenceHandler:    37     38     "Handle content from a Confluence Wiki dump."    39     40     def __init__(self, space, no_translate=False):    41         self.content = {}    42         self.elements = []    43         self.space = space    44         self.no_translate = no_translate    45     46     def handle_object(self, name, elements, attributes, all_text, text):    47     48         """    49         Handle objects according to type. Objects appear as follows:    50     51         <object class="Page" package="...">    52         <id name="id">...</id>    53         ...    54         </object>    55     56         Within objects, one finds things like properties and collections, which    57         are handled by their own methods but which are stored in the content    58         dictionary associated with the current object.    59     60         By the time this method is called, the contents of the object will have    61         been gathered and the properties and collections populated in the    62         content dictionary. Any identifier will have been assigned to the    63         textual content of the object element and will be available in the    64         'text' parameter.    65         """    66     67         objecttype = attributes[-1]["class"]    68     69         # Any identifier is stored as the object's textual content.    70     71         identifier = text.strip()    72     73         # The content is a dictionary mapping names to properties and    74         # collections.    75     76         content = self.content    77     78         pages_dir = join(self.space, "pages")    79         versions_dir = join(self.space, "versions")    80     81         # Handle particular types.    82     83         if objecttype in ("Page", "Comment", "BlogPost"):    84     85             # Handle pages and revisions, adding revisions to the page manifest.    86             # The original version is used as a unifying identifier for all the    87             # different revisions (each of which being defined by a Page    88             # element). Although "original" implies the first identifier used,    89             # it actually appears to be the latest and will have the highest    90             # version number.    91     92             if content.has_key("originalVersion"):    93                 pageid = content["originalVersion"]    94             else:    95                 pageid = identifier    96     97             versionfile = join(versions_dir, identifier)    98     99             # Note page metadata, not necessarily in the correct order.   100             # For comments, the title will need to be rewritten, since they   101             # should be defined in terms of their owner page.   102    103             # NOTE: This only makes the current title available to comments.   104    105             mkdirs(join(pages_dir, pageid))   106    107             title = content["title"]   108    109             # Limit the title to a "safe" number of characters in order to avoid   110             # filesystem issues.   111    112             title = get_page_title(title)   113    114             if title:   115                 title = "%s/%s" % (self.space, title)   116                 write(join(pages_dir, pageid, "pagetitle"), title)   117    118             # See sort_manifest for access to this data.   119    120             append(join(pages_dir, pageid, "manifest"),   121                 "%s|AddRevision|_|%s|%s|%s|%s\n" % ( # blank added for consistency with AddAttachment   122                     content["version"],   123                     versionfile,   124                     title, # comment titles will incorporate the comment's position   125                     content["lastModifierName"],   126                     content["versionComment"]   127                 ))   128    129             # Add information to parent pages for child page lists.   130    131             if content.has_key("parent"):   132                 parentid = content["parent"]   133                 mkdirs(join(pages_dir, parentid))   134                 append(join(pages_dir, parentid, "children"), title + "\n")   135    136             # Add creation details for comments to the owner page.   137             # Since comments can be versioned, the date of the original version   138             # is used, and only this "original" version has the owner property.   139    140             if objecttype == "Comment" and content.has_key("owner"):   141                 ownerid = content["owner"]   142                 mkdirs(join(pages_dir, ownerid))   143                 append(join(pages_dir, ownerid, "comments"), "%s|%s\n" % (content["creationDate"], pageid))   144    145             # Some metadata is not particularly relevant. For example,   146             # ancestors, children, parent are navigation-related.   147    148             # Other metadata could be added to the page content itself.   149             # For example, labelling could be converted to categories.   150    151         # Handle revisions.   152    153         elif objecttype == "BodyContent":   154             body = content["body"]   155             if not body:   156                 body = "## Empty page."   157    158             # NOTE: Very simple technique employed for guessing the format.   159    160             if no_translate:   161                 fn = write   162             elif body.startswith("<"):   163                 fn = xmltranslate   164             else:   165                 fn = translate   166    167             try:   168                 fn(join(versions_dir, content["content"]), body)   169             except:   170                 err = codecs.getwriter("utf-8")(sys.stderr)   171                 print >>err, "Error parsing", content["content"]   172                 raise   173    174         # Handle attachments.   175    176         elif objecttype == "Attachment":   177             pageid = content["content"]   178             version = content["attachmentVersion"]   179    180             if content.has_key("originalVersion"):   181                 attachid = content["originalVersion"]   182             else:   183                 attachid = identifier   184    185             append(join(pages_dir, pageid, "attachments"),   186                 "%s|AddAttachment|%s|%s|%s|%s|%s\n" % (   187                     version,   188                     # Have to "taint" archive filenames, although Moin will   189                     # probably handle package script filename tainting.   190                     wikiutil.taintfilename(join("attachments", pageid, attachid, version)),   191                     wikiutil.taintfilename(content["fileName"]),   192                     "", # pagename is substituted later   193                     content["lastModifierName"],   194                     content["comment"]   195                 ))   196    197         self.content = {}   198    199     def handle_property(self, name, elements, attributes, all_text, text):   200    201         "Record properties in the current content dictionary."   202    203         self.content[attributes[-1]["name"]] = text.strip()   204    205     def handle_id(self, name, elements, attributes, all_text, text):   206    207         "Promote identifiers to the parent element's text."   208    209         all_text[-2].append(text)   210    211     def handle_collection(self, name, elements, attributes, all_text, text):   212    213         "Record collections in the current content dictionary."   214    215         self.content[attributes[-1]["name"]] = self.elements   216         self.elements = []   217    218     def handle_element(self, name, elements, attributes, all_text, text):   219    220         "Add elements to the current collection."   221    222         self.elements.append((attributes[-1]["class"], text.strip()))   223    224 def mkdirs(name):   225    226     "Make the directory with the given 'name' at any depth."   227    228     try:   229         makedirs(name)   230     except OSError:   231         pass   232    233 def append(filename, s):   234    235     "Append to the file with the given 'filename' the string 's'."   236    237     write(filename, s, True)   238    239 def write(filename, s, append=False):   240    241     """   242     Write to the file with the given 'filename' the string 's'. If the optional   243     'append' parameter is set to a true value, 's' will be appended to the file.   244     """   245    246     f = codecs.open(filename, append and "a" or "w", encoding="utf-8")   247     try:   248         f.write(s)   249     finally:   250         f.close()   251    252 def read(filename):   253    254     """   255     Read from the file with the given 'filename', returning a string containing   256     its contents.   257     """   258    259     f = codecs.open(filename, encoding="utf-8")   260     try:   261         return f.read()   262     finally:   263         f.close()   264    265 def translate(filename, body, fn=None):   266    267     """   268     Write to the file with the given 'filename' a translation of the given   269     'body'.   270     """   271    272     fn = fn or wikiparser.parse   273    274     out = codecs.open(filename, "w", encoding="utf-8")   275     try:   276         print >>out, "#pragma page-filename", filename   277         fn(body, out)   278     finally:   279         out.close()   280    281 def xmltranslate(filename, body):   282     translate(filename, body, xmlparser.parse)   283    284 def sort_comments(pages_dir, pageid):   285    286     """   287     Where 'pageid' has comments associated with it, sort them chronologically   288     and label the comment pages with the owner page's title and comment's   289     position in the chronological sequence. Such labelling is done by writing   290     a "pagetitle" file in each comment page's directory.   291     """   292    293     comments = join(pages_dir, pageid, "comments")   294    295     if not exists(comments):   296         return   297    298     title = read(join(pages_dir, pageid, "pagetitle"))   299    300     details = [line.split("|") for line in read(comments).split("\n") if line]   301     details.sort()   302    303     # Write the sorted comments list for testing purposes.   304    305     write(comments, "\n".join(["|".join(x) for x in details]))   306    307     # Define comments as subpages by setting their titles using this   308     # page's name/title and their position in the comments collection.   309    310     for position, (_lastmodified, commentid) in enumerate(details):   311    312         # In the page directory for each comment, write the page title in a   313         # special file for later processing.   314    315         write(join(pages_dir, commentid, "pagetitle"), "%s/%04d" % (title, position))   316    317 def _manifest_to_mapping(manifest, output_mapping):   318    319     """   320     Open the given 'manifest' and write a mapping from version identifiers to   321     page names/titles to the file with the given 'output_mapping' filename.   322     """   323    324     f = codecs.open(manifest, "r", encoding="utf-8")   325     try:   326         mapping = []   327    328         lines = [x.split("|") for x in f.readlines()]   329         for line in lines:   330             version, _action, _archive_filename, filename, title, username, comment = line   331             if title:   332                 mapping.append((split(filename)[-1], title))   333    334         append(output_mapping, "".join([("\t".join(x) + "\n") for x in mapping]))   335    336     finally:   337         f.close()   338    339 def _sort_manifest(manifest, title):   340    341     """   342     Open the given 'manifest' and sort it according to revision so that it will   343     be added to MoinMoin in the correct order.   344    345     If a 'title' is provided, the title column in the manifest will be augmented   346     with that information. This is typically done for comments and is necessary   347     for attachments.   348    349     A list of manifest entries is returned.   350     """   351    352     f = codecs.open(manifest, "r", encoding="utf-8")   353     try:   354         lines = [x.split("|") for x in f.readlines()]   355         lines.sort(cmp=lambda x, y: cmp(int(x[0]), int(y[0])))   356    357         # Reconstruct the lines, optionally changing the titles.   358    359         result = []   360    361         for line in lines:   362             version, _action, _archive_filename, filename, old_title, username, comment = line   363    364             # Replace title information with the information already present.   365    366             if not old_title:   367                 new_title = title   368             else:   369                 new_title = old_title   370    371             # The version is omitted now that the manifest is ordered.   372    373             line = _action, _archive_filename, filename, new_title, username, comment   374             result.append(line)   375    376         return result   377    378     finally:   379         f.close()   380    381 def serialise_manifest(manifest):   382    383     """   384     Process the 'manifest' consisting of entries, removing superfluous columns.   385     """   386    387     result = []   388    389     for columns in manifest:   390         action = columns[0]   391         if action == "AddRevision":   392             columns = list(columns)   393             del columns[1]   394         result.append("|".join(columns))   395    396     return "".join(result)   397                398 def sort_manifest(pages_dir, pageid, output=None, output_mapping=None, no_translate=False):   399    400     """   401     Using the given 'pageid', locate the manifest for the page and any page   402     title information written to a "pagetitle" file.   403    404     Then sort the manifest according to revision so that it will be added to   405     MoinMoin in the correct order.   406    407     If a "pagetitle" file exists, the title column in the manifest will be   408     augmented with the contents of that file. This is typically done for   409     comments.   410    411     If a "children" file exists, the pages in that file will be added as a list   412     to the end of each revision's content.   413    414     If 'output' is given, the manifest details will be appended to the file   415     having that filename instead of being rewritten to the original manifest   416     file.   417    418     If 'output_mapping' is given, a mapping from version identifiers to page   419     titles will be appended to the file having that filename.   420     """   421    422     manifest = join(pages_dir, pageid, "manifest")   423     attachments = join(pages_dir, pageid, "attachments")   424     pagetitle = join(pages_dir, pageid, "pagetitle")   425     children = join(pages_dir, pageid, "children")   426     comments = join(pages_dir, pageid, "comments")   427    428     if exists(pagetitle):   429         title = read(pagetitle)   430         space, _page_name = get_space_and_name(title)   431     else:   432         title = space = None   433    434     # Sort the revision manifest.   435    436     result = _sort_manifest(manifest, title)   437    438     # Output a mapping of identifiers to page names.   439    440     if output_mapping:   441         _manifest_to_mapping(manifest, output_mapping)   442    443     # Modify the content to include child pages and comments.   444    445     for _action, _archive_filename, filename, new_title, username, comment in result:   446    447         # Add child page information to the content.   448    449         if exists(children) and not no_translate:   450             child_pages = []   451             child_page_names = [x for x in read(children).split("\n") if x]   452             child_page_names.sort()   453    454             # Produce links which hide the space prefix.   455    456             for child_page_name in child_page_names:   457                 child_space, page_name = get_space_and_name(child_page_name)   458                 if child_space == space:   459                     child_page_label = page_name   460                 else:   461                     child_page_label = child_page_name   462    463                 child_pages.append(" * [[%s|%s]]" % (child_page_name, child_page_label))   464    465             append(filename, child_page_section % "\n".join(child_pages))   466    467         # Add comments to the content.   468    469         if exists(comments) and title and not no_translate:   470             append(filename, comment_section % title)   471    472     # Add the attachments to the manifest.   473    474     if exists(attachments):   475         result += _sort_manifest(attachments, title)   476    477     # Serialise the manifest.   478    479     s = serialise_manifest(result)   480    481     if output is None:   482         write(manifest, s)   483     else:   484         append(output, s)   485    486 def get_space_and_name(page_name):   487     try:   488         return page_name.split("/", 1)   489     except IndexError:   490         return None, page_name   491    492 # Template for child page information.   493    494 child_page_section = """   495 ----   496    497 %s   498 """   499    500 # Template for comments.   501    502 comment_section = """   503 ----   504    505 <<Include("^%s/")>>   506 """   507    508 # Main program.   509    510 if __name__ == "__main__":   511     try:   512         filename = sys.argv[1]   513         is_zipfile = splitext(filename)[-1] == extsep + "zip"   514         space = sys.argv[2]   515         if len(sys.argv) > 3 and sys.argv[3]:   516             attachments = sys.argv[3]   517         else:   518             attachments = None   519     except IndexError:   520         print >>sys.stderr, """   521 Please specify an XML file containing Wiki data, a workspace name, and an   522 optional attachments directory location. For example:   523    524 com_entities.xml COM attachments   525    526 Adding --no-translate will unpack the Wiki but not translate the content.   527 When doing so without an attachments directory, add an empty argument as   528 follows:   529    530 com_entities.xml COM '' --no-translate   531 """   532         sys.exit(1)   533    534     no_translate = "--no-translate" in sys.argv   535    536     if exists(space):   537         print >>sys.stderr, "Directory exists for space %s. Please choose another or remove its contents." % space   538         sys.exit(1)   539    540     package_zip = space + extsep + "zip"   541    542     if exists(package_zip):   543         print >>sys.stderr, "Page package exists. Please remove or rename it:", package_zip   544         sys.exit(1)   545    546     mkdir(space)   547     mkdirs(join(space, "pages"))   548     mkdirs(join(space, "versions"))   549    550     p = xmlread.ConfigurableParser()   551     handler = ConfluenceHandler(space, no_translate)   552    553     # Register handlers in the parser for different elements.   554    555     p["object"] = handler.handle_object   556     p["property"] = handler.handle_property   557     p["id"] = handler.handle_id   558     p["collection"] = handler.handle_collection   559     p["element"] = handler.handle_element   560    561     # Open the XML dump.   562    563     f = open(filename)   564    565     if is_zipfile:   566         zf = ZipFile(f)   567         ff = StringIO(zf.read("entities.xml"))   568     else:   569         ff = f   570    571     # Parse the data.   572    573     try:   574         p.parse(ff)   575    576         # Tidy up the import manifests, sorting each of them by revision and   577         # finalising them.   578    579         pages_dir = join(space, "pages")   580    581         for pageid in listdir(pages_dir):   582             sort_comments(pages_dir, pageid)   583    584         output_mapping = join(space, "MAPPING")   585    586         output_manifest = join(space, "MOIN_PACKAGE")   587         append(output_manifest, "MoinMoinPackage|1\n")   588    589         for pageid in listdir(pages_dir):   590             sort_manifest(pages_dir, pageid, output_manifest, output_mapping, no_translate)   591    592         # Write the page package.   593    594         page_package = ZipFile(package_zip, "w")   595    596         try:   597             # Include the page revisions.   598    599             versions_dir = join(space, "versions")   600    601             for versionid in listdir(versions_dir):   602                 page_package.write(join(versions_dir, versionid))   603    604             # Include the attachments.   605    606             if attachments:   607                 cwd = getcwd()   608                 chdir(split(attachments)[0])   609                 try:   610                     for path, dirnames, filenames in walk(split(attachments)[1]):   611                         for filename in filenames:   612                             # Have to "taint" archive filenames.   613                             page_package.write(join(path, filename), wikiutil.taintfilename(join(path, filename)))   614                 finally:   615                     chdir(cwd)   616             elif is_zipfile:   617                 for filename in zf.namelist():   618                     if filename.startswith("attachments"):   619                         # Have to "taint" archive filenames.   620                         page_package.writestr(wikiutil.taintfilename(filename), zf.read(filename))   621    622             # Include only the top-level manifest.   623    624             page_package.write(output_manifest, "MOIN_PACKAGE")   625    626         finally:   627             page_package.close()   628    629     finally:   630         f.close()   631    632 # vim: tabstop=4 expandtab shiftwidth=4