ConfluenceConverter (file convert.py at e03ad5886778)

     1 #!/usr/bin/env python     2      3 """     4 Confluence XML dump conversion to a MoinMoin-compatible representation.     5      6 Copyright (C) 2012 Paul Boddie <paul@boddie.org.uk>     7      8 This software is free software; you can redistribute it and/or     9 modify it under the terms of the GNU General Public License as    10 published by the Free Software Foundation; either version 2 of    11 the License, or (at your option) any later version.    12     13 This software is distributed in the hope that it will be useful,    14 but WITHOUT ANY WARRANTY; without even the implied warranty of    15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the    16 GNU General Public License for more details.    17     18 You should have received a copy of the GNU General Public    19 License along with this library; see the file LICENCE.txt    20 If not, write to the Free Software Foundation, Inc.,    21 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA    22 """    23     24 from os import listdir, mkdir, makedirs    25 from os.path import exists, extsep, join, splitext    26 from zipfile import ZipFile    27 from cStringIO import StringIO    28 import codecs    29 import xmlread    30 import parser    31     32 class ConfluenceHandler:    33     34     "Handle content from a Confluence Wiki dump."    35     36     def __init__(self, space, no_translate=False):    37         self.content = {}    38         self.elements = []    39         self.space = space    40         self.no_translate = no_translate    41     42     def handle_object(self, name, elements, attributes, all_text, text):    43     44         "Handle objects according to type."    45     46         objecttype = attributes[-1]["class"]    47         identifier = text.strip()    48         content = self.content    49     50         pages_dir = join(self.space, "pages")    51         versions_dir = join(self.space, "versions")    52     53         # Handle particular types.    54     55         if objecttype in ("Page", "Comment", "BlogPost"):    56     57             # Handle pages and revisions, adding revisions to the page manifest.    58             # The original version is used as a unifying identifier for all the    59             # different revisions (each of which being defined by a Page    60             # element). Although "original" implies the first identifier used,    61             # it actually appears to be the latest and will have the highest    62             # version number.    63     64             if content.has_key("originalVersion"):    65                 pageid = content["originalVersion"]    66             else:    67                 pageid = identifier    68     69             versionfile = join(versions_dir, identifier)    70     71             # Note page metadata, not necessarily in the correct order.    72             # For comments, the title will need to be rewritten, since they    73             # should be defined in terms of their owner page.    74     75             mkdirs(join(pages_dir, pageid))    76     77             title = content["title"]    78             if title:    79                 title = "%s/%s" % (self.space, title)    80     81             append(join(pages_dir, pageid, "manifest"), "%s|AddRevision|%s|%s|%s|%s\n" % (    82                 content["version"],    83                 versionfile,    84                 title or content["version"], # comment titles will incorporate the version    85                 content["lastModifierName"],    86                 content["versionComment"]))    87     88             # Write comments as subpages.    89     90             if content.has_key("comments"):    91     92                 # Define a page directory for each comment, and write the page    93                 # title in a special file for later processing.    94     95                 for _comment, commentid in content["comments"]:    96                     mkdirs(join(pages_dir, commentid))    97                     append(join(pages_dir, commentid, "pagetitle"), title)    98     99             # Some metadata is not particularly relevant. For example,   100             # ancestors, children, parent are navigation-related.   101    102             # Other metadata could be added to the page content itself.   103             # For example, labelling could be converted to categories.   104    105         # Handle revisions.   106    107         elif objecttype == "BodyContent":   108             body = content["body"]   109             if not body:   110                 body = "## Empty page."   111    112             if no_translate:   113                 fn = write   114             else:   115                 fn = translate   116    117             fn(join(versions_dir, content["content"]), body)   118    119         self.content = {}   120    121     def handle_property(self, name, elements, attributes, all_text, text):   122    123         "Record properties in the current content dictionary."   124    125         self.content[attributes[-1]["name"]] = text.strip()   126    127     def handle_id(self, name, elements, attributes, all_text, text):   128    129         "Promote identifiers to the parent element's text."   130    131         all_text[-2].append(text)   132    133     def handle_collection(self, name, elements, attributes, all_text, text):   134    135         "Record collections in the current content dictionary."   136    137         self.content[attributes[-1]["name"]] = self.elements   138         self.elements = []   139    140     def handle_element(self, name, elements, attributes, all_text, text):   141    142         "Add elements to the current collection."   143    144         self.elements.append((attributes[-1]["class"], text.strip()))   145    146 def mkdirs(name):   147    148     "Make the directory with the given 'name' at any depth."   149    150     try:   151         makedirs(name)   152     except OSError:   153         pass   154    155 def append(filename, s):   156    157     "Append to the file with the given 'filename' the string 's'."   158    159     write(filename, s, True)   160    161 def write(filename, s, append=False):   162    163     """   164     Write to the file with the given 'filename' the string 's'. If the optional   165     'append' parameter is set to a true value, 's' will be appended to the file.   166     """   167    168     f = codecs.open(filename, append and "a" or "w", encoding="utf-8")   169     try:   170         f.write(s)   171     finally:   172         f.close()   173    174 def read(filename):   175    176     """   177     Read from the file with the given 'filename', returning a string containing   178     its contents.   179     """   180    181     f = codecs.open(filename, encoding="utf-8")   182     try:   183         return f.read()   184     finally:   185         f.close()   186    187 def translate(filename, body):   188    189     """   190     Write to the file with the given 'filename' a translation of the given   191     'body'.   192     """   193    194     out = codecs.open(filename, "w", encoding="utf-8")   195     try:   196         parser.parse(body, out)   197     finally:   198         out.close()   199    200 def sort_manifest(filename, pagetitle, output=None):   201    202     """   203     Sort the manifest given in 'filename' according to revision. If a   204     'pagetitle' file exists, the title column in the manifest will be augmented   205     with the contents of that file. If 'output' is given, the manifest details   206     will be appended to the file having that filename instead of being rewritten   207     to the original manifest file.   208     """   209    210     if exists(pagetitle):   211         title = read(pagetitle)   212     else:   213         title = None   214    215     f = codecs.open(filename, "r", encoding="utf-8")   216     try:   217         lines = [x.split("|") for x in f.readlines()]   218         lines.sort(cmp=lambda x, y: cmp(int(x[0]), int(y[0])))   219    220         # Reconstruct the lines, optionally changing the titles.   221    222         result = []   223         for x in lines:   224             if title is not None:   225                 x[3] = "%s/%s" % (title, x[3])   226             result.append("|".join(x[1:]))   227     finally:   228         f.close()   229    230     s = "".join(result)   231    232     if output is None:   233         write(filename, s)   234     else:   235         append(output, s)   236    237 if __name__ == "__main__":   238     import sys   239    240     filename = sys.argv[1]   241     is_zipfile = splitext(filename)[-1] == extsep + "zip"   242     space = sys.argv[2]   243    244     no_translate = "--no-translate" in sys.argv   245    246     if exists(space):   247         print >>sys.stderr, "Directory exists for space %s. Please choose another or remove its contents." % space   248         sys.exit(1)   249    250     package_zip = space + extsep + "zip"   251    252     if exists(package_zip):   253         print >>sys.stderr, "Page package exists. Please remove or rename it:", package_zip   254         sys.exit(1)   255    256     mkdir(space)   257     mkdirs(join(space, "pages"))   258     mkdirs(join(space, "versions"))   259    260     p = xmlread.ConfigurableParser()   261     handler = ConfluenceHandler(space, no_translate)   262    263     p["object"] = handler.handle_object   264     p["property"] = handler.handle_property   265     p["id"] = handler.handle_id   266     p["collection"] = handler.handle_collection   267     p["element"] = handler.handle_element   268    269     # Open the XML dump.   270    271     f = open(filename)   272    273     if is_zipfile:   274         zf = ZipFile(f)   275         ff = StringIO(zf.read("entities.xml"))   276     else:   277         ff = f   278    279     # Parse the data.   280    281     try:   282         p.parse(ff)   283     finally:   284         f.close()   285    286     # Tidy up the import manifests, sorting each of them by revision and   287     # finalising them.   288    289     pages_dir = join(space, "pages")   290    291     output_manifest = join(space, "MOIN_PACKAGE")   292     append(output_manifest, "MoinMoinPackage|1\n")   293    294     for pageid in listdir(pages_dir):   295         manifest = join(pages_dir, pageid, "manifest")   296         pagetitle = join(pages_dir, pageid, "pagetitle")   297         sort_manifest(manifest, pagetitle, output_manifest)   298    299     # Write the page package.   300    301     page_package = ZipFile(package_zip, "w")   302    303     try:   304         # Include the page revisions.   305    306         versions_dir = join(space, "versions")   307    308         for versionid in listdir(versions_dir):   309             page_package.write(join(versions_dir, versionid))   310    311         # Include only the top-level manifest.   312    313         page_package.write(output_manifest, "MOIN_PACKAGE")   314    315     finally:   316         page_package.close()   317    318 # vim: tabstop=4 expandtab shiftwidth=4