ConfluenceConverter (file convert.py at 1d1cc4d70e12)

     1 #!/usr/bin/env python     2      3 """     4 Confluence XML dump conversion to a MoinMoin-compatible representation.     5      6 Copyright (C) 2012 Paul Boddie <paul@boddie.org.uk>     7      8 This software is free software; you can redistribute it and/or     9 modify it under the terms of the GNU General Public License as    10 published by the Free Software Foundation; either version 2 of    11 the License, or (at your option) any later version.    12     13 This software is distributed in the hope that it will be useful,    14 but WITHOUT ANY WARRANTY; without even the implied warranty of    15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the    16 GNU General Public License for more details.    17     18 You should have received a copy of the GNU General Public    19 License along with this library; see the file LICENCE.txt    20 If not, write to the Free Software Foundation, Inc.,    21 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA    22 """    23     24 from os import listdir, mkdir, makedirs    25 from os.path import exists, extsep, join, splitext    26 from zipfile import ZipFile    27 from cStringIO import StringIO    28 import codecs    29 import xmlread    30 import parser    31 import sys    32     33 MAX_TITLE_LENGTH = 120    34     35 class ConfluenceHandler:    36     37     "Handle content from a Confluence Wiki dump."    38     39     def __init__(self, space, no_translate=False):    40         self.content = {}    41         self.elements = []    42         self.space = space    43         self.no_translate = no_translate    44     45     def handle_object(self, name, elements, attributes, all_text, text):    46     47         "Handle objects according to type."    48     49         objecttype = attributes[-1]["class"]    50     51         # Any identifier is stored as the object's textual content.    52     53         identifier = text.strip()    54     55         # The content is a dictionary mapping names to properties and    56         # collections.    57     58         content = self.content    59     60         pages_dir = join(self.space, "pages")    61         versions_dir = join(self.space, "versions")    62     63         # Handle particular types.    64     65         if objecttype in ("Page", "Comment", "BlogPost"):    66     67             # Handle pages and revisions, adding revisions to the page manifest.    68             # The original version is used as a unifying identifier for all the    69             # different revisions (each of which being defined by a Page    70             # element). Although "original" implies the first identifier used,    71             # it actually appears to be the latest and will have the highest    72             # version number.    73     74             if content.has_key("originalVersion"):    75                 pageid = content["originalVersion"]    76             else:    77                 pageid = identifier    78     79             versionfile = join(versions_dir, identifier)    80     81             # Note page metadata, not necessarily in the correct order.    82             # For comments, the title will need to be rewritten, since they    83             # should be defined in terms of their owner page.    84     85             mkdirs(join(pages_dir, pageid))    86     87             title = content["title"]    88     89             # Limit the title to a "safe" number of characters in order to avoid    90             # filesystem issues.    91     92             title = title[:MAX_TITLE_LENGTH]    93     94             if title:    95                 title = "%s/%s" % (self.space, title)    96     97             # See sort_manifest for access to this data.    98     99             append(join(pages_dir, pageid, "manifest"),   100                 "%s|AddRevision|%s|%s|%s|%s\n" % (   101                     content["version"],   102                     versionfile,   103                     title or content["version"], # comment titles will incorporate the version   104                     content["lastModifierName"],   105                     content["versionComment"]   106                 ))   107    108             # Write comments as subpages.   109    110             if content.has_key("comments"):   111    112                 # Define a page directory for each comment, and write the page   113                 # title in a special file for later processing.   114    115                 for _comment, commentid in content["comments"]:   116                     mkdirs(join(pages_dir, commentid))   117                     append(join(pages_dir, commentid, "pagetitle"), title)   118    119             # Add information to parent pages for child page lists.   120    121             if content.has_key("parent"):   122                 parentid = content["parent"]   123                 mkdirs(join(pages_dir, parentid))   124                 append(join(pages_dir, parentid, "children"), title + "\n")   125    126             # Some metadata is not particularly relevant. For example,   127             # ancestors, children, parent are navigation-related.   128    129             # Other metadata could be added to the page content itself.   130             # For example, labelling could be converted to categories.   131    132         # Handle revisions.   133    134         elif objecttype == "BodyContent":   135             body = content["body"]   136             if not body:   137                 body = "## Empty page."   138    139             # NOTE: Very simple technique employed for guessing the format.   140    141             if no_translate:   142                 fn = write   143             elif body.startswith("<"):   144                 fn = xmltranslate   145             else:   146                 fn = translate   147    148             try:   149                 fn(join(versions_dir, content["content"]), body)   150             except:   151                 print >>sys.stderr, "Error parsing..."   152                 print >>sys.stderr, body   153                 raise   154    155         self.content = {}   156    157     def handle_property(self, name, elements, attributes, all_text, text):   158    159         "Record properties in the current content dictionary."   160    161         self.content[attributes[-1]["name"]] = text.strip()   162    163     def handle_id(self, name, elements, attributes, all_text, text):   164    165         "Promote identifiers to the parent element's text."   166    167         all_text[-2].append(text)   168    169     def handle_collection(self, name, elements, attributes, all_text, text):   170    171         "Record collections in the current content dictionary."   172    173         self.content[attributes[-1]["name"]] = self.elements   174         self.elements = []   175    176     def handle_element(self, name, elements, attributes, all_text, text):   177    178         "Add elements to the current collection."   179    180         self.elements.append((attributes[-1]["class"], text.strip()))   181    182 def mkdirs(name):   183    184     "Make the directory with the given 'name' at any depth."   185    186     try:   187         makedirs(name)   188     except OSError:   189         pass   190    191 def append(filename, s):   192    193     "Append to the file with the given 'filename' the string 's'."   194    195     write(filename, s, True)   196    197 def write(filename, s, append=False):   198    199     """   200     Write to the file with the given 'filename' the string 's'. If the optional   201     'append' parameter is set to a true value, 's' will be appended to the file.   202     """   203    204     f = codecs.open(filename, append and "a" or "w", encoding="utf-8")   205     try:   206         f.write(s)   207     finally:   208         f.close()   209    210 def read(filename):   211    212     """   213     Read from the file with the given 'filename', returning a string containing   214     its contents.   215     """   216    217     f = codecs.open(filename, encoding="utf-8")   218     try:   219         return f.read()   220     finally:   221         f.close()   222    223 def translate(filename, body, fn=None):   224    225     """   226     Write to the file with the given 'filename' a translation of the given   227     'body'.   228     """   229    230     fn = fn or parser.parse   231    232     out = codecs.open(filename, "w", encoding="utf-8")   233     try:   234         fn(body, out)   235     finally:   236         out.close()   237    238 def xmltranslate(filename, body):   239     translate(filename, body, parser.xmlparse)   240    241 def sort_manifest(pageid, output=None):   242    243     """   244     Using the given 'pageid', locate the manifest for the page and any page   245     title information written to a "pagetitle" file.   246    247     Then sort the manifest according to revision so that it will be added to   248     MoinMoin in the correct order.   249    250     If a "pagetitle" file exists, the title column in the manifest will be   251     augmented with the contents of that file. This is typically done for   252     comments.   253    254     If a "children" file exists, the pages in that file will be added as a list   255     to the end of each revision's content.   256    257     If 'output' is given, the manifest details will be appended to the file   258     having that filename instead of being rewritten to the original manifest   259     file.   260     """   261    262     manifest = join(pages_dir, pageid, "manifest")   263     pagetitle = join(pages_dir, pageid, "pagetitle")   264     children = join(pages_dir, pageid, "children")   265    266     if exists(pagetitle):   267         title = read(pagetitle)   268     else:   269         title = None   270    271     f = codecs.open(manifest, "r", encoding="utf-8")   272     try:   273         lines = [x.split("|") for x in f.readlines()]   274         lines.sort(cmp=lambda x, y: cmp(int(x[0]), int(y[0])))   275    276         # Reconstruct the lines, optionally changing the titles.   277    278         result = []   279    280         for line in lines:   281             version, _addrevision, filename, old_title, username, comment = line   282    283             # Add title information to the information already present.   284    285             if title is not None:   286                 new_title = "%s/%s" % (title, old_title)   287             else:   288                 new_title = old_title   289    290             # The version is omitted now that the manifest is ordered.   291    292             line = _addrevision, filename, new_title, username, comment   293             result.append("|".join(line))   294    295             # Add child page information to the content.   296    297             if exists(children):   298                 child_pages = []   299                 child_page_names = [x for x in read(children).split("\n") if x]   300                 child_page_names.sort()   301    302                 for child_page_name in child_page_names:   303                     child_pages.append(" * [[%s]]" % child_page_name)   304    305                 append(filename, child_page_section % "\n".join(child_pages))   306    307     finally:   308         f.close()   309    310     s = "".join(result)   311    312     if output is None:   313         write(manifest, s)   314     else:   315         append(output, s)   316    317 # Template for child page information.   318    319 child_page_section = """   320 ----   321    322 %s   323 """   324    325 # Main program.   326    327 if __name__ == "__main__":   328     try:   329         filename = sys.argv[1]   330         is_zipfile = splitext(filename)[-1] == extsep + "zip"   331         space = sys.argv[2]   332     except IndexError:   333         print >>sys.stderr, "Please specify an XML file containing Wiki data and a workspace name."   334         print >>sys.stderr, "For example: com_entities.xml COM"   335         sys.exit(1)   336    337     no_translate = "--no-translate" in sys.argv   338    339     if exists(space):   340         print >>sys.stderr, "Directory exists for space %s. Please choose another or remove its contents." % space   341         sys.exit(1)   342    343     package_zip = space + extsep + "zip"   344    345     if exists(package_zip):   346         print >>sys.stderr, "Page package exists. Please remove or rename it:", package_zip   347         sys.exit(1)   348    349     mkdir(space)   350     mkdirs(join(space, "pages"))   351     mkdirs(join(space, "versions"))   352    353     p = xmlread.ConfigurableParser()   354     handler = ConfluenceHandler(space, no_translate)   355    356     # Register handlers in the parser for different elements.   357    358     p["object"] = handler.handle_object   359     p["property"] = handler.handle_property   360     p["id"] = handler.handle_id   361     p["collection"] = handler.handle_collection   362     p["element"] = handler.handle_element   363    364     # Open the XML dump.   365    366     f = open(filename)   367    368     if is_zipfile:   369         zf = ZipFile(f)   370         ff = StringIO(zf.read("entities.xml"))   371     else:   372         ff = f   373    374     # Parse the data.   375    376     try:   377         p.parse(ff)   378     finally:   379         f.close()   380    381     # Tidy up the import manifests, sorting each of them by revision and   382     # finalising them.   383    384     pages_dir = join(space, "pages")   385    386     output_manifest = join(space, "MOIN_PACKAGE")   387     append(output_manifest, "MoinMoinPackage|1\n")   388    389     for pageid in listdir(pages_dir):   390         sort_manifest(pageid, output_manifest)   391    392     # Write the page package.   393    394     page_package = ZipFile(package_zip, "w")   395    396     try:   397         # Include the page revisions.   398    399         versions_dir = join(space, "versions")   400    401         for versionid in listdir(versions_dir):   402             page_package.write(join(versions_dir, versionid))   403    404         # Include only the top-level manifest.   405    406         page_package.write(output_manifest, "MOIN_PACKAGE")   407    408     finally:   409         page_package.close()   410    411 # vim: tabstop=4 expandtab shiftwidth=4