Added initial support for parsing and converting Confluence 4 XHTML content.

     1.1 --- a/convert.py	Fri Dec 14 01:51:01 2012 +0100
     1.2 +++ b/convert.py	Mon Dec 17 02:06:24 2012 +0100
     1.3 @@ -28,6 +28,7 @@
     1.4  import codecs
     1.5  import xmlread
     1.6  import parser
     1.7 +import sys
     1.8  
     1.9  MAX_TITLE_LENGTH = 120
    1.10  
    1.11 @@ -46,7 +47,14 @@
    1.12          "Handle objects according to type."
    1.13  
    1.14          objecttype = attributes[-1]["class"]
    1.15 +
    1.16 +        # Any identifier is stored as the object's textual content.
    1.17 +
    1.18          identifier = text.strip()
    1.19 +
    1.20 +        # The content is a dictionary mapping names to properties and
    1.21 +        # collections.
    1.22 +
    1.23          content = self.content
    1.24  
    1.25          pages_dir = join(self.space, "pages")
    1.26 @@ -126,12 +134,21 @@
    1.27              if not body:
    1.28                  body = "## Empty page."
    1.29  
    1.30 +            # NOTE: Very simple technique employed for guessing the format.
    1.31 +
    1.32              if no_translate:
    1.33                  fn = write
    1.34 +            elif body.startswith("<"):
    1.35 +                fn = xmltranslate
    1.36              else:
    1.37                  fn = translate
    1.38  
    1.39 -            fn(join(versions_dir, content["content"]), body)
    1.40 +            try:
    1.41 +                fn(join(versions_dir, content["content"]), body)
    1.42 +            except:
    1.43 +                print >>sys.stderr, "Error parsing..."
    1.44 +                print >>sys.stderr, body
    1.45 +                raise
    1.46  
    1.47          self.content = {}
    1.48  
    1.49 @@ -201,19 +218,24 @@
    1.50      finally:
    1.51          f.close()
    1.52  
    1.53 -def translate(filename, body):
    1.54 +def translate(filename, body, fn=None):
    1.55  
    1.56      """
    1.57      Write to the file with the given 'filename' a translation of the given
    1.58      'body'.
    1.59      """
    1.60  
    1.61 +    fn = fn or parser.parse
    1.62 +
    1.63      out = codecs.open(filename, "w", encoding="utf-8")
    1.64      try:
    1.65 -        parser.parse(body, out)
    1.66 +        fn(body, out)
    1.67      finally:
    1.68          out.close()
    1.69  
    1.70 +def xmltranslate(filename, body):
    1.71 +    translate(filename, body, parser.xmlparse)
    1.72 +
    1.73  def sort_manifest(filename, pagetitle, output=None):
    1.74  
    1.75      """
    1.76 @@ -256,8 +278,6 @@
    1.77          append(output, s)
    1.78  
    1.79  if __name__ == "__main__":
    1.80 -    import sys
    1.81 -
    1.82      try:
    1.83          filename = sys.argv[1]
    1.84          is_zipfile = splitext(filename)[-1] == extsep + "zip"

     2.1 --- a/parser.py	Fri Dec 14 01:51:01 2012 +0100
     2.2 +++ b/parser.py	Mon Dec 17 02:06:24 2012 +0100
     2.3 @@ -31,7 +31,14 @@
     2.4   3. Each block is then parsed.
     2.5  """
     2.6  
     2.7 +try:
     2.8 +    from cStringIO import StringIO
     2.9 +except ImportError:
    2.10 +    from StringIO import StringIO
    2.11 +
    2.12 +from xmlread import Parser
    2.13  import re
    2.14 +import sys
    2.15  
    2.16  URL_SCHEMES = ("http", "https", "ftp", "mailto")
    2.17  
    2.18 @@ -373,6 +380,123 @@
    2.19      "warning" : "wiki warning",
    2.20      }
    2.21  
    2.22 +# XML dialect syntax parsing.
    2.23 +
    2.24 +tags = {
    2.25 +    "strong"                : "'''%s'''",
    2.26 +    "em"                    : "''%s''",
    2.27 +    "u"                     : "__%s__",
    2.28 +    "del"                   : "--(%s)--",
    2.29 +    "sup"                   : "^%s^",
    2.30 +    "sub"                   : ",,%s,,",
    2.31 +    "code"                  : "`%s`",
    2.32 +    "pre"                   : "{{{%s}}}",
    2.33 +    "blockquote"            : " %s",
    2.34 +    "small"                 : "~-%s-~",
    2.35 +    "big"                   : "~+%s+~",
    2.36 +    "p"                     : "%s\n\n",
    2.37 +    "ac:plain-text-body"    : "{{{%s}}}",
    2.38 +    "ac:link"               : "[[%s%s|%s]]",
    2.39 +    }
    2.40 +    
    2.41 +tags.update(blocktypes)
    2.42 +
    2.43 +list_tags = {
    2.44 +    "ol"                    : " 1. %s\n",
    2.45 +    "ul"                    : " * %s\n",
    2.46 +    }
    2.47 +
    2.48 +link_target_tags = {
    2.49 +    "ri:page"               : "ri:content-title",
    2.50 +    "ri:attachment"         : "ri:filename",
    2.51 +    }
    2.52 +
    2.53 +normalise_regexp_str = r"\n\n+"
    2.54 +normalise_regexp = re.compile(normalise_regexp_str, re.DOTALL)
    2.55 +
    2.56 +class ConfluenceXMLParser(Parser):
    2.57 +
    2.58 +    "Handle content from Confluence 4 page revisions."
    2.59 +
    2.60 +    def __init__(self, out):
    2.61 +        Parser.__init__(self)
    2.62 +        self.out = out
    2.63 +
    2.64 +        # Link target information.
    2.65 +
    2.66 +        self.target = None
    2.67 +        self.target_type = None
    2.68 +
    2.69 +    def handleElement(self, name):
    2.70 +        text = "".join(self.text[-1])
    2.71 +
    2.72 +        # Handle list elements.
    2.73 +
    2.74 +        if name == "li" and len(self.elements) > 1:
    2.75 +            list_tag = self.elements[-2]
    2.76 +            conversion = list_tags.get(list_tag)
    2.77 +
    2.78 +        # Remember link target information.
    2.79 +
    2.80 +        elif link_target_tags.has_key(name):
    2.81 +            self.target = self.attributes[-1].get(link_target_tags[name])
    2.82 +            self.target_type = name
    2.83 +            text = ""
    2.84 +
    2.85 +        # Handle the common case.
    2.86 +
    2.87 +        else:
    2.88 +            conversion = tags.get(name)
    2.89 +
    2.90 +        # Attempt to convert the text.
    2.91 +
    2.92 +        if name == "ac:link":
    2.93 +            if self.target_type == "ri:attachment":
    2.94 +                prefix = "attachment:"
    2.95 +            else:
    2.96 +                prefix = "../"
    2.97 +
    2.98 +            text = conversion % (prefix, self.target, text or self.target)
    2.99 +
   2.100 +        # Handle the common case.
   2.101 +
   2.102 +        elif text and conversion:
   2.103 +            text = conversion % text
   2.104 +
   2.105 +        # Add the converted text to the end of the parent element's text nodes.
   2.106 +
   2.107 +        if len(self.text) > 1:
   2.108 +            self.text[-2].append(text)
   2.109 +
   2.110 +        # Otherwise, emit the text with normalised newlines.
   2.111 +
   2.112 +        else:
   2.113 +            self.out.write(normalise_regexp.sub("\n\n", text))
   2.114 +
   2.115 +def xmlparse(s, out):
   2.116 +
   2.117 +    "Parse the content in the string 's', writing a translation to 'out'."
   2.118 +
   2.119 +    # NOTE: CDATA sections appear to have erroneous endings.
   2.120 +
   2.121 +    s = u"""\
   2.122 +<?xml version="1.0"?>
   2.123 +<!DOCTYPE html 
   2.124 +     PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
   2.125 +     "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
   2.126 +<html xmlns="http://www.w3.org/1999/xhtml">
   2.127 +<body>
   2.128 +%s
   2.129 +</body>
   2.130 +</html>""" % s.replace("]] >", "]]>")
   2.131 +
   2.132 +    f = StringIO(s.encode("utf-8"))
   2.133 +    try:
   2.134 +        parser = ConfluenceXMLParser(out)
   2.135 +        parser.parse(f)
   2.136 +    finally:
   2.137 +        f.close()
   2.138 +
   2.139  # General parsing.
   2.140  
   2.141  def parse(s, out):
   2.142 @@ -430,8 +554,6 @@
   2.143              print >>out
   2.144  
   2.145  if __name__ == "__main__":
   2.146 -    import sys
   2.147 -
   2.148      s = sys.stdin.read()
   2.149      parse(s, sys.stdout)
   2.150
2012-12-17	Paul Boddie	raw files shortlog changelog graph	Added initial support for parsing and converting Confluence 4 XHTML content.
			convert.py (file) parser.py (file)