1.1 --- a/parser.py	Fri Dec 14 01:51:01 2012 +0100
     1.2 +++ b/parser.py	Mon Dec 17 02:06:24 2012 +0100
     1.3 @@ -31,7 +31,14 @@
     1.4   3. Each block is then parsed.
     1.5  """
     1.6  
     1.7 +try:
     1.8 +    from cStringIO import StringIO
     1.9 +except ImportError:
    1.10 +    from StringIO import StringIO
    1.11 +
    1.12 +from xmlread import Parser
    1.13  import re
    1.14 +import sys
    1.15  
    1.16  URL_SCHEMES = ("http", "https", "ftp", "mailto")
    1.17  
    1.18 @@ -373,6 +380,123 @@
    1.19      "warning" : "wiki warning",
    1.20      }
    1.21  
    1.22 +# XML dialect syntax parsing.
    1.23 +
    1.24 +tags = {
    1.25 +    "strong"                : "'''%s'''",
    1.26 +    "em"                    : "''%s''",
    1.27 +    "u"                     : "__%s__",
    1.28 +    "del"                   : "--(%s)--",
    1.29 +    "sup"                   : "^%s^",
    1.30 +    "sub"                   : ",,%s,,",
    1.31 +    "code"                  : "`%s`",
    1.32 +    "pre"                   : "{{{%s}}}",
    1.33 +    "blockquote"            : " %s",
    1.34 +    "small"                 : "~-%s-~",
    1.35 +    "big"                   : "~+%s+~",
    1.36 +    "p"                     : "%s\n\n",
    1.37 +    "ac:plain-text-body"    : "{{{%s}}}",
    1.38 +    "ac:link"               : "[[%s%s|%s]]",
    1.39 +    }
    1.40 +    
    1.41 +tags.update(blocktypes)
    1.42 +
    1.43 +list_tags = {
    1.44 +    "ol"                    : " 1. %s\n",
    1.45 +    "ul"                    : " * %s\n",
    1.46 +    }
    1.47 +
    1.48 +link_target_tags = {
    1.49 +    "ri:page"               : "ri:content-title",
    1.50 +    "ri:attachment"         : "ri:filename",
    1.51 +    }
    1.52 +
    1.53 +normalise_regexp_str = r"\n\n+"
    1.54 +normalise_regexp = re.compile(normalise_regexp_str, re.DOTALL)
    1.55 +
    1.56 +class ConfluenceXMLParser(Parser):
    1.57 +
    1.58 +    "Handle content from Confluence 4 page revisions."
    1.59 +
    1.60 +    def __init__(self, out):
    1.61 +        Parser.__init__(self)
    1.62 +        self.out = out
    1.63 +
    1.64 +        # Link target information.
    1.65 +
    1.66 +        self.target = None
    1.67 +        self.target_type = None
    1.68 +
    1.69 +    def handleElement(self, name):
    1.70 +        text = "".join(self.text[-1])
    1.71 +
    1.72 +        # Handle list elements.
    1.73 +
    1.74 +        if name == "li" and len(self.elements) > 1:
    1.75 +            list_tag = self.elements[-2]
    1.76 +            conversion = list_tags.get(list_tag)
    1.77 +
    1.78 +        # Remember link target information.
    1.79 +
    1.80 +        elif link_target_tags.has_key(name):
    1.81 +            self.target = self.attributes[-1].get(link_target_tags[name])
    1.82 +            self.target_type = name
    1.83 +            text = ""
    1.84 +
    1.85 +        # Handle the common case.
    1.86 +
    1.87 +        else:
    1.88 +            conversion = tags.get(name)
    1.89 +
    1.90 +        # Attempt to convert the text.
    1.91 +
    1.92 +        if name == "ac:link":
    1.93 +            if self.target_type == "ri:attachment":
    1.94 +                prefix = "attachment:"
    1.95 +            else:
    1.96 +                prefix = "../"
    1.97 +
    1.98 +            text = conversion % (prefix, self.target, text or self.target)
    1.99 +
   1.100 +        # Handle the common case.
   1.101 +
   1.102 +        elif text and conversion:
   1.103 +            text = conversion % text
   1.104 +
   1.105 +        # Add the converted text to the end of the parent element's text nodes.
   1.106 +
   1.107 +        if len(self.text) > 1:
   1.108 +            self.text[-2].append(text)
   1.109 +
   1.110 +        # Otherwise, emit the text with normalised newlines.
   1.111 +
   1.112 +        else:
   1.113 +            self.out.write(normalise_regexp.sub("\n\n", text))
   1.114 +
   1.115 +def xmlparse(s, out):
   1.116 +
   1.117 +    "Parse the content in the string 's', writing a translation to 'out'."
   1.118 +
   1.119 +    # NOTE: CDATA sections appear to have erroneous endings.
   1.120 +
   1.121 +    s = u"""\
   1.122 +<?xml version="1.0"?>
   1.123 +<!DOCTYPE html 
   1.124 +     PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
   1.125 +     "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
   1.126 +<html xmlns="http://www.w3.org/1999/xhtml">
   1.127 +<body>
   1.128 +%s
   1.129 +</body>
   1.130 +</html>""" % s.replace("]] >", "]]>")
   1.131 +
   1.132 +    f = StringIO(s.encode("utf-8"))
   1.133 +    try:
   1.134 +        parser = ConfluenceXMLParser(out)
   1.135 +        parser.parse(f)
   1.136 +    finally:
   1.137 +        f.close()
   1.138 +
   1.139  # General parsing.
   1.140  
   1.141  def parse(s, out):
   1.142 @@ -430,8 +554,6 @@
   1.143              print >>out
   1.144  
   1.145  if __name__ == "__main__":
   1.146 -    import sys
   1.147 -
   1.148      s = sys.stdin.read()
   1.149      parse(s, sys.stdout)
   1.150