Attempted to normalise whitespace and to ensure that the different block elements retain their separation and relationships in the generated markup. Added support for adminitions through the recognition of certain Confluence macro invocations. Added support for "skipped entities" such as — and – found in the Confluence XHTML markup.

     1.1 --- a/parser.py	Mon Dec 17 02:06:24 2012 +0100
     1.2 +++ b/parser.py	Tue Dec 18 23:06:19 2012 +0100
     1.3 @@ -39,6 +39,7 @@
     1.4  from xmlread import Parser
     1.5  import re
     1.6  import sys
     1.7 +import operator
     1.8  
     1.9  URL_SCHEMES = ("http", "https", "ftp", "mailto")
    1.10  
    1.11 @@ -383,6 +384,7 @@
    1.12  # XML dialect syntax parsing.
    1.13  
    1.14  tags = {
    1.15 +    # XHTML tag               MoinMoin syntax
    1.16      "strong"                : "'''%s'''",
    1.17      "em"                    : "''%s''",
    1.18      "u"                     : "__%s__",
    1.19 @@ -394,25 +396,48 @@
    1.20      "blockquote"            : " %s",
    1.21      "small"                 : "~-%s-~",
    1.22      "big"                   : "~+%s+~",
    1.23 -    "p"                     : "%s\n\n",
    1.24 +    "p"                     : "\n%s\n",
    1.25 +    "ol"                    : "\n%s",
    1.26 +    "ul"                    : "\n%s",
    1.27      "ac:plain-text-body"    : "{{{%s}}}",
    1.28      "ac:link"               : "[[%s%s|%s]]",
    1.29      }
    1.30 -    
    1.31 -tags.update(blocktypes)
    1.32 +
    1.33 +for tag, translation in blocktypes.items():
    1.34 +    tags[tag] = "\n%s\n" % translation
    1.35 +
    1.36 +simple_tags = {
    1.37 +    # XHTML tag               MoinMoin syntax
    1.38 +    "br"                    : "<<BR>>",
    1.39 +    }
    1.40  
    1.41  list_tags = {
    1.42 -    "ol"                    : " 1. %s\n",
    1.43 -    "ul"                    : " * %s\n",
    1.44 +    # XHTML list tag          MoinMoin list item syntax
    1.45 +    "ol"                    : "1. %s\n",
    1.46 +    "ul"                    : "* %s\n",
    1.47      }
    1.48  
    1.49 +indented_tags = ["li", "p"]
    1.50 +
    1.51  link_target_tags = {
    1.52 +    # Confluence element      Attribute providing the target
    1.53      "ri:page"               : "ri:content-title",
    1.54      "ri:attachment"         : "ri:filename",
    1.55      }
    1.56  
    1.57 -normalise_regexp_str = r"\n\n+"
    1.58 -normalise_regexp = re.compile(normalise_regexp_str, re.DOTALL)
    1.59 +macro_rich_text_styles = {
    1.60 +    # Confluence style        MoinMoin admonition style
    1.61 +    "note"                  : "caution",
    1.62 +    "warning"               : "warning",
    1.63 +    "info"                  : "important",
    1.64 +    "tip"                   : "tip",
    1.65 +    }
    1.66 +
    1.67 +normalise_regexp_str = r"\s+"
    1.68 +normalise_regexp = re.compile(normalise_regexp_str)
    1.69 +
    1.70 +normalise_end_regexp_str = r"\s\s+$"
    1.71 +normalise_end_regexp = re.compile(normalise_end_regexp_str)
    1.72  
    1.73  class ConfluenceXMLParser(Parser):
    1.74  
    1.75 @@ -427,8 +452,50 @@
    1.76          self.target = None
    1.77          self.target_type = None
    1.78  
    1.79 +        # Macro information.
    1.80 +
    1.81 +        self.macro = None
    1.82 +        self.macro_parameters = {}
    1.83 +
    1.84 +        # Indentation and preformatted states.
    1.85 +
    1.86 +        self.indent = 0
    1.87 +        self.states = {}
    1.88 +        for name in ("pre", "ac:plain-text-body"):
    1.89 +            self.states[name] = 0
    1.90 +
    1.91 +    # ContentHandler-related methods.
    1.92 +
    1.93 +    def startElement(self, name, attrs):
    1.94 +        if list_tags.has_key(name):
    1.95 +            self.indent += 1
    1.96 +        elif self.states.has_key(name):
    1.97 +            self.states[name] += 1
    1.98 +        Parser.startElement(self, name, attrs)
    1.99 +
   1.100 +    def endElement(self, name):
   1.101 +        Parser.endElement(self, name)
   1.102 +        if list_tags.has_key(name):
   1.103 +            self.indent -= 1
   1.104 +        elif self.states.has_key(name):
   1.105 +            self.states[name] -= 1
   1.106 +
   1.107 +    def characters(self, content):
   1.108 +        if not self.is_preformatted():
   1.109 +            content = self.normalise(content, self.elements[-1])
   1.110 +        Parser.characters(self, content)
   1.111 +
   1.112 +    def skippedEntity(self, name):
   1.113 +        if name == "mdash":
   1.114 +            self.text[-1].append(u"\u2014")
   1.115 +        elif name == "ndash":
   1.116 +            self.text[-1].append(u"\u2013")
   1.117 +
   1.118 +    # Parser-related methods.
   1.119 +
   1.120      def handleElement(self, name):
   1.121          text = "".join(self.text[-1])
   1.122 +        conversion = None
   1.123  
   1.124          # Handle list elements.
   1.125  
   1.126 @@ -443,6 +510,15 @@
   1.127              self.target_type = name
   1.128              text = ""
   1.129  
   1.130 +        # Remember macro information.
   1.131 +
   1.132 +        elif name == "ac:parameter":
   1.133 +            self.macro_parameters[self.attributes[-1].get("ac:name")] = text
   1.134 +            text = ""
   1.135 +
   1.136 +        elif name == "ac:macro":
   1.137 +            self.macro = self.attributes[-1].get("ac:name")
   1.138 +
   1.139          # Handle the common case.
   1.140  
   1.141          else:
   1.142 @@ -450,6 +526,8 @@
   1.143  
   1.144          # Attempt to convert the text.
   1.145  
   1.146 +        # Links require target information.
   1.147 +
   1.148          if name == "ac:link":
   1.149              if self.target_type == "ri:attachment":
   1.150                  prefix = "attachment:"
   1.151 @@ -457,21 +535,66 @@
   1.152                  prefix = "../"
   1.153  
   1.154              text = conversion % (prefix, self.target, text or self.target)
   1.155 +            self.target = self.target_type = None
   1.156 +
   1.157 +        # Macro name information is used to style rich text body regions.
   1.158 +
   1.159 +        elif name == "ac:macro" and macro_rich_text_styles.has_key(self.macro):
   1.160 +            details = macro_rich_text_styles[self.macro]
   1.161 +            title = self.macro_parameters.get("title")
   1.162 +            if title:
   1.163 +                details = "%s\n\n%s" % (details, title)
   1.164 +            text = "{{{#!wiki %s\n\n%s}}}" % (details, text)
   1.165 +            self.macro = None
   1.166 +            self.macro_parameters = {}
   1.167  
   1.168          # Handle the common case.
   1.169  
   1.170          elif text and conversion:
   1.171              text = conversion % text
   1.172 +        elif simple_tags.has_key(name):
   1.173 +            text = simple_tags[name]
   1.174 +
   1.175 +        # Normalise leading whitespace and indent the text if appropriate.
   1.176 +
   1.177 +        if name in indented_tags:
   1.178 +            text = " " * self.indent + text.lstrip()
   1.179  
   1.180          # Add the converted text to the end of the parent element's text nodes.
   1.181  
   1.182          if len(self.text) > 1:
   1.183 +            preceding = "".join(self.text[-2])
   1.184 +
   1.185 +            if not self.is_preformatted():
   1.186 +                preceding = self.normalise_end(preceding, self.elements[-2])
   1.187 +
   1.188 +            self.text[-2] = [preceding]
   1.189              self.text[-2].append(text)
   1.190  
   1.191 -        # Otherwise, emit the text with normalised newlines.
   1.192 +        # Otherwise, emit the text.
   1.193  
   1.194          else:
   1.195 -            self.out.write(normalise_regexp.sub("\n\n", text))
   1.196 +            self.out.write(text)
   1.197 +
   1.198 +    def is_preformatted(self):
   1.199 +        return reduce(operator.or_, self.states.values(), False)
   1.200 +
   1.201 +    def get_replacement(self, name, end=False):
   1.202 +        if list_tags.has_key(name):
   1.203 +            if end:
   1.204 +                return "\n"
   1.205 +            else:
   1.206 +                return ""
   1.207 +        elif name == "body":
   1.208 +            return "\n\n"
   1.209 +        else:
   1.210 +            return " "
   1.211 +
   1.212 +    def normalise(self, text, name):
   1.213 +        return normalise_regexp.sub(self.get_replacement(name), text)
   1.214 +
   1.215 +    def normalise_end(self, text, name):
   1.216 +        return normalise_end_regexp.sub(self.get_replacement(name, True), text)
   1.217  
   1.218  def xmlparse(s, out):
   1.219
2012-12-18	Paul Boddie	raw files shortlog changelog graph	Attempted to normalise whitespace and to ensure that the different block elements retain their separation and relationships in the generated markup. Added support for adminitions through the recognition of certain Confluence macro invocations. Added support for "skipped entities" such as — and – found in the Confluence XHTML markup.
			parser.py (file)