Simplified whitespace normalisation, explicitly adding newlines before certain elements at certain levels. Added a test mode for XML parsing and a test of parsing XHTML lists.

     1.1 --- a/parser.py	Sun Feb 17 20:33:21 2013 +0100
     1.2 +++ b/parser.py	Sun Feb 17 20:36:11 2013 +0100
     1.3 @@ -3,7 +3,7 @@
     1.4  """
     1.5  Confluence Wiki syntax parsing.
     1.6  
     1.7 -Copyright (C) 2012 Paul Boddie <paul@boddie.org.uk>
     1.8 +Copyright (C) 2012, 2013 Paul Boddie <paul@boddie.org.uk>
     1.9  
    1.10  This software is free software; you can redistribute it and/or
    1.11  modify it under the terms of the GNU General Public License as
    1.12 @@ -397,15 +397,15 @@
    1.13      "blockquote"            : " %s",
    1.14      "small"                 : "~-%s-~",
    1.15      "big"                   : "~+%s+~",
    1.16 -    "p"                     : "\n%s\n",
    1.17 -    "ol"                    : "\n%s",
    1.18 -    "ul"                    : "\n%s",
    1.19 +    "p"                     : "%s",
    1.20 +    "ol"                    : "%s",
    1.21 +    "ul"                    : "%s",
    1.22      "ac:plain-text-body"    : "{{{%s}}}",
    1.23      "ac:link"               : "[[%s%s|%s]]",
    1.24      }
    1.25  
    1.26  for tag, translation in blocktypes.items():
    1.27 -    tags[tag] = "\n%s\n" % translation
    1.28 +    tags[tag] = translation
    1.29  
    1.30  simple_tags = {
    1.31      # XHTML tag               MoinMoin syntax
    1.32 @@ -414,8 +414,8 @@
    1.33  
    1.34  list_tags = {
    1.35      # XHTML list tag          MoinMoin list item syntax
    1.36 -    "ol"                    : "1. %s\n",
    1.37 -    "ul"                    : "* %s\n",
    1.38 +    "ol"                    : "1. %s",
    1.39 +    "ul"                    : "* %s",
    1.40      }
    1.41  
    1.42  indented_tags = ["li", "p"]
    1.43 @@ -438,9 +438,6 @@
    1.44  normalise_regexp_str = r"\s+"
    1.45  normalise_regexp = re.compile(normalise_regexp_str)
    1.46  
    1.47 -normalise_end_regexp_str = r"\s\s+$"
    1.48 -normalise_end_regexp = re.compile(normalise_end_regexp_str)
    1.49 -
    1.50  class ConfluenceXMLParser(Parser):
    1.51  
    1.52      "Handle content from Confluence 4 page revisions."
    1.53 @@ -567,13 +564,16 @@
    1.54          # Add the converted text to the end of the parent element's text nodes.
    1.55  
    1.56          if len(self.text) > 1:
    1.57 -            preceding = "".join(self.text[-2])
    1.58 -
    1.59 -            if not self.is_preformatted():
    1.60 -                preceding = self.normalise_end(preceding, self.elements[-2])
    1.61 -
    1.62 -            self.text[-2] = [preceding]
    1.63 -            self.text[-2].append(text)
    1.64 +            nodes = self.text[-2]
    1.65 +            if "".join(self.text[-2]):
    1.66 +                parent = self.elements[-2]
    1.67 +                if parent == "body":
    1.68 +                    nodes.append("\n\n")
    1.69 +                elif list_tags.has_key(parent):
    1.70 +                    nodes.append("\n")
    1.71 +                elif list_tags.has_key(name) and parent == "li":
    1.72 +                    nodes.append("\n")
    1.73 +            nodes.append(text)
    1.74  
    1.75          # Otherwise, emit the text.
    1.76  
    1.77 @@ -583,23 +583,17 @@
    1.78      def is_preformatted(self):
    1.79          return reduce(operator.or_, self.states.values(), False)
    1.80  
    1.81 -    def get_replacement(self, name, end=False):
    1.82 -        if list_tags.has_key(name):
    1.83 -            if end:
    1.84 -                return "\n"
    1.85 -            else:
    1.86 -                return ""
    1.87 -        elif name == "body":
    1.88 -            return "\n\n"
    1.89 +    # Whitespace normalisation.
    1.90 +
    1.91 +    def get_replacement(self, name):
    1.92 +        if name in ("html", "body") or list_tags.has_key(name):
    1.93 +            return ""
    1.94          else:
    1.95              return " "
    1.96  
    1.97      def normalise(self, text, name):
    1.98          return normalise_regexp.sub(self.get_replacement(name), text)
    1.99  
   1.100 -    def normalise_end(self, text, name):
   1.101 -        return normalise_end_regexp.sub(self.get_replacement(name, True), text)
   1.102 -
   1.103  def xmlparse(s, out):
   1.104  
   1.105      "Parse the content in the string 's', writing a translation to 'out'."
   1.106 @@ -682,6 +676,9 @@
   1.107  
   1.108  if __name__ == "__main__":
   1.109      s = sys.stdin.read()
   1.110 -    parse(s, sys.stdout)
   1.111 +    if "--xml" in sys.argv:
   1.112 +        xmlparse(s, sys.stdout)
   1.113 +    else:
   1.114 +        parse(s, sys.stdout)
   1.115  
   1.116  # vim: tabstop=4 expandtab shiftwidth=4

     2.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     2.2 +++ b/tests/test_xml_lists.txt	Sun Feb 17 20:36:11 2013 +0100
     2.3 @@ -0,0 +1,7 @@
     2.4 +<ul>
     2.5 +<li>A<ul>
     2.6 +<li>A1</li>
     2.7 +<li>A2</li>
     2.8 +</ul></li>
     2.9 +<li>B</li>
    2.10 +</ul>
2013-02-17	Paul Boddie	raw files shortlog changelog graph	Simplified whitespace normalisation, explicitly adding newlines before certain elements at certain levels. Added a test mode for XML parsing and a test of parsing XHTML lists.
			parser.py (file) tests/test_xml_lists.txt (file)