Added XHTML table support; fixed Wiki markup list recognition, avoiding bold formatting conflicts; added tests of XHTML tables and Wiki markup lists. Added UTF-8 output support to the test programs.

     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/tests/test_lists.txt	Sat Mar 02 19:56:23 2013 +0100
     1.3 @@ -0,0 +1,5 @@
     1.4 +*Lists* are like this:
     1.5 + * First item
     1.6 + * Second item
     1.7 + ** Sublist item
     1.8 + * Final item

     2.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     2.2 +++ b/tests/test_xml_tables.txt	Sat Mar 02 19:56:23 2013 +0100
     2.3 @@ -0,0 +1,17 @@
     2.4 +<table><tbody>
     2.5 +<tr>
     2.6 +<th><p>Heading 1</p></th>
     2.7 +<th><p>Heading 2</p></th>
     2.8 +<th><p>Heading 3</p></th>
     2.9 +</tr>
    2.10 +<tr>
    2.11 +<td><p>Cell 1</p></td>
    2.12 +<td><p>Cell 2</p></td>
    2.13 +<td><p>Cell 3</p></td>
    2.14 +</tr>
    2.15 +<tr>
    2.16 +<td><p>Cell 4</p></td>
    2.17 +<td><p>Cell 5</p></td>
    2.18 +<td><p>Cell 6</p></td>
    2.19 +</tr>
    2.20 +</tbody></table>

     3.1 --- a/wikiparser.py	Tue Feb 26 01:07:26 2013 +0100
     3.2 +++ b/wikiparser.py	Sat Mar 02 19:56:23 2013 +0100
     3.3 @@ -34,6 +34,7 @@
     3.4  from common import *
     3.5  import re
     3.6  import sys
     3.7 +import codecs
     3.8  
     3.9  # Section extraction.
    3.10  
    3.11 @@ -74,7 +75,7 @@
    3.12  
    3.13  # Heading, table and list extraction.
    3.14  
    3.15 -list_regexp_str = r"^\s*(?P<listtype>[*#-])[*#-]*.*(\n\s*(?P=listtype).*?)*(?:\n|$)"
    3.16 +list_regexp_str = r"^\s*(?P<listtype>[*#-])[*#-]*\s+.*(\n\s*(?P=listtype).*?)*(?:\n|$)"
    3.17  table_regexp_str = r"^((?P<celltype>[|]{1,2})((.|\n(?!\n))+?(?P=celltype))+(\n|$))+"
    3.18  blocktext_regexp_str = r"^(?P<type>h\d|bq)\.\s+(?P<text>.*)$"
    3.19  
    3.20 @@ -146,7 +147,7 @@
    3.21  
    3.22  # List item inspection.
    3.23  
    3.24 -listitem_regexp_str = r"^(?P<marker> *[-*#]+)\s*(?P<text>.*)$"
    3.25 +listitem_regexp_str = r"^(?P<marker> *[-*#]+)\s+(?P<text>.*)$"
    3.26  listitem_regexp = re.compile(listitem_regexp_str, re.MULTILINE)
    3.27  
    3.28  def get_list_items(text):
    3.29 @@ -503,6 +504,7 @@
    3.30  
    3.31  if __name__ == "__main__":
    3.32      s = sys.stdin.read()
    3.33 -    parse(s, sys.stdout)
    3.34 +    out = codecs.getwriter("utf-8")(sys.stdout)
    3.35 +    parse(s, out)
    3.36  
    3.37  # vim: tabstop=4 expandtab shiftwidth=4

     4.1 --- a/xmlparser.py	Tue Feb 26 01:07:26 2013 +0100
     4.2 +++ b/xmlparser.py	Sat Mar 02 19:56:23 2013 +0100
     4.3 @@ -32,6 +32,7 @@
     4.4  import sys
     4.5  import operator
     4.6  import htmlentitydefs
     4.7 +import codecs
     4.8  
     4.9  # XML dialect syntax parsing.
    4.10  
    4.11 @@ -45,6 +46,11 @@
    4.12      "sub"                   : ",,%s,,",
    4.13      "code"                  : "`%s`",
    4.14      "pre"                   : "{{{%s}}}",
    4.15 +    "table"                 : "{{{#!table\n%s\n}}}",
    4.16 +    "tbody"                 : "%s",
    4.17 +    "tr"                    : "%s",
    4.18 +    "th"                    : "'''%s'''",
    4.19 +    "td"                    : "%s",
    4.20      "blockquote"            : " %s",
    4.21      "small"                 : "~-%s-~",
    4.22      "big"                   : "~+%s+~",
    4.23 @@ -114,6 +120,11 @@
    4.24          for name in ("pre", "ac:plain-text-body"):
    4.25              self.states[name] = 0
    4.26  
    4.27 +        # Table states.
    4.28 +
    4.29 +        self.table_rows = 0
    4.30 +        self.table_columns = 0
    4.31 +
    4.32      # ContentHandler-related methods.
    4.33  
    4.34      def startElement(self, name, attrs):
    4.35 @@ -143,7 +154,17 @@
    4.36      # Parser-related methods.
    4.37  
    4.38      def handleElement(self, name):
    4.39 -        text = "".join(self.text[-1])
    4.40 +        text = "".join(self.text[-1]).strip()
    4.41 +
    4.42 +        # Handle state.
    4.43 +
    4.44 +        if name == "table":
    4.45 +            self.table_rows = 0
    4.46 +        elif name == "tr":
    4.47 +            self.table_columns = 0
    4.48 +
    4.49 +        # Find conversions.
    4.50 +
    4.51          conversion = None
    4.52  
    4.53          # Handle list elements.
    4.54 @@ -207,6 +228,17 @@
    4.55          elif simple_tags.has_key(name):
    4.56              text = simple_tags[name]
    4.57  
    4.58 +        # Postprocess table columns and rows.
    4.59 +
    4.60 +        if name in ("th", "td"):
    4.61 +            if self.table_columns:
    4.62 +                text = "\n|| %s" % text
    4.63 +            self.table_columns += 1
    4.64 +        elif name == "tr":
    4.65 +            if self.table_rows:
    4.66 +                text = "\n==\n%s" % text
    4.67 +            self.table_rows += 1
    4.68 +
    4.69          # Normalise leading whitespace and indent the text if appropriate.
    4.70  
    4.71          if name in indented_tags:
    4.72 @@ -271,6 +303,7 @@
    4.73  
    4.74  if __name__ == "__main__":
    4.75      s = sys.stdin.read()
    4.76 -    parse(s, sys.stdout)
    4.77 +    out = codecs.getwriter("utf-8")(sys.stdout)
    4.78 +    parse(s, out)
    4.79  
    4.80  # vim: tabstop=4 expandtab shiftwidth=4
2013-03-02	Paul Boddie	raw files shortlog changelog graph	Added XHTML table support; fixed Wiki markup list recognition, avoiding bold formatting conflicts; added tests of XHTML tables and Wiki markup lists. Added UTF-8 output support to the test programs.
			tests/test_lists.txt (file) tests/test_xml_tables.txt (file) wikiparser.py (file) xmlparser.py (file)