1.1 --- a/ImprovedTableParser.py Sun Jan 15 23:37:14 2012 +0100
1.2 +++ b/ImprovedTableParser.py Tue Jan 17 01:16:37 2012 +0100
1.3 @@ -14,12 +14,19 @@
1.4 # Regular expressions.
1.5
1.6 syntax = {
1.7 + # For section markers.
1.8 + "markers" : (r"^\s*(?P<n>\\+)(?P<b>{|})(?P=n)(?P=b)(?P=n)(?P=b)", re.MULTILINE),
1.9 + "marker" : (r"(\\+)", 0),
1.10 +
1.11 # At start of line:
1.12 - "rows" : (r"^==", re.MULTILINE), # ==
1.13 + "sections" : (r"(^\s*{{{.*?^\s*}}})", re.MULTILINE | re.DOTALL), # {{{ ... }}}
1.14 + "rows" : (r"^==", re.MULTILINE), # ==
1.15 +
1.16 # Within text:
1.17 - "columns" : (r"\|\|[ \t]*", 0), # || whitespace
1.18 + "columns" : (r"\|\|[ \t]*", 0), # || ws-excl-nl
1.19 +
1.20 # At start of column text:
1.21 - "column" : (r"^\s*<(.*?)>\s*(.*)", re.DOTALL), # whitespace < attributes > whitespace
1.22 + "column" : (r"^\s*<(.*?)>\s*(.*)", re.DOTALL), # ws < attributes > ws
1.23 }
1.24
1.25 patterns = {}
1.26 @@ -32,46 +39,139 @@
1.27
1.28 "Parse 's', returning a table definition."
1.29
1.30 - rows = []
1.31 + s = replaceMarkers(s)
1.32 +
1.33 table_attrs = {}
1.34 + rows = []
1.35
1.36 - # Extract each row from the definition.
1.37 + # The following will be redefined upon the construction of the first row.
1.38 +
1.39 + row_attrs = {}
1.40 + columns = []
1.41 +
1.42 + # Process exposed text and sections.
1.43 +
1.44 + exposed = True
1.45 +
1.46 + # Initially, start a new row.
1.47 +
1.48 + row_continued = False
1.49 +
1.50 + for region in patterns["sections"].split(s):
1.51
1.52 - for row_text in patterns["rows"].split(s):
1.53 - columns = []
1.54 + # Only look for table features in exposed text.
1.55 +
1.56 + if exposed:
1.57 +
1.58 + # Extract each row from the definition.
1.59 +
1.60 + for row_text in patterns["rows"].split(region):
1.61 +
1.62 + # Only create a new row when a boundary has been found.
1.63
1.64 - # Extract each column from the row.
1.65 + if not row_continued:
1.66 + if columns:
1.67 + extractAttributes(columns[0][0], row_attrs, table_attrs)
1.68
1.69 - for text in patterns["columns"].split(row_text):
1.70 + row_attrs = {}
1.71 + columns = []
1.72 + rows.append((row_attrs, columns))
1.73 + column_continued = False
1.74
1.75 - # Extract the attribute and text sections.
1.76 + # Extract each column from the row.
1.77
1.78 - match = patterns["column"].search(text)
1.79 - if match:
1.80 - attribute_text, text = match.groups()
1.81 - columns.append((parseAttributes(attribute_text, True), text))
1.82 - else:
1.83 - columns.append(({}, text))
1.84 + for text in patterns["columns"].split(row_text):
1.85 +
1.86 + # Only create a new column when a boundary has been found.
1.87 +
1.88 + if not column_continued:
1.89 +
1.90 + # Extract the attribute and text sections.
1.91
1.92 - # Extract row- and table-level attributes.
1.93 + match = patterns["column"].search(text)
1.94 + if match:
1.95 + attribute_text, text = match.groups()
1.96 + columns.append([parseAttributes(attribute_text, True), text])
1.97 + else:
1.98 + columns.append([{}, text])
1.99
1.100 - row_attrs = {}
1.101 + else:
1.102 + columns[-1][1] += text
1.103 +
1.104 + # Permit columns immediately following this one.
1.105 +
1.106 + column_continued = False
1.107
1.108 - if columns:
1.109 - attrs, column = columns[0]
1.110 + # Permit a continuation of the current column.
1.111 +
1.112 + column_continued = True
1.113 +
1.114 + # Permit rows immediately following this one.
1.115 +
1.116 + row_continued = False
1.117 +
1.118 + # Permit a continuation if the current row.
1.119
1.120 - for name, value in attrs.items():
1.121 - if name.startswith("row"):
1.122 - row_attrs[name] = value
1.123 - del attrs[name]
1.124 - elif name.startswith("table"):
1.125 - table_attrs[name] = value
1.126 - del attrs[name]
1.127 + row_continued = True
1.128 +
1.129 + # Write any section into the current column.
1.130
1.131 - rows.append((row_attrs, columns))
1.132 + else:
1.133 + columns[-1][1] += region
1.134 +
1.135 + exposed = not exposed
1.136 +
1.137 + if columns:
1.138 + extractAttributes(columns[0][0], row_attrs, table_attrs)
1.139
1.140 return table_attrs, rows
1.141
1.142 +def extractAttributes(attrs, row_attrs, table_attrs):
1.143 +
1.144 + """
1.145 + Extract row- and table-level attributes from 'attrs', storing them in
1.146 + 'row_attrs' and 'table_attrs' respectively.
1.147 + """
1.148 +
1.149 + for name, value in attrs.items():
1.150 + if name.startswith("row"):
1.151 + row_attrs[name] = value
1.152 + del attrs[name]
1.153 + elif name.startswith("table"):
1.154 + table_attrs[name] = value
1.155 + del attrs[name]
1.156 +
1.157 +def replaceMarkers(s):
1.158 +
1.159 + "Convert the section notation in 's'."
1.160 +
1.161 + l = []
1.162 + last = 0
1.163 +
1.164 + # Get each marker and convert it.
1.165 +
1.166 + for match in patterns["markers"].finditer(s):
1.167 + start, stop = match.span()
1.168 + l.append(s[last:start])
1.169 +
1.170 + # Convert the marker.
1.171 +
1.172 + marker = []
1.173 + brace = True
1.174 + for text in patterns["marker"].split(match.group()):
1.175 + if brace:
1.176 + marker.append(text)
1.177 + else:
1.178 + marker.append(text[:-1])
1.179 + brace = not brace
1.180 +
1.181 + l.append("".join(marker))
1.182 + last = stop
1.183 + else:
1.184 + l.append(s[last:])
1.185 +
1.186 + return "".join(l)
1.187 +
1.188 def parseAttributes(s, escape=True):
1.189
1.190 """
3.1 --- a/tests/test_table.py Sun Jan 15 23:37:14 2012 +0100
3.2 +++ b/tests/test_table.py Tue Jan 17 01:16:37 2012 +0100
3.3 @@ -20,6 +20,11 @@
3.4 ||
3.5 * Item #A
3.6 || Not a list
3.7 +==
3.8 +\\{\\{\\{
3.9 +Some preformatted text.
3.10 +\\}\\}\\}
3.11 +||<colspan="2"> Preformatted text in a separate section
3.12 """
3.13
3.14 attrs, rows = parse(table)
3.15 @@ -27,9 +32,9 @@
3.16 print table
3.17 print attrs
3.18 print rows
3.19 -print len(rows) == 5, ": length is", len(rows), "==", 5
3.20 +print len(rows) == 6, ": length is", len(rows), "==", 6
3.21 print
3.22 -for (row_attrs, columns), expected in zip(rows, [3, 2, 3, 3, 3]):
3.23 +for (row_attrs, columns), expected in zip(rows, [3, 2, 3, 3, 3, 2]):
3.24 print row_attrs
3.25 print columns
3.26 print len(columns) == expected, ": length is", len(columns), "==", expected