1.1 --- a/wikiparser.py Sat Feb 23 01:03:23 2013 +0100
1.2 +++ b/wikiparser.py Sun Feb 24 23:42:06 2013 +0100
1.3 @@ -75,7 +75,7 @@
1.4 # Heading, table and list extraction.
1.5
1.6 list_regexp_str = r"^\s*(?P<listtype>[*#-])[*#-]*.*(\n\s*(?P=listtype).*?)*(?:\n|$)"
1.7 -table_regexp_str = r"^((?P<celltype>[|]{1,2})(.+?(?P=celltype))+(\n|$))+"
1.8 +table_regexp_str = r"^((?P<celltype>[|]{1,2})((.|\n(?!\n))+?(?P=celltype))+(\n|$))+"
1.9 blocktext_regexp_str = r"^(?P<type>h\d|bq)\.\s+(?P<text>.*)$"
1.10
1.11 blockelement_regexp = re.compile(
1.12 @@ -207,6 +207,51 @@
1.13 content_regexp = re.compile(content_regexp_str)
1.14 table_content_regexp = re.compile(table_content_regexp_str)
1.15
1.16 +# Notation conversion.
1.17 +
1.18 +notation_mapping = [
1.19 + (r"\!", "!"),
1.20 + (r"\-", "-"),
1.21 + (r"\\""\n", "<<BR>> "),
1.22 + (r"\\ ", " "),
1.23 + ]
1.24 +
1.25 +# Translation helpers.
1.26 +
1.27 +markers = {
1.28 + "*" : "*",
1.29 + "#" : "1.",
1.30 + "-" : "*",
1.31 + }
1.32 +
1.33 +def translate_marker(marker):
1.34 +
1.35 + "Translate the given 'marker' to a suitable Moin representation."
1.36 +
1.37 + return " " * len(marker) + markers[marker[-1]]
1.38 +
1.39 +cellseps = {
1.40 + "|" : "||",
1.41 + "||" : "||",
1.42 + }
1.43 +
1.44 +cellextra = {
1.45 + "|" : "",
1.46 + "||" : "'''",
1.47 + }
1.48 +
1.49 +def translate_cellsep(cellsep):
1.50 +
1.51 + "Translate the given 'cellsep' to a suitable Moin representation."
1.52 +
1.53 + return cellseps[cellsep]
1.54 +
1.55 +def translate_cell(cellsep, text):
1.56 +
1.57 + "Using 'cellsep', translate the cell 'text'."
1.58 +
1.59 + return cellextra[cellsep] + parse_text(text) + cellextra[cellsep]
1.60 +
1.61 def translate_content_match(match):
1.62
1.63 "Translate the content described by the given 'match', returning a string."
1.64 @@ -296,7 +341,15 @@
1.65 return ",,%s,," % translate_content(match.group("subtext"))
1.66
1.67 else:
1.68 - return match.group()
1.69 + return translate_text(match.group())
1.70 +
1.71 +def translate_text(s):
1.72 +
1.73 + "Translate the plain text string 's', converting notation."
1.74 +
1.75 + for before, after in notation_mapping:
1.76 + s = s.replace(before, after)
1.77 + return s
1.78
1.79 def translate_content(text, sectiontype=None):
1.80
1.81 @@ -311,7 +364,7 @@
1.82 last = 0
1.83 for match in content_regexp.finditer(text):
1.84 start, end = match.span()
1.85 - parts.append(text[last:start])
1.86 + parts.append(translate_text(text[last:start]))
1.87
1.88 # Handle unformatted sections.
1.89
1.90 @@ -322,22 +375,64 @@
1.91
1.92 last = end
1.93
1.94 - parts.append(text[last:])
1.95 + parts.append(translate_text(text[last:]))
1.96 return "".join(parts)
1.97
1.98 +def translate_block(blocktype, blocktext):
1.99 +
1.100 + "Translate the block with the given 'blocktype' and 'blocktext'."
1.101 +
1.102 + parts = []
1.103 +
1.104 + # Translate headings and blockquotes.
1.105 +
1.106 + if blocktypes.has_key(blocktype):
1.107 + parts.append(blocktypes[blocktype] % blocktext)
1.108 +
1.109 + # Translate list items.
1.110 +
1.111 + elif blocktype == "list":
1.112 + for listmarker, listitem in get_list_items(blocktext):
1.113 + parts.append("%s %s" % (translate_marker(listmarker), translate_content(listitem)))
1.114 +
1.115 + # Translate table items.
1.116 +
1.117 + elif blocktype == "table":
1.118 + parts.append("{{{#!table")
1.119 + first = True
1.120 + for cellsep, columns in get_table_rows(blocktext):
1.121 + if not first:
1.122 + parts.append("==")
1.123 + else:
1.124 + first = False
1.125 + moinsep = translate_cellsep(cellsep)
1.126 + parts.append(moinsep.join([translate_cell(cellsep, column) for column in columns]))
1.127 + parts.append("}}}")
1.128 +
1.129 + # Handle anonymous blocks.
1.130 +
1.131 + else:
1.132 + parts.append(translate_content(blocktext).rstrip())
1.133 +
1.134 + return "\n".join(parts) + "\n"
1.135 +
1.136 def get_table_rows(text):
1.137
1.138 "Return a list of (cellsep, columns) tuples for the given table 'text'."
1.139
1.140 rows = []
1.141
1.142 - for line in text.split("\n"):
1.143 + for row in text.split("|\n"):
1.144 + if not row:
1.145 + break
1.146 +
1.147 + row += "|"
1.148 cellsep = None
1.149 columns = [""]
1.150 last = 0
1.151 - for match in table_content_regexp.finditer(line):
1.152 + for match in table_content_regexp.finditer(row):
1.153 start, end = match.span()
1.154 - columns[-1] += line[last:start]
1.155 + columns[-1] += row[last:start]
1.156
1.157 if match.group("celltype"):
1.158 if cellsep is None:
1.159 @@ -348,49 +443,13 @@
1.160
1.161 last = end
1.162
1.163 - columns[-1] += line[last:]
1.164 + columns[-1] += row[last:]
1.165
1.166 if cellsep:
1.167 rows.append((cellsep, columns[1:-1]))
1.168
1.169 return rows
1.170
1.171 -# Translation helpers.
1.172 -
1.173 -markers = {
1.174 - "*" : "*",
1.175 - "#" : "1.",
1.176 - "-" : "*",
1.177 - }
1.178 -
1.179 -def translate_marker(marker):
1.180 -
1.181 - "Translate the given 'marker' to a suitable Moin representation."
1.182 -
1.183 - return " " * len(marker) + markers[marker[-1]]
1.184 -
1.185 -cellseps = {
1.186 - "|" : "||",
1.187 - "||" : "||",
1.188 - }
1.189 -
1.190 -cellextra = {
1.191 - "|" : "",
1.192 - "||" : "'''",
1.193 - }
1.194 -
1.195 -def translate_cellsep(cellsep):
1.196 -
1.197 - "Translate the given 'cellsep' to a suitable Moin representation."
1.198 -
1.199 - return cellseps[cellsep]
1.200 -
1.201 -def translate_cell(cellsep, text):
1.202 -
1.203 - "Using 'cellsep', translate the cell 'text'."
1.204 -
1.205 - return cellextra[cellsep] + translate_content(text) + cellextra[cellsep]
1.206 -
1.207 sectiontypes = {
1.208 "code" : "",
1.209 "noformat" : "",
1.210 @@ -403,9 +462,11 @@
1.211
1.212 # General parsing.
1.213
1.214 -def parse(s, out):
1.215 +def parse_text(s):
1.216
1.217 - "Parse the content in the string 's', writing a translation to 'out'."
1.218 + "Parse the content in the string 's', returning the translation."
1.219 +
1.220 + parts = []
1.221
1.222 for type, text in get_regions(s):
1.223
1.224 @@ -413,31 +474,7 @@
1.225
1.226 if type is None:
1.227 for blocktype, blocktext in get_blocks(text):
1.228 -
1.229 - # Translate headings and blockquotes.
1.230 -
1.231 - if blocktypes.has_key(blocktype):
1.232 - print >>out, blocktypes[blocktype] % blocktext
1.233 -
1.234 - # Translate list items.
1.235 -
1.236 - elif blocktype == "list":
1.237 - for listmarker, listitem in get_list_items(blocktext):
1.238 - print >>out, "%s %s" % (translate_marker(listmarker), translate_content(listitem))
1.239 -
1.240 - # Translate table items.
1.241 -
1.242 - elif blocktype == "table":
1.243 - for cellsep, columns in get_table_rows(blocktext):
1.244 - moinsep = translate_cellsep(cellsep)
1.245 - print >>out, moinsep + moinsep.join([translate_cell(cellsep, column) for column in columns]) + moinsep
1.246 -
1.247 - # Handle anonymous blocks.
1.248 -
1.249 - else:
1.250 - print >>out, translate_content(blocktext.rstrip())
1.251 -
1.252 - print >>out
1.253 + parts.append("%s\n" % translate_block(blocktype, blocktext))
1.254
1.255 # Handle sections.
1.256
1.257 @@ -448,14 +485,21 @@
1.258
1.259 mointype = sectiontypes.get(sectiontype)
1.260 if mointype:
1.261 - print >>out, "{{{#!%s" % mointype
1.262 + parts.append("{{{#!%s\n" % mointype)
1.263 if options:
1.264 - print >>out, "##", options
1.265 + parts.append("## %s\n" % options)
1.266 else:
1.267 - print >>out, "{{{",
1.268 - print >>out, translate_content(text, sectiontype),
1.269 - print >>out, "}}}"
1.270 - print >>out
1.271 + parts.append("{{{")
1.272 + parts.append(translate_content(text, sectiontype))
1.273 + parts.append("}}}\n")
1.274 +
1.275 + return "".join(parts)
1.276 +
1.277 +def parse(s, out):
1.278 +
1.279 + "Parse the content in the string 's', writing a translation to 'out'."
1.280 +
1.281 + out.write(parse_text(s))
1.282
1.283 if __name__ == "__main__":
1.284 s = sys.stdin.read()