Improved table region and table row detection. Introduced recursive processing of table cells, so that lists and other blocks can be recognised and translated inside cells. Introduced usage of ImprovedTableParser syntax in order to handle complicated table layout. Introduced notation conversion for plain text fragments.

     1.1 --- a/wikiparser.py	Sat Feb 23 01:03:23 2013 +0100
     1.2 +++ b/wikiparser.py	Sun Feb 24 23:42:06 2013 +0100
     1.3 @@ -75,7 +75,7 @@
     1.4  # Heading, table and list extraction.
     1.5  
     1.6  list_regexp_str = r"^\s*(?P<listtype>[*#-])[*#-]*.*(\n\s*(?P=listtype).*?)*(?:\n|$)"
     1.7 -table_regexp_str = r"^((?P<celltype>[|]{1,2})(.+?(?P=celltype))+(\n|$))+"
     1.8 +table_regexp_str = r"^((?P<celltype>[|]{1,2})((.|\n(?!\n))+?(?P=celltype))+(\n|$))+"
     1.9  blocktext_regexp_str = r"^(?P<type>h\d|bq)\.\s+(?P<text>.*)$"
    1.10  
    1.11  blockelement_regexp = re.compile(
    1.12 @@ -207,6 +207,51 @@
    1.13  content_regexp = re.compile(content_regexp_str)
    1.14  table_content_regexp = re.compile(table_content_regexp_str)
    1.15  
    1.16 +# Notation conversion.
    1.17 +
    1.18 +notation_mapping = [
    1.19 +    (r"\!", "!"),
    1.20 +    (r"\-", "-"),
    1.21 +    (r"\\""\n", "<<BR>> "),
    1.22 +    (r"\\ ", " "),
    1.23 +    ]
    1.24 +
    1.25 +# Translation helpers.
    1.26 +
    1.27 +markers = {
    1.28 +    "*" : "*",
    1.29 +    "#" : "1.",
    1.30 +    "-" : "*",
    1.31 +    }
    1.32 +
    1.33 +def translate_marker(marker):
    1.34 +
    1.35 +    "Translate the given 'marker' to a suitable Moin representation."
    1.36 +
    1.37 +    return " " * len(marker) + markers[marker[-1]]
    1.38 +
    1.39 +cellseps = {
    1.40 +    "|" : "||",
    1.41 +    "||" : "||",
    1.42 +    }
    1.43 +
    1.44 +cellextra = {
    1.45 +    "|" : "",
    1.46 +    "||" : "'''",
    1.47 +    }
    1.48 +
    1.49 +def translate_cellsep(cellsep):
    1.50 +
    1.51 +    "Translate the given 'cellsep' to a suitable Moin representation."
    1.52 +
    1.53 +    return cellseps[cellsep]
    1.54 +
    1.55 +def translate_cell(cellsep, text):
    1.56 +
    1.57 +    "Using 'cellsep', translate the cell 'text'."
    1.58 +
    1.59 +    return cellextra[cellsep] + parse_text(text) + cellextra[cellsep]
    1.60 +
    1.61  def translate_content_match(match):
    1.62  
    1.63      "Translate the content described by the given 'match', returning a string."
    1.64 @@ -296,7 +341,15 @@
    1.65          return ",,%s,," % translate_content(match.group("subtext"))
    1.66  
    1.67      else:
    1.68 -        return match.group()
    1.69 +        return translate_text(match.group())
    1.70 +
    1.71 +def translate_text(s):
    1.72 +
    1.73 +    "Translate the plain text string 's', converting notation."
    1.74 +
    1.75 +    for before, after in notation_mapping:
    1.76 +        s = s.replace(before, after)
    1.77 +    return s
    1.78  
    1.79  def translate_content(text, sectiontype=None):
    1.80  
    1.81 @@ -311,7 +364,7 @@
    1.82      last = 0
    1.83      for match in content_regexp.finditer(text):
    1.84          start, end = match.span()
    1.85 -        parts.append(text[last:start])
    1.86 +        parts.append(translate_text(text[last:start]))
    1.87  
    1.88          # Handle unformatted sections.
    1.89  
    1.90 @@ -322,22 +375,64 @@
    1.91  
    1.92          last = end
    1.93  
    1.94 -    parts.append(text[last:])
    1.95 +    parts.append(translate_text(text[last:]))
    1.96      return "".join(parts)
    1.97  
    1.98 +def translate_block(blocktype, blocktext):
    1.99 +
   1.100 +    "Translate the block with the given 'blocktype' and 'blocktext'."
   1.101 +
   1.102 +    parts = []
   1.103 +
   1.104 +    # Translate headings and blockquotes.
   1.105 +
   1.106 +    if blocktypes.has_key(blocktype):
   1.107 +        parts.append(blocktypes[blocktype] % blocktext)
   1.108 +
   1.109 +    # Translate list items.
   1.110 +
   1.111 +    elif blocktype == "list":
   1.112 +        for listmarker, listitem in get_list_items(blocktext):
   1.113 +            parts.append("%s %s" % (translate_marker(listmarker), translate_content(listitem)))
   1.114 +
   1.115 +    # Translate table items.
   1.116 +
   1.117 +    elif blocktype == "table":
   1.118 +        parts.append("{{{#!table")
   1.119 +        first = True
   1.120 +        for cellsep, columns in get_table_rows(blocktext):
   1.121 +            if not first:
   1.122 +                parts.append("==")
   1.123 +            else:
   1.124 +                first = False
   1.125 +            moinsep = translate_cellsep(cellsep)
   1.126 +            parts.append(moinsep.join([translate_cell(cellsep, column) for column in columns]))
   1.127 +        parts.append("}}}")
   1.128 +
   1.129 +    # Handle anonymous blocks.
   1.130 +
   1.131 +    else:
   1.132 +        parts.append(translate_content(blocktext).rstrip())
   1.133 +
   1.134 +    return "\n".join(parts) + "\n"
   1.135 +
   1.136  def get_table_rows(text):
   1.137  
   1.138      "Return a list of (cellsep, columns) tuples for the given table 'text'."
   1.139  
   1.140      rows = []
   1.141  
   1.142 -    for line in text.split("\n"):
   1.143 +    for row in text.split("|\n"):
   1.144 +        if not row:
   1.145 +            break
   1.146 +
   1.147 +        row += "|"
   1.148          cellsep = None
   1.149          columns = [""]
   1.150          last = 0
   1.151 -        for match in table_content_regexp.finditer(line):
   1.152 +        for match in table_content_regexp.finditer(row):
   1.153              start, end = match.span()
   1.154 -            columns[-1] += line[last:start]
   1.155 +            columns[-1] += row[last:start]
   1.156  
   1.157              if match.group("celltype"):
   1.158                  if cellsep is None:
   1.159 @@ -348,49 +443,13 @@
   1.160  
   1.161              last = end
   1.162  
   1.163 -        columns[-1] += line[last:]
   1.164 +        columns[-1] += row[last:]
   1.165  
   1.166          if cellsep:
   1.167              rows.append((cellsep, columns[1:-1]))
   1.168  
   1.169      return rows
   1.170  
   1.171 -# Translation helpers.
   1.172 -
   1.173 -markers = {
   1.174 -    "*" : "*",
   1.175 -    "#" : "1.",
   1.176 -    "-" : "*",
   1.177 -    }
   1.178 -
   1.179 -def translate_marker(marker):
   1.180 -
   1.181 -    "Translate the given 'marker' to a suitable Moin representation."
   1.182 -
   1.183 -    return " " * len(marker) + markers[marker[-1]]
   1.184 -
   1.185 -cellseps = {
   1.186 -    "|" : "||",
   1.187 -    "||" : "||",
   1.188 -    }
   1.189 -
   1.190 -cellextra = {
   1.191 -    "|" : "",
   1.192 -    "||" : "'''",
   1.193 -    }
   1.194 -
   1.195 -def translate_cellsep(cellsep):
   1.196 -
   1.197 -    "Translate the given 'cellsep' to a suitable Moin representation."
   1.198 -
   1.199 -    return cellseps[cellsep]
   1.200 -
   1.201 -def translate_cell(cellsep, text):
   1.202 -
   1.203 -    "Using 'cellsep', translate the cell 'text'."
   1.204 -
   1.205 -    return cellextra[cellsep] + translate_content(text) + cellextra[cellsep]
   1.206 -
   1.207  sectiontypes = {
   1.208      "code" : "",
   1.209      "noformat" : "",
   1.210 @@ -403,9 +462,11 @@
   1.211  
   1.212  # General parsing.
   1.213  
   1.214 -def parse(s, out):
   1.215 +def parse_text(s):
   1.216  
   1.217 -    "Parse the content in the string 's', writing a translation to 'out'."
   1.218 +    "Parse the content in the string 's', returning the translation."
   1.219 +
   1.220 +    parts = []
   1.221  
   1.222      for type, text in get_regions(s):
   1.223  
   1.224 @@ -413,31 +474,7 @@
   1.225  
   1.226          if type is None:
   1.227              for blocktype, blocktext in get_blocks(text):
   1.228 -
   1.229 -                # Translate headings and blockquotes.
   1.230 -
   1.231 -                if blocktypes.has_key(blocktype):
   1.232 -                    print >>out, blocktypes[blocktype] % blocktext
   1.233 -
   1.234 -                # Translate list items.
   1.235 -
   1.236 -                elif blocktype == "list":
   1.237 -                    for listmarker, listitem in get_list_items(blocktext):
   1.238 -                        print >>out, "%s %s" % (translate_marker(listmarker), translate_content(listitem))
   1.239 -
   1.240 -                # Translate table items.
   1.241 -
   1.242 -                elif blocktype == "table":
   1.243 -                    for cellsep, columns in get_table_rows(blocktext):
   1.244 -                        moinsep = translate_cellsep(cellsep)
   1.245 -                        print >>out, moinsep + moinsep.join([translate_cell(cellsep, column) for column in columns]) + moinsep
   1.246 -
   1.247 -                # Handle anonymous blocks.
   1.248 -
   1.249 -                else:
   1.250 -                    print >>out, translate_content(blocktext.rstrip())
   1.251 -
   1.252 -                print >>out
   1.253 +                parts.append("%s\n" % translate_block(blocktype, blocktext))
   1.254  
   1.255          # Handle sections.
   1.256  
   1.257 @@ -448,14 +485,21 @@
   1.258  
   1.259              mointype = sectiontypes.get(sectiontype)
   1.260              if mointype:
   1.261 -                print >>out, "{{{#!%s" % mointype
   1.262 +                parts.append("{{{#!%s\n" % mointype)
   1.263                  if options:
   1.264 -                    print >>out, "##", options
   1.265 +                    parts.append("## %s\n" % options)
   1.266              else:
   1.267 -                print >>out, "{{{",
   1.268 -            print >>out, translate_content(text, sectiontype),
   1.269 -            print >>out, "}}}"
   1.270 -            print >>out
   1.271 +                parts.append("{{{")
   1.272 +            parts.append(translate_content(text, sectiontype))
   1.273 +            parts.append("}}}\n")
   1.274 +
   1.275 +    return "".join(parts)
   1.276 +
   1.277 +def parse(s, out):
   1.278 +
   1.279 +    "Parse the content in the string 's', writing a translation to 'out'."
   1.280 +
   1.281 +    out.write(parse_text(s))
   1.282  
   1.283  if __name__ == "__main__":
   1.284      s = sys.stdin.read()
2013-02-24	Paul Boddie	raw files shortlog changelog graph	Improved table region and table row detection. Introduced recursive processing of table cells, so that lists and other blocks can be recognised and translated inside cells. Introduced usage of ImprovedTableParser syntax in order to handle complicated table layout. Introduced notation conversion for plain text fragments.
			wikiparser.py (file)