Merged list and "block element" (heading and blockquote) extraction, including also table extraction in order to simplify the processing hierarchy. Added elementary translation of list items and table rows, attempting to avoid link and image syntax being interpreted as table cell separators.

     1.1 --- a/parser.py	Sun Apr 22 20:48:55 2012 +0200
     1.2 +++ b/parser.py	Sun Apr 22 21:06:35 2012 +0200
     1.3 @@ -70,23 +70,34 @@
     1.4      else:
     1.5          return None, s
     1.6  
     1.7 -# List extraction.
     1.8 +# Heading, table and list extraction.
     1.9  
    1.10  list_regexp_str = r"^(?P<listtype>[*#-])[*#-]*.*\n((?P=listtype).*(?:\n|$))*"
    1.11 -list_regexp = re.compile(list_regexp_str, re.MULTILINE)
    1.12 +table_regexp_str = r"^((?P<celltype>[|]{1,2})(.+?(?P=celltype))+(\n|$))+"
    1.13 +blocktext_regexp_str = r"^(?P<type>h\d|bq)\.\s+(?P<text>.*)$"
    1.14  
    1.15 -def get_lists(s):
    1.16 +blockelement_regexp = re.compile(
    1.17 +    "(" + list_regexp_str + ")"
    1.18 +    "|"
    1.19 +    "(" + table_regexp_str + ")"
    1.20 +    "|"
    1.21 +    "(" + blocktext_regexp_str + ")",
    1.22 +    re.MULTILINE
    1.23 +    )
    1.24 +
    1.25 +def get_block_elements(s):
    1.26  
    1.27      """
    1.28 -    Extract lists from the given string 's'.
    1.29 +    Extract headings, tables and lists from the given string 's'.
    1.30      """
    1.31  
    1.32      last = 0
    1.33      blocks = []
    1.34 -    for match in list_regexp.finditer(s):
    1.35 +    for match in blockelement_regexp.finditer(s):
    1.36          start, end = match.span()
    1.37 +        matchtype = match.group("listtype") and "list" or match.group("celltype") and "table" or match.group("type")
    1.38          blocks.append((None, s[last:start]))
    1.39 -        blocks.append(("list", s[start:end]))
    1.40 +        blocks.append((matchtype, match.group("text") or s[start:end]))
    1.41          last = end
    1.42      blocks.append((None, s[last:]))
    1.43      return blocks
    1.44 @@ -107,9 +118,6 @@
    1.45  
    1.46  # Block inspection.
    1.47  
    1.48 -blocktext_regexp_str = r"^(?P<type>h\d|bq)\.\s+(?P<text>.*)$"
    1.49 -blocktext_regexp = re.compile(blocktext_regexp_str, re.MULTILINE)
    1.50 -
    1.51  def get_blocks(s):
    1.52  
    1.53      """
    1.54 @@ -119,9 +127,9 @@
    1.55  
    1.56      blocks = []
    1.57  
    1.58 -    for blocktype, blocktext in get_lists(s):
    1.59 +    for blocktype, blocktext in get_block_elements(s):
    1.60  
    1.61 -        # Collect list blocks.
    1.62 +        # Collect heading, list and table blocks.
    1.63  
    1.64          if blocktype is not None:
    1.65              blocks.append((blocktype, blocktext))
    1.66 @@ -130,32 +138,71 @@
    1.67  
    1.68          else:
    1.69              for block in get_basic_blocks(blocktext):
    1.70 -                last = 0
    1.71 -                for match in blocktext_regexp.finditer(block):
    1.72 -                    start, end = match.span()
    1.73 -
    1.74 -                    # Add preceding non-block text.
    1.75 -
    1.76 -                    preceding = block[last:start]
    1.77 -                    if preceding.strip():
    1.78 -                        blocks.append((None, preceding))
    1.79 -
    1.80 -                    # Add the subblock.
    1.81 -
    1.82 -                    blocks.append((match.group("type"), match.group("text")))
    1.83 -                    last = end
    1.84 -
    1.85 -                # Add trailing non-block text.
    1.86 -
    1.87 -                trailing = block[last:]
    1.88 -                if trailing.strip():
    1.89 -                    blocks.append((None, trailing))
    1.90 +                blocks.append((None, block))
    1.91  
    1.92      return blocks
    1.93  
    1.94 -listitem_regexp_str = r"^([*#-])+\s*(.*)$"
    1.95 +# List item inspection.
    1.96 +
    1.97 +listitem_regexp_str = r"^(?P<marker>[*#-])+\s*(?P<text>.*)$"
    1.98  listitem_regexp = re.compile(listitem_regexp_str, re.MULTILINE)
    1.99  
   1.100 +def get_list_items(text):
   1.101 +
   1.102 +    "Return a list of (marker, text) tuples for the given list 'text'."
   1.103 +
   1.104 +    items = []
   1.105 +
   1.106 +    for match in listitem_regexp.finditer(text):
   1.107 +        items.append((match.group("marker"), match.group("text")))
   1.108 +
   1.109 +    return items
   1.110 +
   1.111 +# Table row inspection.
   1.112 +
   1.113 +link_regexp_str = r"[[](?P<linktext>.*?)]"
   1.114 +image_regexp_str = r"!(?P<imagetext>.*?)!"
   1.115 +cellsep_regexp_str = r"(?P<celltype>[|]{1,2})"
   1.116 +content_regexp = re.compile(
   1.117 +    "(" + link_regexp_str + ")"
   1.118 +    "|"
   1.119 +    "(" + image_regexp_str + ")"
   1.120 +    "|"
   1.121 +    "(" + cellsep_regexp_str + ")"
   1.122 +    )
   1.123 +
   1.124 +def get_table_rows(text):
   1.125 +
   1.126 +    "Return a list of (cellsep, columns) tuples for the given table 'text'."
   1.127 +
   1.128 +    rows = []
   1.129 +
   1.130 +    for line in text.split("\n"):
   1.131 +        cellsep = None
   1.132 +        columns = [""]
   1.133 +        last = 0
   1.134 +        for match in content_regexp.finditer(line):
   1.135 +            start, end = match.span()
   1.136 +            columns[-1] += line[last:start]
   1.137 +
   1.138 +            if match.group("celltype"):
   1.139 +                if cellsep is None:
   1.140 +                    cellsep = match.group("celltype")
   1.141 +                columns.append("")
   1.142 +            else:
   1.143 +                columns[-1] += line[start:end]
   1.144 +
   1.145 +            last = end
   1.146 +
   1.147 +        columns[-1] += line[last:]
   1.148 +
   1.149 +        if cellsep:
   1.150 +            rows.append((cellsep, columns[1:-1]))
   1.151 +
   1.152 +    return rows
   1.153 +
   1.154 +# General parsing and translation.
   1.155 +
   1.156  blocktypes = {
   1.157      "h1" : "= %s =",
   1.158      "h2" : "== %s ==",
   1.159 @@ -166,6 +213,40 @@
   1.160      "bq" : "{{{%s}}}",
   1.161      }
   1.162  
   1.163 +markers = {
   1.164 +    "*" : "*",
   1.165 +    "#" : "1.",
   1.166 +    "-" : "*",
   1.167 +    }
   1.168 +
   1.169 +def translate_marker(marker):
   1.170 +
   1.171 +    "Translate the given 'marker' to a suitable Moin representation."
   1.172 +
   1.173 +    return " " * len(marker) + markers[marker[-1]]
   1.174 +
   1.175 +cellseps = {
   1.176 +    "|" : "||",
   1.177 +    "||" : "||",
   1.178 +    }
   1.179 +
   1.180 +cellextra = {
   1.181 +    "|" : "",
   1.182 +    "||" : "'''",
   1.183 +    }
   1.184 +
   1.185 +def translate_cellsep(cellsep):
   1.186 +
   1.187 +    "Translate the given 'cellsep' to a suitable Moin representation."
   1.188 +
   1.189 +    return cellseps[cellsep]
   1.190 +
   1.191 +def translate_cell(cellsep, text):
   1.192 +
   1.193 +    "Using 'cellsep', translate the cell 'text'."
   1.194 +
   1.195 +    return cellextra[cellsep] + text + cellextra[cellsep]
   1.196 +
   1.197  def parse(s, out):
   1.198  
   1.199      "Parse the content in the string 's', writing a translation to 'out'."
   1.200 @@ -176,16 +257,39 @@
   1.201  
   1.202          if type is None:
   1.203              for blocktype, blocktext in get_blocks(text):
   1.204 +
   1.205 +                # Translate headings and blockquotes.
   1.206 +
   1.207                  if blocktypes.has_key(blocktype):
   1.208                      print >>out, blocktypes[blocktype] % blocktext
   1.209 +
   1.210 +                # Translate list items.
   1.211 +
   1.212 +                elif blocktype == "list":
   1.213 +                    for listmarker, listitem in get_list_items(blocktext):
   1.214 +                        print >>out, "%s %s" % (translate_marker(listmarker), listitem)
   1.215 +
   1.216 +                # Translate table items.
   1.217 +
   1.218 +                elif blocktype == "table":
   1.219 +                    for cellsep, columns in get_table_rows(blocktext):
   1.220 +                        moinsep = translate_cellsep(cellsep)
   1.221 +                        print >>out, moinsep + moinsep.join([translate_cell(cellsep, column) for column in columns]) + moinsep
   1.222 +
   1.223 +                # Handle anonymous blocks.
   1.224 +
   1.225                  else:
   1.226 -                    print >>out, blocktext
   1.227 +                    print >>out, blocktext.rstrip()
   1.228 +
   1.229 +                print >>out
   1.230  
   1.231          # Handle sections.
   1.232  
   1.233          else:
   1.234 -            print >>out, "Region type:", type
   1.235 -            print >>out, text
   1.236 +            print >>out, "{{{",
   1.237 +            print >>out, text,
   1.238 +            print >>out, "}}}"
   1.239 +            print >>out
   1.240  
   1.241  if __name__ == "__main__":
   1.242      import sys
2012-04-22	Paul Boddie	raw files shortlog changelog graph	Merged list and "block element" (heading and blockquote) extraction, including also table extraction in order to simplify the processing hierarchy. Added elementary translation of list items and table rows, attempting to avoid link and image syntax being interpreted as table cell separators.
			parser.py (file)