1.1 --- a/simplex/indexers.py Sun Oct 02 20:43:03 2011 +0200
1.2 +++ b/simplex/indexers.py Mon Oct 03 00:17:53 2011 +0200
1.3 @@ -66,16 +66,19 @@
1.4 self.count += 1
1.5 self.pos += len(record)
1.6
1.7 -def make_index(reader, get_key, interval):
1.8 +def make_index(reader, get_key, interval, output=None):
1.9
1.10 """
1.11 Index a resource whose 'reader' provides records, using a 'get_key'
1.12 operation to yield the key for such records, creating an index entry for a
1.13 record after a given number of records, defined by 'interval', have been
1.14 read since the last entry was produced.
1.15 +
1.16 + Either append index entries to the given 'output' sequence, or populate a
1.17 + new list.
1.18 """
1.19
1.20 - l = []
1.21 + l = output or []
1.22 indexer = Indexer(l, get_key, interval)
1.23
1.24 for record in reader:
2.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
2.2 +++ b/test_read.py Mon Oct 03 00:17:53 2011 +0200
2.3 @@ -0,0 +1,43 @@
2.4 +#!/usr/bin/env python
2.5 +
2.6 +from simplex import *
2.7 +import sys, time
2.8 +
2.9 +def from_index_record(convert, record):
2.10 + values = record.split("\t")
2.11 + key = convert(values[:-1])
2.12 + pos = int(values[-1])
2.13 + return key, pos
2.14 +
2.15 +try:
2.16 + separator = sys.argv.index("--")
2.17 + filename, numeric, index_filename = sys.argv[1:4]
2.18 + fields = map(int, sys.argv[4:separator])
2.19 + terms = groups(sys.argv[separator+1:], len(fields))
2.20 + numeric = numeric == "true"
2.21 +except (IndexError, ValueError):
2.22 + print >>sys.stderr, "Usage: %s <filename> <numeric> <index> <field>... -- <term value>..." % sys.argv[0]
2.23 + sys.exit(1)
2.24 +
2.25 +f = open(filename)
2.26 +fi = open(index_filename)
2.27 +accessor = DelimitedRecord(fields, numeric=numeric)
2.28 +
2.29 +try:
2.30 + t = time.time()
2.31 + l = [from_index_record(accessor.convert, record) for record in fi]
2.32 + print "Read index (at %s seconds, with %d entries)." % (time.time() - t, len(l))
2.33 +
2.34 + # Now use the index.
2.35 +
2.36 + for term in terms:
2.37 + t = time.time()
2.38 + line = find_with_index(f, accessor.get_key, l, accessor.convert(term))
2.39 + if line:
2.40 + print "Found (at %s seconds)...\n%s" % (time.time() - t, line)
2.41 +
2.42 +finally:
2.43 + f.close()
2.44 + fi.close()
2.45 +
2.46 +# vim: tabstop=4 expandtab shiftwidth=4
3.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
3.2 +++ b/test_write.py Mon Oct 03 00:17:53 2011 +0200
3.3 @@ -0,0 +1,40 @@
3.4 +#!/usr/bin/env python
3.5 +
3.6 +from simplex import *
3.7 +import sys, time
3.8 +
3.9 +class IndexWriter:
3.10 +
3.11 + "A tab-delimited file writer."
3.12 +
3.13 + def __init__(self, f):
3.14 + self.f = f
3.15 +
3.16 + def append(self, entry):
3.17 + key, pos = entry
3.18 + entry = key + [pos]
3.19 + self.f.write("\t".join(map(str, entry)) + "\n")
3.20 +
3.21 +try:
3.22 + filename, numeric, interval, index_filename = sys.argv[1:5]
3.23 + fields = map(int, sys.argv[5:])
3.24 + numeric = numeric == "true"
3.25 +except (IndexError, ValueError):
3.26 + print >>sys.stderr, "Usage: %s <filename> <numeric> <interval> <index> <field>..." % sys.argv[0]
3.27 + sys.exit(1)
3.28 +
3.29 +f = open(filename)
3.30 +fi = open(index_filename, "w")
3.31 +accessor = DelimitedRecord(fields, numeric=numeric)
3.32 +writer = IndexWriter(fi)
3.33 +
3.34 +try:
3.35 + t = time.time()
3.36 + make_index(f, accessor.get_key, int(interval), writer)
3.37 + print "Indexed in %s seconds." % (time.time() - t)
3.38 +
3.39 +finally:
3.40 + f.close()
3.41 + fi.close()
3.42 +
3.43 +# vim: tabstop=4 expandtab shiftwidth=4