# HG changeset patch # User Paul Boddie # Date 1317593873 -7200 # Node ID 0897d076edbb457d2e8f15a3bfa5ce7287eca097 # Parent 94f93801356cb6e8b451e362b3b231b6e944b3fe Permitted the make_index function to use existing sequence-like objects. Added tests of index writing and reading. diff -r 94f93801356c -r 0897d076edbb simplex/indexers.py --- a/simplex/indexers.py Sun Oct 02 20:43:03 2011 +0200 +++ b/simplex/indexers.py Mon Oct 03 00:17:53 2011 +0200 @@ -66,16 +66,19 @@ self.count += 1 self.pos += len(record) -def make_index(reader, get_key, interval): +def make_index(reader, get_key, interval, output=None): """ Index a resource whose 'reader' provides records, using a 'get_key' operation to yield the key for such records, creating an index entry for a record after a given number of records, defined by 'interval', have been read since the last entry was produced. + + Either append index entries to the given 'output' sequence, or populate a + new list. """ - l = [] + l = output or [] indexer = Indexer(l, get_key, interval) for record in reader: diff -r 94f93801356c -r 0897d076edbb test_read.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test_read.py Mon Oct 03 00:17:53 2011 +0200 @@ -0,0 +1,43 @@ +#!/usr/bin/env python + +from simplex import * +import sys, time + +def from_index_record(convert, record): + values = record.split("\t") + key = convert(values[:-1]) + pos = int(values[-1]) + return key, pos + +try: + separator = sys.argv.index("--") + filename, numeric, index_filename = sys.argv[1:4] + fields = map(int, sys.argv[4:separator]) + terms = groups(sys.argv[separator+1:], len(fields)) + numeric = numeric == "true" +except (IndexError, ValueError): + print >>sys.stderr, "Usage: %s ... -- ..." % sys.argv[0] + sys.exit(1) + +f = open(filename) +fi = open(index_filename) +accessor = DelimitedRecord(fields, numeric=numeric) + +try: + t = time.time() + l = [from_index_record(accessor.convert, record) for record in fi] + print "Read index (at %s seconds, with %d entries)." % (time.time() - t, len(l)) + + # Now use the index. + + for term in terms: + t = time.time() + line = find_with_index(f, accessor.get_key, l, accessor.convert(term)) + if line: + print "Found (at %s seconds)...\n%s" % (time.time() - t, line) + +finally: + f.close() + fi.close() + +# vim: tabstop=4 expandtab shiftwidth=4 diff -r 94f93801356c -r 0897d076edbb test_write.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test_write.py Mon Oct 03 00:17:53 2011 +0200 @@ -0,0 +1,40 @@ +#!/usr/bin/env python + +from simplex import * +import sys, time + +class IndexWriter: + + "A tab-delimited file writer." + + def __init__(self, f): + self.f = f + + def append(self, entry): + key, pos = entry + entry = key + [pos] + self.f.write("\t".join(map(str, entry)) + "\n") + +try: + filename, numeric, interval, index_filename = sys.argv[1:5] + fields = map(int, sys.argv[5:]) + numeric = numeric == "true" +except (IndexError, ValueError): + print >>sys.stderr, "Usage: %s ..." % sys.argv[0] + sys.exit(1) + +f = open(filename) +fi = open(index_filename, "w") +accessor = DelimitedRecord(fields, numeric=numeric) +writer = IndexWriter(fi) + +try: + t = time.time() + make_index(f, accessor.get_key, int(interval), writer) + print "Indexed in %s seconds." % (time.time() - t) + +finally: + f.close() + fi.close() + +# vim: tabstop=4 expandtab shiftwidth=4