# HG changeset patch # User Paul Boddie # Date 1317422438 -7200 # Node ID b18d0bb03a08ff2a789b87bdfbffdde5a2f8dc58 # Parent 6397ceae2ddaa3aa1ea4c2d28a03338d580ff979 Added support for compound keys. Fixed out of range index access. Added a groups convenience function for slicing up sequences into fixed length groups. diff -r 6397ceae2dda -r b18d0bb03a08 simplex.py --- a/simplex.py Sat Oct 01 00:01:04 2011 +0200 +++ b/simplex.py Sat Oct 01 00:40:38 2011 +0200 @@ -41,9 +41,9 @@ "A wrapper around text files." - def __init__(self, f, key=0, delimiter=None): + def __init__(self, f, keys=None, delimiter=None): self.f = f - self.key = key + self.keys = keys or [0] self.delimiter = delimiter def seek(self, pos): @@ -53,7 +53,8 @@ return self.f.xreadlines() def get_key(self, record): - return record.split(self.delimiter)[self.key] + values = record.split(self.delimiter) + return [values[key] for key in self.keys] def index_file(f, interval): @@ -95,7 +96,11 @@ """ i = bisect.bisect_left(l, (term, None)) - found, pos = l[i] + + try: + found, pos = l[i] + except IndexError: + return None # Since the index is more coarse than the underlying file, the bisect left # operation will most likely point to an index entry for later records than @@ -132,4 +137,20 @@ def find(self, term): return find_with_index(self.f, self.entries, term) +def groups(l, length): + + "Split 'l' into groups of the given 'length'." + + if length <= 0: + raise ValueError, "Groups must be greater than zero." + + i = 0 + g = [] + + while i < len(l): + g.append(l[i:i+length]) + i += length + + return g + # vim: tabstop=4 expandtab shiftwidth=4 diff -r 6397ceae2dda -r b18d0bb03a08 test_indexed.py --- a/test_indexed.py Sat Oct 01 00:01:04 2011 +0200 +++ b/test_indexed.py Sat Oct 01 00:40:38 2011 +0200 @@ -3,14 +3,20 @@ from simplex import * import sys, time -filename, step = sys.argv[1:3] -terms = sys.argv[3:] +try: + separator = sys.argv.index("--") + filename, interval = sys.argv[1:3] + keys = map(int, sys.argv[3:separator]) + terms = groups(sys.argv[separator+1:], len(keys)) +except (IndexError, ValueError): + print >>sys.stderr, "Usage: %s ... -- ..." % sys.argv[0] + sys.exit(1) f = open(filename) -tf = TextFile(f) +tf = TextFile(f, keys) try: t = time.time() - l = index_file(tf, int(step)) + l = index_file(tf, int(interval)) print "Indexed in %s seconds." % (time.time() - t) # Now use the index. diff -r 6397ceae2dda -r b18d0bb03a08 test_scan.py --- a/test_scan.py Sat Oct 01 00:01:04 2011 +0200 +++ b/test_scan.py Sat Oct 01 00:40:38 2011 +0200 @@ -3,11 +3,17 @@ from simplex import * import sys, time -filename = sys.argv[1] -terms = sys.argv[2:] +try: + separator = sys.argv.index("--") + filename = sys.argv[1] + keys = map(int, sys.argv[2:separator]) + terms = groups(sys.argv[separator+1:], len(keys)) +except (IndexError, ValueError): + print >>sys.stderr, "Usage: %s ... -- ..." % sys.argv[0] + sys.exit(1) f = open(filename) -tf = TextFile(f) +tf = TextFile(f, keys) try: for term in terms: tf.seek(0)