# HG changeset patch # User Paul Boddie # Date 1317423376 -7200 # Node ID dc3d2ee38bae3135c0f26db8e00ec299f94f6c82 # Parent b18d0bb03a08ff2a789b87bdfbffdde5a2f8dc58 Separated the reader and accessor concerns so that different objects can provide records from resources and access to the details within records. diff -r b18d0bb03a08 -r dc3d2ee38bae simplex.py --- a/simplex.py Sat Oct 01 00:40:38 2011 +0200 +++ b/simplex.py Sat Oct 01 00:56:16 2011 +0200 @@ -26,13 +26,6 @@ encouraging multiple seeks and reads are likely to waste time compared to just performing a single read operation, even if that operation involves a larger quantity of data, at least for storage with hard disk access characteristics. - -Potential Improvements ----------------------- - -Ideally, the acquisition of records should be done more generally than just -reading lines, and the selection of matches should involve more than just -selecting the first column. """ import bisect @@ -56,11 +49,13 @@ values = record.split(self.delimiter) return [values[key] for key in self.keys] -def index_file(f, interval): +def make_index(reader, accessor, interval): """ - Index a file 'f', creating an index entry for a record after a given number, - defined by 'interval', have been read since the last entry. + Index a resource whose 'reader' provides records and whose 'accessor' can + yield the key for such records, creating an index entry for a record after a + given number of records, defined by 'interval', have been read since the + last entry was produced. """ l = [] @@ -69,8 +64,8 @@ current_key = None start_pos = 0 - for i, record in enumerate(f.get_records()): - key = f.get_key(record) + for i, record in enumerate(reader.get_records()): + key = accessor.get_key(record) # Where duplicate keys are permitted, the first record employing the key # must be available as an index entry. Otherwise, records preceding the @@ -88,11 +83,13 @@ return l -def find_with_index(f, l, term): +def find_with_index(reader, accessor, l, term): """ - Find in file 'f', using the given index list 'l', the given 'term', - returning a record employing the term or None if no such record was found. + Find in the resource whose 'reader' provides records and whose 'accessor' + can yield the key for such records, using the given index list 'l', the + given 'term', returning a record employing the term or None if no such + record was found. """ i = bisect.bisect_left(l, (term, None)) @@ -110,33 +107,23 @@ i = max(0, i - 1) found, pos = l[i] - f.seek(pos) - return find_in_file(f, term) + reader.seek(pos) + return find_in_file(reader, accessor, term) -def find_in_file(f, term): +def find_in_file(reader, accessor, term): """ - Find in file 'f' the given 'term', returning a record employing the term or - None if no such record was found. + Find in the resource whose 'reader' provides records and whose 'accessor' + can yield the key for such records, the given 'term', returning a record + employing the term or None if no such record was found. """ - for record in f.get_records(): - if term == f.get_key(record): + for record in reader.get_records(): + if term == accessor.get_key(record): return record return None -class Index: - - "An index abstraction." - - def __init__(self, entries, f): - self.entries = entries - self.f = f - - def find(self, term): - return find_with_index(self.f, self.entries, term) - def groups(l, length): "Split 'l' into groups of the given 'length'." diff -r b18d0bb03a08 -r dc3d2ee38bae test_indexed.py --- a/test_indexed.py Sat Oct 01 00:40:38 2011 +0200 +++ b/test_indexed.py Sat Oct 01 00:56:16 2011 +0200 @@ -13,17 +13,17 @@ sys.exit(1) f = open(filename) -tf = TextFile(f, keys) +reader = TextFile(f, keys) try: t = time.time() - l = index_file(tf, int(interval)) + l = make_index(reader, reader, int(interval)) print "Indexed in %s seconds." % (time.time() - t) # Now use the index. for term in terms: t = time.time() - line = find_with_index(tf, l, term) + line = find_with_index(reader, reader, l, term) if line: print "Found (at %s seconds)...\n%s" % (time.time() - t, line) diff -r b18d0bb03a08 -r dc3d2ee38bae test_scan.py --- a/test_scan.py Sat Oct 01 00:40:38 2011 +0200 +++ b/test_scan.py Sat Oct 01 00:56:16 2011 +0200 @@ -13,13 +13,13 @@ sys.exit(1) f = open(filename) -tf = TextFile(f, keys) +reader = TextFile(f, keys) try: for term in terms: - tf.seek(0) + reader.seek(0) t = time.time() - line = find_in_file(tf, term) + line = find_in_file(reader, reader, term) if line: print "Found (at %s seconds)...\n%s" % (time.time() - t, line) finally: