# HG changeset patch # User Paul Boddie # Date 1317419187 -7200 # Node ID baee3f51e8f51bee6308dec3182d74f6a0e33b8a # Parent 729f5f4a3fd2adf3ae4e4243e5ceca02f17ef686 Added a wrapper for text files and changed the basic functions to use the API provided by the wrapper. Fixed the copyright information. diff -r 729f5f4a3fd2 -r baee3f51e8f5 simplex.py --- a/simplex.py Fri Sep 30 00:44:45 2011 +0200 +++ b/simplex.py Fri Sep 30 23:46:27 2011 +0200 @@ -3,7 +3,7 @@ """ Simple indexing of sorted files. -Copyright (C) 2009, 2010 Paul Boddie +Copyright (C) 2011 Paul Boddie This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -37,29 +37,46 @@ import bisect -def index_by_lines(f, interval): +class TextFile: + + "A wrapper around text files." + + def __init__(self, f, key=0, delimiter=None): + self.f = f + self.key = key + self.delimiter = delimiter + + def seek(self, pos): + self.f.seek(pos) + + def get_records(self): + return self.f.xreadlines() + + def get_key(self, record): + return record.split(self.delimiter)[self.key] + +def index_file(f, interval): """ - Index a file 'f', creating an index entry for a line after a given number, - defined by 'interval', has been read since the last entry. + Index a file 'f', creating an index entry for a record after a given number, + defined by 'interval', have been read since the last entry. """ l = [] pos = 0 - for i, line in enumerate(f.xreadlines()): - columns = line.split("\t") + for i, record in enumerate(f.get_records()): if i % interval == 0: - l.append((columns[0], pos)) - pos += len(line) + l.append((f.get_key(record), pos)) + pos += len(record) return l -def find_line_with_index(f, l, term): +def find_with_index(f, l, term): """ Find in file 'f', using the given index list 'l', the given 'term', - returning a line employing the term or None if no such line was found. + returning a record employing the term or None if no such record was found. """ i = bisect.bisect_left(l, (term, None)) @@ -74,19 +91,18 @@ found, pos = l[i] f.seek(pos) - return find_line_in_file(f, term) + return find_in_file(f, term) -def find_line_in_file(f, term): +def find_in_file(f, term): """ - Find in file 'f' the given 'term', returning a line employing the term or - None if no such line was found. + Find in file 'f' the given 'term', returning a record employing the term or + None if no such record was found. """ - for line in f.xreadlines(): - columns = line.split("\t") - if term == columns[0]: - return line + for record in f.get_records(): + if term == f.get_key(record): + return record return None @@ -99,6 +115,6 @@ self.f = f def find(self, term): - return find_line_with_index(self.f, self.entries, term) + return find_with_index(self.f, self.entries, term) # vim: tabstop=4 expandtab shiftwidth=4 diff -r 729f5f4a3fd2 -r baee3f51e8f5 test_indexed.py --- a/test_indexed.py Fri Sep 30 00:44:45 2011 +0200 +++ b/test_indexed.py Fri Sep 30 23:46:27 2011 +0200 @@ -7,16 +7,17 @@ terms = sys.argv[3:] f = open(filename) +tf = TextFile(f) try: t = time.time() - l = index_by_lines(f, int(step)) + l = index_file(tf, int(step)) print "Indexed in %s seconds." % (time.time() - t) # Now use the index. for term in terms: t = time.time() - line = find_line_with_index(f, l, term) + line = find_with_index(tf, l, term) if line: print "Found (at %s seconds)...\n%s" % (time.time() - t, line) diff -r 729f5f4a3fd2 -r baee3f51e8f5 test_scan.py --- a/test_scan.py Fri Sep 30 00:44:45 2011 +0200 +++ b/test_scan.py Fri Sep 30 23:46:27 2011 +0200 @@ -7,12 +7,13 @@ terms = sys.argv[2:] f = open(filename) +tf = TextFile(f) try: for term in terms: - f.seek(0) + tf.seek(0) t = time.time() - line = find_line_in_file(f, term) + line = find_in_file(tf, term) if line: print "Found (at %s seconds)...\n%s" % (time.time() - t, line) finally: