1.1 --- a/simplex.py Fri Sep 30 00:44:45 2011 +0200
1.2 +++ b/simplex.py Fri Sep 30 23:46:27 2011 +0200
1.3 @@ -3,7 +3,7 @@
1.4 """
1.5 Simple indexing of sorted files.
1.6
1.7 -Copyright (C) 2009, 2010 Paul Boddie <paul@boddie.org.uk>
1.8 +Copyright (C) 2011 Paul Boddie <paul@boddie.org.uk>
1.9
1.10 This program is free software; you can redistribute it and/or modify it under
1.11 the terms of the GNU General Public License as published by the Free Software
1.12 @@ -37,29 +37,46 @@
1.13
1.14 import bisect
1.15
1.16 -def index_by_lines(f, interval):
1.17 +class TextFile:
1.18 +
1.19 + "A wrapper around text files."
1.20 +
1.21 + def __init__(self, f, key=0, delimiter=None):
1.22 + self.f = f
1.23 + self.key = key
1.24 + self.delimiter = delimiter
1.25 +
1.26 + def seek(self, pos):
1.27 + self.f.seek(pos)
1.28 +
1.29 + def get_records(self):
1.30 + return self.f.xreadlines()
1.31 +
1.32 + def get_key(self, record):
1.33 + return record.split(self.delimiter)[self.key]
1.34 +
1.35 +def index_file(f, interval):
1.36
1.37 """
1.38 - Index a file 'f', creating an index entry for a line after a given number,
1.39 - defined by 'interval', has been read since the last entry.
1.40 + Index a file 'f', creating an index entry for a record after a given number,
1.41 + defined by 'interval', have been read since the last entry.
1.42 """
1.43
1.44 l = []
1.45 pos = 0
1.46
1.47 - for i, line in enumerate(f.xreadlines()):
1.48 - columns = line.split("\t")
1.49 + for i, record in enumerate(f.get_records()):
1.50 if i % interval == 0:
1.51 - l.append((columns[0], pos))
1.52 - pos += len(line)
1.53 + l.append((f.get_key(record), pos))
1.54 + pos += len(record)
1.55
1.56 return l
1.57
1.58 -def find_line_with_index(f, l, term):
1.59 +def find_with_index(f, l, term):
1.60
1.61 """
1.62 Find in file 'f', using the given index list 'l', the given 'term',
1.63 - returning a line employing the term or None if no such line was found.
1.64 + returning a record employing the term or None if no such record was found.
1.65 """
1.66
1.67 i = bisect.bisect_left(l, (term, None))
1.68 @@ -74,19 +91,18 @@
1.69 found, pos = l[i]
1.70
1.71 f.seek(pos)
1.72 - return find_line_in_file(f, term)
1.73 + return find_in_file(f, term)
1.74
1.75 -def find_line_in_file(f, term):
1.76 +def find_in_file(f, term):
1.77
1.78 """
1.79 - Find in file 'f' the given 'term', returning a line employing the term or
1.80 - None if no such line was found.
1.81 + Find in file 'f' the given 'term', returning a record employing the term or
1.82 + None if no such record was found.
1.83 """
1.84
1.85 - for line in f.xreadlines():
1.86 - columns = line.split("\t")
1.87 - if term == columns[0]:
1.88 - return line
1.89 + for record in f.get_records():
1.90 + if term == f.get_key(record):
1.91 + return record
1.92
1.93 return None
1.94
1.95 @@ -99,6 +115,6 @@
1.96 self.f = f
1.97
1.98 def find(self, term):
1.99 - return find_line_with_index(self.f, self.entries, term)
1.100 + return find_with_index(self.f, self.entries, term)
1.101
1.102 # vim: tabstop=4 expandtab shiftwidth=4
2.1 --- a/test_indexed.py Fri Sep 30 00:44:45 2011 +0200
2.2 +++ b/test_indexed.py Fri Sep 30 23:46:27 2011 +0200
2.3 @@ -7,16 +7,17 @@
2.4 terms = sys.argv[3:]
2.5
2.6 f = open(filename)
2.7 +tf = TextFile(f)
2.8 try:
2.9 t = time.time()
2.10 - l = index_by_lines(f, int(step))
2.11 + l = index_file(tf, int(step))
2.12 print "Indexed in %s seconds." % (time.time() - t)
2.13
2.14 # Now use the index.
2.15
2.16 for term in terms:
2.17 t = time.time()
2.18 - line = find_line_with_index(f, l, term)
2.19 + line = find_with_index(tf, l, term)
2.20 if line:
2.21 print "Found (at %s seconds)...\n%s" % (time.time() - t, line)
2.22
3.1 --- a/test_scan.py Fri Sep 30 00:44:45 2011 +0200
3.2 +++ b/test_scan.py Fri Sep 30 23:46:27 2011 +0200
3.3 @@ -7,12 +7,13 @@
3.4 terms = sys.argv[2:]
3.5
3.6 f = open(filename)
3.7 +tf = TextFile(f)
3.8 try:
3.9 for term in terms:
3.10 - f.seek(0)
3.11 + tf.seek(0)
3.12
3.13 t = time.time()
3.14 - line = find_line_in_file(f, term)
3.15 + line = find_in_file(tf, term)
3.16 if line:
3.17 print "Found (at %s seconds)...\n%s" % (time.time() - t, line)
3.18 finally: