1 #!/usr/bin/env python 2 3 """ 4 Simple indexing of sorted files. 5 6 Copyright (C) 2009, 2010 Paul Boddie <paul@boddie.org.uk> 7 8 This program is free software; you can redistribute it and/or modify it under 9 the terms of the GNU General Public License as published by the Free Software 10 Foundation; either version 3 of the License, or (at your option) any later 11 version. 12 13 This program is distributed in the hope that it will be useful, but WITHOUT ANY 14 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A 15 PARTICULAR PURPOSE. See the GNU General Public License for more details. 16 17 You should have received a copy of the GNU General Public License along 18 with this program. If not, see <http://www.gnu.org/licenses/>. 19 20 Motivations 21 ----------- 22 23 When searching files, the cost to scan an entire file typically exceeds the 24 cost to navigate to the appropriate region and to scan a smaller portion of the 25 file, especially for large files. At the same time, exotic data structures 26 encouraging multiple seeks and reads are likely to waste time compared to just 27 performing a single read operation, even if that operation involves a larger 28 quantity of data, at least for storage with hard disk access characteristics. 29 30 Potential Improvements 31 ---------------------- 32 33 Ideally, the acquisition of records should be done more generally than just 34 reading lines, and the selection of matches should involve more than just 35 selecting the first column. 36 """ 37 38 import bisect 39 40 def index_by_lines(f, interval): 41 42 """ 43 Index a file 'f', creating an index entry for a line after a given number, 44 defined by 'interval', has been read since the last entry. 45 """ 46 47 l = [] 48 pos = 0 49 50 for i, line in enumerate(f.xreadlines()): 51 columns = line.split("\t") 52 if i % interval == 0: 53 l.append((columns[0], pos)) 54 pos += len(line) 55 56 return l 57 58 def find_line_with_index(f, l, term): 59 60 """ 61 Find in file 'f', using the given index list 'l', the given 'term', 62 returning a line employing the term or None if no such line was found. 63 """ 64 65 i = bisect.bisect_left(l, (term, None)) 66 found, pos = l[i] 67 68 # Since the index is more coarse than the underlying file, the bisect left 69 # operation will most likely point to an index entry for later records than 70 # the desired one. 71 72 if found > term: 73 i = max(0, i - 1) 74 found, pos = l[i] 75 76 f.seek(pos) 77 return find_line_in_file(f, term) 78 79 def find_line_in_file(f, term): 80 81 """ 82 Find in file 'f' the given 'term', returning a line employing the term or 83 None if no such line was found. 84 """ 85 86 for line in f.xreadlines(): 87 columns = line.split("\t") 88 if term == columns[0]: 89 return line 90 91 return None 92 93 class Index: 94 95 "An index abstraction." 96 97 def __init__(self, entries, f): 98 self.entries = entries 99 self.f = f 100 101 def find(self, term): 102 return find_line_with_index(self.f, self.entries, term) 103 104 # vim: tabstop=4 expandtab shiftwidth=4