1 #!/usr/bin/env python 2 3 """ 4 Simple indexing of sorted files. 5 6 Copyright (C) 2011 Paul Boddie <paul@boddie.org.uk> 7 8 This program is free software; you can redistribute it and/or modify it under 9 the terms of the GNU General Public License as published by the Free Software 10 Foundation; either version 3 of the License, or (at your option) any later 11 version. 12 13 This program is distributed in the hope that it will be useful, but WITHOUT ANY 14 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A 15 PARTICULAR PURPOSE. See the GNU General Public License for more details. 16 17 You should have received a copy of the GNU General Public License along 18 with this program. If not, see <http://www.gnu.org/licenses/>. 19 20 Motivations 21 ----------- 22 23 When searching files, the cost to scan an entire file typically exceeds the 24 cost to navigate to the appropriate region and to scan a smaller portion of the 25 file, especially for large files. At the same time, exotic data structures 26 encouraging multiple seeks and reads are likely to waste time compared to just 27 performing a single read operation, even if that operation involves a larger 28 quantity of data, at least for storage with hard disk access characteristics. 29 30 Potential Improvements 31 ---------------------- 32 33 Ideally, the acquisition of records should be done more generally than just 34 reading lines, and the selection of matches should involve more than just 35 selecting the first column. 36 """ 37 38 import bisect 39 40 class TextFile: 41 42 "A wrapper around text files." 43 44 def __init__(self, f, keys=None, delimiter=None): 45 self.f = f 46 self.keys = keys or [0] 47 self.delimiter = delimiter 48 49 def seek(self, pos): 50 self.f.seek(pos) 51 52 def get_records(self): 53 return self.f.xreadlines() 54 55 def get_key(self, record): 56 values = record.split(self.delimiter) 57 return [values[key] for key in self.keys] 58 59 def index_file(f, interval): 60 61 """ 62 Index a file 'f', creating an index entry for a record after a given number, 63 defined by 'interval', have been read since the last entry. 64 """ 65 66 l = [] 67 pos = 0 68 69 current_key = None 70 start_pos = 0 71 72 for i, record in enumerate(f.get_records()): 73 key = f.get_key(record) 74 75 # Where duplicate keys are permitted, the first record employing the key 76 # must be available as an index entry. Otherwise, records preceding the 77 # one referenced by the entry may have the same key and be missed when 78 # seeking using the index. 79 80 if key != current_key: 81 current_key = key 82 start_pos = pos 83 84 if i % interval == 0: 85 l.append((current_key, start_pos)) 86 87 pos += len(record) 88 89 return l 90 91 def find_with_index(f, l, term): 92 93 """ 94 Find in file 'f', using the given index list 'l', the given 'term', 95 returning a record employing the term or None if no such record was found. 96 """ 97 98 i = bisect.bisect_left(l, (term, None)) 99 100 try: 101 found, pos = l[i] 102 except IndexError: 103 return None 104 105 # Since the index is more coarse than the underlying file, the bisect left 106 # operation will most likely point to an index entry for later records than 107 # the desired one. 108 109 if found > term: 110 i = max(0, i - 1) 111 found, pos = l[i] 112 113 f.seek(pos) 114 return find_in_file(f, term) 115 116 def find_in_file(f, term): 117 118 """ 119 Find in file 'f' the given 'term', returning a record employing the term or 120 None if no such record was found. 121 """ 122 123 for record in f.get_records(): 124 if term == f.get_key(record): 125 return record 126 127 return None 128 129 class Index: 130 131 "An index abstraction." 132 133 def __init__(self, entries, f): 134 self.entries = entries 135 self.f = f 136 137 def find(self, term): 138 return find_with_index(self.f, self.entries, term) 139 140 def groups(l, length): 141 142 "Split 'l' into groups of the given 'length'." 143 144 if length <= 0: 145 raise ValueError, "Groups must be greater than zero." 146 147 i = 0 148 g = [] 149 150 while i < len(l): 151 g.append(l[i:i+length]) 152 i += length 153 154 return g 155 156 # vim: tabstop=4 expandtab shiftwidth=4