1 #!/usr/bin/env python 2 3 """ 4 Simple indexing of sorted files. 5 6 Copyright (C) 2011 Paul Boddie <paul@boddie.org.uk> 7 8 This program is free software; you can redistribute it and/or modify it under 9 the terms of the GNU General Public License as published by the Free Software 10 Foundation; either version 3 of the License, or (at your option) any later 11 version. 12 13 This program is distributed in the hope that it will be useful, but WITHOUT ANY 14 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A 15 PARTICULAR PURPOSE. See the GNU General Public License for more details. 16 17 You should have received a copy of the GNU General Public License along 18 with this program. If not, see <http://www.gnu.org/licenses/>. 19 20 Motivations 21 ----------- 22 23 When searching files, the cost to scan an entire file typically exceeds the 24 cost to navigate to the appropriate region and to scan a smaller portion of the 25 file, especially for large files. At the same time, exotic data structures 26 encouraging multiple seeks and reads are likely to waste time compared to just 27 performing a single read operation, even if that operation involves a larger 28 quantity of data, at least for storage with hard disk access characteristics. 29 """ 30 31 import bisect 32 33 class TextFile: 34 35 "A wrapper around text files." 36 37 def __init__(self, f): 38 self.f = f 39 40 def seek(self, pos): 41 self.f.seek(pos) 42 43 def get_records(self): 44 return self.f.xreadlines() 45 46 class DelimitedRecord: 47 48 "An accessor using a delimiter to split a record." 49 50 def __init__(self, keys=None, delimiter=None): 51 self.keys = keys or [0] 52 self.delimiter = delimiter 53 54 def get_key(self, record): 55 values = record.split(self.delimiter) 56 return [values[key] for key in self.keys] 57 58 def make_index(reader, accessor, interval): 59 60 """ 61 Index a resource whose 'reader' provides records and whose 'accessor' can 62 yield the key for such records, creating an index entry for a record after a 63 given number of records, defined by 'interval', have been read since the 64 last entry was produced. 65 """ 66 67 l = [] 68 pos = 0 69 70 current_key = None 71 start_pos = 0 72 73 for i, record in enumerate(reader.get_records()): 74 key = accessor.get_key(record) 75 76 # Where duplicate keys are permitted, the first record employing the key 77 # must be available as an index entry. Otherwise, records preceding the 78 # one referenced by the entry may have the same key and be missed when 79 # seeking using the index. 80 81 if key != current_key: 82 current_key = key 83 start_pos = pos 84 85 if i % interval == 0: 86 l.append((current_key, start_pos)) 87 88 pos += len(record) 89 90 return l 91 92 def find_with_index(reader, accessor, l, term): 93 94 """ 95 Find in the resource whose 'reader' provides records and whose 'accessor' 96 can yield the key for such records, using the given index list 'l', the 97 given 'term', returning a record employing the term or None if no such 98 record was found. 99 """ 100 101 i = bisect.bisect_left(l, (term, None)) 102 103 try: 104 found, pos = l[i] 105 except IndexError: 106 return None 107 108 # Since the index is more coarse than the underlying file, the bisect left 109 # operation will most likely point to an index entry for later records than 110 # the desired one. 111 112 if found > term: 113 i = max(0, i - 1) 114 found, pos = l[i] 115 116 reader.seek(pos) 117 return find_in_file(reader, accessor, term) 118 119 def find_in_file(reader, accessor, term): 120 121 """ 122 Find in the resource whose 'reader' provides records and whose 'accessor' 123 can yield the key for such records, the given 'term', returning a record 124 employing the term or None if no such record was found. 125 """ 126 127 for record in reader.get_records(): 128 if term == accessor.get_key(record): 129 return record 130 131 return None 132 133 def groups(l, length): 134 135 "Split 'l' into groups of the given 'length'." 136 137 if length <= 0: 138 raise ValueError, "Groups must be greater than zero." 139 140 i = 0 141 g = [] 142 143 while i < len(l): 144 g.append(l[i:i+length]) 145 i += length 146 147 return g 148 149 # vim: tabstop=4 expandtab shiftwidth=4