1 #!/usr/bin/env python 2 3 """ 4 Simple indexing of sorted files. 5 6 Copyright (C) 2011 Paul Boddie <paul@boddie.org.uk> 7 8 This program is free software; you can redistribute it and/or modify it under 9 the terms of the GNU General Public License as published by the Free Software 10 Foundation; either version 3 of the License, or (at your option) any later 11 version. 12 13 This program is distributed in the hope that it will be useful, but WITHOUT ANY 14 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A 15 PARTICULAR PURPOSE. See the GNU General Public License for more details. 16 17 You should have received a copy of the GNU General Public License along 18 with this program. If not, see <http://www.gnu.org/licenses/>. 19 20 Motivations 21 ----------- 22 23 When searching files, the cost to scan an entire file typically exceeds the 24 cost to navigate to the appropriate region and to scan a smaller portion of the 25 file, especially for large files. At the same time, exotic data structures 26 encouraging multiple seeks and reads are likely to waste time compared to just 27 performing a single read operation, even if that operation involves a larger 28 quantity of data, at least for storage with hard disk access characteristics. 29 """ 30 31 from simplex.readers import * 32 import bisect 33 34 def make_index(reader, interval): 35 36 """ 37 Index a resource whose 'reader' provides records and keys, creating an index 38 entry for a record after a given number of records, defined by 'interval', 39 have been read since the last entry was produced. 40 """ 41 42 l = [] 43 pos = 0 44 45 current_key = None 46 start_pos = 0 47 48 for i, (key, record) in enumerate(reader.get_records()): 49 50 # Where duplicate keys are permitted, the first record employing the key 51 # must be available as an index entry. Otherwise, records preceding the 52 # one referenced by the entry may have the same key and be missed when 53 # seeking using the index. 54 55 if key != current_key: 56 current_key = key 57 start_pos = pos 58 59 if i % interval == 0: 60 l.append((current_key, start_pos)) 61 62 pos += len(record) 63 64 return l 65 66 def find_with_index(reader, l, term): 67 68 """ 69 In the resource whose 'reader' provides records and keys, using the given 70 index list 'l', find the given 'term', returning a record employing the term 71 or None if no such record was found. 72 """ 73 74 i = bisect.bisect_left(l, (term, None)) 75 76 try: 77 found, pos = l[i] 78 except IndexError: 79 return None 80 81 # Since the index is more coarse than the underlying file, the bisect left 82 # operation will most likely point to an index entry for later records than 83 # the desired one. 84 85 if found > term: 86 i = max(0, i - 1) 87 found, pos = l[i] 88 89 reader.seek(pos) 90 return find_in_file(reader, term) 91 92 def find_in_file(reader, term): 93 94 """ 95 In the resource whose 'reader' provides records and keys, find the given 96 'term', returning a record employing the term or None if no such record was 97 found. 98 """ 99 100 for key, record in reader.get_records(): 101 if term == key: 102 return record 103 104 # Short-circuit failed searches. 105 106 elif term < key: 107 return None 108 109 return None 110 111 def groups(l, length): 112 113 "Split 'l' into groups of the given 'length'." 114 115 if length <= 0: 116 raise ValueError, "Groups must be greater than zero." 117 118 i = 0 119 g = [] 120 121 while i < len(l): 122 g.append(l[i:i+length]) 123 i += length 124 125 return g 126 127 # vim: tabstop=4 expandtab shiftwidth=4