1 #!/usr/bin/env python 2 3 """ 4 Simple indexing of sorted files. 5 6 Copyright (C) 2011 Paul Boddie <paul@boddie.org.uk> 7 8 This program is free software; you can redistribute it and/or modify it under 9 the terms of the GNU General Public License as published by the Free Software 10 Foundation; either version 3 of the License, or (at your option) any later 11 version. 12 13 This program is distributed in the hope that it will be useful, but WITHOUT ANY 14 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A 15 PARTICULAR PURPOSE. See the GNU General Public License for more details. 16 17 You should have received a copy of the GNU General Public License along 18 with this program. If not, see <http://www.gnu.org/licenses/>. 19 20 Motivations 21 ----------- 22 23 When searching files, the cost to scan an entire file typically exceeds the 24 cost to navigate to the appropriate region and to scan a smaller portion of the 25 file, especially for large files. At the same time, exotic data structures 26 encouraging multiple seeks and reads are likely to waste time compared to just 27 performing a single read operation, even if that operation involves a larger 28 quantity of data, at least for storage with hard disk access characteristics. 29 """ 30 31 from simplex.readers import * 32 from simplex.iterators import * 33 from simplex.accessors import * 34 from simplex.state import * 35 36 import bisect 37 38 def make_index(reader, interval): 39 40 """ 41 Index a resource whose 'reader' provides records and keys, creating an index 42 entry for a record after a given number of records, defined by 'interval', 43 have been read since the last entry was produced. 44 """ 45 46 l = [] 47 pos = 0 48 49 current_key = None 50 start_pos = 0 51 52 for i, (key, record) in enumerate(reader): 53 54 # Where duplicate keys are permitted, the first record employing the key 55 # must be available as an index entry. Otherwise, records preceding the 56 # one referenced by the entry may have the same key and be missed when 57 # seeking using the index. 58 59 if key != current_key: 60 current_key = key 61 start_pos = pos 62 63 if i % interval == 0: 64 l.append((current_key, start_pos)) 65 66 pos += len(record) 67 68 return l 69 70 def find_with_index(reader, l, term): 71 72 """ 73 In the resource whose 'reader' provides records and keys, using the given 74 index list 'l', find the given 'term', returning a record employing the term 75 or None if no such record was found. 76 """ 77 78 i = bisect.bisect_left(l, (term, None)) 79 80 try: 81 found, pos = l[i] 82 except IndexError: 83 return None 84 85 # Since the index is more coarse than the underlying file, the bisect left 86 # operation will most likely point to an index entry for later records than 87 # the desired one. 88 89 if found > term: 90 i = max(0, i - 1) 91 found, pos = l[i] 92 93 reader.seek(pos) 94 return find_in_file(reader, term) 95 96 def find_in_file(reader, term): 97 98 """ 99 In the resource whose 'reader' provides records and keys, find the given 100 'term', returning a record employing the term or None if no such record was 101 found. 102 """ 103 104 for key, record in reader: 105 if term == key: 106 return record 107 108 # Short-circuit failed searches. 109 110 elif term < key: 111 return None 112 113 return None 114 115 def groups(l, length): 116 117 "Split 'l' into groups of the given 'length'." 118 119 if length <= 0: 120 raise ValueError, "Groups must be greater than zero." 121 122 i = 0 123 g = [] 124 125 while i < len(l): 126 g.append(l[i:i+length]) 127 i += length 128 129 return g 130 131 # vim: tabstop=4 expandtab shiftwidth=4