1 #!/usr/bin/env python 2 3 """ 4 Simple indexing of sorted files. 5 6 Copyright (C) 2011 Paul Boddie <paul@boddie.org.uk> 7 8 This program is free software; you can redistribute it and/or modify it under 9 the terms of the GNU General Public License as published by the Free Software 10 Foundation; either version 3 of the License, or (at your option) any later 11 version. 12 13 This program is distributed in the hope that it will be useful, but WITHOUT ANY 14 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A 15 PARTICULAR PURPOSE. See the GNU General Public License for more details. 16 17 You should have received a copy of the GNU General Public License along 18 with this program. If not, see <http://www.gnu.org/licenses/>. 19 20 Motivations 21 ----------- 22 23 When searching files, the cost to scan an entire file typically exceeds the 24 cost to navigate to the appropriate region and to scan a smaller portion of the 25 file, especially for large files. At the same time, exotic data structures 26 encouraging multiple seeks and reads are likely to waste time compared to just 27 performing a single read operation, even if that operation involves a larger 28 quantity of data, at least for storage with hard disk access characteristics. 29 """ 30 31 from simplex.readers import * 32 from simplex.indexers import * 33 import bisect 34 35 def find_with_index(reader, get_key, l, term): 36 37 """ 38 In the resource whose 'reader' provides records, using a 'get_key' operation 39 to yield the key for such records, and using the given index list 'l', find 40 the given 'term', returning a record employing the term or None if no such 41 record was found. 42 """ 43 44 i = bisect.bisect_left(l, (term, None)) 45 46 try: 47 found, pos = l[i] 48 except IndexError: 49 return None 50 51 # Since the index is more coarse than the underlying file, the bisect left 52 # operation will most likely point to an index entry for later records than 53 # the desired one. 54 55 if found > term: 56 i = max(0, i - 1) 57 found, pos = l[i] 58 59 reader.seek(pos) 60 return find_in_file(reader, get_key, term) 61 62 def find_in_file(reader, get_key, term): 63 64 """ 65 In the resource whose 'reader' provides records, using a 'get_key' operation 66 to yield the key for such records, find the given 'term', returning a record 67 employing the term or None if no such record was found. 68 """ 69 70 for record in reader: 71 key = get_key(record) 72 if term == key: 73 return record 74 75 # Short-circuit failed searches. 76 77 elif term < key: 78 return None 79 80 return None 81 82 def groups(l, length): 83 84 "Split 'l' into groups of the given 'length'." 85 86 if length <= 0: 87 raise ValueError, "Groups must be greater than zero." 88 89 i = 0 90 g = [] 91 92 while i < len(l): 93 g.append(l[i:i+length]) 94 i += length 95 96 return g 97 98 # vim: tabstop=4 expandtab shiftwidth=4