1 #!/usr/bin/env python 2 3 """ 4 Simple indexing of sorted files. 5 6 Copyright (C) 2011 Paul Boddie <paul@boddie.org.uk> 7 8 This program is free software; you can redistribute it and/or modify it under 9 the terms of the GNU General Public License as published by the Free Software 10 Foundation; either version 3 of the License, or (at your option) any later 11 version. 12 13 This program is distributed in the hope that it will be useful, but WITHOUT ANY 14 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A 15 PARTICULAR PURPOSE. See the GNU General Public License for more details. 16 17 You should have received a copy of the GNU General Public License along 18 with this program. If not, see <http://www.gnu.org/licenses/>. 19 20 Motivations 21 ----------- 22 23 When searching files, the cost to scan an entire file typically exceeds the 24 cost to navigate to the appropriate region and to scan a smaller portion of the 25 file, especially for large files. At the same time, exotic data structures 26 encouraging multiple seeks and reads are likely to waste time compared to just 27 performing a single read operation, even if that operation involves a larger 28 quantity of data, at least for storage with hard disk access characteristics. 29 """ 30 31 from simplex.readers import * 32 from simplex.indexers import * 33 import bisect 34 35 class IndexedFile: 36 37 "An indexed file referring to a sorted file." 38 39 def __init__(self, reader, index, get_key): 40 self.reader = reader 41 self.index = index 42 self.get_key = get_key 43 44 def find(self, term): 45 return find_with_index(self.reader, self.get_key, self.index, term) 46 47 def find_with_index(reader, get_key, l, term): 48 49 """ 50 In the resource whose 'reader' provides records, using a 'get_key' operation 51 to yield the key for such records, and using the given index list 'l', find 52 the given 'term', returning all records employing the term or an empty list 53 if no such records were found. 54 """ 55 56 i = bisect.bisect_left(l, (term, None)) 57 58 try: 59 found, pos = l[i] 60 except IndexError: 61 return [] 62 63 # Since the index is more coarse than the underlying file, the bisect left 64 # operation will most likely point to an index entry for later records than 65 # the desired one. 66 67 if found > term: 68 i = max(0, i - 1) 69 found, pos = l[i] 70 71 reader.seek(pos) 72 return find_in_file(reader, get_key, term) 73 74 def find_in_file(reader, get_key, term): 75 76 """ 77 In the resource whose 'reader' provides records, using a 'get_key' operation 78 to yield the key for such records, find the given 'term', returning all 79 records employing the term or an empty list if no such records were found. 80 """ 81 82 results = [] 83 84 for record in reader: 85 key = get_key(record) 86 if term == key: 87 results.append(record) 88 89 # Short-circuit failed searches. 90 91 elif term < key: 92 break 93 94 return results 95 96 def groups(l, length): 97 98 "Split 'l' into groups of the given 'length'." 99 100 if length <= 0: 101 raise ValueError, "Groups must be greater than zero." 102 103 i = 0 104 g = [] 105 106 while i < len(l): 107 g.append(l[i:i+length]) 108 i += length 109 110 return g 111 112 # vim: tabstop=4 expandtab shiftwidth=4