paul@0 | 1 | #!/usr/bin/env python |
paul@0 | 2 | |
paul@0 | 3 | """ |
paul@0 | 4 | Simple indexing of sorted files. |
paul@0 | 5 | |
paul@1 | 6 | Copyright (C) 2011 Paul Boddie <paul@boddie.org.uk> |
paul@0 | 7 | |
paul@0 | 8 | This program is free software; you can redistribute it and/or modify it under |
paul@0 | 9 | the terms of the GNU General Public License as published by the Free Software |
paul@0 | 10 | Foundation; either version 3 of the License, or (at your option) any later |
paul@0 | 11 | version. |
paul@0 | 12 | |
paul@0 | 13 | This program is distributed in the hope that it will be useful, but WITHOUT ANY |
paul@0 | 14 | WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A |
paul@0 | 15 | PARTICULAR PURPOSE. See the GNU General Public License for more details. |
paul@0 | 16 | |
paul@0 | 17 | You should have received a copy of the GNU General Public License along |
paul@0 | 18 | with this program. If not, see <http://www.gnu.org/licenses/>. |
paul@0 | 19 | |
paul@0 | 20 | Motivations |
paul@0 | 21 | ----------- |
paul@0 | 22 | |
paul@0 | 23 | When searching files, the cost to scan an entire file typically exceeds the |
paul@0 | 24 | cost to navigate to the appropriate region and to scan a smaller portion of the |
paul@0 | 25 | file, especially for large files. At the same time, exotic data structures |
paul@0 | 26 | encouraging multiple seeks and reads are likely to waste time compared to just |
paul@0 | 27 | performing a single read operation, even if that operation involves a larger |
paul@0 | 28 | quantity of data, at least for storage with hard disk access characteristics. |
paul@0 | 29 | """ |
paul@0 | 30 | |
paul@6 | 31 | from simplex.readers import * |
paul@0 | 32 | import bisect |
paul@0 | 33 | |
paul@17 | 34 | def make_index(reader, get_key, interval): |
paul@0 | 35 | |
paul@0 | 36 | """ |
paul@17 | 37 | Index a resource whose 'reader' provides records, using a 'get_key' |
paul@17 | 38 | operation to yield the key for such records, creating an index entry for a |
paul@17 | 39 | record after a given number of records, defined by 'interval', have been |
paul@17 | 40 | read since the last entry was produced. |
paul@0 | 41 | """ |
paul@0 | 42 | |
paul@0 | 43 | l = [] |
paul@0 | 44 | pos = 0 |
paul@0 | 45 | |
paul@2 | 46 | current_key = None |
paul@2 | 47 | start_pos = 0 |
paul@2 | 48 | |
paul@16 | 49 | for i, record in enumerate(reader): |
paul@17 | 50 | key = get_key(record) |
paul@2 | 51 | |
paul@2 | 52 | # Where duplicate keys are permitted, the first record employing the key |
paul@2 | 53 | # must be available as an index entry. Otherwise, records preceding the |
paul@2 | 54 | # one referenced by the entry may have the same key and be missed when |
paul@2 | 55 | # seeking using the index. |
paul@2 | 56 | |
paul@2 | 57 | if key != current_key: |
paul@2 | 58 | current_key = key |
paul@2 | 59 | start_pos = pos |
paul@2 | 60 | |
paul@0 | 61 | if i % interval == 0: |
paul@2 | 62 | l.append((current_key, start_pos)) |
paul@2 | 63 | |
paul@1 | 64 | pos += len(record) |
paul@0 | 65 | |
paul@0 | 66 | return l |
paul@0 | 67 | |
paul@17 | 68 | def find_with_index(reader, get_key, l, term): |
paul@0 | 69 | |
paul@0 | 70 | """ |
paul@17 | 71 | In the resource whose 'reader' provides records, using a 'get_key' operation |
paul@17 | 72 | to yield the key for such records, and using the given index list 'l', find |
paul@17 | 73 | the given 'term', returning a record employing the term or None if no such |
paul@4 | 74 | record was found. |
paul@0 | 75 | """ |
paul@0 | 76 | |
paul@0 | 77 | i = bisect.bisect_left(l, (term, None)) |
paul@3 | 78 | |
paul@3 | 79 | try: |
paul@3 | 80 | found, pos = l[i] |
paul@3 | 81 | except IndexError: |
paul@3 | 82 | return None |
paul@0 | 83 | |
paul@0 | 84 | # Since the index is more coarse than the underlying file, the bisect left |
paul@0 | 85 | # operation will most likely point to an index entry for later records than |
paul@0 | 86 | # the desired one. |
paul@0 | 87 | |
paul@0 | 88 | if found > term: |
paul@0 | 89 | i = max(0, i - 1) |
paul@0 | 90 | found, pos = l[i] |
paul@0 | 91 | |
paul@4 | 92 | reader.seek(pos) |
paul@17 | 93 | return find_in_file(reader, get_key, term) |
paul@0 | 94 | |
paul@17 | 95 | def find_in_file(reader, get_key, term): |
paul@0 | 96 | |
paul@0 | 97 | """ |
paul@17 | 98 | In the resource whose 'reader' provides records, using a 'get_key' operation |
paul@17 | 99 | to yield the key for such records, find the given 'term', returning a record |
paul@4 | 100 | employing the term or None if no such record was found. |
paul@0 | 101 | """ |
paul@0 | 102 | |
paul@16 | 103 | for record in reader: |
paul@17 | 104 | key = get_key(record) |
paul@7 | 105 | if term == key: |
paul@1 | 106 | return record |
paul@0 | 107 | |
paul@7 | 108 | # Short-circuit failed searches. |
paul@7 | 109 | |
paul@7 | 110 | elif term < key: |
paul@7 | 111 | return None |
paul@7 | 112 | |
paul@0 | 113 | return None |
paul@0 | 114 | |
paul@3 | 115 | def groups(l, length): |
paul@3 | 116 | |
paul@3 | 117 | "Split 'l' into groups of the given 'length'." |
paul@3 | 118 | |
paul@3 | 119 | if length <= 0: |
paul@3 | 120 | raise ValueError, "Groups must be greater than zero." |
paul@3 | 121 | |
paul@3 | 122 | i = 0 |
paul@3 | 123 | g = [] |
paul@3 | 124 | |
paul@3 | 125 | while i < len(l): |
paul@3 | 126 | g.append(l[i:i+length]) |
paul@3 | 127 | i += length |
paul@3 | 128 | |
paul@3 | 129 | return g |
paul@3 | 130 | |
paul@0 | 131 | # vim: tabstop=4 expandtab shiftwidth=4 |