paul@0 | 1 | #!/usr/bin/env python |
paul@0 | 2 | |
paul@0 | 3 | """ |
paul@0 | 4 | Simple indexing of sorted files. |
paul@0 | 5 | |
paul@1 | 6 | Copyright (C) 2011 Paul Boddie <paul@boddie.org.uk> |
paul@0 | 7 | |
paul@0 | 8 | This program is free software; you can redistribute it and/or modify it under |
paul@0 | 9 | the terms of the GNU General Public License as published by the Free Software |
paul@0 | 10 | Foundation; either version 3 of the License, or (at your option) any later |
paul@0 | 11 | version. |
paul@0 | 12 | |
paul@0 | 13 | This program is distributed in the hope that it will be useful, but WITHOUT ANY |
paul@0 | 14 | WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A |
paul@0 | 15 | PARTICULAR PURPOSE. See the GNU General Public License for more details. |
paul@0 | 16 | |
paul@0 | 17 | You should have received a copy of the GNU General Public License along |
paul@0 | 18 | with this program. If not, see <http://www.gnu.org/licenses/>. |
paul@0 | 19 | |
paul@0 | 20 | Motivations |
paul@0 | 21 | ----------- |
paul@0 | 22 | |
paul@0 | 23 | When searching files, the cost to scan an entire file typically exceeds the |
paul@0 | 24 | cost to navigate to the appropriate region and to scan a smaller portion of the |
paul@0 | 25 | file, especially for large files. At the same time, exotic data structures |
paul@0 | 26 | encouraging multiple seeks and reads are likely to waste time compared to just |
paul@0 | 27 | performing a single read operation, even if that operation involves a larger |
paul@0 | 28 | quantity of data, at least for storage with hard disk access characteristics. |
paul@0 | 29 | """ |
paul@0 | 30 | |
paul@6 | 31 | from simplex.readers import * |
paul@0 | 32 | import bisect |
paul@0 | 33 | |
paul@8 | 34 | def make_index(reader, interval): |
paul@0 | 35 | |
paul@0 | 36 | """ |
paul@8 | 37 | Index a resource whose 'reader' provides records and keys, creating an index |
paul@8 | 38 | entry for a record after a given number of records, defined by 'interval', |
paul@8 | 39 | have been read since the last entry was produced. |
paul@0 | 40 | """ |
paul@0 | 41 | |
paul@0 | 42 | l = [] |
paul@0 | 43 | pos = 0 |
paul@0 | 44 | |
paul@2 | 45 | current_key = None |
paul@2 | 46 | start_pos = 0 |
paul@2 | 47 | |
paul@8 | 48 | for i, (key, record) in enumerate(reader.get_records()): |
paul@2 | 49 | |
paul@2 | 50 | # Where duplicate keys are permitted, the first record employing the key |
paul@2 | 51 | # must be available as an index entry. Otherwise, records preceding the |
paul@2 | 52 | # one referenced by the entry may have the same key and be missed when |
paul@2 | 53 | # seeking using the index. |
paul@2 | 54 | |
paul@2 | 55 | if key != current_key: |
paul@2 | 56 | current_key = key |
paul@2 | 57 | start_pos = pos |
paul@2 | 58 | |
paul@0 | 59 | if i % interval == 0: |
paul@2 | 60 | l.append((current_key, start_pos)) |
paul@2 | 61 | |
paul@1 | 62 | pos += len(record) |
paul@0 | 63 | |
paul@0 | 64 | return l |
paul@0 | 65 | |
paul@8 | 66 | def find_with_index(reader, l, term): |
paul@0 | 67 | |
paul@0 | 68 | """ |
paul@8 | 69 | In the resource whose 'reader' provides records and keys, using the given |
paul@8 | 70 | index list 'l', find the given 'term', returning a record employing the term |
paul@8 | 71 | or None if no such record was found. |
paul@0 | 72 | """ |
paul@0 | 73 | |
paul@0 | 74 | i = bisect.bisect_left(l, (term, None)) |
paul@3 | 75 | |
paul@3 | 76 | try: |
paul@3 | 77 | found, pos = l[i] |
paul@3 | 78 | except IndexError: |
paul@3 | 79 | return None |
paul@0 | 80 | |
paul@0 | 81 | # Since the index is more coarse than the underlying file, the bisect left |
paul@0 | 82 | # operation will most likely point to an index entry for later records than |
paul@0 | 83 | # the desired one. |
paul@0 | 84 | |
paul@0 | 85 | if found > term: |
paul@0 | 86 | i = max(0, i - 1) |
paul@0 | 87 | found, pos = l[i] |
paul@0 | 88 | |
paul@4 | 89 | reader.seek(pos) |
paul@8 | 90 | return find_in_file(reader, term) |
paul@0 | 91 | |
paul@8 | 92 | def find_in_file(reader, term): |
paul@0 | 93 | |
paul@0 | 94 | """ |
paul@8 | 95 | In the resource whose 'reader' provides records and keys, find the given |
paul@8 | 96 | 'term', returning a record employing the term or None if no such record was |
paul@8 | 97 | found. |
paul@0 | 98 | """ |
paul@0 | 99 | |
paul@8 | 100 | for key, record in reader.get_records(): |
paul@7 | 101 | if term == key: |
paul@1 | 102 | return record |
paul@0 | 103 | |
paul@7 | 104 | # Short-circuit failed searches. |
paul@7 | 105 | |
paul@7 | 106 | elif term < key: |
paul@7 | 107 | return None |
paul@7 | 108 | |
paul@0 | 109 | return None |
paul@0 | 110 | |
paul@3 | 111 | def groups(l, length): |
paul@3 | 112 | |
paul@3 | 113 | "Split 'l' into groups of the given 'length'." |
paul@3 | 114 | |
paul@3 | 115 | if length <= 0: |
paul@3 | 116 | raise ValueError, "Groups must be greater than zero." |
paul@3 | 117 | |
paul@3 | 118 | i = 0 |
paul@3 | 119 | g = [] |
paul@3 | 120 | |
paul@3 | 121 | while i < len(l): |
paul@3 | 122 | g.append(l[i:i+length]) |
paul@3 | 123 | i += length |
paul@3 | 124 | |
paul@3 | 125 | return g |
paul@3 | 126 | |
paul@0 | 127 | # vim: tabstop=4 expandtab shiftwidth=4 |