paul@0 | 1 | #!/usr/bin/env python |
paul@0 | 2 | |
paul@0 | 3 | """ |
paul@0 | 4 | Simple indexing of sorted files. |
paul@0 | 5 | |
paul@1 | 6 | Copyright (C) 2011 Paul Boddie <paul@boddie.org.uk> |
paul@0 | 7 | |
paul@0 | 8 | This program is free software; you can redistribute it and/or modify it under |
paul@0 | 9 | the terms of the GNU General Public License as published by the Free Software |
paul@0 | 10 | Foundation; either version 3 of the License, or (at your option) any later |
paul@0 | 11 | version. |
paul@0 | 12 | |
paul@0 | 13 | This program is distributed in the hope that it will be useful, but WITHOUT ANY |
paul@0 | 14 | WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A |
paul@0 | 15 | PARTICULAR PURPOSE. See the GNU General Public License for more details. |
paul@0 | 16 | |
paul@0 | 17 | You should have received a copy of the GNU General Public License along |
paul@0 | 18 | with this program. If not, see <http://www.gnu.org/licenses/>. |
paul@0 | 19 | |
paul@0 | 20 | Motivations |
paul@0 | 21 | ----------- |
paul@0 | 22 | |
paul@0 | 23 | When searching files, the cost to scan an entire file typically exceeds the |
paul@0 | 24 | cost to navigate to the appropriate region and to scan a smaller portion of the |
paul@0 | 25 | file, especially for large files. At the same time, exotic data structures |
paul@0 | 26 | encouraging multiple seeks and reads are likely to waste time compared to just |
paul@0 | 27 | performing a single read operation, even if that operation involves a larger |
paul@0 | 28 | quantity of data, at least for storage with hard disk access characteristics. |
paul@0 | 29 | """ |
paul@0 | 30 | |
paul@0 | 31 | import bisect |
paul@0 | 32 | |
paul@1 | 33 | class TextFile: |
paul@1 | 34 | |
paul@1 | 35 | "A wrapper around text files." |
paul@1 | 36 | |
paul@3 | 37 | def __init__(self, f, keys=None, delimiter=None): |
paul@1 | 38 | self.f = f |
paul@3 | 39 | self.keys = keys or [0] |
paul@1 | 40 | self.delimiter = delimiter |
paul@1 | 41 | |
paul@1 | 42 | def seek(self, pos): |
paul@1 | 43 | self.f.seek(pos) |
paul@1 | 44 | |
paul@1 | 45 | def get_records(self): |
paul@1 | 46 | return self.f.xreadlines() |
paul@1 | 47 | |
paul@1 | 48 | def get_key(self, record): |
paul@3 | 49 | values = record.split(self.delimiter) |
paul@3 | 50 | return [values[key] for key in self.keys] |
paul@1 | 51 | |
paul@4 | 52 | def make_index(reader, accessor, interval): |
paul@0 | 53 | |
paul@0 | 54 | """ |
paul@4 | 55 | Index a resource whose 'reader' provides records and whose 'accessor' can |
paul@4 | 56 | yield the key for such records, creating an index entry for a record after a |
paul@4 | 57 | given number of records, defined by 'interval', have been read since the |
paul@4 | 58 | last entry was produced. |
paul@0 | 59 | """ |
paul@0 | 60 | |
paul@0 | 61 | l = [] |
paul@0 | 62 | pos = 0 |
paul@0 | 63 | |
paul@2 | 64 | current_key = None |
paul@2 | 65 | start_pos = 0 |
paul@2 | 66 | |
paul@4 | 67 | for i, record in enumerate(reader.get_records()): |
paul@4 | 68 | key = accessor.get_key(record) |
paul@2 | 69 | |
paul@2 | 70 | # Where duplicate keys are permitted, the first record employing the key |
paul@2 | 71 | # must be available as an index entry. Otherwise, records preceding the |
paul@2 | 72 | # one referenced by the entry may have the same key and be missed when |
paul@2 | 73 | # seeking using the index. |
paul@2 | 74 | |
paul@2 | 75 | if key != current_key: |
paul@2 | 76 | current_key = key |
paul@2 | 77 | start_pos = pos |
paul@2 | 78 | |
paul@0 | 79 | if i % interval == 0: |
paul@2 | 80 | l.append((current_key, start_pos)) |
paul@2 | 81 | |
paul@1 | 82 | pos += len(record) |
paul@0 | 83 | |
paul@0 | 84 | return l |
paul@0 | 85 | |
paul@4 | 86 | def find_with_index(reader, accessor, l, term): |
paul@0 | 87 | |
paul@0 | 88 | """ |
paul@4 | 89 | Find in the resource whose 'reader' provides records and whose 'accessor' |
paul@4 | 90 | can yield the key for such records, using the given index list 'l', the |
paul@4 | 91 | given 'term', returning a record employing the term or None if no such |
paul@4 | 92 | record was found. |
paul@0 | 93 | """ |
paul@0 | 94 | |
paul@0 | 95 | i = bisect.bisect_left(l, (term, None)) |
paul@3 | 96 | |
paul@3 | 97 | try: |
paul@3 | 98 | found, pos = l[i] |
paul@3 | 99 | except IndexError: |
paul@3 | 100 | return None |
paul@0 | 101 | |
paul@0 | 102 | # Since the index is more coarse than the underlying file, the bisect left |
paul@0 | 103 | # operation will most likely point to an index entry for later records than |
paul@0 | 104 | # the desired one. |
paul@0 | 105 | |
paul@0 | 106 | if found > term: |
paul@0 | 107 | i = max(0, i - 1) |
paul@0 | 108 | found, pos = l[i] |
paul@0 | 109 | |
paul@4 | 110 | reader.seek(pos) |
paul@4 | 111 | return find_in_file(reader, accessor, term) |
paul@0 | 112 | |
paul@4 | 113 | def find_in_file(reader, accessor, term): |
paul@0 | 114 | |
paul@0 | 115 | """ |
paul@4 | 116 | Find in the resource whose 'reader' provides records and whose 'accessor' |
paul@4 | 117 | can yield the key for such records, the given 'term', returning a record |
paul@4 | 118 | employing the term or None if no such record was found. |
paul@0 | 119 | """ |
paul@0 | 120 | |
paul@4 | 121 | for record in reader.get_records(): |
paul@4 | 122 | if term == accessor.get_key(record): |
paul@1 | 123 | return record |
paul@0 | 124 | |
paul@0 | 125 | return None |
paul@0 | 126 | |
paul@3 | 127 | def groups(l, length): |
paul@3 | 128 | |
paul@3 | 129 | "Split 'l' into groups of the given 'length'." |
paul@3 | 130 | |
paul@3 | 131 | if length <= 0: |
paul@3 | 132 | raise ValueError, "Groups must be greater than zero." |
paul@3 | 133 | |
paul@3 | 134 | i = 0 |
paul@3 | 135 | g = [] |
paul@3 | 136 | |
paul@3 | 137 | while i < len(l): |
paul@3 | 138 | g.append(l[i:i+length]) |
paul@3 | 139 | i += length |
paul@3 | 140 | |
paul@3 | 141 | return g |
paul@3 | 142 | |
paul@0 | 143 | # vim: tabstop=4 expandtab shiftwidth=4 |