1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
1.2 +++ b/simplex/__init__.py Sat Oct 01 15:07:17 2011 +0200
1.3 @@ -0,0 +1,125 @@
1.4 +#!/usr/bin/env python
1.5 +
1.6 +"""
1.7 +Simple indexing of sorted files.
1.8 +
1.9 +Copyright (C) 2011 Paul Boddie <paul@boddie.org.uk>
1.10 +
1.11 +This program is free software; you can redistribute it and/or modify it under
1.12 +the terms of the GNU General Public License as published by the Free Software
1.13 +Foundation; either version 3 of the License, or (at your option) any later
1.14 +version.
1.15 +
1.16 +This program is distributed in the hope that it will be useful, but WITHOUT ANY
1.17 +WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
1.18 +PARTICULAR PURPOSE. See the GNU General Public License for more details.
1.19 +
1.20 +You should have received a copy of the GNU General Public License along
1.21 +with this program. If not, see <http://www.gnu.org/licenses/>.
1.22 +
1.23 +Motivations
1.24 +-----------
1.25 +
1.26 +When searching files, the cost to scan an entire file typically exceeds the
1.27 +cost to navigate to the appropriate region and to scan a smaller portion of the
1.28 +file, especially for large files. At the same time, exotic data structures
1.29 +encouraging multiple seeks and reads are likely to waste time compared to just
1.30 +performing a single read operation, even if that operation involves a larger
1.31 +quantity of data, at least for storage with hard disk access characteristics.
1.32 +"""
1.33 +
1.34 +from simplex.readers import *
1.35 +import bisect
1.36 +
1.37 +def make_index(reader, accessor, interval):
1.38 +
1.39 + """
1.40 + Index a resource whose 'reader' provides records and whose 'accessor' can
1.41 + yield the key for such records, creating an index entry for a record after a
1.42 + given number of records, defined by 'interval', have been read since the
1.43 + last entry was produced.
1.44 + """
1.45 +
1.46 + l = []
1.47 + pos = 0
1.48 +
1.49 + current_key = None
1.50 + start_pos = 0
1.51 +
1.52 + for i, record in enumerate(reader.get_records()):
1.53 + key = accessor.get_key(record)
1.54 +
1.55 + # Where duplicate keys are permitted, the first record employing the key
1.56 + # must be available as an index entry. Otherwise, records preceding the
1.57 + # one referenced by the entry may have the same key and be missed when
1.58 + # seeking using the index.
1.59 +
1.60 + if key != current_key:
1.61 + current_key = key
1.62 + start_pos = pos
1.63 +
1.64 + if i % interval == 0:
1.65 + l.append((current_key, start_pos))
1.66 +
1.67 + pos += len(record)
1.68 +
1.69 + return l
1.70 +
1.71 +def find_with_index(reader, accessor, l, term):
1.72 +
1.73 + """
1.74 + Find in the resource whose 'reader' provides records and whose 'accessor'
1.75 + can yield the key for such records, using the given index list 'l', the
1.76 + given 'term', returning a record employing the term or None if no such
1.77 + record was found.
1.78 + """
1.79 +
1.80 + i = bisect.bisect_left(l, (term, None))
1.81 +
1.82 + try:
1.83 + found, pos = l[i]
1.84 + except IndexError:
1.85 + return None
1.86 +
1.87 + # Since the index is more coarse than the underlying file, the bisect left
1.88 + # operation will most likely point to an index entry for later records than
1.89 + # the desired one.
1.90 +
1.91 + if found > term:
1.92 + i = max(0, i - 1)
1.93 + found, pos = l[i]
1.94 +
1.95 + reader.seek(pos)
1.96 + return find_in_file(reader, accessor, term)
1.97 +
1.98 +def find_in_file(reader, accessor, term):
1.99 +
1.100 + """
1.101 + Find in the resource whose 'reader' provides records and whose 'accessor'
1.102 + can yield the key for such records, the given 'term', returning a record
1.103 + employing the term or None if no such record was found.
1.104 + """
1.105 +
1.106 + for record in reader.get_records():
1.107 + if term == accessor.get_key(record):
1.108 + return record
1.109 +
1.110 + return None
1.111 +
1.112 +def groups(l, length):
1.113 +
1.114 + "Split 'l' into groups of the given 'length'."
1.115 +
1.116 + if length <= 0:
1.117 + raise ValueError, "Groups must be greater than zero."
1.118 +
1.119 + i = 0
1.120 + g = []
1.121 +
1.122 + while i < len(l):
1.123 + g.append(l[i:i+length])
1.124 + i += length
1.125 +
1.126 + return g
1.127 +
1.128 +# vim: tabstop=4 expandtab shiftwidth=4