simplex (annotate simplex/__init_

simplex

Annotated simplex/init.py

17:294a658846b6

2011-10-02

Paul Boddie

Changed the index-related functions to use a get_key callable instead of an object providing a get_key method.

paul@0	1	#!/usr/bin/env python
paul@0	2
paul@0	3	"""
paul@0	4	Simple indexing of sorted files.
paul@0	5
paul@1	6	Copyright (C) 2011 Paul Boddie <paul@boddie.org.uk>
paul@0	7
paul@0	8	This program is free software; you can redistribute it and/or modify it under
paul@0	9	the terms of the GNU General Public License as published by the Free Software
paul@0	10	Foundation; either version 3 of the License, or (at your option) any later
paul@0	11	version.
paul@0	12
paul@0	13	This program is distributed in the hope that it will be useful, but WITHOUT ANY
paul@0	14	WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
paul@0	15	PARTICULAR PURPOSE. See the GNU General Public License for more details.
paul@0	16
paul@0	17	You should have received a copy of the GNU General Public License along
paul@0	18	with this program. If not, see <http://www.gnu.org/licenses/>.
paul@0	19
paul@0	20	Motivations
paul@0	21	-----------
paul@0	22
paul@0	23	When searching files, the cost to scan an entire file typically exceeds the
paul@0	24	cost to navigate to the appropriate region and to scan a smaller portion of the
paul@0	25	file, especially for large files. At the same time, exotic data structures
paul@0	26	encouraging multiple seeks and reads are likely to waste time compared to just
paul@0	27	performing a single read operation, even if that operation involves a larger
paul@0	28	quantity of data, at least for storage with hard disk access characteristics.
paul@0	29	"""
paul@0	30
paul@6	31	from simplex.readers import *
paul@0	32	import bisect
paul@0	33
paul@17	34	def make_index(reader, get_key, interval):
paul@0	35
paul@0	36	"""
paul@17	37	Index a resource whose 'reader' provides records, using a 'get_key'
paul@17	38	operation to yield the key for such records, creating an index entry for a
paul@17	39	record after a given number of records, defined by 'interval', have been
paul@17	40	read since the last entry was produced.
paul@0	41	"""
paul@0	42
paul@0	43	l = []
paul@0	44	pos = 0
paul@0	45
paul@2	46	current_key = None
paul@2	47	start_pos = 0
paul@2	48
paul@16	49	for i, record in enumerate(reader):
paul@17	50	key = get_key(record)
paul@2	51
paul@2	52	# Where duplicate keys are permitted, the first record employing the key
paul@2	53	# must be available as an index entry. Otherwise, records preceding the
paul@2	54	# one referenced by the entry may have the same key and be missed when
paul@2	55	# seeking using the index.
paul@2	56
paul@2	57	if key != current_key:
paul@2	58	current_key = key
paul@2	59	start_pos = pos
paul@2	60
paul@0	61	if i % interval == 0:
paul@2	62	l.append((current_key, start_pos))
paul@2	63
paul@1	64	pos += len(record)
paul@0	65
paul@0	66	return l
paul@0	67
paul@17	68	def find_with_index(reader, get_key, l, term):
paul@0	69
paul@0	70	"""
paul@17	71	In the resource whose 'reader' provides records, using a 'get_key' operation
paul@17	72	to yield the key for such records, and using the given index list 'l', find
paul@17	73	the given 'term', returning a record employing the term or None if no such
paul@4	74	record was found.
paul@0	75	"""
paul@0	76
paul@0	77	i = bisect.bisect_left(l, (term, None))
paul@3	78
paul@3	79	try:
paul@3	80	found, pos = l[i]
paul@3	81	except IndexError:
paul@3	82	return None
paul@0	83
paul@0	84	# Since the index is more coarse than the underlying file, the bisect left
paul@0	85	# operation will most likely point to an index entry for later records than
paul@0	86	# the desired one.
paul@0	87
paul@0	88	if found > term:
paul@0	89	i = max(0, i - 1)
paul@0	90	found, pos = l[i]
paul@0	91
paul@4	92	reader.seek(pos)
paul@17	93	return find_in_file(reader, get_key, term)
paul@0	94
paul@17	95	def find_in_file(reader, get_key, term):
paul@0	96
paul@0	97	"""
paul@17	98	In the resource whose 'reader' provides records, using a 'get_key' operation
paul@17	99	to yield the key for such records, find the given 'term', returning a record
paul@4	100	employing the term or None if no such record was found.
paul@0	101	"""
paul@0	102
paul@16	103	for record in reader:
paul@17	104	key = get_key(record)
paul@7	105	if term == key:
paul@1	106	return record
paul@0	107
paul@7	108	# Short-circuit failed searches.
paul@7	109
paul@7	110	elif term < key:
paul@7	111	return None
paul@7	112
paul@0	113	return None
paul@0	114
paul@3	115	def groups(l, length):
paul@3	116
paul@3	117	"Split 'l' into groups of the given 'length'."
paul@3	118
paul@3	119	if length <= 0:
paul@3	120	raise ValueError, "Groups must be greater than zero."
paul@3	121
paul@3	122	i = 0
paul@3	123	g = []
paul@3	124
paul@3	125	while i < len(l):
paul@3	126	g.append(l[i:i+length])
paul@3	127	i += length
paul@3	128
paul@3	129	return g
paul@3	130
paul@0	131	# vim: tabstop=4 expandtab shiftwidth=4

simplex

Annotated simplex/__init__.py

Annotated simplex/init.py