simplex (annotate simplex/__init_

simplex

Annotated simplex/init.py

8:6e1d369de4b3

2011-10-01

Paul Boddie

Incorporated usage of accessors into readers so that they can provide records and keys directly.

paul@0	1	#!/usr/bin/env python
paul@0	2
paul@0	3	"""
paul@0	4	Simple indexing of sorted files.
paul@0	5
paul@1	6	Copyright (C) 2011 Paul Boddie <paul@boddie.org.uk>
paul@0	7
paul@0	8	This program is free software; you can redistribute it and/or modify it under
paul@0	9	the terms of the GNU General Public License as published by the Free Software
paul@0	10	Foundation; either version 3 of the License, or (at your option) any later
paul@0	11	version.
paul@0	12
paul@0	13	This program is distributed in the hope that it will be useful, but WITHOUT ANY
paul@0	14	WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
paul@0	15	PARTICULAR PURPOSE. See the GNU General Public License for more details.
paul@0	16
paul@0	17	You should have received a copy of the GNU General Public License along
paul@0	18	with this program. If not, see <http://www.gnu.org/licenses/>.
paul@0	19
paul@0	20	Motivations
paul@0	21	-----------
paul@0	22
paul@0	23	When searching files, the cost to scan an entire file typically exceeds the
paul@0	24	cost to navigate to the appropriate region and to scan a smaller portion of the
paul@0	25	file, especially for large files. At the same time, exotic data structures
paul@0	26	encouraging multiple seeks and reads are likely to waste time compared to just
paul@0	27	performing a single read operation, even if that operation involves a larger
paul@0	28	quantity of data, at least for storage with hard disk access characteristics.
paul@0	29	"""
paul@0	30
paul@6	31	from simplex.readers import *
paul@0	32	import bisect
paul@0	33
paul@8	34	def make_index(reader, interval):
paul@0	35
paul@0	36	"""
paul@8	37	Index a resource whose 'reader' provides records and keys, creating an index
paul@8	38	entry for a record after a given number of records, defined by 'interval',
paul@8	39	have been read since the last entry was produced.
paul@0	40	"""
paul@0	41
paul@0	42	l = []
paul@0	43	pos = 0
paul@0	44
paul@2	45	current_key = None
paul@2	46	start_pos = 0
paul@2	47
paul@8	48	for i, (key, record) in enumerate(reader.get_records()):
paul@2	49
paul@2	50	# Where duplicate keys are permitted, the first record employing the key
paul@2	51	# must be available as an index entry. Otherwise, records preceding the
paul@2	52	# one referenced by the entry may have the same key and be missed when
paul@2	53	# seeking using the index.
paul@2	54
paul@2	55	if key != current_key:
paul@2	56	current_key = key
paul@2	57	start_pos = pos
paul@2	58
paul@0	59	if i % interval == 0:
paul@2	60	l.append((current_key, start_pos))
paul@2	61
paul@1	62	pos += len(record)
paul@0	63
paul@0	64	return l
paul@0	65
paul@8	66	def find_with_index(reader, l, term):
paul@0	67
paul@0	68	"""
paul@8	69	In the resource whose 'reader' provides records and keys, using the given
paul@8	70	index list 'l', find the given 'term', returning a record employing the term
paul@8	71	or None if no such record was found.
paul@0	72	"""
paul@0	73
paul@0	74	i = bisect.bisect_left(l, (term, None))
paul@3	75
paul@3	76	try:
paul@3	77	found, pos = l[i]
paul@3	78	except IndexError:
paul@3	79	return None
paul@0	80
paul@0	81	# Since the index is more coarse than the underlying file, the bisect left
paul@0	82	# operation will most likely point to an index entry for later records than
paul@0	83	# the desired one.
paul@0	84
paul@0	85	if found > term:
paul@0	86	i = max(0, i - 1)
paul@0	87	found, pos = l[i]
paul@0	88
paul@4	89	reader.seek(pos)
paul@8	90	return find_in_file(reader, term)
paul@0	91
paul@8	92	def find_in_file(reader, term):
paul@0	93
paul@0	94	"""
paul@8	95	In the resource whose 'reader' provides records and keys, find the given
paul@8	96	'term', returning a record employing the term or None if no such record was
paul@8	97	found.
paul@0	98	"""
paul@0	99
paul@8	100	for key, record in reader.get_records():
paul@7	101	if term == key:
paul@1	102	return record
paul@0	103
paul@7	104	# Short-circuit failed searches.
paul@7	105
paul@7	106	elif term < key:
paul@7	107	return None
paul@7	108
paul@0	109	return None
paul@0	110
paul@3	111	def groups(l, length):
paul@3	112
paul@3	113	"Split 'l' into groups of the given 'length'."
paul@3	114
paul@3	115	if length <= 0:
paul@3	116	raise ValueError, "Groups must be greater than zero."
paul@3	117
paul@3	118	i = 0
paul@3	119	g = []
paul@3	120
paul@3	121	while i < len(l):
paul@3	122	g.append(l[i:i+length])
paul@3	123	i += length
paul@3	124
paul@3	125	return g
paul@3	126
paul@0	127	# vim: tabstop=4 expandtab shiftwidth=4

simplex

Annotated simplex/__init__.py

Annotated simplex/init.py