simplex (annotate simplex.py in dc3d2ee38bae)

simplex

Annotated simplex.py

4:dc3d2ee38bae

2011-10-01

Paul Boddie

Separated the reader and accessor concerns so that different objects can provide records from resources and access to the details within records.

paul@0	1	#!/usr/bin/env python
paul@0	2
paul@0	3	"""
paul@0	4	Simple indexing of sorted files.
paul@0	5
paul@1	6	Copyright (C) 2011 Paul Boddie <paul@boddie.org.uk>
paul@0	7
paul@0	8	This program is free software; you can redistribute it and/or modify it under
paul@0	9	the terms of the GNU General Public License as published by the Free Software
paul@0	10	Foundation; either version 3 of the License, or (at your option) any later
paul@0	11	version.
paul@0	12
paul@0	13	This program is distributed in the hope that it will be useful, but WITHOUT ANY
paul@0	14	WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
paul@0	15	PARTICULAR PURPOSE. See the GNU General Public License for more details.
paul@0	16
paul@0	17	You should have received a copy of the GNU General Public License along
paul@0	18	with this program. If not, see <http://www.gnu.org/licenses/>.
paul@0	19
paul@0	20	Motivations
paul@0	21	-----------
paul@0	22
paul@0	23	When searching files, the cost to scan an entire file typically exceeds the
paul@0	24	cost to navigate to the appropriate region and to scan a smaller portion of the
paul@0	25	file, especially for large files. At the same time, exotic data structures
paul@0	26	encouraging multiple seeks and reads are likely to waste time compared to just
paul@0	27	performing a single read operation, even if that operation involves a larger
paul@0	28	quantity of data, at least for storage with hard disk access characteristics.
paul@0	29	"""
paul@0	30
paul@0	31	import bisect
paul@0	32
paul@1	33	class TextFile:
paul@1	34
paul@1	35	"A wrapper around text files."
paul@1	36
paul@3	37	def __init__(self, f, keys=None, delimiter=None):
paul@1	38	self.f = f
paul@3	39	self.keys = keys or [0]
paul@1	40	self.delimiter = delimiter
paul@1	41
paul@1	42	def seek(self, pos):
paul@1	43	self.f.seek(pos)
paul@1	44
paul@1	45	def get_records(self):
paul@1	46	return self.f.xreadlines()
paul@1	47
paul@1	48	def get_key(self, record):
paul@3	49	values = record.split(self.delimiter)
paul@3	50	return [values[key] for key in self.keys]
paul@1	51
paul@4	52	def make_index(reader, accessor, interval):
paul@0	53
paul@0	54	"""
paul@4	55	Index a resource whose 'reader' provides records and whose 'accessor' can
paul@4	56	yield the key for such records, creating an index entry for a record after a
paul@4	57	given number of records, defined by 'interval', have been read since the
paul@4	58	last entry was produced.
paul@0	59	"""
paul@0	60
paul@0	61	l = []
paul@0	62	pos = 0
paul@0	63
paul@2	64	current_key = None
paul@2	65	start_pos = 0
paul@2	66
paul@4	67	for i, record in enumerate(reader.get_records()):
paul@4	68	key = accessor.get_key(record)
paul@2	69
paul@2	70	# Where duplicate keys are permitted, the first record employing the key
paul@2	71	# must be available as an index entry. Otherwise, records preceding the
paul@2	72	# one referenced by the entry may have the same key and be missed when
paul@2	73	# seeking using the index.
paul@2	74
paul@2	75	if key != current_key:
paul@2	76	current_key = key
paul@2	77	start_pos = pos
paul@2	78
paul@0	79	if i % interval == 0:
paul@2	80	l.append((current_key, start_pos))
paul@2	81
paul@1	82	pos += len(record)
paul@0	83
paul@0	84	return l
paul@0	85
paul@4	86	def find_with_index(reader, accessor, l, term):
paul@0	87
paul@0	88	"""
paul@4	89	Find in the resource whose 'reader' provides records and whose 'accessor'
paul@4	90	can yield the key for such records, using the given index list 'l', the
paul@4	91	given 'term', returning a record employing the term or None if no such
paul@4	92	record was found.
paul@0	93	"""
paul@0	94
paul@0	95	i = bisect.bisect_left(l, (term, None))
paul@3	96
paul@3	97	try:
paul@3	98	found, pos = l[i]
paul@3	99	except IndexError:
paul@3	100	return None
paul@0	101
paul@0	102	# Since the index is more coarse than the underlying file, the bisect left
paul@0	103	# operation will most likely point to an index entry for later records than
paul@0	104	# the desired one.
paul@0	105
paul@0	106	if found > term:
paul@0	107	i = max(0, i - 1)
paul@0	108	found, pos = l[i]
paul@0	109
paul@4	110	reader.seek(pos)
paul@4	111	return find_in_file(reader, accessor, term)
paul@0	112
paul@4	113	def find_in_file(reader, accessor, term):
paul@0	114
paul@0	115	"""
paul@4	116	Find in the resource whose 'reader' provides records and whose 'accessor'
paul@4	117	can yield the key for such records, the given 'term', returning a record
paul@4	118	employing the term or None if no such record was found.
paul@0	119	"""
paul@0	120
paul@4	121	for record in reader.get_records():
paul@4	122	if term == accessor.get_key(record):
paul@1	123	return record
paul@0	124
paul@0	125	return None
paul@0	126
paul@3	127	def groups(l, length):
paul@3	128
paul@3	129	"Split 'l' into groups of the given 'length'."
paul@3	130
paul@3	131	if length <= 0:
paul@3	132	raise ValueError, "Groups must be greater than zero."
paul@3	133
paul@3	134	i = 0
paul@3	135	g = []
paul@3	136
paul@3	137	while i < len(l):
paul@3	138	g.append(l[i:i+length])
paul@3	139	i += length
paul@3	140
paul@3	141	return g
paul@3	142
paul@0	143	# vim: tabstop=4 expandtab shiftwidth=4