iixr (annotate iixr.py in 5ef1ed194593)

iixr

Annotated iixr.py

3:5ef1ed194593

2009-08-25

Paul Boddie

Added end-of-file detection. Added term dictionary support, including term dictionary index file reading and writing.

paul@0	1	#!/usr/bin/env python
paul@0	2
paul@0	3	"""
paul@0	4	A simple (and sane) text indexing library.
paul@1	5
paul@1	6	Copyright (C) 2009 Paul Boddie <paul@boddie.org.uk>
paul@1	7
paul@1	8	This program is free software; you can redistribute it and/or modify it under
paul@1	9	the terms of the GNU General Public License as published by the Free Software
paul@1	10	Foundation; either version 3 of the License, or (at your option) any later
paul@1	11	version.
paul@1	12
paul@1	13	This program is distributed in the hope that it will be useful, but WITHOUT ANY
paul@1	14	WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
paul@1	15	PARTICULAR PURPOSE. See the GNU General Public License for more details.
paul@1	16
paul@1	17	You should have received a copy of the GNU General Public License along
paul@1	18	with this program. If not, see <http://www.gnu.org/licenses/>.
paul@0	19	"""
paul@0	20
paul@2	21	from os.path import commonprefix # to find common string prefixes
paul@3	22	from bisect import bisect_right # to find terms in the dictionary index
paul@2	23
paul@0	24	# Foundation classes.
paul@0	25
paul@0	26	class File:
paul@0	27
paul@0	28	"A basic file abstraction."
paul@0	29
paul@0	30	def __init__(self, f):
paul@0	31	self.f = f
paul@0	32	self.reset()
paul@0	33
paul@0	34	def reset(self):
paul@0	35	pass
paul@0	36
paul@0	37	def close(self):
paul@0	38	self.f.close()
paul@0	39
paul@0	40	class FileWriter(File):
paul@0	41
paul@0	42	"Writing basic data types to files."
paul@0	43
paul@0	44	def write_number(self, number):
paul@0	45
paul@0	46	"Write 'number' to the file using a variable length encoding."
paul@0	47
paul@0	48	# Negative numbers are not supported.
paul@0	49
paul@0	50	if number < 0:
paul@0	51	raise ValueError, "Number %r is negative." % number
paul@0	52
paul@0	53	# Special case: one byte containing zero.
paul@0	54
paul@0	55	elif number == 0:
paul@0	56	self.f.write(chr(1) + chr(0))
paul@0	57	return
paul@0	58
paul@0	59	# Write the number from least to most significant digits.
paul@0	60
paul@0	61	nbytes = 0
paul@0	62	bytes = []
paul@0	63
paul@0	64	while number != 0:
paul@0	65	lsd = number & 255
paul@0	66	bytes.append(chr(lsd))
paul@0	67	number = number >> 8
paul@0	68	nbytes += 1
paul@0	69
paul@0	70	# Too large numbers are not supported.
paul@0	71
paul@0	72	if nbytes > 255:
paul@0	73	raise ValueError, "Number %r is too large." % number
paul@0	74
paul@0	75	bytes.insert(0, chr(nbytes))
paul@0	76	record = "".join(bytes)
paul@0	77	self.f.write(record)
paul@0	78
paul@2	79	def write_unsigned_byte(self, number):
paul@2	80
paul@2	81	"Write 'number' to the file using a single byte."
paul@2	82
paul@2	83	if not (0 <= number <= 255):
paul@2	84	raise ValueError, "Number %r is out of range." % number
paul@2	85
paul@2	86	self.f.write(chr(number))
paul@2	87
paul@2	88	def write_string(self, s):
paul@2	89
paul@2	90	"Write 's' to the file, recording its length."
paul@2	91
paul@2	92	length = len(s)
paul@2	93
paul@2	94	if not (0 <= length <= 255):
paul@2	95	raise ValueError, "String %r is too long." % s
paul@2	96
paul@2	97	self.write_unsigned_byte(length)
paul@2	98	self.f.write(s)
paul@2	99
paul@0	100	class FileReader(File):
paul@0	101
paul@0	102	"Reading basic data types from files."
paul@0	103
paul@0	104	def read_number(self):
paul@0	105
paul@0	106	"Read a number from the file."
paul@0	107
paul@3	108	nbytes = self.read_unsigned_byte()
paul@0	109
paul@0	110	# Read each byte, adding it to the number.
paul@0	111
paul@0	112	bytes = self.f.read(nbytes)
paul@0	113
paul@0	114	i = 0
paul@0	115	shift = 0
paul@0	116	number = 0
paul@0	117
paul@0	118	while i < nbytes:
paul@0	119	csd = ord(bytes[i])
paul@0	120	number += (csd << shift)
paul@0	121	shift += 8
paul@0	122	i += 1
paul@0	123
paul@0	124	return number
paul@0	125
paul@2	126	def read_unsigned_byte(self):
paul@2	127
paul@2	128	"Read a number from the file, consuming a single byte."
paul@2	129
paul@3	130	s = self.f.read(1)
paul@3	131	if not s:
paul@3	132	raise EOFError
paul@3	133
paul@3	134	return ord(s)
paul@2	135
paul@2	136	def read_string(self):
paul@2	137
paul@2	138	"Read a string from the file."
paul@2	139
paul@2	140	length = self.read_unsigned_byte()
paul@2	141	return self.f.read(length)
paul@2	142
paul@0	143	# Specific classes.
paul@0	144
paul@0	145	class PositionWriter(FileWriter):
paul@0	146
paul@0	147	"Writing position information to files."
paul@0	148
paul@0	149	def reset(self):
paul@0	150	self.last_docnum = 0
paul@0	151
paul@0	152	def write_positions(self, docnum, positions):
paul@0	153
paul@0	154	"Write for the document 'docnum' the given 'positions'."
paul@0	155
paul@0	156	if docnum < self.last_docnum:
paul@0	157	raise ValueError, "Document number %r is less than previous number %r." % (docnum, self.last_docnum)
paul@0	158
paul@0	159	# Write the document number delta.
paul@0	160
paul@0	161	self.write_number(docnum - self.last_docnum)
paul@0	162
paul@0	163	# Write the number of positions.
paul@0	164
paul@0	165	self.write_number(len(positions))
paul@0	166
paul@0	167	# Write the position deltas.
paul@0	168
paul@0	169	last = 0
paul@0	170	for position in positions:
paul@0	171	pos = position - last
paul@0	172	self.write_number(pos)
paul@0	173	last = position
paul@0	174
paul@0	175	self.last_docnum = docnum
paul@0	176
paul@0	177	def write_all_positions(self, doc_positions):
paul@0	178
paul@0	179	"""
paul@0	180	Write all 'doc_positions' - a collection of tuples of the form (document
paul@0	181	number, position list) - to the file, returning the offset at which they
paul@0	182	were stored.
paul@0	183	"""
paul@0	184
paul@0	185	# Reset the writer and record the current file offset.
paul@0	186
paul@0	187	self.reset()
paul@0	188	offset = self.f.tell()
paul@0	189
paul@0	190	# Write the number of documents.
paul@0	191
paul@0	192	self.write_number(len(doc_positions))
paul@0	193
paul@0	194	# Write the positions.
paul@0	195
paul@0	196	for docnum, positions in doc_positions:
paul@0	197	self.write_positions(docnum, positions)
paul@0	198
paul@0	199	return offset
paul@0	200
paul@0	201	class PositionReader(FileReader):
paul@0	202
paul@0	203	"Reading position information from files."
paul@0	204
paul@0	205	def reset(self):
paul@0	206	self.last_docnum = 0
paul@0	207
paul@0	208	def read_positions(self):
paul@0	209
paul@0	210	"Read positions, returning a document number and a list of positions."
paul@0	211
paul@0	212	# Read the document number delta and add it to the last number.
paul@0	213
paul@0	214	self.last_docnum += self.read_number()
paul@0	215
paul@0	216	# Read the number of positions.
paul@0	217
paul@0	218	npositions = self.read_number()
paul@0	219
paul@0	220	# Read the position deltas, adding each previous position to get the
paul@0	221	# appropriate collection of absolute positions.
paul@0	222
paul@0	223	i = 0
paul@0	224	last = 0
paul@0	225	positions = []
paul@0	226
paul@0	227	while i < npositions:
paul@0	228	last += self.read_number()
paul@0	229	positions.append(last)
paul@0	230	i += 1
paul@0	231
paul@0	232	return self.last_docnum, positions
paul@0	233
paul@0	234	def read_all_positions(self, offset):
paul@0	235
paul@0	236	"""
paul@0	237	Read all positions from 'offset', seeking to that position in the file
paul@0	238	before reading.
paul@0	239	"""
paul@0	240
paul@0	241	self.reset()
paul@0	242	self.f.seek(offset)
paul@0	243
paul@0	244	# Read the number of documents.
paul@0	245
paul@0	246	ndocuments = self.read_number()
paul@0	247
paul@0	248	# Read all records.
paul@0	249
paul@0	250	i = 0
paul@0	251	doc_positions = []
paul@0	252
paul@0	253	while i < ndocuments:
paul@0	254	doc_positions.append(self.read_positions())
paul@0	255	i += 1
paul@0	256
paul@0	257	return doc_positions
paul@0	258
paul@2	259	class TermWriter(FileWriter):
paul@2	260
paul@2	261	"Writing term information to files."
paul@2	262
paul@2	263	def reset(self):
paul@2	264	self.last_term = ""
paul@2	265	self.last_offset = 0
paul@2	266
paul@2	267	def write_term(self, term, offset):
paul@2	268
paul@2	269	"""
paul@2	270	Write the given 'term' and its position file 'offset' to the term
paul@3	271	information file. Return the offset after the term information was
paul@3	272	written to the file.
paul@2	273	"""
paul@2	274
paul@2	275	# Too long terms are not currently supported.
paul@2	276
paul@2	277	if len(term) > 255:
paul@2	278	raise ValueError, "Term %r is too long." % term
paul@2	279
paul@2	280	# Write the prefix length and term suffix.
paul@2	281
paul@2	282	common = len(commonprefix([self.last_term, term]))
paul@2	283	suffix = term[common:]
paul@2	284
paul@2	285	self.write_unsigned_byte(common)
paul@2	286	self.write_string(suffix)
paul@2	287
paul@2	288	# Write the offset delta.
paul@2	289
paul@2	290	self.write_number(offset - self.last_offset)
paul@2	291
paul@2	292	self.last_term = term
paul@2	293	self.last_offset = offset
paul@2	294
paul@3	295	return self.f.tell()
paul@3	296
paul@2	297	class TermReader(FileReader):
paul@2	298
paul@2	299	"Reading term information from files."
paul@2	300
paul@2	301	def reset(self):
paul@2	302	self.last_term = ""
paul@2	303	self.last_offset = 0
paul@2	304
paul@2	305	def read_term(self):
paul@2	306
paul@2	307	"""
paul@2	308	Read a term and its position file offset from the term information file.
paul@2	309	"""
paul@2	310
paul@2	311	# Read the prefix length and term suffix.
paul@2	312
paul@2	313	common = self.read_unsigned_byte()
paul@2	314	suffix = self.read_string()
paul@2	315
paul@2	316	self.last_term = self.last_term[:common] + suffix
paul@2	317
paul@2	318	# Read the offset delta.
paul@2	319
paul@2	320	self.last_offset += self.read_number()
paul@2	321
paul@2	322	return self.last_term, self.last_offset
paul@2	323
paul@3	324	def go_to_term(self, term, offset, info_offset):
paul@3	325
paul@3	326	"Seek past the entry for 'term' having 'offset' to 'info_offset'."
paul@3	327
paul@3	328	self.f.seek(info_offset)
paul@3	329	self.last_term = term
paul@3	330	self.last_offset = offset
paul@3	331
paul@3	332	class TermIndexWriter(TermWriter):
paul@3	333
paul@3	334	"Writing term dictionary index details to files."
paul@3	335
paul@3	336	def reset(self):
paul@3	337	TermWriter.reset(self)
paul@3	338	self.last_info_offset = 0
paul@3	339
paul@3	340	def write_term(self, term, offset, info_offset):
paul@3	341
paul@3	342	"""
paul@3	343	Write the given 'term' and its position file 'offset' to the term
paul@3	344	dictionary index file, along with the 'info_offset' in the term
paul@3	345	information file.
paul@3	346	"""
paul@3	347
paul@3	348	TermWriter.write_term(self, term, offset)
paul@3	349
paul@3	350	# Write the information file offset delta.
paul@3	351
paul@3	352	self.write_number(info_offset - self.last_info_offset)
paul@3	353	self.last_info_offset = info_offset
paul@3	354
paul@3	355	class TermIndexReader(TermReader):
paul@3	356
paul@3	357	"Reading term dictionary index details from files."
paul@3	358
paul@3	359	def reset(self):
paul@3	360	TermReader.reset(self)
paul@3	361	self.last_info_offset = 0
paul@3	362
paul@3	363	def read_term(self):
paul@3	364
paul@3	365	"""
paul@3	366	Read a term, its position file offset, and its term information file
paul@3	367	offset from the term dictionary index file.
paul@3	368	"""
paul@3	369
paul@3	370	term, offset = TermReader.read_term(self)
paul@3	371
paul@3	372	# Read the offset delta.
paul@3	373
paul@3	374	self.last_info_offset += self.read_number()
paul@3	375
paul@3	376	return term, offset, self.last_info_offset
paul@3	377
paul@3	378	class TermDictionaryWriter:
paul@3	379
paul@3	380	"Writing term dictionaries."
paul@3	381
paul@3	382	def __init__(self, info_writer, index_writer, interval):
paul@3	383	self.info_writer = info_writer
paul@3	384	self.index_writer = index_writer
paul@3	385	self.interval = interval
paul@3	386	self.entry = 0
paul@3	387
paul@3	388	def write_term(self, term, offset):
paul@3	389
paul@3	390	"""
paul@3	391	Write the given 'term' and its position file 'offset' to the term
paul@3	392	information file and optionally to the index, making a dictionary entry.
paul@3	393	"""
paul@3	394
paul@3	395	info_offset = self.info_writer.write_term(term, offset)
paul@3	396
paul@3	397	if self.entry % self.interval == 0:
paul@3	398	self.index_writer.write_term(term, offset, info_offset)
paul@3	399
paul@3	400	self.entry += 1
paul@3	401
paul@3	402	def close(self):
paul@3	403	self.info_writer.close()
paul@3	404	self.index_writer.close()
paul@3	405
paul@3	406	class TermDictionaryReader:
paul@3	407
paul@3	408	"Reading term dictionaries."
paul@3	409
paul@3	410	def __init__(self, info_reader, index_reader):
paul@3	411	self.info_reader = info_reader
paul@3	412	self.index_reader = index_reader
paul@3	413
paul@3	414	self.terms = []
paul@3	415	try:
paul@3	416	while 1:
paul@3	417	self.terms.append(self.index_reader.read_term())
paul@3	418	except EOFError:
paul@3	419	pass
paul@3	420
paul@3	421	# Large numbers for ordering purposes.
paul@3	422
paul@3	423	self.max_offset = self.terms[-1][1]
paul@3	424	self.max_info_offset = self.terms[-1][2]
paul@3	425
paul@3	426	def find(self, term):
paul@3	427
paul@3	428	"Find the position file offset of 'term' from the term dictionary."
paul@3	429
paul@3	430	i = bisect_right(self.terms, (term, self.max_offset, self.max_info_offset)) - 1
paul@3	431
paul@3	432	# Get the entry position providing the term or one preceding it.
paul@3	433
paul@3	434	if i == -1:
paul@3	435	return None
paul@3	436
paul@3	437	found_term, offset, info_offset = self.terms[i]
paul@3	438
paul@3	439	# Where the term is found immediately, return the offset.
paul@3	440
paul@3	441	if term == found_term:
paul@3	442	return offset
paul@3	443
paul@3	444	# Otherwise, seek past the index term's entry in the information file
paul@3	445	# and scan for the desired term.
paul@3	446
paul@3	447	else:
paul@3	448	self.info_reader.go_to_term(found_term, offset, info_offset)
paul@3	449	try:
paul@3	450	while term > found_term:
paul@3	451	found_term, offset = self.info_reader.read_term()
paul@3	452	except EOFError:
paul@3	453	pass
paul@3	454
paul@3	455	# If the term is found, return the offset.
paul@3	456
paul@3	457	if term == found_term:
paul@3	458	return offset
paul@3	459	else:
paul@3	460	return None
paul@3	461
paul@3	462	def close(self):
paul@3	463	self.info_reader.close()
paul@3	464	self.index_reader.close()
paul@3	465
paul@0	466	# vim: tabstop=4 expandtab shiftwidth=4