iixr (annotate iixr.py in 42cc066da2fd)

iixr

Annotated iixr.py

7:42cc066da2fd

2009-08-27

Paul Boddie

Added Unicode conversion, position sorting. Added an Index class which manages files within a directory. Added measures to close methods in order to tolerate repeated invocations.

paul@0	1	#!/usr/bin/env python
paul@0	2
paul@0	3	"""
paul@0	4	A simple (and sane) text indexing library.
paul@1	5
paul@1	6	Copyright (C) 2009 Paul Boddie <paul@boddie.org.uk>
paul@1	7
paul@1	8	This program is free software; you can redistribute it and/or modify it under
paul@1	9	the terms of the GNU General Public License as published by the Free Software
paul@1	10	Foundation; either version 3 of the License, or (at your option) any later
paul@1	11	version.
paul@1	12
paul@1	13	This program is distributed in the hope that it will be useful, but WITHOUT ANY
paul@1	14	WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
paul@1	15	PARTICULAR PURPOSE. See the GNU General Public License for more details.
paul@1	16
paul@1	17	You should have received a copy of the GNU General Public License along
paul@1	18	with this program. If not, see <http://www.gnu.org/licenses/>.
paul@0	19	"""
paul@0	20
paul@7	21	from os import mkdir # to determine whether to create indexes
paul@7	22	from os.path import exists, join
paul@2	23	from os.path import commonprefix # to find common string prefixes
paul@3	24	from bisect import bisect_right # to find terms in the dictionary index
paul@2	25
paul@7	26	# Constants.
paul@7	27
paul@7	28	INTERVAL = 100
paul@7	29
paul@0	30	# Foundation classes.
paul@0	31
paul@0	32	class File:
paul@0	33
paul@0	34	"A basic file abstraction."
paul@0	35
paul@0	36	def __init__(self, f):
paul@0	37	self.f = f
paul@0	38	self.reset()
paul@0	39
paul@0	40	def reset(self):
paul@0	41	pass
paul@0	42
paul@0	43	def close(self):
paul@7	44	if self.f is not None:
paul@7	45	self.f.close()
paul@7	46	self.f = None
paul@0	47
paul@0	48	class FileWriter(File):
paul@0	49
paul@0	50	"Writing basic data types to files."
paul@0	51
paul@0	52	def write_number(self, number):
paul@0	53
paul@0	54	"Write 'number' to the file using a variable length encoding."
paul@0	55
paul@0	56	# Negative numbers are not supported.
paul@0	57
paul@0	58	if number < 0:
paul@0	59	raise ValueError, "Number %r is negative." % number
paul@0	60
paul@0	61	# Special case: one byte containing zero.
paul@0	62
paul@0	63	elif number == 0:
paul@4	64	self.f.write(chr(0))
paul@0	65	return
paul@0	66
paul@0	67	# Write the number from least to most significant digits.
paul@0	68
paul@0	69	bytes = []
paul@0	70
paul@0	71	while number != 0:
paul@4	72	lsd = number & 127
paul@4	73	number = number >> 7
paul@4	74	if number != 0:
paul@4	75	lsd \|= 128
paul@0	76	bytes.append(chr(lsd))
paul@0	77
paul@0	78	record = "".join(bytes)
paul@0	79	self.f.write(record)
paul@0	80
paul@2	81	def write_string(self, s):
paul@2	82
paul@2	83	"Write 's' to the file, recording its length."
paul@2	84
paul@7	85	# Convert Unicode objects to strings.
paul@7	86
paul@7	87	if isinstance(s, unicode):
paul@7	88	s = s.encode("utf-8")
paul@7	89
paul@2	90	length = len(s)
paul@2	91
paul@2	92	if not (0 <= length <= 255):
paul@2	93	raise ValueError, "String %r is too long." % s
paul@2	94
paul@4	95	self.write_number(length)
paul@2	96	self.f.write(s)
paul@2	97
paul@0	98	class FileReader(File):
paul@0	99
paul@0	100	"Reading basic data types from files."
paul@0	101
paul@0	102	def read_number(self):
paul@0	103
paul@0	104	"Read a number from the file."
paul@0	105
paul@0	106	# Read each byte, adding it to the number.
paul@0	107
paul@0	108	shift = 0
paul@0	109	number = 0
paul@4	110	more = 1
paul@0	111
paul@4	112	while more:
paul@4	113	byte = self.f.read(1)
paul@4	114	if not byte:
paul@4	115	raise EOFError
paul@4	116
paul@4	117	csd = ord(byte)
paul@4	118	more = csd & 128 != 0
paul@4	119	if more:
paul@4	120	csd &= 127
paul@0	121	number += (csd << shift)
paul@4	122	shift += 7
paul@0	123
paul@0	124	return number
paul@0	125
paul@2	126	def read_string(self):
paul@2	127
paul@2	128	"Read a string from the file."
paul@2	129
paul@4	130	length = self.read_number()
paul@7	131
paul@7	132	# Convert strings to Unicode objects.
paul@7	133
paul@7	134	return unicode(self.f.read(length), "utf-8")
paul@2	135
paul@0	136	# Specific classes.
paul@0	137
paul@0	138	class PositionWriter(FileWriter):
paul@0	139
paul@0	140	"Writing position information to files."
paul@0	141
paul@0	142	def reset(self):
paul@0	143	self.last_docnum = 0
paul@0	144
paul@0	145	def write_positions(self, docnum, positions):
paul@0	146
paul@0	147	"Write for the document 'docnum' the given 'positions'."
paul@0	148
paul@0	149	if docnum < self.last_docnum:
paul@0	150	raise ValueError, "Document number %r is less than previous number %r." % (docnum, self.last_docnum)
paul@0	151
paul@0	152	# Write the document number delta.
paul@0	153
paul@0	154	self.write_number(docnum - self.last_docnum)
paul@0	155
paul@0	156	# Write the number of positions.
paul@0	157
paul@0	158	self.write_number(len(positions))
paul@0	159
paul@7	160	# Make sure that the positions are sorted.
paul@7	161
paul@7	162	positions.sort()
paul@7	163
paul@0	164	# Write the position deltas.
paul@0	165
paul@0	166	last = 0
paul@0	167	for position in positions:
paul@0	168	pos = position - last
paul@0	169	self.write_number(pos)
paul@0	170	last = position
paul@0	171
paul@0	172	self.last_docnum = docnum
paul@0	173
paul@0	174	def write_all_positions(self, doc_positions):
paul@0	175
paul@0	176	"""
paul@0	177	Write all 'doc_positions' - a collection of tuples of the form (document
paul@0	178	number, position list) - to the file, returning the offset at which they
paul@0	179	were stored.
paul@0	180	"""
paul@0	181
paul@0	182	# Reset the writer and record the current file offset.
paul@0	183
paul@0	184	self.reset()
paul@0	185	offset = self.f.tell()
paul@0	186
paul@0	187	# Write the number of documents.
paul@0	188
paul@0	189	self.write_number(len(doc_positions))
paul@0	190
paul@0	191	# Write the positions.
paul@0	192
paul@0	193	for docnum, positions in doc_positions:
paul@0	194	self.write_positions(docnum, positions)
paul@0	195
paul@0	196	return offset
paul@0	197
paul@0	198	class PositionReader(FileReader):
paul@0	199
paul@0	200	"Reading position information from files."
paul@0	201
paul@0	202	def reset(self):
paul@0	203	self.last_docnum = 0
paul@0	204
paul@0	205	def read_positions(self):
paul@0	206
paul@0	207	"Read positions, returning a document number and a list of positions."
paul@0	208
paul@0	209	# Read the document number delta and add it to the last number.
paul@0	210
paul@0	211	self.last_docnum += self.read_number()
paul@0	212
paul@0	213	# Read the number of positions.
paul@0	214
paul@0	215	npositions = self.read_number()
paul@0	216
paul@0	217	# Read the position deltas, adding each previous position to get the
paul@0	218	# appropriate collection of absolute positions.
paul@0	219
paul@0	220	i = 0
paul@0	221	last = 0
paul@0	222	positions = []
paul@0	223
paul@0	224	while i < npositions:
paul@0	225	last += self.read_number()
paul@0	226	positions.append(last)
paul@0	227	i += 1
paul@0	228
paul@0	229	return self.last_docnum, positions
paul@0	230
paul@0	231	def read_all_positions(self, offset):
paul@0	232
paul@0	233	"""
paul@0	234	Read all positions from 'offset', seeking to that position in the file
paul@0	235	before reading.
paul@0	236	"""
paul@0	237
paul@0	238	self.reset()
paul@0	239	self.f.seek(offset)
paul@0	240
paul@0	241	# Read the number of documents.
paul@0	242
paul@0	243	ndocuments = self.read_number()
paul@0	244
paul@0	245	# Read all records.
paul@0	246
paul@0	247	i = 0
paul@0	248	doc_positions = []
paul@0	249
paul@0	250	while i < ndocuments:
paul@0	251	doc_positions.append(self.read_positions())
paul@0	252	i += 1
paul@0	253
paul@0	254	return doc_positions
paul@0	255
paul@2	256	class TermWriter(FileWriter):
paul@2	257
paul@2	258	"Writing term information to files."
paul@2	259
paul@2	260	def reset(self):
paul@2	261	self.last_term = ""
paul@2	262	self.last_offset = 0
paul@2	263
paul@2	264	def write_term(self, term, offset):
paul@2	265
paul@2	266	"""
paul@2	267	Write the given 'term' and its position file 'offset' to the term
paul@3	268	information file. Return the offset after the term information was
paul@3	269	written to the file.
paul@2	270	"""
paul@2	271
paul@2	272	# Too long terms are not currently supported.
paul@2	273
paul@2	274	if len(term) > 255:
paul@2	275	raise ValueError, "Term %r is too long." % term
paul@2	276
paul@2	277	# Write the prefix length and term suffix.
paul@2	278
paul@2	279	common = len(commonprefix([self.last_term, term]))
paul@2	280	suffix = term[common:]
paul@2	281
paul@4	282	self.write_number(common)
paul@2	283	self.write_string(suffix)
paul@2	284
paul@2	285	# Write the offset delta.
paul@2	286
paul@2	287	self.write_number(offset - self.last_offset)
paul@2	288
paul@2	289	self.last_term = term
paul@2	290	self.last_offset = offset
paul@2	291
paul@3	292	return self.f.tell()
paul@3	293
paul@2	294	class TermReader(FileReader):
paul@2	295
paul@2	296	"Reading term information from files."
paul@2	297
paul@2	298	def reset(self):
paul@2	299	self.last_term = ""
paul@2	300	self.last_offset = 0
paul@2	301
paul@2	302	def read_term(self):
paul@2	303
paul@2	304	"""
paul@2	305	Read a term and its position file offset from the term information file.
paul@2	306	"""
paul@2	307
paul@2	308	# Read the prefix length and term suffix.
paul@2	309
paul@4	310	common = self.read_number()
paul@2	311	suffix = self.read_string()
paul@2	312
paul@2	313	self.last_term = self.last_term[:common] + suffix
paul@2	314
paul@2	315	# Read the offset delta.
paul@2	316
paul@2	317	self.last_offset += self.read_number()
paul@2	318
paul@2	319	return self.last_term, self.last_offset
paul@2	320
paul@3	321	def go_to_term(self, term, offset, info_offset):
paul@3	322
paul@3	323	"Seek past the entry for 'term' having 'offset' to 'info_offset'."
paul@3	324
paul@3	325	self.f.seek(info_offset)
paul@3	326	self.last_term = term
paul@3	327	self.last_offset = offset
paul@3	328
paul@3	329	class TermIndexWriter(TermWriter):
paul@3	330
paul@3	331	"Writing term dictionary index details to files."
paul@3	332
paul@3	333	def reset(self):
paul@3	334	TermWriter.reset(self)
paul@3	335	self.last_info_offset = 0
paul@3	336
paul@3	337	def write_term(self, term, offset, info_offset):
paul@3	338
paul@3	339	"""
paul@3	340	Write the given 'term' and its position file 'offset' to the term
paul@3	341	dictionary index file, along with the 'info_offset' in the term
paul@3	342	information file.
paul@3	343	"""
paul@3	344
paul@3	345	TermWriter.write_term(self, term, offset)
paul@3	346
paul@3	347	# Write the information file offset delta.
paul@3	348
paul@3	349	self.write_number(info_offset - self.last_info_offset)
paul@3	350	self.last_info_offset = info_offset
paul@3	351
paul@3	352	class TermIndexReader(TermReader):
paul@3	353
paul@3	354	"Reading term dictionary index details from files."
paul@3	355
paul@3	356	def reset(self):
paul@3	357	TermReader.reset(self)
paul@3	358	self.last_info_offset = 0
paul@3	359
paul@3	360	def read_term(self):
paul@3	361
paul@3	362	"""
paul@3	363	Read a term, its position file offset, and its term information file
paul@3	364	offset from the term dictionary index file.
paul@3	365	"""
paul@3	366
paul@3	367	term, offset = TermReader.read_term(self)
paul@3	368
paul@3	369	# Read the offset delta.
paul@3	370
paul@3	371	self.last_info_offset += self.read_number()
paul@3	372
paul@3	373	return term, offset, self.last_info_offset
paul@3	374
paul@3	375	class TermDictionaryWriter:
paul@3	376
paul@3	377	"Writing term dictionaries."
paul@3	378
paul@5	379	def __init__(self, info_writer, index_writer, position_writer, interval):
paul@3	380	self.info_writer = info_writer
paul@3	381	self.index_writer = index_writer
paul@5	382	self.position_writer = position_writer
paul@3	383	self.interval = interval
paul@3	384	self.entry = 0
paul@3	385
paul@3	386	def write_term(self, term, offset):
paul@3	387
paul@3	388	"""
paul@3	389	Write the given 'term' and its position file 'offset' to the term
paul@3	390	information file and optionally to the index, making a dictionary entry.
paul@3	391	"""
paul@3	392
paul@3	393	info_offset = self.info_writer.write_term(term, offset)
paul@3	394
paul@3	395	if self.entry % self.interval == 0:
paul@3	396	self.index_writer.write_term(term, offset, info_offset)
paul@3	397
paul@3	398	self.entry += 1
paul@3	399
paul@5	400	def write_term_positions(self, term, doc_positions):
paul@5	401
paul@5	402	"""
paul@5	403	Write the given 'term' and the 'doc_positions' recording the documents
paul@5	404	and positions at which the term is found.
paul@5	405	"""
paul@5	406
paul@5	407	offset = self.position_writer.write_all_positions(doc_positions)
paul@5	408	self.write_term(term, offset)
paul@5	409
paul@3	410	def close(self):
paul@3	411	self.info_writer.close()
paul@3	412	self.index_writer.close()
paul@5	413	self.position_writer.close()
paul@3	414
paul@3	415	class TermDictionaryReader:
paul@3	416
paul@3	417	"Reading term dictionaries."
paul@3	418
paul@5	419	def __init__(self, info_reader, index_reader, position_reader):
paul@3	420	self.info_reader = info_reader
paul@3	421	self.index_reader = index_reader
paul@5	422	self.position_reader = position_reader
paul@3	423
paul@3	424	self.terms = []
paul@3	425	try:
paul@3	426	while 1:
paul@3	427	self.terms.append(self.index_reader.read_term())
paul@3	428	except EOFError:
paul@3	429	pass
paul@3	430
paul@3	431	# Large numbers for ordering purposes.
paul@3	432
paul@3	433	self.max_offset = self.terms[-1][1]
paul@3	434	self.max_info_offset = self.terms[-1][2]
paul@3	435
paul@6	436	def find_term(self, term):
paul@3	437
paul@3	438	"Find the position file offset of 'term' from the term dictionary."
paul@3	439
paul@3	440	i = bisect_right(self.terms, (term, self.max_offset, self.max_info_offset)) - 1
paul@3	441
paul@3	442	# Get the entry position providing the term or one preceding it.
paul@3	443
paul@3	444	if i == -1:
paul@3	445	return None
paul@3	446
paul@3	447	found_term, offset, info_offset = self.terms[i]
paul@3	448
paul@3	449	# Where the term is found immediately, return the offset.
paul@3	450
paul@3	451	if term == found_term:
paul@3	452	return offset
paul@3	453
paul@3	454	# Otherwise, seek past the index term's entry in the information file
paul@3	455	# and scan for the desired term.
paul@3	456
paul@3	457	else:
paul@3	458	self.info_reader.go_to_term(found_term, offset, info_offset)
paul@3	459	try:
paul@3	460	while term > found_term:
paul@3	461	found_term, offset = self.info_reader.read_term()
paul@3	462	except EOFError:
paul@3	463	pass
paul@3	464
paul@3	465	# If the term is found, return the offset.
paul@3	466
paul@3	467	if term == found_term:
paul@3	468	return offset
paul@3	469	else:
paul@3	470	return None
paul@3	471
paul@5	472	def find_positions(self, term):
paul@5	473
paul@5	474	"Return the documents and positions at which the given 'term' is found."
paul@5	475
paul@6	476	offset = self.find_term(term)
paul@5	477	if offset is None:
paul@5	478	return None
paul@5	479	else:
paul@5	480	return self.position_reader.read_all_positions(offset)
paul@5	481
paul@3	482	def close(self):
paul@3	483	self.info_reader.close()
paul@3	484	self.index_reader.close()
paul@5	485	self.position_reader.close()
paul@3	486
paul@6	487	class IndexWriter:
paul@6	488
paul@6	489	"Building term information and writing it to the term dictionary."
paul@6	490
paul@6	491	def __init__(self, dict_writer):
paul@6	492	self.dict_writer = dict_writer
paul@6	493	self.terms = {}
paul@6	494
paul@6	495	def add_position(self, term, docnum, position):
paul@6	496
paul@6	497	"""
paul@6	498	Add a position entry for the given 'term' in the document with the given
paul@6	499	'docnum', indicating the given 'position'.
paul@6	500	"""
paul@6	501
paul@6	502	if not self.terms.has_key(term):
paul@6	503	doc_positions = self.terms[term] = {}
paul@6	504	else:
paul@6	505	doc_positions = self.terms[term]
paul@6	506
paul@6	507	if not doc_positions.has_key(docnum):
paul@6	508	doc = doc_positions[docnum] = []
paul@6	509	else:
paul@6	510	doc = doc_positions[docnum]
paul@6	511
paul@6	512	doc.append(position)
paul@6	513
paul@6	514	def close(self):
paul@7	515	if self.dict_writer is None:
paul@7	516	return
paul@6	517
paul@6	518	# Get the terms in order.
paul@6	519
paul@6	520	terms = self.terms.items()
paul@6	521	terms.sort()
paul@6	522
paul@6	523	for term, doc_positions in terms:
paul@6	524	doc_positions = doc_positions.items()
paul@6	525	doc_positions.sort()
paul@6	526	self.dict_writer.write_term_positions(term, doc_positions)
paul@6	527
paul@6	528	self.dict_writer.close()
paul@7	529	self.dict_writer = None
paul@7	530
paul@7	531	class Index:
paul@7	532
paul@7	533	"An inverted index solution encapsulating the various components."
paul@7	534
paul@7	535	def __init__(self, pathname):
paul@7	536	self.pathname = pathname
paul@7	537	self.reader = None
paul@7	538	self.writer = None
paul@7	539
paul@7	540	def get_writer(self, interval=INTERVAL):
paul@7	541
paul@7	542	"Return a writer, optionally using the given indexing 'interval'."
paul@7	543
paul@7	544	if not exists(self.pathname):
paul@7	545	mkdir(self.pathname)
paul@7	546
paul@7	547	tdf = open(join(self.pathname, "terms"), "wb")
paul@7	548	info_writer = TermWriter(tdf)
paul@7	549
paul@7	550	tdif = open(join(self.pathname, "index"), "wb")
paul@7	551	index_writer = TermIndexWriter(tdif)
paul@7	552
paul@7	553	tpf = open(join(self.pathname, "positions"), "wb")
paul@7	554	positions_writer = PositionWriter(tpf)
paul@7	555
paul@7	556	dict_writer = TermDictionaryWriter(info_writer, index_writer, positions_writer, interval)
paul@7	557
paul@7	558	self.writer = IndexWriter(dict_writer)
paul@7	559	return self.writer
paul@7	560
paul@7	561	def get_reader(self):
paul@7	562
paul@7	563	"Return a reader for the index."
paul@7	564
paul@7	565	if not exists(self.pathname):
paul@7	566	raise OSError, "Index path %r does not exist." % self.pathname
paul@7	567
paul@7	568	tdf = open(join(self.pathname, "terms"), "rb")
paul@7	569	info_reader = TermReader(tdf)
paul@7	570
paul@7	571	tdif = open(join(self.pathname, "index"), "rb")
paul@7	572	index_reader = TermIndexReader(tdif)
paul@7	573
paul@7	574	tpf = open(join(self.pathname, "positions"), "rb")
paul@7	575	positions_reader = PositionReader(tpf)
paul@7	576
paul@7	577	self.reader = TermDictionaryReader(info_reader, index_reader, positions_reader)
paul@7	578	return self.reader
paul@7	579
paul@7	580	def close(self):
paul@7	581	if self.reader is not None:
paul@7	582	self.reader.close()
paul@7	583	self.reader = None
paul@7	584	if self.writer is not None:
paul@7	585	self.writer.close()
paul@7	586	self.writer = None
paul@6	587
paul@0	588	# vim: tabstop=4 expandtab shiftwidth=4