iixr (annotate iixr/terms.py in fad9698e2c46)

iixr

Annotated iixr/terms.py

44:fad9698e2c46

2009-09-15

Paul Boddie

Made iixr a package with several submodules.

paul@44	1	#!/usr/bin/env python
paul@44	2
paul@44	3	"""
paul@44	4	Specific classes for storing term information.
paul@44	5
paul@44	6	Copyright (C) 2009 Paul Boddie <paul@boddie.org.uk>
paul@44	7
paul@44	8	This program is free software; you can redistribute it and/or modify it under
paul@44	9	the terms of the GNU General Public License as published by the Free Software
paul@44	10	Foundation; either version 3 of the License, or (at your option) any later
paul@44	11	version.
paul@44	12
paul@44	13	This program is distributed in the hope that it will be useful, but WITHOUT ANY
paul@44	14	WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
paul@44	15	PARTICULAR PURPOSE. See the GNU General Public License for more details.
paul@44	16
paul@44	17	You should have received a copy of the GNU General Public License along
paul@44	18	with this program. If not, see <http://www.gnu.org/licenses/>.
paul@44	19	"""
paul@44	20
paul@44	21	from iixr.files import *
paul@44	22	from os.path import commonprefix # to find common string prefixes
paul@44	23	from bisect import bisect_right # to find terms in the dictionary index
paul@44	24
paul@44	25	class TermWriter(FileWriter):
paul@44	26
paul@44	27	"Writing term information to files."
paul@44	28
paul@44	29	def reset(self):
paul@44	30	self.last_term = ""
paul@44	31	self.last_offset = 0
paul@44	32
paul@44	33	def write_term(self, term, offset, frequency, doc_frequency):
paul@44	34
paul@44	35	"""
paul@44	36	Write the given 'term', its position file 'offset', its 'frequency' and
paul@44	37	its 'doc_frequency' (number of documents in which it appears) to the
paul@44	38	term information file. Return the offset after the term information was
paul@44	39	written to the file.
paul@44	40	"""
paul@44	41
paul@44	42	# Write the prefix length and term suffix.
paul@44	43
paul@44	44	common = len(commonprefix([self.last_term, term]))
paul@44	45	suffix = term[common:]
paul@44	46
paul@44	47	self.write_number(common)
paul@44	48	self.write_string(suffix)
paul@44	49
paul@44	50	# Write the offset delta.
paul@44	51
paul@44	52	self.write_number(offset - self.last_offset)
paul@44	53
paul@44	54	# Write the frequency.
paul@44	55
paul@44	56	self.write_number(frequency)
paul@44	57
paul@44	58	# Write the document frequency.
paul@44	59
paul@44	60	self.write_number(doc_frequency)
paul@44	61
paul@44	62	self.last_term = term
paul@44	63	self.last_offset = offset
paul@44	64
paul@44	65	return self.tell()
paul@44	66
paul@44	67	class TermReader(FileReader):
paul@44	68
paul@44	69	"Reading term information from files."
paul@44	70
paul@44	71	def reset(self):
paul@44	72	self.last_term = ""
paul@44	73	self.last_offset = 0
paul@44	74
paul@44	75	def read_term(self):
paul@44	76
paul@44	77	"""
paul@44	78	Read a term, its position file offset, its frequency and its document
paul@44	79	frequency from the term information file.
paul@44	80	"""
paul@44	81
paul@44	82	# Read the prefix length and term suffix.
paul@44	83
paul@44	84	common = self.read_number()
paul@44	85	suffix = self.read_string()
paul@44	86
paul@44	87	self.last_term = self.last_term[:common] + suffix
paul@44	88
paul@44	89	# Read the offset delta.
paul@44	90
paul@44	91	self.last_offset += self.read_number()
paul@44	92
paul@44	93	# Read the frequency.
paul@44	94
paul@44	95	frequency = self.read_number()
paul@44	96
paul@44	97	# Read the document frequency.
paul@44	98
paul@44	99	doc_frequency = self.read_number()
paul@44	100
paul@44	101	return self.last_term, self.last_offset, frequency, doc_frequency
paul@44	102
paul@44	103	def go_to_term(self, term, offset, info_offset):
paul@44	104
paul@44	105	"""
paul@44	106	Seek past the entry for 'term' having 'offset' to 'info_offset'. This
paul@44	107	permits the scanning for later terms from the specified term.
paul@44	108	"""
paul@44	109
paul@44	110	self.seek(info_offset)
paul@44	111	self.last_term = term
paul@44	112	self.last_offset = offset
paul@44	113
paul@44	114	class TermIndexWriter(TermWriter):
paul@44	115
paul@44	116	"Writing term dictionary index details to files."
paul@44	117
paul@44	118	def reset(self):
paul@44	119	TermWriter.reset(self)
paul@44	120	self.last_info_offset = 0
paul@44	121
paul@44	122	def write_term(self, term, offset, frequency, doc_frequency, info_offset):
paul@44	123
paul@44	124	"""
paul@44	125	Write the given 'term', its position file 'offset', its 'frequency' and
paul@44	126	its 'doc_frequency' to the term dictionary index file, along with the
paul@44	127	'info_offset' in the term information file.
paul@44	128	"""
paul@44	129
paul@44	130	TermWriter.write_term(self, term, offset, frequency, doc_frequency)
paul@44	131
paul@44	132	# Write the information file offset delta.
paul@44	133
paul@44	134	self.write_number(info_offset - self.last_info_offset)
paul@44	135	self.last_info_offset = info_offset
paul@44	136
paul@44	137	class TermIndexReader(TermReader):
paul@44	138
paul@44	139	"Reading term dictionary index details from files."
paul@44	140
paul@44	141	def reset(self):
paul@44	142	TermReader.reset(self)
paul@44	143	self.last_info_offset = 0
paul@44	144
paul@44	145	def read_term(self):
paul@44	146
paul@44	147	"""
paul@44	148	Read a term, its position file offset, its frequency, its document
paul@44	149	frequency and a term information file offset from the term dictionary
paul@44	150	index file.
paul@44	151	"""
paul@44	152
paul@44	153	term, offset, frequency, doc_frequency = TermReader.read_term(self)
paul@44	154
paul@44	155	# Read the offset delta.
paul@44	156
paul@44	157	self.last_info_offset += self.read_number()
paul@44	158
paul@44	159	return term, offset, frequency, doc_frequency, self.last_info_offset
paul@44	160
paul@44	161	class TermDictionaryWriter:
paul@44	162
paul@44	163	"Writing term dictionaries."
paul@44	164
paul@44	165	def __init__(self, info_writer, index_writer, position_dict_writer, interval):
paul@44	166	self.info_writer = info_writer
paul@44	167	self.index_writer = index_writer
paul@44	168	self.position_dict_writer = position_dict_writer
paul@44	169	self.interval = interval
paul@44	170	self.entry = 0
paul@44	171
paul@44	172	def _write_term(self, term, offset, frequency, doc_frequency):
paul@44	173
paul@44	174	"""
paul@44	175	Write the given 'term', its position file 'offset', its 'frequency' and
paul@44	176	its 'doc_frequency' (number of documents in which it appears) to the
paul@44	177	term information file. Return the offset after the term information was
paul@44	178	written to the file.
paul@44	179	"""
paul@44	180
paul@44	181	info_offset = self.info_writer.write_term(term, offset, frequency, doc_frequency)
paul@44	182
paul@44	183	if self.entry % self.interval == 0:
paul@44	184	self.index_writer.write_term(term, offset, frequency, doc_frequency, info_offset)
paul@44	185
paul@44	186	self.entry += 1
paul@44	187
paul@44	188	def write_term_positions(self, term, doc_positions):
paul@44	189
paul@44	190	"""
paul@44	191	Write the given 'term' and the 'doc_positions' recording the documents
paul@44	192	and positions at which the term is found.
paul@44	193	"""
paul@44	194
paul@44	195	offset, frequency, doc_frequency = self.position_dict_writer.write_term_positions(doc_positions)
paul@44	196	self._write_term(term, offset, frequency, doc_frequency)
paul@44	197
paul@44	198	def close(self):
paul@44	199	self.info_writer.close()
paul@44	200	self.index_writer.close()
paul@44	201	self.position_dict_writer.close()
paul@44	202
paul@44	203	class TermDictionaryReader:
paul@44	204
paul@44	205	"Reading term dictionaries."
paul@44	206
paul@44	207	def __init__(self, info_reader, index_reader, position_dict_reader):
paul@44	208	self.info_reader = info_reader
paul@44	209	self.index_reader = index_reader
paul@44	210	self.position_dict_reader = position_dict_reader
paul@44	211
paul@44	212	self.terms = []
paul@44	213	try:
paul@44	214	while 1:
paul@44	215	self.terms.append(self.index_reader.read_term())
paul@44	216	except EOFError:
paul@44	217	pass
paul@44	218
paul@44	219	# Large numbers for ordering purposes.
paul@44	220
paul@44	221	if self.terms:
paul@44	222	self.max_offset = self.terms[-1][1] + 1
paul@44	223	else:
paul@44	224	self.max_offset = None
paul@44	225
paul@44	226	def _find_closest_entry(self, term):
paul@44	227
paul@44	228	"""
paul@44	229	Find the offsets and frequencies of 'term' from the term dictionary or
paul@44	230	the closest term starting with the value of 'term'.
paul@44	231
paul@44	232	Return the closest index entry consisting of a term, the position file
paul@44	233	offset, the term frequency, the document frequency, and the term details
paul@44	234	file offset.
paul@44	235	"""
paul@44	236
paul@44	237	i = bisect_right(self.terms, (term, self.max_offset, 0, 0)) - 1
paul@44	238
paul@44	239	# Get the entry position providing the term or one preceding it.
paul@44	240	# If no entry precedes the requested term, return the very first entry
paul@44	241	# as the closest.
paul@44	242
paul@44	243	if i == -1:
paul@44	244	return self.terms[0]
paul@44	245	else:
paul@44	246	return self.terms[i]
paul@44	247
paul@44	248	def _find_closest_term(self, term):
paul@44	249
paul@44	250	"""
paul@44	251	Find the offsets and frequencies of 'term' from the term dictionary or
paul@44	252	the closest term starting with the value of 'term'.
paul@44	253
paul@44	254	Return the closest term (or the term itself), the position file offset,
paul@44	255	the term frequency, the document frequency, and the term details file
paul@44	256	offset (or None if the reader is already positioned).
paul@44	257	"""
paul@44	258
paul@44	259	found_term, offset, frequency, doc_frequency, info_offset = self._find_closest_entry(term)
paul@44	260
paul@44	261	# Where the term is found immediately, return the offset and
paul@44	262	# frequencies. If the term does not appear, return the details of the
paul@44	263	# closest entry.
paul@44	264
paul@44	265	if term <= found_term:
paul@44	266	return found_term, offset, frequency, doc_frequency, info_offset
paul@44	267
paul@44	268	# Otherwise, seek past the index term's entry in the information file
paul@44	269	# and scan for the desired term.
paul@44	270
paul@44	271	else:
paul@44	272	self.info_reader.go_to_term(found_term, offset, info_offset)
paul@44	273	try:
paul@44	274	while term > found_term:
paul@44	275	found_term, offset, frequency, doc_frequency = self.info_reader.read_term()
paul@44	276	except EOFError:
paul@44	277	pass
paul@44	278
paul@44	279	return found_term, offset, frequency, doc_frequency, None
paul@44	280
paul@44	281	def _find_term(self, term):
paul@44	282
paul@44	283	"""
paul@44	284	Find the position file offset and frequency of 'term' from the term
paul@44	285	dictionary.
paul@44	286	"""
paul@44	287
paul@44	288	found_term, offset, frequency, doc_frequency, info_offset = self._find_closest_term(term)
paul@44	289
paul@44	290	# If the term is found, return the offset and frequencies.
paul@44	291
paul@44	292	if term == found_term:
paul@44	293	return offset, frequency, doc_frequency
paul@44	294	else:
paul@44	295	return None
paul@44	296
paul@44	297	def _get_positions(self, offset, doc_frequency):
paul@44	298	return self.position_dict_reader.read_term_positions(offset, doc_frequency)
paul@44	299
paul@44	300	# Iterator convenience methods.
paul@44	301
paul@44	302	def __iter__(self):
paul@44	303	self.rewind()
paul@44	304	return self
paul@44	305
paul@44	306	def next(self):
paul@44	307	try:
paul@44	308	return self.read_term()
paul@44	309	except EOFError:
paul@44	310	raise StopIteration
paul@44	311
paul@44	312	# Sequential access methods.
paul@44	313
paul@44	314	def rewind(self):
paul@44	315	self.info_reader.rewind()
paul@44	316
paul@44	317	def read_term(self):
paul@44	318
paul@44	319	"""
paul@44	320	Return the next term, its frequency, its document frequency, and the
paul@44	321	documents and positions at which the term is found.
paul@44	322	"""
paul@44	323
paul@44	324	term, offset, frequency, doc_frequency = self.info_reader.read_term()
paul@44	325	positions = self._get_positions(offset, doc_frequency)
paul@44	326	return term, frequency, doc_frequency, positions
paul@44	327
paul@44	328	# Query methods.
paul@44	329
paul@44	330	def find_terms(self, term):
paul@44	331
paul@44	332	"Return all terms whose values start with the value of 'term'."
paul@44	333
paul@44	334	terms = []
paul@44	335
paul@44	336	found_term, offset, frequency, doc_frequency, info_offset = self._find_closest_term(term)
paul@44	337
paul@44	338	# Position the reader, if necessary.
paul@44	339
paul@44	340	if info_offset is not None:
paul@44	341	self.info_reader.go_to_term(found_term, offset, info_offset)
paul@44	342
paul@44	343	# Read and record terms.
paul@44	344
paul@44	345	try:
paul@44	346	# Add the found term if it starts with the specified term.
paul@44	347
paul@44	348	while found_term.startswith(term):
paul@44	349	terms.append(found_term)
paul@44	350	found_term, offset, frequency, doc_frequency = self.info_reader.read_term()
paul@44	351
paul@44	352	except EOFError:
paul@44	353	pass
paul@44	354
paul@44	355	return terms
paul@44	356
paul@44	357	def find_positions(self, term):
paul@44	358
paul@44	359	"Return the documents and positions at which the given 'term' is found."
paul@44	360
paul@44	361	t = self._find_term(term)
paul@44	362	if t is None:
paul@44	363	return None
paul@44	364	else:
paul@44	365	offset, frequency, doc_frequency = t
paul@44	366	return self._get_positions(offset, doc_frequency)
paul@44	367
paul@44	368	def get_frequency(self, term):
paul@44	369
paul@44	370	"Return the frequency of the given 'term'."
paul@44	371
paul@44	372	t = self._find_term(term)
paul@44	373	if t is None:
paul@44	374	return None
paul@44	375	else:
paul@44	376	offset, frequency, doc_frequency = t
paul@44	377	return frequency
paul@44	378
paul@44	379	def get_document_frequency(self, term):
paul@44	380
paul@44	381	"Return the document frequency of the given 'term'."
paul@44	382
paul@44	383	t = self._find_term(term)
paul@44	384	if t is None:
paul@44	385	return None
paul@44	386	else:
paul@44	387	offset, frequency, doc_frequency = t
paul@44	388	return doc_frequency
paul@44	389
paul@44	390	def close(self):
paul@44	391	self.info_reader.close()
paul@44	392	self.index_reader.close()
paul@44	393	self.position_dict_reader.close()
paul@44	394
paul@44	395	# vim: tabstop=4 expandtab shiftwidth=4