iixr (annotate iixr/terms.py in b75bd39cf61f)

iixr

Annotated iixr/terms.py

95:b75bd39cf61f

2011-02-12

Paul Boddie

Changed cache slicing to record pointer updating.

paul@44	1	#!/usr/bin/env python
paul@44	2
paul@44	3	"""
paul@44	4	Specific classes for storing term information.
paul@44	5
paul@89	6	Copyright (C) 2009, 2010, 2011 Paul Boddie <paul@boddie.org.uk>
paul@44	7
paul@44	8	This program is free software; you can redistribute it and/or modify it under
paul@44	9	the terms of the GNU General Public License as published by the Free Software
paul@44	10	Foundation; either version 3 of the License, or (at your option) any later
paul@44	11	version.
paul@44	12
paul@44	13	This program is distributed in the hope that it will be useful, but WITHOUT ANY
paul@44	14	WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
paul@44	15	PARTICULAR PURPOSE. See the GNU General Public License for more details.
paul@44	16
paul@44	17	You should have received a copy of the GNU General Public License along
paul@44	18	with this program. If not, see <http://www.gnu.org/licenses/>.
paul@44	19	"""
paul@44	20
paul@44	21	from iixr.files import *
paul@47	22	from iixr.positions import *
paul@60	23	from iixr.phrases import PhraseIterator
paul@44	24	from os.path import commonprefix # to find common string prefixes
paul@44	25	from bisect import bisect_right # to find terms in the dictionary index
paul@44	26
paul@44	27	class TermWriter(FileWriter):
paul@44	28
paul@44	29	"Writing term information to files."
paul@44	30
paul@44	31	def reset(self):
paul@90	32	self.end_record()
paul@44	33	self.last_term = ""
paul@44	34	self.last_offset = 0
paul@44	35
paul@44	36	def write_term(self, term, offset, frequency, doc_frequency):
paul@44	37
paul@44	38	"""
paul@44	39	Write the given 'term', its position file 'offset', its 'frequency' and
paul@44	40	its 'doc_frequency' (number of documents in which it appears) to the
paul@55	41	term information file.
paul@44	42	"""
paul@44	43
paul@75	44	if term <= self.last_term:
paul@75	45	raise ValueError, "Term %r precedes the previous term %r." % (term, self.last_term)
paul@75	46
paul@44	47	# Write the prefix length and term suffix.
paul@44	48
paul@44	49	common = len(commonprefix([self.last_term, term]))
paul@44	50	suffix = term[common:]
paul@44	51
paul@44	52	self.write_number(common)
paul@44	53	self.write_string(suffix)
paul@44	54
paul@44	55	# Write the offset delta.
paul@44	56	# Write the frequency.
paul@44	57	# Write the document frequency.
paul@44	58
paul@86	59	self.write_numbers((
paul@86	60	offset - self.last_offset,
paul@86	61	frequency,
paul@86	62	doc_frequency
paul@86	63	))
paul@44	64
paul@44	65	self.last_term = term
paul@44	66	self.last_offset = offset
paul@44	67
paul@44	68	class TermReader(FileReader):
paul@44	69
paul@44	70	"Reading term information from files."
paul@44	71
paul@44	72	def reset(self):
paul@44	73	self.last_term = ""
paul@44	74	self.last_offset = 0
paul@90	75	self.begin_record()
paul@44	76
paul@44	77	def read_term(self):
paul@44	78
paul@44	79	"""
paul@44	80	Read a term, its position file offset, its frequency and its document
paul@44	81	frequency from the term information file.
paul@44	82	"""
paul@44	83
paul@44	84	# Read the prefix length and term suffix.
paul@44	85
paul@44	86	common = self.read_number()
paul@44	87	suffix = self.read_string()
paul@44	88
paul@44	89	self.last_term = self.last_term[:common] + suffix
paul@44	90
paul@44	91	# Read the offset delta.
paul@44	92
paul@44	93	self.last_offset += self.read_number()
paul@44	94
paul@44	95	# Read the frequency.
paul@44	96
paul@44	97	frequency = self.read_number()
paul@44	98
paul@44	99	# Read the document frequency.
paul@44	100
paul@44	101	doc_frequency = self.read_number()
paul@44	102
paul@44	103	return self.last_term, self.last_offset, frequency, doc_frequency
paul@44	104
paul@44	105	def go_to_term(self, term, offset, info_offset):
paul@44	106
paul@44	107	"""
paul@44	108	Seek past the entry for 'term' having 'offset' to 'info_offset'. This
paul@44	109	permits the scanning for later terms from the specified term.
paul@44	110	"""
paul@44	111
paul@69	112	self.seek(info_offset)
paul@44	113	self.last_term = term
paul@44	114	self.last_offset = offset
paul@44	115
paul@44	116	class TermIndexWriter(TermWriter):
paul@44	117
paul@44	118	"Writing term dictionary index details to files."
paul@44	119
paul@44	120	def reset(self):
paul@44	121	TermWriter.reset(self)
paul@44	122	self.last_info_offset = 0
paul@44	123
paul@44	124	def write_term(self, term, offset, frequency, doc_frequency, info_offset):
paul@44	125
paul@44	126	"""
paul@44	127	Write the given 'term', its position file 'offset', its 'frequency' and
paul@44	128	its 'doc_frequency' to the term dictionary index file, along with the
paul@44	129	'info_offset' in the term information file.
paul@44	130	"""
paul@44	131
paul@90	132	TermWriter.write_term(self, term, offset, frequency, doc_frequency)
paul@44	133
paul@44	134	# Write the information file offset delta.
paul@44	135
paul@44	136	self.write_number(info_offset - self.last_info_offset)
paul@89	137
paul@44	138	self.last_info_offset = info_offset
paul@44	139
paul@44	140	class TermIndexReader(TermReader):
paul@44	141
paul@44	142	"Reading term dictionary index details from files."
paul@44	143
paul@44	144	def reset(self):
paul@44	145	TermReader.reset(self)
paul@44	146	self.last_info_offset = 0
paul@44	147
paul@44	148	def read_term(self):
paul@44	149
paul@44	150	"""
paul@44	151	Read a term, its position file offset, its frequency, its document
paul@44	152	frequency and a term information file offset from the term dictionary
paul@44	153	index file.
paul@44	154	"""
paul@44	155
paul@90	156	term, offset, frequency, doc_frequency = TermReader.read_term(self)
paul@44	157
paul@44	158	# Read the offset delta.
paul@44	159
paul@44	160	self.last_info_offset += self.read_number()
paul@44	161
paul@44	162	return term, offset, frequency, doc_frequency, self.last_info_offset
paul@44	163
paul@44	164	class TermDictionaryWriter:
paul@44	165
paul@44	166	"Writing term dictionaries."
paul@44	167
paul@44	168	def __init__(self, info_writer, index_writer, position_dict_writer, interval):
paul@44	169	self.info_writer = info_writer
paul@44	170	self.index_writer = index_writer
paul@44	171	self.position_dict_writer = position_dict_writer
paul@44	172	self.interval = interval
paul@44	173	self.entry = 0
paul@44	174
paul@91	175	self.index_writer.reset()
paul@91	176
paul@44	177	def _write_term(self, term, offset, frequency, doc_frequency):
paul@44	178
paul@44	179	"""
paul@44	180	Write the given 'term', its position file 'offset', its 'frequency' and
paul@44	181	its 'doc_frequency' (number of documents in which it appears) to the
paul@90	182	term information file. Return the offset before the term information was
paul@44	183	written to the file.
paul@44	184	"""
paul@44	185
paul@44	186	if self.entry % self.interval == 0:
paul@90	187	self.info_writer.reset()
paul@88	188	info_offset = self.info_writer.tell()
paul@44	189	self.index_writer.write_term(term, offset, frequency, doc_frequency, info_offset)
paul@44	190
paul@90	191	self.info_writer.write_term(term, offset, frequency, doc_frequency)
paul@44	192	self.entry += 1
paul@44	193
paul@44	194	def write_term_positions(self, term, doc_positions):
paul@44	195
paul@44	196	"""
paul@44	197	Write the given 'term' and the 'doc_positions' recording the documents
paul@44	198	and positions at which the term is found.
paul@44	199	"""
paul@44	200
paul@44	201	offset, frequency, doc_frequency = self.position_dict_writer.write_term_positions(doc_positions)
paul@75	202
paul@75	203	if not frequency or not doc_frequency:
paul@75	204	raise ValueError, "Term %r has no occurrences recorded: %r" % (term, doc_positions)
paul@75	205
paul@44	206	self._write_term(term, offset, frequency, doc_frequency)
paul@44	207
paul@44	208	def close(self):
paul@44	209	self.info_writer.close()
paul@44	210	self.index_writer.close()
paul@44	211	self.position_dict_writer.close()
paul@44	212
paul@44	213	class TermDictionaryReader:
paul@44	214
paul@44	215	"Reading term dictionaries."
paul@44	216
paul@44	217	def __init__(self, info_reader, index_reader, position_dict_reader):
paul@44	218	self.info_reader = info_reader
paul@44	219	self.index_reader = index_reader
paul@44	220	self.position_dict_reader = position_dict_reader
paul@91	221
paul@91	222	self.info_reader.reset()
paul@91	223	self.index_reader.reset()
paul@91	224
paul@90	225	self.entry = 0
paul@44	226	self.terms = []
paul@44	227	try:
paul@44	228	while 1:
paul@44	229	self.terms.append(self.index_reader.read_term())
paul@44	230	except EOFError:
paul@44	231	pass
paul@44	232
paul@44	233	# Large numbers for ordering purposes.
paul@44	234
paul@44	235	if self.terms:
paul@44	236	self.max_offset = self.terms[-1][1] + 1
paul@44	237	else:
paul@44	238	self.max_offset = None
paul@44	239
paul@44	240	def _find_closest_entry(self, term):
paul@44	241
paul@44	242	"""
paul@44	243	Find the offsets and frequencies of 'term' from the term dictionary or
paul@44	244	the closest term starting with the value of 'term'.
paul@44	245
paul@44	246	Return the closest index entry consisting of a term, the position file
paul@44	247	offset, the term frequency, the document frequency, and the term details
paul@44	248	file offset.
paul@44	249	"""
paul@44	250
paul@44	251	i = bisect_right(self.terms, (term, self.max_offset, 0, 0)) - 1
paul@44	252
paul@44	253	# Get the entry position providing the term or one preceding it.
paul@44	254	# If no entry precedes the requested term, return the very first entry
paul@44	255	# as the closest.
paul@44	256
paul@44	257	if i == -1:
paul@90	258	self.entry = 0
paul@44	259	return self.terms[0]
paul@44	260	else:
paul@90	261	self.entry = i
paul@44	262	return self.terms[i]
paul@44	263
paul@44	264	def _find_closest_term(self, term):
paul@44	265
paul@44	266	"""
paul@44	267	Find the offsets and frequencies of 'term' from the term dictionary or
paul@44	268	the closest term starting with the value of 'term'.
paul@44	269
paul@44	270	Return the closest term (or the term itself), the position file offset,
paul@44	271	the term frequency, the document frequency, and the term details file
paul@44	272	offset (or None if the reader is already positioned).
paul@44	273	"""
paul@44	274
paul@44	275	found_term, offset, frequency, doc_frequency, info_offset = self._find_closest_entry(term)
paul@44	276
paul@44	277	# Where the term is found immediately, return the offset and
paul@44	278	# frequencies. If the term does not appear, return the details of the
paul@44	279	# closest entry.
paul@44	280
paul@44	281	if term <= found_term:
paul@44	282	return found_term, offset, frequency, doc_frequency, info_offset
paul@44	283
paul@44	284	# Otherwise, seek past the index term's entry in the information file
paul@44	285	# and scan for the desired term.
paul@44	286
paul@44	287	else:
paul@90	288	# Reset the term and offset for the new page.
paul@90	289	self.info_reader.go_to_term("", 0, info_offset)
paul@44	290	try:
paul@44	291	while term > found_term:
paul@90	292	found_term, offset, frequency, doc_frequency = self._read_term()
paul@44	293	except EOFError:
paul@44	294	pass
paul@44	295
paul@44	296	return found_term, offset, frequency, doc_frequency, None
paul@44	297
paul@44	298	def _find_term(self, term):
paul@44	299
paul@44	300	"""
paul@44	301	Find the position file offset and frequency of 'term' from the term
paul@44	302	dictionary.
paul@44	303	"""
paul@44	304
paul@44	305	found_term, offset, frequency, doc_frequency, info_offset = self._find_closest_term(term)
paul@44	306
paul@44	307	# If the term is found, return the offset and frequencies.
paul@44	308
paul@44	309	if term == found_term:
paul@44	310	return offset, frequency, doc_frequency
paul@44	311	else:
paul@44	312	return None
paul@44	313
paul@81	314	def _get_term_and_positions(self, term, offset, frequency, doc_frequency):
paul@81	315
paul@81	316	"""
paul@81	317	Return the term plus positions details using the given 'term', 'offset',
paul@81	318	'frequency' and 'doc_frequency'.
paul@81	319	"""
paul@81	320
paul@81	321	return term, frequency, doc_frequency, self._get_positions(offset, doc_frequency)
paul@81	322
paul@44	323	def _get_positions(self, offset, doc_frequency):
paul@58	324
paul@58	325	"""
paul@58	326	Obtain positions from the position index 'offset' expecting a number of
paul@58	327	documents equal to the given 'doc_frequency'.
paul@58	328	"""
paul@58	329
paul@69	330	return self.position_dict_reader.read_term_positions(offset, doc_frequency)
paul@44	331
paul@44	332	# Iterator convenience methods.
paul@44	333
paul@44	334	def __iter__(self):
paul@44	335	self.rewind()
paul@44	336	return self
paul@44	337
paul@44	338	def next(self):
paul@44	339	try:
paul@44	340	return self.read_term()
paul@44	341	except EOFError:
paul@44	342	raise StopIteration
paul@44	343
paul@44	344	# Sequential access methods.
paul@44	345
paul@44	346	def rewind(self):
paul@90	347	self.entry = 0
paul@44	348	self.info_reader.rewind()
paul@44	349
paul@44	350	def read_term(self):
paul@44	351
paul@44	352	"""
paul@44	353	Return the next term, its frequency, its document frequency, and the
paul@44	354	documents and positions at which the term is found.
paul@44	355	"""
paul@44	356
paul@90	357	return self._get_term_and_positions(*self._read_term())
paul@90	358
paul@90	359	def _read_term(self):
paul@90	360
paul@90	361	try:
paul@90	362	term, offset, frequency, doc_frequency = self.info_reader.read_term()
paul@90	363	except EOFError:
paul@90	364	self.entry += 1
paul@90	365	try:
paul@90	366	term, offset, frequency, doc_frequency, info_offset = self.terms[self.entry]
paul@90	367	except IndexError:
paul@90	368	raise EOFError
paul@90	369	else:
paul@90	370	# Reset the term and offset for the new page.
paul@90	371
paul@90	372	self.info_reader.go_to_term("", 0, info_offset)
paul@90	373
paul@90	374	# Skip the term in the information file.
paul@90	375
paul@90	376	self.info_reader.read_term()
paul@90	377
paul@90	378	return term, offset, frequency, doc_frequency
paul@81	379
paul@81	380	def go_to_term(self, term):
paul@81	381
paul@81	382	"""
paul@81	383	Navigate to 'term' in the dictionary, returning the details from its
paul@81	384	entry. The returned details can be augmented with position information
paul@81	385	when presented to the _get_term_and_positions method.
paul@81	386	"""
paul@81	387
paul@81	388	found_term, offset, frequency, doc_frequency, info_offset = self._find_closest_term(term)
paul@81	389
paul@81	390	# Position the reader, if necessary.
paul@81	391
paul@81	392	if info_offset is not None:
paul@90	393
paul@90	394	# Reset the term and offset for the new page.
paul@90	395
paul@90	396	self.info_reader.go_to_term("", 0, info_offset)
paul@90	397
paul@90	398	# Skip the term in the information file.
paul@90	399
paul@90	400	self.info_reader.read_term()
paul@81	401
paul@81	402	return found_term, offset, frequency, doc_frequency
paul@44	403
paul@44	404	# Query methods.
paul@44	405
paul@72	406	def get_terms(self):
paul@72	407
paul@72	408	"Return a list of all terms."
paul@72	409
paul@74	410	return iter(self)
paul@72	411
paul@44	412	def find_terms(self, term):
paul@44	413
paul@44	414	"Return all terms whose values start with the value of 'term'."
paul@44	415
paul@44	416	terms = []
paul@44	417
paul@81	418	found_term, offset, frequency, doc_frequency = self.go_to_term(term)
paul@44	419
paul@44	420	# Read and record terms.
paul@44	421
paul@44	422	try:
paul@44	423	# Add the found term if it starts with the specified term.
paul@44	424
paul@44	425	while found_term.startswith(term):
paul@44	426	terms.append(found_term)
paul@90	427	found_term, offset, frequency, doc_frequency = self._read_term()
paul@44	428
paul@44	429	except EOFError:
paul@44	430	pass
paul@44	431
paul@44	432	return terms
paul@44	433
paul@44	434	def find_positions(self, term):
paul@44	435
paul@44	436	"Return the documents and positions at which the given 'term' is found."
paul@44	437
paul@44	438	t = self._find_term(term)
paul@44	439	if t is None:
paul@61	440	return []
paul@44	441	else:
paul@44	442	offset, frequency, doc_frequency = t
paul@44	443	return self._get_positions(offset, doc_frequency)
paul@44	444
paul@60	445	def find_common_positions(self, terms):
paul@60	446
paul@60	447	"""
paul@60	448	Return the documents and positions at which all the given 'terms' are
paul@60	449	found, where only common documents are returned.
paul@60	450	"""
paul@60	451
paul@60	452	return PhraseIterator([self.find_positions(term) for term in terms])
paul@60	453
paul@44	454	def get_frequency(self, term):
paul@44	455
paul@44	456	"Return the frequency of the given 'term'."
paul@44	457
paul@44	458	t = self._find_term(term)
paul@44	459	if t is None:
paul@44	460	return None
paul@44	461	else:
paul@44	462	offset, frequency, doc_frequency = t
paul@44	463	return frequency
paul@44	464
paul@44	465	def get_document_frequency(self, term):
paul@44	466
paul@44	467	"Return the document frequency of the given 'term'."
paul@44	468
paul@44	469	t = self._find_term(term)
paul@44	470	if t is None:
paul@44	471	return None
paul@44	472	else:
paul@44	473	offset, frequency, doc_frequency = t
paul@44	474	return doc_frequency
paul@44	475
paul@44	476	def close(self):
paul@44	477	self.info_reader.close()
paul@44	478	self.index_reader.close()
paul@44	479	self.position_dict_reader.close()
paul@44	480
paul@44	481	# vim: tabstop=4 expandtab shiftwidth=4