iixr (annotate iixr/terms.py in 1f3986bca1a3)

iixr

Annotated iixr/terms.py

89:1f3986bca1a3

2011-02-07

Paul Boddie

Introduced record-oriented reading and writing of files where an array is populated in a single read from a file or flushed to a buffer in a single write operation. Moved various data representation operations into the data module, removing explicit object size concerns from the higher-level modules, replacing them with usage of adder and subtractor functions where appropriate. Made the vint caches lists instead of dictionaries. Enforced tuples as the input representation of serialised sequence values.

paul@44	1	#!/usr/bin/env python
paul@44	2
paul@44	3	"""
paul@44	4	Specific classes for storing term information.
paul@44	5
paul@89	6	Copyright (C) 2009, 2010, 2011 Paul Boddie <paul@boddie.org.uk>
paul@44	7
paul@44	8	This program is free software; you can redistribute it and/or modify it under
paul@44	9	the terms of the GNU General Public License as published by the Free Software
paul@44	10	Foundation; either version 3 of the License, or (at your option) any later
paul@44	11	version.
paul@44	12
paul@44	13	This program is distributed in the hope that it will be useful, but WITHOUT ANY
paul@44	14	WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
paul@44	15	PARTICULAR PURPOSE. See the GNU General Public License for more details.
paul@44	16
paul@44	17	You should have received a copy of the GNU General Public License along
paul@44	18	with this program. If not, see <http://www.gnu.org/licenses/>.
paul@44	19	"""
paul@44	20
paul@44	21	from iixr.files import *
paul@47	22	from iixr.positions import *
paul@60	23	from iixr.phrases import PhraseIterator
paul@44	24	from os.path import commonprefix # to find common string prefixes
paul@44	25	from bisect import bisect_right # to find terms in the dictionary index
paul@44	26
paul@44	27	class TermWriter(FileWriter):
paul@44	28
paul@44	29	"Writing term information to files."
paul@44	30
paul@44	31	def reset(self):
paul@44	32	self.last_term = ""
paul@44	33	self.last_offset = 0
paul@44	34
paul@44	35	def write_term(self, term, offset, frequency, doc_frequency):
paul@44	36
paul@44	37	"""
paul@44	38	Write the given 'term', its position file 'offset', its 'frequency' and
paul@44	39	its 'doc_frequency' (number of documents in which it appears) to the
paul@55	40	term information file.
paul@44	41	"""
paul@44	42
paul@89	43	self.begin_record()
paul@89	44	self._write_term(term, offset, frequency, doc_frequency)
paul@89	45	self.end_record()
paul@89	46
paul@89	47	def _write_term(self, term, offset, frequency, doc_frequency):
paul@89	48
paul@89	49	"Performs the term writing for 'write_term'."
paul@89	50
paul@75	51	if term <= self.last_term:
paul@75	52	raise ValueError, "Term %r precedes the previous term %r." % (term, self.last_term)
paul@75	53
paul@44	54	# Write the prefix length and term suffix.
paul@44	55
paul@44	56	common = len(commonprefix([self.last_term, term]))
paul@44	57	suffix = term[common:]
paul@44	58
paul@44	59	self.write_number(common)
paul@44	60	self.write_string(suffix)
paul@44	61
paul@44	62	# Write the offset delta.
paul@44	63	# Write the frequency.
paul@44	64	# Write the document frequency.
paul@44	65
paul@86	66	self.write_numbers((
paul@86	67	offset - self.last_offset,
paul@86	68	frequency,
paul@86	69	doc_frequency
paul@86	70	))
paul@44	71
paul@44	72	self.last_term = term
paul@44	73	self.last_offset = offset
paul@44	74
paul@44	75	class TermReader(FileReader):
paul@44	76
paul@44	77	"Reading term information from files."
paul@44	78
paul@44	79	def reset(self):
paul@44	80	self.last_term = ""
paul@44	81	self.last_offset = 0
paul@44	82
paul@44	83	def read_term(self):
paul@44	84
paul@44	85	"""
paul@44	86	Read a term, its position file offset, its frequency and its document
paul@44	87	frequency from the term information file.
paul@44	88	"""
paul@44	89
paul@89	90	self.begin_record()
paul@89	91	try:
paul@89	92	return self._read_term()
paul@89	93	finally:
paul@89	94	self.end_record()
paul@89	95
paul@89	96	def _read_term(self):
paul@89	97
paul@89	98	"Performs the term reading for 'read_term'."
paul@89	99
paul@44	100	# Read the prefix length and term suffix.
paul@44	101
paul@44	102	common = self.read_number()
paul@44	103	suffix = self.read_string()
paul@44	104
paul@44	105	self.last_term = self.last_term[:common] + suffix
paul@44	106
paul@44	107	# Read the offset delta.
paul@44	108
paul@44	109	self.last_offset += self.read_number()
paul@44	110
paul@44	111	# Read the frequency.
paul@44	112
paul@44	113	frequency = self.read_number()
paul@44	114
paul@44	115	# Read the document frequency.
paul@44	116
paul@44	117	doc_frequency = self.read_number()
paul@44	118
paul@44	119	return self.last_term, self.last_offset, frequency, doc_frequency
paul@44	120
paul@44	121	def go_to_term(self, term, offset, info_offset):
paul@44	122
paul@44	123	"""
paul@44	124	Seek past the entry for 'term' having 'offset' to 'info_offset'. This
paul@44	125	permits the scanning for later terms from the specified term.
paul@44	126	"""
paul@44	127
paul@69	128	self.seek(info_offset)
paul@44	129	self.last_term = term
paul@44	130	self.last_offset = offset
paul@44	131
paul@44	132	class TermIndexWriter(TermWriter):
paul@44	133
paul@44	134	"Writing term dictionary index details to files."
paul@44	135
paul@44	136	def reset(self):
paul@44	137	TermWriter.reset(self)
paul@44	138	self.last_info_offset = 0
paul@44	139
paul@44	140	def write_term(self, term, offset, frequency, doc_frequency, info_offset):
paul@44	141
paul@44	142	"""
paul@44	143	Write the given 'term', its position file 'offset', its 'frequency' and
paul@44	144	its 'doc_frequency' to the term dictionary index file, along with the
paul@44	145	'info_offset' in the term information file.
paul@44	146	"""
paul@44	147
paul@89	148	self.begin_record()
paul@89	149	TermWriter._write_term(self, term, offset, frequency, doc_frequency)
paul@44	150
paul@44	151	# Write the information file offset delta.
paul@44	152
paul@44	153	self.write_number(info_offset - self.last_info_offset)
paul@89	154	self.end_record()
paul@89	155
paul@44	156	self.last_info_offset = info_offset
paul@44	157
paul@44	158	class TermIndexReader(TermReader):
paul@44	159
paul@44	160	"Reading term dictionary index details from files."
paul@44	161
paul@44	162	def reset(self):
paul@44	163	TermReader.reset(self)
paul@44	164	self.last_info_offset = 0
paul@44	165
paul@44	166	def read_term(self):
paul@44	167
paul@44	168	"""
paul@44	169	Read a term, its position file offset, its frequency, its document
paul@44	170	frequency and a term information file offset from the term dictionary
paul@44	171	index file.
paul@44	172	"""
paul@44	173
paul@89	174	self.begin_record()
paul@89	175	term, offset, frequency, doc_frequency = TermReader._read_term(self)
paul@44	176
paul@44	177	# Read the offset delta.
paul@44	178
paul@44	179	self.last_info_offset += self.read_number()
paul@89	180	self.end_record()
paul@44	181
paul@44	182	return term, offset, frequency, doc_frequency, self.last_info_offset
paul@44	183
paul@44	184	class TermDictionaryWriter:
paul@44	185
paul@44	186	"Writing term dictionaries."
paul@44	187
paul@44	188	def __init__(self, info_writer, index_writer, position_dict_writer, interval):
paul@44	189	self.info_writer = info_writer
paul@44	190	self.index_writer = index_writer
paul@44	191	self.position_dict_writer = position_dict_writer
paul@44	192	self.interval = interval
paul@44	193	self.entry = 0
paul@44	194
paul@44	195	def _write_term(self, term, offset, frequency, doc_frequency):
paul@44	196
paul@44	197	"""
paul@44	198	Write the given 'term', its position file 'offset', its 'frequency' and
paul@44	199	its 'doc_frequency' (number of documents in which it appears) to the
paul@44	200	term information file. Return the offset after the term information was
paul@44	201	written to the file.
paul@44	202	"""
paul@44	203
paul@55	204	self.info_writer.write_term(term, offset, frequency, doc_frequency)
paul@44	205
paul@44	206	if self.entry % self.interval == 0:
paul@88	207	info_offset = self.info_writer.tell()
paul@44	208	self.index_writer.write_term(term, offset, frequency, doc_frequency, info_offset)
paul@44	209
paul@44	210	self.entry += 1
paul@44	211
paul@44	212	def write_term_positions(self, term, doc_positions):
paul@44	213
paul@44	214	"""
paul@44	215	Write the given 'term' and the 'doc_positions' recording the documents
paul@44	216	and positions at which the term is found.
paul@44	217	"""
paul@44	218
paul@44	219	offset, frequency, doc_frequency = self.position_dict_writer.write_term_positions(doc_positions)
paul@75	220
paul@75	221	if not frequency or not doc_frequency:
paul@75	222	raise ValueError, "Term %r has no occurrences recorded: %r" % (term, doc_positions)
paul@75	223
paul@44	224	self._write_term(term, offset, frequency, doc_frequency)
paul@44	225
paul@44	226	def close(self):
paul@44	227	self.info_writer.close()
paul@44	228	self.index_writer.close()
paul@44	229	self.position_dict_writer.close()
paul@44	230
paul@44	231	class TermDictionaryReader:
paul@44	232
paul@44	233	"Reading term dictionaries."
paul@44	234
paul@44	235	def __init__(self, info_reader, index_reader, position_dict_reader):
paul@44	236	self.info_reader = info_reader
paul@44	237	self.index_reader = index_reader
paul@44	238	self.position_dict_reader = position_dict_reader
paul@44	239
paul@44	240	self.terms = []
paul@44	241	try:
paul@44	242	while 1:
paul@44	243	self.terms.append(self.index_reader.read_term())
paul@44	244	except EOFError:
paul@44	245	pass
paul@44	246
paul@44	247	# Large numbers for ordering purposes.
paul@44	248
paul@44	249	if self.terms:
paul@44	250	self.max_offset = self.terms[-1][1] + 1
paul@44	251	else:
paul@44	252	self.max_offset = None
paul@44	253
paul@44	254	def _find_closest_entry(self, term):
paul@44	255
paul@44	256	"""
paul@44	257	Find the offsets and frequencies of 'term' from the term dictionary or
paul@44	258	the closest term starting with the value of 'term'.
paul@44	259
paul@44	260	Return the closest index entry consisting of a term, the position file
paul@44	261	offset, the term frequency, the document frequency, and the term details
paul@44	262	file offset.
paul@44	263	"""
paul@44	264
paul@44	265	i = bisect_right(self.terms, (term, self.max_offset, 0, 0)) - 1
paul@44	266
paul@44	267	# Get the entry position providing the term or one preceding it.
paul@44	268	# If no entry precedes the requested term, return the very first entry
paul@44	269	# as the closest.
paul@44	270
paul@44	271	if i == -1:
paul@44	272	return self.terms[0]
paul@44	273	else:
paul@44	274	return self.terms[i]
paul@44	275
paul@44	276	def _find_closest_term(self, term):
paul@44	277
paul@44	278	"""
paul@44	279	Find the offsets and frequencies of 'term' from the term dictionary or
paul@44	280	the closest term starting with the value of 'term'.
paul@44	281
paul@44	282	Return the closest term (or the term itself), the position file offset,
paul@44	283	the term frequency, the document frequency, and the term details file
paul@44	284	offset (or None if the reader is already positioned).
paul@44	285	"""
paul@44	286
paul@44	287	found_term, offset, frequency, doc_frequency, info_offset = self._find_closest_entry(term)
paul@44	288
paul@44	289	# Where the term is found immediately, return the offset and
paul@44	290	# frequencies. If the term does not appear, return the details of the
paul@44	291	# closest entry.
paul@44	292
paul@44	293	if term <= found_term:
paul@44	294	return found_term, offset, frequency, doc_frequency, info_offset
paul@44	295
paul@44	296	# Otherwise, seek past the index term's entry in the information file
paul@44	297	# and scan for the desired term.
paul@44	298
paul@44	299	else:
paul@44	300	self.info_reader.go_to_term(found_term, offset, info_offset)
paul@44	301	try:
paul@44	302	while term > found_term:
paul@44	303	found_term, offset, frequency, doc_frequency = self.info_reader.read_term()
paul@44	304	except EOFError:
paul@44	305	pass
paul@44	306
paul@44	307	return found_term, offset, frequency, doc_frequency, None
paul@44	308
paul@44	309	def _find_term(self, term):
paul@44	310
paul@44	311	"""
paul@44	312	Find the position file offset and frequency of 'term' from the term
paul@44	313	dictionary.
paul@44	314	"""
paul@44	315
paul@44	316	found_term, offset, frequency, doc_frequency, info_offset = self._find_closest_term(term)
paul@44	317
paul@44	318	# If the term is found, return the offset and frequencies.
paul@44	319
paul@44	320	if term == found_term:
paul@44	321	return offset, frequency, doc_frequency
paul@44	322	else:
paul@44	323	return None
paul@44	324
paul@81	325	def _get_term_and_positions(self, term, offset, frequency, doc_frequency):
paul@81	326
paul@81	327	"""
paul@81	328	Return the term plus positions details using the given 'term', 'offset',
paul@81	329	'frequency' and 'doc_frequency'.
paul@81	330	"""
paul@81	331
paul@81	332	return term, frequency, doc_frequency, self._get_positions(offset, doc_frequency)
paul@81	333
paul@44	334	def _get_positions(self, offset, doc_frequency):
paul@58	335
paul@58	336	"""
paul@58	337	Obtain positions from the position index 'offset' expecting a number of
paul@58	338	documents equal to the given 'doc_frequency'.
paul@58	339	"""
paul@58	340
paul@69	341	return self.position_dict_reader.read_term_positions(offset, doc_frequency)
paul@44	342
paul@44	343	# Iterator convenience methods.
paul@44	344
paul@44	345	def __iter__(self):
paul@44	346	self.rewind()
paul@44	347	return self
paul@44	348
paul@44	349	def next(self):
paul@44	350	try:
paul@44	351	return self.read_term()
paul@44	352	except EOFError:
paul@44	353	raise StopIteration
paul@44	354
paul@44	355	# Sequential access methods.
paul@44	356
paul@44	357	def rewind(self):
paul@44	358	self.info_reader.rewind()
paul@44	359
paul@44	360	def read_term(self):
paul@44	361
paul@44	362	"""
paul@44	363	Return the next term, its frequency, its document frequency, and the
paul@44	364	documents and positions at which the term is found.
paul@44	365	"""
paul@44	366
paul@44	367	term, offset, frequency, doc_frequency = self.info_reader.read_term()
paul@81	368	return self._get_term_and_positions(term, offset, frequency, doc_frequency)
paul@81	369
paul@81	370	def go_to_term(self, term):
paul@81	371
paul@81	372	"""
paul@81	373	Navigate to 'term' in the dictionary, returning the details from its
paul@81	374	entry. The returned details can be augmented with position information
paul@81	375	when presented to the _get_term_and_positions method.
paul@81	376	"""
paul@81	377
paul@81	378	found_term, offset, frequency, doc_frequency, info_offset = self._find_closest_term(term)
paul@81	379
paul@81	380	# Position the reader, if necessary.
paul@81	381
paul@81	382	if info_offset is not None:
paul@81	383	self.info_reader.go_to_term(found_term, offset, info_offset)
paul@81	384
paul@81	385	return found_term, offset, frequency, doc_frequency
paul@44	386
paul@44	387	# Query methods.
paul@44	388
paul@72	389	def get_terms(self):
paul@72	390
paul@72	391	"Return a list of all terms."
paul@72	392
paul@74	393	return iter(self)
paul@72	394
paul@44	395	def find_terms(self, term):
paul@44	396
paul@44	397	"Return all terms whose values start with the value of 'term'."
paul@44	398
paul@44	399	terms = []
paul@44	400
paul@81	401	found_term, offset, frequency, doc_frequency = self.go_to_term(term)
paul@44	402
paul@44	403	# Read and record terms.
paul@44	404
paul@44	405	try:
paul@44	406	# Add the found term if it starts with the specified term.
paul@44	407
paul@44	408	while found_term.startswith(term):
paul@44	409	terms.append(found_term)
paul@44	410	found_term, offset, frequency, doc_frequency = self.info_reader.read_term()
paul@44	411
paul@44	412	except EOFError:
paul@44	413	pass
paul@44	414
paul@44	415	return terms
paul@44	416
paul@44	417	def find_positions(self, term):
paul@44	418
paul@44	419	"Return the documents and positions at which the given 'term' is found."
paul@44	420
paul@44	421	t = self._find_term(term)
paul@44	422	if t is None:
paul@61	423	return []
paul@44	424	else:
paul@44	425	offset, frequency, doc_frequency = t
paul@44	426	return self._get_positions(offset, doc_frequency)
paul@44	427
paul@60	428	def find_common_positions(self, terms):
paul@60	429
paul@60	430	"""
paul@60	431	Return the documents and positions at which all the given 'terms' are
paul@60	432	found, where only common documents are returned.
paul@60	433	"""
paul@60	434
paul@60	435	return PhraseIterator([self.find_positions(term) for term in terms])
paul@60	436
paul@44	437	def get_frequency(self, term):
paul@44	438
paul@44	439	"Return the frequency of the given 'term'."
paul@44	440
paul@44	441	t = self._find_term(term)
paul@44	442	if t is None:
paul@44	443	return None
paul@44	444	else:
paul@44	445	offset, frequency, doc_frequency = t
paul@44	446	return frequency
paul@44	447
paul@44	448	def get_document_frequency(self, term):
paul@44	449
paul@44	450	"Return the document frequency of the given 'term'."
paul@44	451
paul@44	452	t = self._find_term(term)
paul@44	453	if t is None:
paul@44	454	return None
paul@44	455	else:
paul@44	456	offset, frequency, doc_frequency = t
paul@44	457	return doc_frequency
paul@44	458
paul@44	459	def close(self):
paul@44	460	self.info_reader.close()
paul@44	461	self.index_reader.close()
paul@44	462	self.position_dict_reader.close()
paul@44	463
paul@44	464	# vim: tabstop=4 expandtab shiftwidth=4