iixr (annotate iixr/terms.py in 4c35f0aa339c)

iixr

Annotated iixr/terms.py

88:4c35f0aa339c

2011-02-03

Paul Boddie

Changed the files to have an internal array for reading and writing data.

paul@44	1	#!/usr/bin/env python
paul@44	2
paul@44	3	"""
paul@44	4	Specific classes for storing term information.
paul@44	5
paul@69	6	Copyright (C) 2009, 2010 Paul Boddie <paul@boddie.org.uk>
paul@44	7
paul@44	8	This program is free software; you can redistribute it and/or modify it under
paul@44	9	the terms of the GNU General Public License as published by the Free Software
paul@44	10	Foundation; either version 3 of the License, or (at your option) any later
paul@44	11	version.
paul@44	12
paul@44	13	This program is distributed in the hope that it will be useful, but WITHOUT ANY
paul@44	14	WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
paul@44	15	PARTICULAR PURPOSE. See the GNU General Public License for more details.
paul@44	16
paul@44	17	You should have received a copy of the GNU General Public License along
paul@44	18	with this program. If not, see <http://www.gnu.org/licenses/>.
paul@44	19	"""
paul@44	20
paul@44	21	from iixr.files import *
paul@47	22	from iixr.positions import *
paul@60	23	from iixr.phrases import PhraseIterator
paul@44	24	from os.path import commonprefix # to find common string prefixes
paul@44	25	from bisect import bisect_right # to find terms in the dictionary index
paul@44	26
paul@44	27	class TermWriter(FileWriter):
paul@44	28
paul@44	29	"Writing term information to files."
paul@44	30
paul@44	31	def reset(self):
paul@44	32	self.last_term = ""
paul@44	33	self.last_offset = 0
paul@44	34
paul@44	35	def write_term(self, term, offset, frequency, doc_frequency):
paul@44	36
paul@44	37	"""
paul@44	38	Write the given 'term', its position file 'offset', its 'frequency' and
paul@44	39	its 'doc_frequency' (number of documents in which it appears) to the
paul@55	40	term information file.
paul@44	41	"""
paul@44	42
paul@75	43	if term <= self.last_term:
paul@75	44	raise ValueError, "Term %r precedes the previous term %r." % (term, self.last_term)
paul@75	45
paul@44	46	# Write the prefix length and term suffix.
paul@44	47
paul@44	48	common = len(commonprefix([self.last_term, term]))
paul@44	49	suffix = term[common:]
paul@44	50
paul@44	51	self.write_number(common)
paul@44	52	self.write_string(suffix)
paul@44	53
paul@44	54	# Write the offset delta.
paul@44	55	# Write the frequency.
paul@44	56	# Write the document frequency.
paul@44	57
paul@86	58	self.write_numbers((
paul@86	59	offset - self.last_offset,
paul@86	60	frequency,
paul@86	61	doc_frequency
paul@86	62	))
paul@44	63
paul@44	64	self.last_term = term
paul@44	65	self.last_offset = offset
paul@44	66
paul@44	67	class TermReader(FileReader):
paul@44	68
paul@44	69	"Reading term information from files."
paul@44	70
paul@44	71	def reset(self):
paul@44	72	self.last_term = ""
paul@44	73	self.last_offset = 0
paul@44	74
paul@44	75	def read_term(self):
paul@44	76
paul@44	77	"""
paul@44	78	Read a term, its position file offset, its frequency and its document
paul@44	79	frequency from the term information file.
paul@44	80	"""
paul@44	81
paul@44	82	# Read the prefix length and term suffix.
paul@44	83
paul@44	84	common = self.read_number()
paul@44	85	suffix = self.read_string()
paul@44	86
paul@44	87	self.last_term = self.last_term[:common] + suffix
paul@44	88
paul@44	89	# Read the offset delta.
paul@44	90
paul@44	91	self.last_offset += self.read_number()
paul@44	92
paul@44	93	# Read the frequency.
paul@44	94
paul@44	95	frequency = self.read_number()
paul@44	96
paul@44	97	# Read the document frequency.
paul@44	98
paul@44	99	doc_frequency = self.read_number()
paul@44	100
paul@44	101	return self.last_term, self.last_offset, frequency, doc_frequency
paul@44	102
paul@44	103	def go_to_term(self, term, offset, info_offset):
paul@44	104
paul@44	105	"""
paul@44	106	Seek past the entry for 'term' having 'offset' to 'info_offset'. This
paul@44	107	permits the scanning for later terms from the specified term.
paul@44	108	"""
paul@44	109
paul@69	110	self.seek(info_offset)
paul@44	111	self.last_term = term
paul@44	112	self.last_offset = offset
paul@44	113
paul@44	114	class TermIndexWriter(TermWriter):
paul@44	115
paul@44	116	"Writing term dictionary index details to files."
paul@44	117
paul@44	118	def reset(self):
paul@44	119	TermWriter.reset(self)
paul@44	120	self.last_info_offset = 0
paul@44	121
paul@44	122	def write_term(self, term, offset, frequency, doc_frequency, info_offset):
paul@44	123
paul@44	124	"""
paul@44	125	Write the given 'term', its position file 'offset', its 'frequency' and
paul@44	126	its 'doc_frequency' to the term dictionary index file, along with the
paul@44	127	'info_offset' in the term information file.
paul@44	128	"""
paul@44	129
paul@44	130	TermWriter.write_term(self, term, offset, frequency, doc_frequency)
paul@44	131
paul@44	132	# Write the information file offset delta.
paul@44	133
paul@44	134	self.write_number(info_offset - self.last_info_offset)
paul@44	135	self.last_info_offset = info_offset
paul@44	136
paul@44	137	class TermIndexReader(TermReader):
paul@44	138
paul@44	139	"Reading term dictionary index details from files."
paul@44	140
paul@44	141	def reset(self):
paul@44	142	TermReader.reset(self)
paul@44	143	self.last_info_offset = 0
paul@44	144
paul@44	145	def read_term(self):
paul@44	146
paul@44	147	"""
paul@44	148	Read a term, its position file offset, its frequency, its document
paul@44	149	frequency and a term information file offset from the term dictionary
paul@44	150	index file.
paul@44	151	"""
paul@44	152
paul@44	153	term, offset, frequency, doc_frequency = TermReader.read_term(self)
paul@44	154
paul@44	155	# Read the offset delta.
paul@44	156
paul@44	157	self.last_info_offset += self.read_number()
paul@44	158
paul@44	159	return term, offset, frequency, doc_frequency, self.last_info_offset
paul@44	160
paul@44	161	class TermDictionaryWriter:
paul@44	162
paul@44	163	"Writing term dictionaries."
paul@44	164
paul@44	165	def __init__(self, info_writer, index_writer, position_dict_writer, interval):
paul@44	166	self.info_writer = info_writer
paul@44	167	self.index_writer = index_writer
paul@44	168	self.position_dict_writer = position_dict_writer
paul@44	169	self.interval = interval
paul@44	170	self.entry = 0
paul@44	171
paul@44	172	def _write_term(self, term, offset, frequency, doc_frequency):
paul@44	173
paul@44	174	"""
paul@44	175	Write the given 'term', its position file 'offset', its 'frequency' and
paul@44	176	its 'doc_frequency' (number of documents in which it appears) to the
paul@44	177	term information file. Return the offset after the term information was
paul@44	178	written to the file.
paul@44	179	"""
paul@44	180
paul@55	181	self.info_writer.write_term(term, offset, frequency, doc_frequency)
paul@44	182
paul@44	183	if self.entry % self.interval == 0:
paul@88	184	info_offset = self.info_writer.tell()
paul@44	185	self.index_writer.write_term(term, offset, frequency, doc_frequency, info_offset)
paul@44	186
paul@44	187	self.entry += 1
paul@44	188
paul@44	189	def write_term_positions(self, term, doc_positions):
paul@44	190
paul@44	191	"""
paul@44	192	Write the given 'term' and the 'doc_positions' recording the documents
paul@44	193	and positions at which the term is found.
paul@44	194	"""
paul@44	195
paul@44	196	offset, frequency, doc_frequency = self.position_dict_writer.write_term_positions(doc_positions)
paul@75	197
paul@75	198	if not frequency or not doc_frequency:
paul@75	199	raise ValueError, "Term %r has no occurrences recorded: %r" % (term, doc_positions)
paul@75	200
paul@44	201	self._write_term(term, offset, frequency, doc_frequency)
paul@44	202
paul@44	203	def close(self):
paul@44	204	self.info_writer.close()
paul@44	205	self.index_writer.close()
paul@44	206	self.position_dict_writer.close()
paul@44	207
paul@44	208	class TermDictionaryReader:
paul@44	209
paul@44	210	"Reading term dictionaries."
paul@44	211
paul@44	212	def __init__(self, info_reader, index_reader, position_dict_reader):
paul@44	213	self.info_reader = info_reader
paul@44	214	self.index_reader = index_reader
paul@44	215	self.position_dict_reader = position_dict_reader
paul@44	216
paul@44	217	self.terms = []
paul@44	218	try:
paul@44	219	while 1:
paul@44	220	self.terms.append(self.index_reader.read_term())
paul@44	221	except EOFError:
paul@44	222	pass
paul@44	223
paul@44	224	# Large numbers for ordering purposes.
paul@44	225
paul@44	226	if self.terms:
paul@44	227	self.max_offset = self.terms[-1][1] + 1
paul@44	228	else:
paul@44	229	self.max_offset = None
paul@44	230
paul@44	231	def _find_closest_entry(self, term):
paul@44	232
paul@44	233	"""
paul@44	234	Find the offsets and frequencies of 'term' from the term dictionary or
paul@44	235	the closest term starting with the value of 'term'.
paul@44	236
paul@44	237	Return the closest index entry consisting of a term, the position file
paul@44	238	offset, the term frequency, the document frequency, and the term details
paul@44	239	file offset.
paul@44	240	"""
paul@44	241
paul@44	242	i = bisect_right(self.terms, (term, self.max_offset, 0, 0)) - 1
paul@44	243
paul@44	244	# Get the entry position providing the term or one preceding it.
paul@44	245	# If no entry precedes the requested term, return the very first entry
paul@44	246	# as the closest.
paul@44	247
paul@44	248	if i == -1:
paul@44	249	return self.terms[0]
paul@44	250	else:
paul@44	251	return self.terms[i]
paul@44	252
paul@44	253	def _find_closest_term(self, term):
paul@44	254
paul@44	255	"""
paul@44	256	Find the offsets and frequencies of 'term' from the term dictionary or
paul@44	257	the closest term starting with the value of 'term'.
paul@44	258
paul@44	259	Return the closest term (or the term itself), the position file offset,
paul@44	260	the term frequency, the document frequency, and the term details file
paul@44	261	offset (or None if the reader is already positioned).
paul@44	262	"""
paul@44	263
paul@44	264	found_term, offset, frequency, doc_frequency, info_offset = self._find_closest_entry(term)
paul@44	265
paul@44	266	# Where the term is found immediately, return the offset and
paul@44	267	# frequencies. If the term does not appear, return the details of the
paul@44	268	# closest entry.
paul@44	269
paul@44	270	if term <= found_term:
paul@44	271	return found_term, offset, frequency, doc_frequency, info_offset
paul@44	272
paul@44	273	# Otherwise, seek past the index term's entry in the information file
paul@44	274	# and scan for the desired term.
paul@44	275
paul@44	276	else:
paul@44	277	self.info_reader.go_to_term(found_term, offset, info_offset)
paul@44	278	try:
paul@44	279	while term > found_term:
paul@44	280	found_term, offset, frequency, doc_frequency = self.info_reader.read_term()
paul@44	281	except EOFError:
paul@44	282	pass
paul@44	283
paul@44	284	return found_term, offset, frequency, doc_frequency, None
paul@44	285
paul@44	286	def _find_term(self, term):
paul@44	287
paul@44	288	"""
paul@44	289	Find the position file offset and frequency of 'term' from the term
paul@44	290	dictionary.
paul@44	291	"""
paul@44	292
paul@44	293	found_term, offset, frequency, doc_frequency, info_offset = self._find_closest_term(term)
paul@44	294
paul@44	295	# If the term is found, return the offset and frequencies.
paul@44	296
paul@44	297	if term == found_term:
paul@44	298	return offset, frequency, doc_frequency
paul@44	299	else:
paul@44	300	return None
paul@44	301
paul@81	302	def _get_term_and_positions(self, term, offset, frequency, doc_frequency):
paul@81	303
paul@81	304	"""
paul@81	305	Return the term plus positions details using the given 'term', 'offset',
paul@81	306	'frequency' and 'doc_frequency'.
paul@81	307	"""
paul@81	308
paul@81	309	return term, frequency, doc_frequency, self._get_positions(offset, doc_frequency)
paul@81	310
paul@44	311	def _get_positions(self, offset, doc_frequency):
paul@58	312
paul@58	313	"""
paul@58	314	Obtain positions from the position index 'offset' expecting a number of
paul@58	315	documents equal to the given 'doc_frequency'.
paul@58	316	"""
paul@58	317
paul@69	318	return self.position_dict_reader.read_term_positions(offset, doc_frequency)
paul@44	319
paul@44	320	# Iterator convenience methods.
paul@44	321
paul@44	322	def __iter__(self):
paul@44	323	self.rewind()
paul@44	324	return self
paul@44	325
paul@44	326	def next(self):
paul@44	327	try:
paul@44	328	return self.read_term()
paul@44	329	except EOFError:
paul@44	330	raise StopIteration
paul@44	331
paul@44	332	# Sequential access methods.
paul@44	333
paul@44	334	def rewind(self):
paul@44	335	self.info_reader.rewind()
paul@44	336
paul@44	337	def read_term(self):
paul@44	338
paul@44	339	"""
paul@44	340	Return the next term, its frequency, its document frequency, and the
paul@44	341	documents and positions at which the term is found.
paul@44	342	"""
paul@44	343
paul@44	344	term, offset, frequency, doc_frequency = self.info_reader.read_term()
paul@81	345	return self._get_term_and_positions(term, offset, frequency, doc_frequency)
paul@81	346
paul@81	347	def go_to_term(self, term):
paul@81	348
paul@81	349	"""
paul@81	350	Navigate to 'term' in the dictionary, returning the details from its
paul@81	351	entry. The returned details can be augmented with position information
paul@81	352	when presented to the _get_term_and_positions method.
paul@81	353	"""
paul@81	354
paul@81	355	found_term, offset, frequency, doc_frequency, info_offset = self._find_closest_term(term)
paul@81	356
paul@81	357	# Position the reader, if necessary.
paul@81	358
paul@81	359	if info_offset is not None:
paul@81	360	self.info_reader.go_to_term(found_term, offset, info_offset)
paul@81	361
paul@81	362	return found_term, offset, frequency, doc_frequency
paul@44	363
paul@44	364	# Query methods.
paul@44	365
paul@72	366	def get_terms(self):
paul@72	367
paul@72	368	"Return a list of all terms."
paul@72	369
paul@74	370	return iter(self)
paul@72	371
paul@44	372	def find_terms(self, term):
paul@44	373
paul@44	374	"Return all terms whose values start with the value of 'term'."
paul@44	375
paul@44	376	terms = []
paul@44	377
paul@81	378	found_term, offset, frequency, doc_frequency = self.go_to_term(term)
paul@44	379
paul@44	380	# Read and record terms.
paul@44	381
paul@44	382	try:
paul@44	383	# Add the found term if it starts with the specified term.
paul@44	384
paul@44	385	while found_term.startswith(term):
paul@44	386	terms.append(found_term)
paul@44	387	found_term, offset, frequency, doc_frequency = self.info_reader.read_term()
paul@44	388
paul@44	389	except EOFError:
paul@44	390	pass
paul@44	391
paul@44	392	return terms
paul@44	393
paul@44	394	def find_positions(self, term):
paul@44	395
paul@44	396	"Return the documents and positions at which the given 'term' is found."
paul@44	397
paul@44	398	t = self._find_term(term)
paul@44	399	if t is None:
paul@61	400	return []
paul@44	401	else:
paul@44	402	offset, frequency, doc_frequency = t
paul@44	403	return self._get_positions(offset, doc_frequency)
paul@44	404
paul@60	405	def find_common_positions(self, terms):
paul@60	406
paul@60	407	"""
paul@60	408	Return the documents and positions at which all the given 'terms' are
paul@60	409	found, where only common documents are returned.
paul@60	410	"""
paul@60	411
paul@60	412	return PhraseIterator([self.find_positions(term) for term in terms])
paul@60	413
paul@44	414	def get_frequency(self, term):
paul@44	415
paul@44	416	"Return the frequency of the given 'term'."
paul@44	417
paul@44	418	t = self._find_term(term)
paul@44	419	if t is None:
paul@44	420	return None
paul@44	421	else:
paul@44	422	offset, frequency, doc_frequency = t
paul@44	423	return frequency
paul@44	424
paul@44	425	def get_document_frequency(self, term):
paul@44	426
paul@44	427	"Return the document frequency of the given 'term'."
paul@44	428
paul@44	429	t = self._find_term(term)
paul@44	430	if t is None:
paul@44	431	return None
paul@44	432	else:
paul@44	433	offset, frequency, doc_frequency = t
paul@44	434	return doc_frequency
paul@44	435
paul@44	436	def close(self):
paul@44	437	self.info_reader.close()
paul@44	438	self.index_reader.close()
paul@44	439	self.position_dict_reader.close()
paul@44	440
paul@44	441	# vim: tabstop=4 expandtab shiftwidth=4