iixr (annotate iixr.py in 628c5c388109)

iixr

Annotated iixr.py

40:628c5c388109

2009-09-12

Paul Boddie

Moved cache-affected writing methods into the FileWriter class. Fixed cache flushing in FileWriter to use the cache length, not the number of list elements. Introduced caching into the FileReader class. Introduced a seek method into FileReader in order to work with the caching, altering iterator construction.

paul@0	1	#!/usr/bin/env python
paul@0	2
paul@0	3	"""
paul@0	4	A simple (and sane) text indexing library.
paul@1	5
paul@1	6	Copyright (C) 2009 Paul Boddie <paul@boddie.org.uk>
paul@1	7
paul@1	8	This program is free software; you can redistribute it and/or modify it under
paul@1	9	the terms of the GNU General Public License as published by the Free Software
paul@1	10	Foundation; either version 3 of the License, or (at your option) any later
paul@1	11	version.
paul@1	12
paul@1	13	This program is distributed in the hope that it will be useful, but WITHOUT ANY
paul@1	14	WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
paul@1	15	PARTICULAR PURPOSE. See the GNU General Public License for more details.
paul@1	16
paul@1	17	You should have received a copy of the GNU General Public License along
paul@1	18	with this program. If not, see <http://www.gnu.org/licenses/>.
paul@0	19	"""
paul@0	20
paul@19	21	from os import dup, fdopen # independent iterator access to files
paul@12	22	from os import listdir, mkdir # index and partition discovery
paul@14	23	from os import remove, rename # partition manipulation
paul@7	24	from os.path import exists, join
paul@2	25	from os.path import commonprefix # to find common string prefixes
paul@3	26	from bisect import bisect_right # to find terms in the dictionary index
paul@10	27	import bz2, zlib # for field compression
paul@33	28	from itermerge import itermerge
paul@2	29
paul@21	30	try:
paul@21	31	set
paul@21	32	except NameError:
paul@21	33	from sets import Set as set
paul@21	34
paul@7	35	# Constants.
paul@7	36
paul@20	37	TERM_INTERVAL = 100
paul@20	38	DOCUMENT_INTERVAL = 100
paul@20	39	FIELD_INTERVAL = 100
paul@28	40	FLUSH_INTERVAL = 10000
paul@20	41
paul@20	42	TERM_FILENAMES = "terms", "terms_index", "positions", "positions_index"
paul@20	43	FIELD_FILENAMES = "fields", "fields_index"
paul@7	44
paul@10	45	compressors = [("b", bz2.compress), ("z", zlib.compress)]
paul@10	46	decompressors = {"b" : bz2.decompress, "z" : zlib.decompress}
paul@10	47
paul@35	48	# Utility functions.
paul@35	49
paul@37	50	try:
paul@38	51	from vint import vint as _vint
paul@38	52
paul@38	53	def vint(number):
paul@38	54
paul@38	55	"Write 'number' as a variable-length integer."
paul@38	56
paul@38	57	if number >= 0:
paul@38	58	return _vint(number)
paul@38	59	else:
paul@38	60	raise ValueError, "Number %r is negative." % number
paul@38	61
paul@37	62	except ImportError:
paul@37	63
paul@37	64	def vint(number):
paul@37	65
paul@37	66	"Write 'number' as a variable-length integer."
paul@37	67
paul@37	68	if number >= 0:
paul@37	69
paul@37	70	# Special case: one byte containing a 7-bit number.
paul@37	71
paul@37	72	if number < 128:
paul@37	73	return chr(number)
paul@37	74
paul@37	75	# Write the number from least to most significant digits.
paul@37	76
paul@37	77	bytes = []
paul@37	78
paul@37	79	while number != 0:
paul@37	80	lsd = number & 127
paul@37	81	number = number >> 7
paul@37	82	if number != 0:
paul@37	83	lsd \|= 128
paul@37	84	bytes.append(chr(lsd))
paul@37	85
paul@37	86	return "".join(bytes)
paul@37	87
paul@37	88	# Negative numbers are not supported.
paul@37	89
paul@37	90	else:
paul@37	91	raise ValueError, "Number %r is negative." % number
paul@35	92
paul@0	93	# Foundation classes.
paul@0	94
paul@0	95	class File:
paul@0	96
paul@0	97	"A basic file abstraction."
paul@0	98
paul@0	99	def __init__(self, f):
paul@0	100	self.f = f
paul@0	101	self.reset()
paul@0	102
paul@0	103	def reset(self):
paul@12	104
paul@12	105	"To be used to reset the state of the reader or writer between records."
paul@12	106
paul@0	107	pass
paul@0	108
paul@12	109	def rewind(self):
paul@40	110	self.seek(0)
paul@13	111	self.reset()
paul@12	112
paul@40	113	def seek(self, offset):
paul@40	114
paul@40	115	"To be defined by readers."
paul@40	116
paul@40	117	pass
paul@39	118
paul@39	119	def flush(self):
paul@40	120
paul@40	121	"To be defined by writers."
paul@40	122
paul@40	123	pass
paul@39	124
paul@0	125	def close(self):
paul@7	126	if self.f is not None:
paul@39	127	self.flush()
paul@7	128	self.f.close()
paul@7	129	self.f = None
paul@0	130
paul@0	131	class FileWriter(File):
paul@0	132
paul@0	133	"Writing basic data types to files."
paul@0	134
paul@40	135	def __init__(self, f):
paul@40	136	File.__init__(self, f)
paul@40	137	self.cache = []
paul@40	138	self.cache_length = 0
paul@40	139
paul@0	140	def write_number(self, number):
paul@0	141
paul@0	142	"Write 'number' to the file using a variable length encoding."
paul@0	143
paul@39	144	self.write(vint(number))
paul@0	145
paul@8	146	def write_string(self, s, compress=0):
paul@2	147
paul@8	148	"""
paul@8	149	Write 's' to the file, recording its length and compressing the string
paul@8	150	if 'compress' is set to a true value.
paul@8	151	"""
paul@2	152
paul@7	153	# Convert Unicode objects to strings.
paul@7	154
paul@7	155	if isinstance(s, unicode):
paul@7	156	s = s.encode("utf-8")
paul@7	157
paul@8	158	# Compress the string if requested.
paul@2	159
paul@8	160	if compress:
paul@10	161	for flag, fn in compressors:
paul@10	162	cs = fn(s)
paul@10	163
paul@10	164	# Take the first string shorter than the original.
paul@10	165
paul@10	166	if len(cs) < len(s):
paul@10	167	s = cs
paul@10	168	break
paul@10	169	else:
paul@10	170	flag = "-"
paul@10	171
paul@35	172	else:
paul@35	173	flag = ""
paul@2	174
paul@8	175	# Write the length of the data before the data itself.
paul@8	176
paul@8	177	length = len(s)
paul@39	178	self.write(flag + vint(length) + s)
paul@2	179
paul@40	180	# Cache-affected methods.
paul@40	181
paul@40	182	def write(self, s):
paul@40	183	self.cache.append(s)
paul@40	184	self.cache_length += len(s)
paul@40	185	if self.cache_length >= 1000:
paul@40	186	self.flush()
paul@40	187
paul@40	188	def tell(self):
paul@40	189	return self.f.tell() + self.cache_length
paul@40	190
paul@40	191	def flush(self):
paul@40	192	self.f.write("".join(self.cache))
paul@40	193	self.cache = []
paul@40	194	self.cache_length = 0
paul@40	195
paul@0	196	class FileReader(File):
paul@0	197
paul@0	198	"Reading basic data types from files."
paul@0	199
paul@40	200	def __init__(self, f):
paul@40	201	File.__init__(self, f)
paul@40	202	self.cache = ""
paul@40	203	self.cache_length = 0
paul@40	204
paul@0	205	def read_number(self):
paul@0	206
paul@0	207	"Read a number from the file."
paul@0	208
paul@0	209	# Read each byte, adding it to the number.
paul@0	210
paul@0	211	shift = 0
paul@0	212	number = 0
paul@40	213	read = self.read
paul@34	214
paul@34	215	try:
paul@34	216	csd = ord(read(1))
paul@34	217	while csd & 128:
paul@34	218	number += ((csd & 127) << shift)
paul@34	219	shift += 7
paul@34	220	csd = ord(read(1))
paul@34	221	else:
paul@34	222	number += (csd << shift)
paul@34	223	except TypeError:
paul@34	224	raise EOFError
paul@0	225
paul@0	226	return number
paul@0	227
paul@8	228	def read_string(self, decompress=0):
paul@2	229
paul@8	230	"""
paul@8	231	Read a string from the file, decompressing the stored data if
paul@8	232	'decompress' is set to a true value.
paul@8	233	"""
paul@2	234
paul@10	235	# Decompress the data if requested.
paul@10	236
paul@10	237	if decompress:
paul@40	238	flag = self.read(1)
paul@10	239	else:
paul@10	240	flag = "-"
paul@10	241
paul@4	242	length = self.read_number()
paul@40	243	s = self.read(length)
paul@8	244
paul@10	245	# Perform decompression if applicable.
paul@8	246
paul@10	247	if flag != "-":
paul@10	248	fn = decompressors[flag]
paul@10	249	s = fn(s)
paul@7	250
paul@7	251	# Convert strings to Unicode objects.
paul@7	252
paul@8	253	return unicode(s, "utf-8")
paul@2	254
paul@40	255	# Cache-affected methods.
paul@40	256
paul@40	257	def read(self, n):
paul@40	258	needed = n - self.cache_length
paul@40	259	if needed > 0:
paul@40	260	s = self.f.read(max(needed, 1000))
paul@40	261	self.cache += s
paul@40	262	self.cache_length += len(s)
paul@40	263
paul@40	264	s = self.cache[:n]
paul@40	265	self.cache = self.cache[n:]
paul@40	266	self.cache_length -= len(s)
paul@40	267	return s
paul@40	268
paul@40	269	def tell(self):
paul@40	270	return self.f.tell() - self.cache_length
paul@40	271
paul@40	272	def seek(self, offset):
paul@40	273	self.f.seek(offset)
paul@40	274	self.cache = ""
paul@40	275	self.cache_length = 0
paul@40	276
paul@34	277	class FileOpener:
paul@34	278
paul@34	279	"Opening files using their filenames."
paul@34	280
paul@34	281	def __init__(self, filename):
paul@34	282	self.filename = filename
paul@34	283
paul@34	284	def open(self, mode):
paul@34	285	return open(self.filename, mode)
paul@34	286
paul@34	287	def close(self):
paul@34	288	pass
paul@34	289
paul@9	290	# Specific classes for storing term and position information.
paul@0	291
paul@0	292	class PositionWriter(FileWriter):
paul@0	293
paul@0	294	"Writing position information to files."
paul@0	295
paul@0	296	def reset(self):
paul@0	297	self.last_docnum = 0
paul@0	298
paul@0	299	def write_positions(self, docnum, positions):
paul@0	300
paul@19	301	"""
paul@19	302	Write for the document 'docnum' the given 'positions'.
paul@19	303	Return the offset of the written record.
paul@19	304	"""
paul@0	305
paul@0	306	if docnum < self.last_docnum:
paul@0	307	raise ValueError, "Document number %r is less than previous number %r." % (docnum, self.last_docnum)
paul@0	308
paul@19	309	# Record the offset of this record.
paul@19	310
paul@39	311	offset = self.tell()
paul@0	312
paul@7	313	# Make sure that the positions are sorted.
paul@7	314
paul@7	315	positions.sort()
paul@7	316
paul@0	317	# Write the position deltas.
paul@0	318
paul@36	319	output = []
paul@0	320	last = 0
paul@36	321
paul@0	322	for position in positions:
paul@36	323	output.append(vint(position - last))
paul@0	324	last = position
paul@0	325
paul@36	326	# Write the document number delta.
paul@36	327	# Write the number of positions.
paul@36	328	# Then write the positions.
paul@36	329
paul@39	330	self.write(vint(docnum - self.last_docnum) + vint(len(positions)) + "".join(output))
paul@35	331
paul@0	332	self.last_docnum = docnum
paul@19	333	return offset
paul@0	334
paul@34	335	class PositionOpener(FileOpener):
paul@0	336
paul@0	337	"Reading position information from files."
paul@0	338
paul@19	339	def read_term_positions(self, offset, count):
paul@0	340
paul@0	341	"""
paul@0	342	Read all positions from 'offset', seeking to that position in the file
paul@19	343	before reading. The number of documents available for reading is limited
paul@19	344	to 'count'.
paul@0	345	"""
paul@0	346
paul@19	347	# Duplicate the file handle.
paul@19	348
paul@34	349	f = self.open("rb")
paul@40	350	return PositionIterator(f, offset, count)
paul@19	351
paul@19	352	class PositionIndexWriter(FileWriter):
paul@19	353
paul@19	354	"Writing position index information to files."
paul@19	355
paul@19	356	def reset(self):
paul@19	357	self.last_docnum = 0
paul@19	358	self.last_pos_offset = 0
paul@19	359
paul@19	360	def write_positions(self, docnum, pos_offset, count):
paul@19	361
paul@19	362	"""
paul@19	363	Write the given 'docnum, 'pos_offset' and document 'count' to the
paul@19	364	position index file.
paul@19	365	"""
paul@19	366
paul@19	367	# Record the offset of this record.
paul@19	368
paul@39	369	offset = self.tell()
paul@35	370	output = []
paul@19	371
paul@19	372	# Write the document number delta.
paul@19	373
paul@35	374	output.append(vint(docnum - self.last_docnum))
paul@19	375	self.last_docnum = docnum
paul@19	376
paul@19	377	# Write the position file offset delta.
paul@19	378
paul@35	379	output.append(vint(pos_offset - self.last_pos_offset))
paul@19	380	self.last_pos_offset = pos_offset
paul@19	381
paul@19	382	# Write the document count.
paul@19	383
paul@35	384	output.append(vint(count))
paul@35	385
paul@35	386	# Actually write the data.
paul@35	387
paul@39	388	self.write("".join(output))
paul@19	389
paul@19	390	return offset
paul@19	391
paul@34	392	class PositionIndexOpener(FileOpener):
paul@19	393
paul@19	394	"Reading position index information from files."
paul@18	395
paul@34	396	def read_term_positions(self, offset, doc_frequency):
paul@34	397
paul@34	398	"""
paul@34	399	Read all positions from 'offset', seeking to that position in the file
paul@34	400	before reading. The number of documents available for reading is limited
paul@34	401	to 'doc_frequency'.
paul@34	402	"""
paul@34	403
paul@34	404	# Duplicate the file handle.
paul@34	405
paul@34	406	f = self.open("rb")
paul@40	407	return PositionIndexIterator(f, offset, doc_frequency)
paul@34	408
paul@34	409	# Iterators for position-related files.
paul@34	410
paul@34	411	class IteratorBase:
paul@34	412
paul@34	413	def __init__(self, count):
paul@34	414	self.replenish(count)
paul@34	415
paul@34	416	def replenish(self, count):
paul@34	417	self.count = count
paul@34	418	self.read_documents = 0
paul@34	419
paul@34	420	def __len__(self):
paul@34	421	return self.count
paul@34	422
paul@34	423	def sort(self):
paul@34	424	pass # Stored document positions are already sorted.
paul@34	425
paul@34	426	def __iter__(self):
paul@34	427	return self
paul@34	428
paul@34	429	class PositionIterator(FileReader, IteratorBase):
paul@34	430
paul@34	431	"Iterating over document positions."
paul@34	432
paul@40	433	def __init__(self, f, offset, count):
paul@34	434	FileReader.__init__(self, f)
paul@34	435	IteratorBase.__init__(self, count)
paul@40	436	self.seek(offset)
paul@34	437
paul@34	438	def reset(self):
paul@34	439	self.last_docnum = 0
paul@34	440
paul@34	441	def read_positions(self):
paul@34	442
paul@34	443	"Read positions, returning a document number and a list of positions."
paul@34	444
paul@34	445	# Read the document number delta and add it to the last number.
paul@34	446
paul@34	447	self.last_docnum += self.read_number()
paul@34	448
paul@34	449	# Read the number of positions.
paul@34	450
paul@34	451	npositions = self.read_number()
paul@34	452
paul@34	453	# Read the position deltas, adding each previous position to get the
paul@34	454	# appropriate collection of absolute positions.
paul@34	455
paul@34	456	i = 0
paul@34	457	last = 0
paul@34	458	positions = []
paul@34	459
paul@34	460	while i < npositions:
paul@34	461	last += self.read_number()
paul@34	462	positions.append(last)
paul@34	463	i += 1
paul@34	464
paul@34	465	return self.last_docnum, positions
paul@34	466
paul@34	467	def next(self):
paul@34	468
paul@34	469	"Read positions for a single document."
paul@34	470
paul@34	471	if self.read_documents < self.count:
paul@34	472	self.read_documents += 1
paul@34	473	return self.read_positions()
paul@34	474	else:
paul@34	475	raise StopIteration
paul@34	476
paul@34	477	class PositionIndexIterator(FileReader, IteratorBase):
paul@34	478
paul@34	479	"Iterating over document positions."
paul@34	480
paul@40	481	def __init__(self, f, offset, count):
paul@34	482	FileReader.__init__(self, f)
paul@34	483	IteratorBase.__init__(self, count)
paul@40	484	self.seek(offset)
paul@34	485	self.section_count = 0
paul@34	486
paul@19	487	def reset(self):
paul@19	488	self.last_docnum = 0
paul@19	489	self.last_pos_offset = 0
paul@19	490
paul@19	491	def read_positions(self):
paul@19	492
paul@19	493	"""
paul@19	494	Read a document number, a position file offset for the position index
paul@19	495	file, and the number of documents in a section of that file.
paul@19	496	"""
paul@19	497
paul@19	498	# Read the document number delta.
paul@19	499
paul@19	500	self.last_docnum += self.read_number()
paul@19	501
paul@19	502	# Read the offset delta.
paul@19	503
paul@19	504	self.last_pos_offset += self.read_number()
paul@19	505
paul@19	506	# Read the document count.
paul@19	507
paul@19	508	count = self.read_number()
paul@19	509
paul@19	510	return self.last_docnum, self.last_pos_offset, count
paul@19	511
paul@19	512	def next(self):
paul@19	513
paul@19	514	"Read positions for a single document."
paul@18	515
paul@19	516	self.read_documents += self.section_count
paul@19	517	if self.read_documents < self.count:
paul@19	518	docnum, pos_offset, self.section_count = t = self.read_positions()
paul@19	519	return t
paul@19	520	else:
paul@19	521	raise StopIteration
paul@19	522
paul@19	523	class PositionDictionaryWriter:
paul@19	524
paul@19	525	"Writing position dictionaries."
paul@19	526
paul@19	527	def __init__(self, position_writer, position_index_writer, interval):
paul@19	528	self.position_writer = position_writer
paul@19	529	self.position_index_writer = position_index_writer
paul@19	530	self.interval = interval
paul@19	531
paul@19	532	def write_term_positions(self, doc_positions):
paul@19	533
paul@19	534	"""
paul@19	535	Write all 'doc_positions' - a collection of tuples of the form (document
paul@19	536	number, position list) - to the file.
paul@19	537
paul@19	538	Add some records to the index, making dictionary entries.
paul@19	539
paul@19	540	Return a tuple containing the offset of the written data, the frequency
paul@19	541	(number of positions), and document frequency (number of documents) for
paul@19	542	the term involved.
paul@19	543	"""
paul@19	544
paul@20	545	# Reset the writers.
paul@19	546
paul@19	547	self.position_writer.reset()
paul@20	548	self.position_index_writer.reset()
paul@20	549
paul@19	550	index_offset = None
paul@19	551
paul@19	552	# Write the positions.
paul@19	553
paul@19	554	frequency = 0
paul@20	555	first_docnum = None
paul@19	556	first_offset = None
paul@19	557	count = 0
paul@19	558
paul@19	559	doc_positions.sort()
paul@19	560
paul@19	561	for docnum, positions in doc_positions:
paul@19	562	pos_offset = self.position_writer.write_positions(docnum, positions)
paul@19	563
paul@19	564	# Retain the first record offset for a subsequent index entry.
paul@19	565
paul@19	566	if first_offset is None:
paul@19	567	first_offset = pos_offset
paul@20	568	first_docnum = docnum
paul@19	569
paul@19	570	frequency += len(positions)
paul@20	571	count += 1
paul@19	572
paul@19	573	# Every {interval} entries, write an index entry.
paul@19	574
paul@34	575	if count % self.interval == 0:
paul@20	576	io = self.position_index_writer.write_positions(first_docnum, first_offset, self.interval)
paul@0	577
paul@19	578	# Remember the first index entry offset.
paul@19	579
paul@19	580	if index_offset is None:
paul@19	581	index_offset = io
paul@19	582
paul@19	583	first_offset = None
paul@20	584	first_docnum = None
paul@19	585
paul@22	586	# Reset the position writer so that position readers accessing
paul@22	587	# a section start with the correct document number.
paul@22	588
paul@22	589	self.position_writer.reset()
paul@22	590
paul@19	591	# Finish writing an index entry for the remaining documents.
paul@19	592
paul@19	593	else:
paul@19	594	if first_offset is not None:
paul@34	595	io = self.position_index_writer.write_positions(first_docnum, first_offset, count % self.interval)
paul@19	596
paul@19	597	# Remember the first index entry offset.
paul@19	598
paul@19	599	if index_offset is None:
paul@19	600	index_offset = io
paul@19	601
paul@34	602	return index_offset, frequency, count
paul@19	603
paul@19	604	def close(self):
paul@19	605	self.position_writer.close()
paul@19	606	self.position_index_writer.close()
paul@19	607
paul@19	608	class PositionDictionaryReader:
paul@18	609
paul@19	610	"Reading position dictionaries."
paul@19	611
paul@34	612	def __init__(self, position_opener, position_index_opener):
paul@34	613	self.position_opener = position_opener
paul@34	614	self.position_index_opener = position_index_opener
paul@19	615
paul@19	616	def read_term_positions(self, offset, doc_frequency):
paul@19	617
paul@19	618	"""
paul@19	619	Return an iterator for dictionary entries starting at 'offset' with the
paul@19	620	given 'doc_frequency'.
paul@19	621	"""
paul@18	622
paul@34	623	return PositionDictionaryIterator(self.position_opener,
paul@34	624	self.position_index_opener, offset, doc_frequency)
paul@19	625
paul@19	626	def close(self):
paul@34	627	pass
paul@19	628
paul@19	629	class PositionDictionaryIterator:
paul@19	630
paul@19	631	"Iteration over position dictionary entries."
paul@19	632
paul@34	633	def __init__(self, position_opener, position_index_opener, offset, doc_frequency):
paul@34	634	self.position_opener = position_opener
paul@20	635	self.doc_frequency = doc_frequency
paul@34	636	self.index_iterator = position_index_opener.read_term_positions(offset, doc_frequency)
paul@34	637	self.iterator = None
paul@19	638
paul@22	639	# Remember the last values.
paul@22	640
paul@22	641	self.found_docnum, self.found_positions = None, None
paul@22	642
paul@21	643	# Maintain state for the next index entry, if read.
paul@21	644
paul@21	645	self.next_docnum, self.next_pos_offset, self.next_section_count = None, None, None
paul@21	646
paul@21	647	# Initialise the current index entry and current position file iterator.
paul@21	648
paul@21	649	self._next_section()
paul@21	650	self._init_section()
paul@0	651
paul@34	652	# Sequence methods.
paul@34	653
paul@20	654	def __len__(self):
paul@20	655	return self.doc_frequency
paul@20	656
paul@20	657	def sort(self):
paul@20	658	pass
paul@20	659
paul@34	660	# Iterator methods.
paul@34	661
paul@18	662	def __iter__(self):
paul@18	663	return self
paul@18	664
paul@18	665	def next(self):
paul@0	666
paul@21	667	"""
paul@21	668	Attempt to get the next document record from the section in the
paul@21	669	positions file.
paul@21	670	"""
paul@19	671
paul@22	672	# Return any visited but unrequested record.
paul@22	673
paul@22	674	if self.found_docnum is not None:
paul@22	675	t = self.found_docnum, self.found_positions
paul@22	676	self.found_docnum, self.found_positions = None, None
paul@22	677	return t
paul@22	678
paul@22	679	# Or search for the next record.
paul@22	680
paul@19	681	while 1:
paul@19	682
paul@19	683	# Either return the next record.
paul@19	684
paul@19	685	try:
paul@19	686	return self.iterator.next()
paul@0	687
paul@19	688	# Or, where a section is finished, get the next section and try again.
paul@19	689
paul@19	690	except StopIteration:
paul@20	691
paul@20	692	# Where a section follows, update the index iterator, but keep
paul@20	693	# reading using the same file iterator (since the data should
paul@20	694	# just follow on from the last section).
paul@20	695
paul@21	696	self._next_section()
paul@19	697	self.iterator.replenish(self.section_count)
paul@19	698
paul@22	699	# Reset the state of the iterator to make sure that document
paul@22	700	# numbers are correct.
paul@22	701
paul@22	702	self.iterator.reset()
paul@22	703
paul@22	704	def from_document(self, docnum):
paul@21	705
paul@21	706	"""
paul@21	707	Attempt to navigate to a positions entry for the given 'docnum',
paul@22	708	returning the positions for 'docnum', or None otherwise.
paul@21	709	"""
paul@21	710
paul@22	711	# Return any unrequested document positions.
paul@22	712
paul@22	713	if docnum == self.found_docnum:
paul@22	714	return self.found_positions
paul@22	715
paul@21	716	# Read ahead in the index until the next entry refers to a document
paul@21	717	# later than the desired document.
paul@21	718
paul@21	719	try:
paul@21	720	if self.next_docnum is None:
paul@21	721	self.next_docnum, self.next_pos_offset, self.next_section_count = self.index_iterator.next()
paul@21	722
paul@22	723	# Read until the next entry is after the desired document number,
paul@22	724	# or until the end of the results.
paul@22	725
paul@22	726	while self.next_docnum <= docnum:
paul@21	727	self._next_read_section()
paul@22	728	if self.docnum < docnum:
paul@22	729	self.next_docnum, self.next_pos_offset, self.next_section_count = self.index_iterator.next()
paul@22	730	else:
paul@22	731	break
paul@21	732
paul@21	733	except StopIteration:
paul@21	734	pass
paul@21	735
paul@21	736	# Navigate in the position file to the document.
paul@21	737
paul@21	738	self._init_section()
paul@19	739
paul@21	740	try:
paul@21	741	while 1:
paul@22	742	found_docnum, found_positions = self.iterator.next()
paul@22	743
paul@24	744	# Return the desired document positions or None (retaining the
paul@24	745	# positions for the document immediately after).
paul@22	746
paul@21	747	if docnum == found_docnum:
paul@22	748	return found_positions
paul@23	749	elif docnum < found_docnum:
paul@22	750	self.found_docnum, self.found_positions = found_docnum, found_positions
paul@21	751	return None
paul@22	752
paul@21	753	except StopIteration:
paul@21	754	return None
paul@21	755
paul@21	756	# Internal methods.
paul@21	757
paul@21	758	def _next_section(self):
paul@21	759
paul@21	760	"Attempt to get the next section in the index."
paul@21	761
paul@21	762	if self.next_docnum is None:
paul@21	763	self.docnum, self.pos_offset, self.section_count = self.index_iterator.next()
paul@21	764	else:
paul@21	765	self._next_read_section()
paul@21	766
paul@21	767	def _next_read_section(self):
paul@21	768
paul@21	769	"""
paul@21	770	Make the next index entry the current one without reading from the
paul@21	771	index.
paul@21	772	"""
paul@21	773
paul@21	774	self.docnum, self.pos_offset, self.section_count = self.next_docnum, self.next_pos_offset, self.next_section_count
paul@22	775	self.next_docnum, self.next_pos_offset, self.next_section_count = None, None, None
paul@21	776
paul@21	777	def _init_section(self):
paul@21	778
paul@21	779	"Initialise the iterator for the section in the position file."
paul@21	780
paul@34	781	if self.iterator is not None:
paul@34	782	self.iterator.close()
paul@34	783	self.iterator = self.position_opener.read_term_positions(self.pos_offset, self.section_count)
paul@34	784
paul@34	785	def close(self):
paul@34	786	if self.iterator is not None:
paul@34	787	self.iterator.close()
paul@34	788	self.iterator = None
paul@34	789	if self.index_iterator is not None:
paul@34	790	self.index_iterator.close()
paul@34	791	self.index_iterator = None
paul@0	792
paul@2	793	class TermWriter(FileWriter):
paul@2	794
paul@2	795	"Writing term information to files."
paul@2	796
paul@2	797	def reset(self):
paul@2	798	self.last_term = ""
paul@2	799	self.last_offset = 0
paul@2	800
paul@19	801	def write_term(self, term, offset, frequency, doc_frequency):
paul@2	802
paul@2	803	"""
paul@19	804	Write the given 'term', its position file 'offset', its 'frequency' and
paul@19	805	its 'doc_frequency' (number of documents in which it appears) to the
paul@19	806	term information file. Return the offset after the term information was
paul@19	807	written to the file.
paul@2	808	"""
paul@2	809
paul@2	810	# Write the prefix length and term suffix.
paul@2	811
paul@2	812	common = len(commonprefix([self.last_term, term]))
paul@2	813	suffix = term[common:]
paul@2	814
paul@4	815	self.write_number(common)
paul@2	816	self.write_string(suffix)
paul@2	817
paul@2	818	# Write the offset delta.
paul@2	819
paul@2	820	self.write_number(offset - self.last_offset)
paul@2	821
paul@11	822	# Write the frequency.
paul@11	823
paul@11	824	self.write_number(frequency)
paul@11	825
paul@19	826	# Write the document frequency.
paul@19	827
paul@19	828	self.write_number(doc_frequency)
paul@19	829
paul@2	830	self.last_term = term
paul@2	831	self.last_offset = offset
paul@2	832
paul@39	833	return self.tell()
paul@3	834
paul@2	835	class TermReader(FileReader):
paul@2	836
paul@2	837	"Reading term information from files."
paul@2	838
paul@2	839	def reset(self):
paul@2	840	self.last_term = ""
paul@2	841	self.last_offset = 0
paul@2	842
paul@2	843	def read_term(self):
paul@2	844
paul@2	845	"""
paul@19	846	Read a term, its position file offset, its frequency and its document
paul@25	847	frequency from the term information file.
paul@2	848	"""
paul@2	849
paul@2	850	# Read the prefix length and term suffix.
paul@2	851
paul@4	852	common = self.read_number()
paul@2	853	suffix = self.read_string()
paul@2	854
paul@2	855	self.last_term = self.last_term[:common] + suffix
paul@2	856
paul@2	857	# Read the offset delta.
paul@2	858
paul@2	859	self.last_offset += self.read_number()
paul@2	860
paul@11	861	# Read the frequency.
paul@11	862
paul@11	863	frequency = self.read_number()
paul@11	864
paul@19	865	# Read the document frequency.
paul@19	866
paul@19	867	doc_frequency = self.read_number()
paul@19	868
paul@19	869	return self.last_term, self.last_offset, frequency, doc_frequency
paul@2	870
paul@3	871	def go_to_term(self, term, offset, info_offset):
paul@3	872
paul@9	873	"""
paul@9	874	Seek past the entry for 'term' having 'offset' to 'info_offset'. This
paul@9	875	permits the scanning for later terms from the specified term.
paul@9	876	"""
paul@3	877
paul@40	878	self.seek(info_offset)
paul@3	879	self.last_term = term
paul@3	880	self.last_offset = offset
paul@3	881
paul@3	882	class TermIndexWriter(TermWriter):
paul@3	883
paul@3	884	"Writing term dictionary index details to files."
paul@3	885
paul@3	886	def reset(self):
paul@3	887	TermWriter.reset(self)
paul@3	888	self.last_info_offset = 0
paul@3	889
paul@19	890	def write_term(self, term, offset, frequency, doc_frequency, info_offset):
paul@3	891
paul@3	892	"""
paul@19	893	Write the given 'term', its position file 'offset', its 'frequency' and
paul@19	894	its 'doc_frequency' to the term dictionary index file, along with the
paul@19	895	'info_offset' in the term information file.
paul@3	896	"""
paul@3	897
paul@19	898	TermWriter.write_term(self, term, offset, frequency, doc_frequency)
paul@3	899
paul@3	900	# Write the information file offset delta.
paul@3	901
paul@3	902	self.write_number(info_offset - self.last_info_offset)
paul@3	903	self.last_info_offset = info_offset
paul@3	904
paul@3	905	class TermIndexReader(TermReader):
paul@3	906
paul@3	907	"Reading term dictionary index details from files."
paul@3	908
paul@3	909	def reset(self):
paul@3	910	TermReader.reset(self)
paul@3	911	self.last_info_offset = 0
paul@3	912
paul@3	913	def read_term(self):
paul@3	914
paul@3	915	"""
paul@19	916	Read a term, its position file offset, its frequency, its document
paul@19	917	frequency and a term information file offset from the term dictionary
paul@19	918	index file.
paul@3	919	"""
paul@3	920
paul@19	921	term, offset, frequency, doc_frequency = TermReader.read_term(self)
paul@3	922
paul@3	923	# Read the offset delta.
paul@3	924
paul@3	925	self.last_info_offset += self.read_number()
paul@3	926
paul@19	927	return term, offset, frequency, doc_frequency, self.last_info_offset
paul@3	928
paul@3	929	class TermDictionaryWriter:
paul@3	930
paul@3	931	"Writing term dictionaries."
paul@3	932
paul@19	933	def __init__(self, info_writer, index_writer, position_dict_writer, interval):
paul@3	934	self.info_writer = info_writer
paul@3	935	self.index_writer = index_writer
paul@19	936	self.position_dict_writer = position_dict_writer
paul@3	937	self.interval = interval
paul@3	938	self.entry = 0
paul@3	939
paul@19	940	def _write_term(self, term, offset, frequency, doc_frequency):
paul@3	941
paul@3	942	"""
paul@19	943	Write the given 'term', its position file 'offset', its 'frequency' and
paul@19	944	its 'doc_frequency' (number of documents in which it appears) to the
paul@19	945	term information file. Return the offset after the term information was
paul@19	946	written to the file.
paul@3	947	"""
paul@3	948
paul@19	949	info_offset = self.info_writer.write_term(term, offset, frequency, doc_frequency)
paul@3	950
paul@3	951	if self.entry % self.interval == 0:
paul@19	952	self.index_writer.write_term(term, offset, frequency, doc_frequency, info_offset)
paul@3	953
paul@3	954	self.entry += 1
paul@3	955
paul@5	956	def write_term_positions(self, term, doc_positions):
paul@5	957
paul@5	958	"""
paul@5	959	Write the given 'term' and the 'doc_positions' recording the documents
paul@5	960	and positions at which the term is found.
paul@5	961	"""
paul@5	962
paul@19	963	offset, frequency, doc_frequency = self.position_dict_writer.write_term_positions(doc_positions)
paul@19	964	self._write_term(term, offset, frequency, doc_frequency)
paul@5	965
paul@3	966	def close(self):
paul@3	967	self.info_writer.close()
paul@3	968	self.index_writer.close()
paul@19	969	self.position_dict_writer.close()
paul@3	970
paul@3	971	class TermDictionaryReader:
paul@3	972
paul@3	973	"Reading term dictionaries."
paul@3	974
paul@22	975	def __init__(self, info_reader, index_reader, position_dict_reader):
paul@3	976	self.info_reader = info_reader
paul@3	977	self.index_reader = index_reader
paul@22	978	self.position_dict_reader = position_dict_reader
paul@3	979
paul@3	980	self.terms = []
paul@3	981	try:
paul@3	982	while 1:
paul@3	983	self.terms.append(self.index_reader.read_term())
paul@3	984	except EOFError:
paul@3	985	pass
paul@3	986
paul@3	987	# Large numbers for ordering purposes.
paul@3	988
paul@28	989	if self.terms:
paul@28	990	self.max_offset = self.terms[-1][1] + 1
paul@28	991	else:
paul@28	992	self.max_offset = None
paul@3	993
paul@25	994	def _find_closest_entry(self, term):
paul@3	995
paul@11	996	"""
paul@25	997	Find the offsets and frequencies of 'term' from the term dictionary or
paul@25	998	the closest term starting with the value of 'term'.
paul@25	999
paul@25	1000	Return the closest index entry consisting of a term, the position file
paul@25	1001	offset, the term frequency, the document frequency, and the term details
paul@25	1002	file offset.
paul@11	1003	"""
paul@3	1004
paul@14	1005	i = bisect_right(self.terms, (term, self.max_offset, 0, 0)) - 1
paul@3	1006
paul@3	1007	# Get the entry position providing the term or one preceding it.
paul@25	1008	# If no entry precedes the requested term, return the very first entry
paul@25	1009	# as the closest.
paul@3	1010
paul@3	1011	if i == -1:
paul@25	1012	return self.terms[0]
paul@25	1013	else:
paul@25	1014	return self.terms[i]
paul@25	1015
paul@25	1016	def _find_closest_term(self, term):
paul@25	1017
paul@25	1018	"""
paul@25	1019	Find the offsets and frequencies of 'term' from the term dictionary or
paul@25	1020	the closest term starting with the value of 'term'.
paul@25	1021
paul@25	1022	Return the closest term (or the term itself), the position file offset,
paul@25	1023	the term frequency, the document frequency, and the term details file
paul@25	1024	offset (or None if the reader is already positioned).
paul@25	1025	"""
paul@25	1026
paul@25	1027	found_term, offset, frequency, doc_frequency, info_offset = self._find_closest_entry(term)
paul@3	1028
paul@19	1029	# Where the term is found immediately, return the offset and
paul@25	1030	# frequencies. If the term does not appear, return the details of the
paul@25	1031	# closest entry.
paul@25	1032
paul@25	1033	if term <= found_term:
paul@25	1034	return found_term, offset, frequency, doc_frequency, info_offset
paul@3	1035
paul@3	1036	# Otherwise, seek past the index term's entry in the information file
paul@3	1037	# and scan for the desired term.
paul@3	1038
paul@3	1039	else:
paul@3	1040	self.info_reader.go_to_term(found_term, offset, info_offset)
paul@3	1041	try:
paul@3	1042	while term > found_term:
paul@19	1043	found_term, offset, frequency, doc_frequency = self.info_reader.read_term()
paul@3	1044	except EOFError:
paul@3	1045	pass
paul@3	1046
paul@25	1047	return found_term, offset, frequency, doc_frequency, None
paul@25	1048
paul@25	1049	def _find_term(self, term):
paul@25	1050
paul@25	1051	"""
paul@25	1052	Find the position file offset and frequency of 'term' from the term
paul@25	1053	dictionary.
paul@25	1054	"""
paul@25	1055
paul@25	1056	found_term, offset, frequency, doc_frequency, info_offset = self._find_closest_term(term)
paul@25	1057
paul@25	1058	# If the term is found, return the offset and frequencies.
paul@25	1059
paul@25	1060	if term == found_term:
paul@25	1061	return offset, frequency, doc_frequency
paul@25	1062	else:
paul@25	1063	return None
paul@25	1064
paul@25	1065	def _get_positions(self, offset, doc_frequency):
paul@25	1066	return self.position_dict_reader.read_term_positions(offset, doc_frequency)
paul@25	1067
paul@34	1068	# Iterator convenience methods.
paul@34	1069
paul@34	1070	def __iter__(self):
paul@34	1071	self.rewind()
paul@34	1072	return self
paul@34	1073
paul@34	1074	def next(self):
paul@34	1075	try:
paul@34	1076	return self.read_term()
paul@34	1077	except EOFError:
paul@34	1078	raise StopIteration
paul@34	1079
paul@25	1080	# Sequential access methods.
paul@3	1081
paul@12	1082	def rewind(self):
paul@12	1083	self.info_reader.rewind()
paul@12	1084
paul@12	1085	def read_term(self):
paul@12	1086
paul@12	1087	"""
paul@19	1088	Return the next term, its frequency, its document frequency, and the
paul@19	1089	documents and positions at which the term is found.
paul@12	1090	"""
paul@12	1091
paul@19	1092	term, offset, frequency, doc_frequency = self.info_reader.read_term()
paul@19	1093	positions = self._get_positions(offset, doc_frequency)
paul@19	1094	return term, frequency, doc_frequency, positions
paul@12	1095
paul@25	1096	# Query methods.
paul@25	1097
paul@25	1098	def find_terms(self, term):
paul@25	1099
paul@25	1100	"Return all terms whose values start with the value of 'term'."
paul@25	1101
paul@25	1102	terms = []
paul@25	1103
paul@25	1104	found_term, offset, frequency, doc_frequency, info_offset = self._find_closest_term(term)
paul@25	1105
paul@25	1106	# Position the reader, if necessary.
paul@25	1107
paul@25	1108	if info_offset is not None:
paul@25	1109	self.info_reader.go_to_term(found_term, offset, info_offset)
paul@25	1110
paul@25	1111	# Read and record terms.
paul@25	1112
paul@25	1113	try:
paul@25	1114	# Add the found term if it starts with the specified term.
paul@25	1115
paul@25	1116	while found_term.startswith(term):
paul@25	1117	terms.append(found_term)
paul@25	1118	found_term, offset, frequency, doc_frequency = self.info_reader.read_term()
paul@25	1119
paul@25	1120	except EOFError:
paul@25	1121	pass
paul@25	1122
paul@25	1123	return terms
paul@25	1124
paul@5	1125	def find_positions(self, term):
paul@5	1126
paul@5	1127	"Return the documents and positions at which the given 'term' is found."
paul@5	1128
paul@11	1129	t = self._find_term(term)
paul@11	1130	if t is None:
paul@5	1131	return None
paul@5	1132	else:
paul@19	1133	offset, frequency, doc_frequency = t
paul@19	1134	return self._get_positions(offset, doc_frequency)
paul@5	1135
paul@11	1136	def get_frequency(self, term):
paul@11	1137
paul@11	1138	"Return the frequency of the given 'term'."
paul@11	1139
paul@11	1140	t = self._find_term(term)
paul@11	1141	if t is None:
paul@11	1142	return None
paul@11	1143	else:
paul@19	1144	offset, frequency, doc_frequency = t
paul@11	1145	return frequency
paul@11	1146
paul@19	1147	def get_document_frequency(self, term):
paul@19	1148
paul@19	1149	"Return the document frequency of the given 'term'."
paul@19	1150
paul@19	1151	t = self._find_term(term)
paul@19	1152	if t is None:
paul@19	1153	return None
paul@19	1154	else:
paul@19	1155	offset, frequency, doc_frequency = t
paul@19	1156	return doc_frequency
paul@19	1157
paul@3	1158	def close(self):
paul@3	1159	self.info_reader.close()
paul@3	1160	self.index_reader.close()
paul@22	1161	self.position_dict_reader.close()
paul@3	1162
paul@9	1163	# Specific classes for storing document information.
paul@9	1164
paul@8	1165	class FieldWriter(FileWriter):
paul@8	1166
paul@8	1167	"Writing field data to files."
paul@8	1168
paul@9	1169	def reset(self):
paul@9	1170	self.last_docnum = 0
paul@9	1171
paul@9	1172	def write_fields(self, docnum, fields):
paul@8	1173
paul@8	1174	"""
paul@13	1175	Write for the given 'docnum', a list of 'fields' (integer, string pairs
paul@13	1176	representing field identifiers and values respectively).
paul@13	1177	Return the offset at which the fields are stored.
paul@8	1178	"""
paul@8	1179
paul@39	1180	offset = self.tell()
paul@8	1181
paul@9	1182	# Write the document number delta.
paul@9	1183
paul@9	1184	self.write_number(docnum - self.last_docnum)
paul@9	1185
paul@8	1186	# Write the number of fields.
paul@8	1187
paul@8	1188	self.write_number(len(fields))
paul@8	1189
paul@8	1190	# Write the fields themselves.
paul@8	1191
paul@13	1192	for i, field in fields:
paul@13	1193	self.write_number(i)
paul@10	1194	self.write_string(field, 1) # compress
paul@8	1195
paul@9	1196	self.last_docnum = docnum
paul@8	1197	return offset
paul@8	1198
paul@8	1199	class FieldReader(FileReader):
paul@8	1200
paul@8	1201	"Reading field data from files."
paul@8	1202
paul@9	1203	def reset(self):
paul@9	1204	self.last_docnum = 0
paul@9	1205
paul@8	1206	def read_fields(self):
paul@8	1207
paul@9	1208	"""
paul@9	1209	Read fields from the file, returning a tuple containing the document
paul@13	1210	number and a list of field (identifier, value) pairs.
paul@9	1211	"""
paul@9	1212
paul@9	1213	# Read the document number.
paul@9	1214
paul@9	1215	self.last_docnum += self.read_number()
paul@8	1216
paul@8	1217	# Read the number of fields.
paul@8	1218
paul@8	1219	nfields = self.read_number()
paul@8	1220
paul@8	1221	# Collect the fields.
paul@8	1222
paul@8	1223	fields = []
paul@8	1224	i = 0
paul@8	1225
paul@8	1226	while i < nfields:
paul@13	1227	identifier = self.read_number()
paul@13	1228	value = self.read_string(1) # decompress
paul@13	1229	fields.append((identifier, value))
paul@8	1230	i += 1
paul@8	1231
paul@9	1232	return self.last_docnum, fields
paul@9	1233
paul@9	1234	def read_document_fields(self, docnum, offset):
paul@8	1235
paul@9	1236	"""
paul@9	1237	Read fields for 'docnum' at the given 'offset'. This permits the
paul@9	1238	retrieval of details for the specified document, as well as scanning for
paul@9	1239	later documents.
paul@9	1240	"""
paul@8	1241
paul@40	1242	self.seek(offset)
paul@9	1243	bad_docnum, fields = self.read_fields()
paul@9	1244	self.last_docnum = docnum
paul@9	1245	return docnum, fields
paul@12	1246
paul@9	1247	class FieldIndexWriter(FileWriter):
paul@9	1248
paul@9	1249	"Writing field index details to files."
paul@9	1250
paul@9	1251	def reset(self):
paul@9	1252	self.last_docnum = 0
paul@10	1253	self.last_offset = 0
paul@9	1254
paul@9	1255	def write_document(self, docnum, offset):
paul@9	1256
paul@9	1257	"""
paul@9	1258	Write for the given 'docnum', the 'offset' at which the fields for the
paul@9	1259	document are stored in the fields file.
paul@9	1260	"""
paul@9	1261
paul@10	1262	# Write the document number and offset deltas.
paul@9	1263
paul@9	1264	self.write_number(docnum - self.last_docnum)
paul@10	1265	self.write_number(offset - self.last_offset)
paul@9	1266
paul@9	1267	self.last_docnum = docnum
paul@10	1268	self.last_offset = offset
paul@9	1269
paul@9	1270	class FieldIndexReader(FileReader):
paul@9	1271
paul@9	1272	"Reading field index details from files."
paul@9	1273
paul@9	1274	def reset(self):
paul@9	1275	self.last_docnum = 0
paul@10	1276	self.last_offset = 0
paul@9	1277
paul@9	1278	def read_document(self):
paul@9	1279
paul@9	1280	"Read a document number and field file offset."
paul@9	1281
paul@9	1282	# Read the document number delta and offset.
paul@9	1283
paul@9	1284	self.last_docnum += self.read_number()
paul@10	1285	self.last_offset += self.read_number()
paul@9	1286
paul@10	1287	return self.last_docnum, self.last_offset
paul@9	1288
paul@9	1289	class FieldDictionaryWriter:
paul@9	1290
paul@9	1291	"Writing field dictionary details."
paul@9	1292
paul@9	1293	def __init__(self, field_writer, field_index_writer, interval):
paul@9	1294	self.field_writer = field_writer
paul@9	1295	self.field_index_writer = field_index_writer
paul@9	1296	self.interval = interval
paul@9	1297	self.entry = 0
paul@9	1298
paul@9	1299	def write_fields(self, docnum, fields):
paul@9	1300
paul@9	1301	"Write details of the document with the given 'docnum' and 'fields'."
paul@9	1302
paul@9	1303	offset = self.field_writer.write_fields(docnum, fields)
paul@9	1304
paul@9	1305	if self.entry % self.interval == 0:
paul@9	1306	self.field_index_writer.write_document(docnum, offset)
paul@9	1307
paul@9	1308	self.entry += 1
paul@9	1309
paul@9	1310	def close(self):
paul@9	1311	self.field_writer.close()
paul@9	1312	self.field_index_writer.close()
paul@9	1313
paul@9	1314	class FieldDictionaryReader:
paul@9	1315
paul@9	1316	"Reading field dictionary details."
paul@9	1317
paul@9	1318	def __init__(self, field_reader, field_index_reader):
paul@9	1319	self.field_reader = field_reader
paul@9	1320	self.field_index_reader = field_index_reader
paul@9	1321
paul@9	1322	self.docs = []
paul@9	1323	try:
paul@9	1324	while 1:
paul@9	1325	self.docs.append(self.field_index_reader.read_document())
paul@9	1326	except EOFError:
paul@9	1327	pass
paul@9	1328
paul@9	1329	# Large numbers for ordering purposes.
paul@9	1330
paul@28	1331	if self.docs:
paul@28	1332	self.max_offset = self.docs[-1][1]
paul@28	1333	else:
paul@28	1334	self.max_offset = None
paul@9	1335
paul@34	1336	# Iterator convenience methods.
paul@34	1337
paul@34	1338	def __iter__(self):
paul@34	1339	self.rewind()
paul@34	1340	return self
paul@34	1341
paul@34	1342	def next(self):
paul@34	1343	try:
paul@34	1344	return self.read_fields()
paul@34	1345	except EOFError:
paul@34	1346	raise StopIteration
paul@34	1347
paul@34	1348	# Sequential access methods.
paul@34	1349
paul@13	1350	def rewind(self):
paul@13	1351	self.field_reader.rewind()
paul@13	1352
paul@13	1353	def read_fields(self):
paul@13	1354
paul@13	1355	"Return the next document number and fields."
paul@13	1356
paul@13	1357	return self.field_reader.read_fields()
paul@13	1358
paul@34	1359	# Random access methods.
paul@34	1360
paul@13	1361	def get_fields(self, docnum):
paul@9	1362
paul@9	1363	"Read the fields of the document with the given 'docnum'."
paul@9	1364
paul@9	1365	i = bisect_right(self.docs, (docnum, self.max_offset)) - 1
paul@9	1366
paul@9	1367	# Get the entry position providing the term or one preceding it.
paul@9	1368
paul@9	1369	if i == -1:
paul@9	1370	return None
paul@9	1371
paul@9	1372	found_docnum, offset = self.docs[i]
paul@9	1373
paul@9	1374	# Read from the fields file.
paul@9	1375
paul@9	1376	found_docnum, fields = self.field_reader.read_document_fields(found_docnum, offset)
paul@9	1377
paul@9	1378	# Scan for the document, if necessary.
paul@9	1379
paul@9	1380	try:
paul@9	1381	while docnum > found_docnum:
paul@9	1382	found_docnum, fields = self.field_reader.read_fields()
paul@9	1383	except EOFError:
paul@9	1384	pass
paul@9	1385
paul@9	1386	# If the document is found, return the fields.
paul@9	1387
paul@9	1388	if docnum == found_docnum:
paul@9	1389	return fields
paul@9	1390	else:
paul@9	1391	return None
paul@9	1392
paul@9	1393	def close(self):
paul@9	1394	self.field_reader.close()
paul@9	1395	self.field_index_reader.close()
paul@8	1396
paul@12	1397	# Dictionary merging classes.
paul@12	1398
paul@13	1399	class Merger:
paul@12	1400
paul@13	1401	"Merge files."
paul@12	1402
paul@12	1403	def __init__(self, writer, readers):
paul@12	1404	self.writer = writer
paul@12	1405	self.readers = readers
paul@12	1406
paul@13	1407	def close(self):
paul@13	1408	for reader in self.readers:
paul@13	1409	reader.close()
paul@13	1410	self.writer.close()
paul@13	1411
paul@13	1412	class TermDictionaryMerger(Merger):
paul@13	1413
paul@13	1414	"Merge term and position files."
paul@13	1415
paul@12	1416	def merge(self):
paul@13	1417
paul@13	1418	"""
paul@13	1419	Merge terms and positions from the readers, sending them to the writer.
paul@13	1420	"""
paul@13	1421
paul@34	1422	last_term = None
paul@34	1423	current_readers = []
paul@34	1424
paul@34	1425	for term, frequency, doc_frequency, positions in itermerge(self.readers):
paul@34	1426	if term == last_term:
paul@34	1427	current_readers.append(positions)
paul@34	1428	else:
paul@34	1429	if current_readers:
paul@34	1430	self.writer.write_term_positions(last_term, itermerge(current_readers))
paul@34	1431	last_term = term
paul@34	1432	current_readers = [positions]
paul@34	1433	else:
paul@34	1434	if current_readers:
paul@34	1435	self.writer.write_term_positions(last_term, itermerge(current_readers))
paul@12	1436
paul@13	1437	class FieldDictionaryMerger(Merger):
paul@13	1438
paul@13	1439	"Merge field files."
paul@13	1440
paul@13	1441	def merge(self):
paul@13	1442
paul@13	1443	"""
paul@13	1444	Merge fields from the readers, sending them to the writer.
paul@13	1445	"""
paul@13	1446
paul@34	1447	for docnum, fields in itermerge(self.readers):
paul@13	1448	self.writer.write_fields(docnum, fields)
paul@13	1449
paul@13	1450	# Utility functions.
paul@13	1451
paul@19	1452	def get_term_writer(pathname, partition, interval, doc_interval):
paul@13	1453
paul@13	1454	"""
paul@13	1455	Return a term dictionary writer using files under the given 'pathname'
paul@13	1456	labelled according to the given 'partition', using the given indexing
paul@19	1457	'interval' for terms and 'doc_interval' for document position records.
paul@13	1458	"""
paul@13	1459
paul@13	1460	tdf = open(join(pathname, "terms-%s" % partition), "wb")
paul@13	1461	info_writer = TermWriter(tdf)
paul@13	1462
paul@14	1463	tdif = open(join(pathname, "terms_index-%s" % partition), "wb")
paul@13	1464	index_writer = TermIndexWriter(tdif)
paul@13	1465
paul@13	1466	tpf = open(join(pathname, "positions-%s" % partition), "wb")
paul@13	1467	positions_writer = PositionWriter(tpf)
paul@13	1468
paul@19	1469	tpif = open(join(pathname, "positions_index-%s" % partition), "wb")
paul@19	1470	positions_index_writer = PositionIndexWriter(tpif)
paul@19	1471
paul@19	1472	positions_dict_writer = PositionDictionaryWriter(positions_writer, positions_index_writer, doc_interval)
paul@19	1473
paul@19	1474	return TermDictionaryWriter(info_writer, index_writer, positions_dict_writer, interval)
paul@13	1475
paul@13	1476	def get_field_writer(pathname, partition, interval):
paul@13	1477
paul@13	1478	"""
paul@13	1479	Return a field dictionary writer using files under the given 'pathname'
paul@13	1480	labelled according to the given 'partition', using the given indexing
paul@13	1481	'interval'.
paul@13	1482	"""
paul@13	1483
paul@13	1484	ff = open(join(pathname, "fields-%s" % partition), "wb")
paul@13	1485	field_writer = FieldWriter(ff)
paul@13	1486
paul@13	1487	fif = open(join(pathname, "fields_index-%s" % partition), "wb")
paul@13	1488	field_index_writer = FieldIndexWriter(fif)
paul@13	1489
paul@13	1490	return FieldDictionaryWriter(field_writer, field_index_writer, interval)
paul@13	1491
paul@14	1492	def get_term_reader(pathname, partition):
paul@14	1493
paul@14	1494	"""
paul@14	1495	Return a term dictionary reader using files under the given 'pathname'
paul@14	1496	labelled according to the given 'partition'.
paul@14	1497	"""
paul@14	1498
paul@14	1499	tdf = open(join(pathname, "terms-%s" % partition), "rb")
paul@14	1500	info_reader = TermReader(tdf)
paul@14	1501
paul@14	1502	tdif = open(join(pathname, "terms_index-%s" % partition), "rb")
paul@14	1503	index_reader = TermIndexReader(tdif)
paul@14	1504
paul@34	1505	positions_opener = PositionOpener(join(pathname, "positions-%s" % partition))
paul@34	1506	positions_index_opener = PositionIndexOpener(join(pathname, "positions_index-%s" % partition))
paul@34	1507
paul@34	1508	positions_dict_reader = PositionDictionaryReader(positions_opener, positions_index_opener)
paul@19	1509
paul@19	1510	return TermDictionaryReader(info_reader, index_reader, positions_dict_reader)
paul@14	1511
paul@14	1512	def get_field_reader(pathname, partition):
paul@14	1513
paul@14	1514	"""
paul@14	1515	Return a field dictionary reader using files under the given 'pathname'
paul@14	1516	labelled according to the given 'partition'.
paul@14	1517	"""
paul@14	1518
paul@14	1519	ff = open(join(pathname, "fields-%s" % partition), "rb")
paul@14	1520	field_reader = FieldReader(ff)
paul@14	1521
paul@14	1522	fif = open(join(pathname, "fields_index-%s" % partition), "rb")
paul@14	1523	field_index_reader = FieldIndexReader(fif)
paul@14	1524
paul@14	1525	return FieldDictionaryReader(field_reader, field_index_reader)
paul@14	1526
paul@14	1527	def rename_files(pathname, names, from_partition, to_partition):
paul@14	1528	for name in names:
paul@14	1529	rename(join(pathname, "%s-%s" % (name, from_partition)), join(pathname, "%s-%s" % (name, to_partition)))
paul@14	1530
paul@14	1531	def rename_term_files(pathname, from_partition, to_partition):
paul@20	1532	rename_files(pathname, TERM_FILENAMES, from_partition, to_partition)
paul@14	1533
paul@14	1534	def rename_field_files(pathname, from_partition, to_partition):
paul@20	1535	rename_files(pathname, FIELD_FILENAMES, from_partition, to_partition)
paul@14	1536
paul@14	1537	def remove_files(pathname, names, partition):
paul@14	1538	for name in names:
paul@14	1539	remove(join(pathname, "%s-%s" % (name, partition)))
paul@14	1540
paul@14	1541	def remove_term_files(pathname, partition):
paul@20	1542	remove_files(pathname, TERM_FILENAMES, partition)
paul@14	1543
paul@14	1544	def remove_field_files(pathname, partition):
paul@20	1545	remove_files(pathname, FIELD_FILENAMES, partition)
paul@14	1546
paul@8	1547	# High-level classes.
paul@8	1548
paul@28	1549	class Document:
paul@28	1550
paul@28	1551	"A container of document information."
paul@28	1552
paul@28	1553	def __init__(self, docnum):
paul@28	1554	self.docnum = docnum
paul@28	1555	self.fields = []
paul@28	1556	self.terms = {}
paul@28	1557
paul@28	1558	def add_position(self, term, position):
paul@28	1559
paul@28	1560	"""
paul@28	1561	Add a position entry for the given 'term', indicating the given
paul@28	1562	'position'.
paul@28	1563	"""
paul@28	1564
paul@28	1565	self.terms.setdefault(term, []).append(position)
paul@28	1566
paul@28	1567	def add_field(self, identifier, value):
paul@28	1568
paul@28	1569	"Add a field having the given 'identifier' and 'value'."
paul@28	1570
paul@28	1571	self.fields.append((identifier, unicode(value))) # convert to string
paul@28	1572
paul@31	1573	def set_fields(self, fields):
paul@28	1574
paul@28	1575	"""
paul@31	1576	Set the document's 'fields': a list of tuples each containing an integer
paul@31	1577	identifier and a string value.
paul@28	1578	"""
paul@28	1579
paul@28	1580	self.fields = fields
paul@28	1581
paul@6	1582	class IndexWriter:
paul@6	1583
paul@10	1584	"""
paul@10	1585	Building term information and writing it to the term and field dictionaries.
paul@10	1586	"""
paul@6	1587
paul@20	1588	def __init__(self, pathname, interval, doc_interval, flush_interval):
paul@12	1589	self.pathname = pathname
paul@12	1590	self.interval = interval
paul@20	1591	self.doc_interval = doc_interval
paul@12	1592	self.flush_interval = flush_interval
paul@12	1593
paul@12	1594	self.dict_partition = 0
paul@12	1595	self.field_dict_partition = 0
paul@12	1596
paul@6	1597	self.terms = {}
paul@10	1598	self.docs = {}
paul@6	1599
paul@27	1600	self.doc_counter = 0
paul@12	1601
paul@28	1602	def add_document(self, doc):
paul@10	1603
paul@13	1604	"""
paul@28	1605	Add the given document 'doc', updating the document counter and flushing
paul@28	1606	terms and fields if appropriate.
paul@13	1607	"""
paul@10	1608
paul@28	1609	for term, positions in doc.terms.items():
paul@28	1610	self.terms.setdefault(term, {})[doc.docnum] = positions
paul@28	1611
paul@28	1612	self.docs[doc.docnum] = doc.fields
paul@27	1613
paul@27	1614	self.doc_counter += 1
paul@27	1615	if self.flush_interval and self.doc_counter >= self.flush_interval:
paul@27	1616	self.flush_terms()
paul@12	1617	self.flush_fields()
paul@27	1618	self.doc_counter = 0
paul@26	1619
paul@12	1620	def get_term_writer(self):
paul@12	1621
paul@12	1622	"Return a term dictionary writer for the current partition."
paul@12	1623
paul@20	1624	return get_term_writer(self.pathname, self.dict_partition, self.interval, self.doc_interval)
paul@12	1625
paul@12	1626	def get_field_writer(self):
paul@12	1627
paul@12	1628	"Return a field dictionary writer for the current partition."
paul@12	1629
paul@13	1630	return get_field_writer(self.pathname, self.field_dict_partition, self.interval)
paul@12	1631
paul@12	1632	def flush_terms(self):
paul@12	1633
paul@12	1634	"Flush terms into the current term dictionary partition."
paul@6	1635
paul@6	1636	# Get the terms in order.
paul@6	1637
paul@32	1638	all_terms = self.terms
paul@32	1639	terms = all_terms.keys()
paul@6	1640	terms.sort()
paul@6	1641
paul@12	1642	dict_writer = self.get_term_writer()
paul@12	1643
paul@32	1644	for term in terms:
paul@32	1645	doc_positions = all_terms[term].items()
paul@12	1646	dict_writer.write_term_positions(term, doc_positions)
paul@12	1647
paul@12	1648	dict_writer.close()
paul@6	1649
paul@12	1650	self.terms = {}
paul@12	1651	self.dict_partition += 1
paul@12	1652
paul@12	1653	def flush_fields(self):
paul@12	1654
paul@12	1655	"Flush fields into the current term dictionary partition."
paul@7	1656
paul@10	1657	# Get the documents in order.
paul@10	1658
paul@10	1659	docs = self.docs.items()
paul@10	1660	docs.sort()
paul@10	1661
paul@12	1662	field_dict_writer = self.get_field_writer()
paul@12	1663
paul@10	1664	for docnum, fields in docs:
paul@12	1665	field_dict_writer.write_fields(docnum, fields)
paul@12	1666
paul@12	1667	field_dict_writer.close()
paul@10	1668
paul@12	1669	self.docs = {}
paul@12	1670	self.field_dict_partition += 1
paul@12	1671
paul@12	1672	def close(self):
paul@12	1673	if self.terms:
paul@12	1674	self.flush_terms()
paul@12	1675	if self.docs:
paul@12	1676	self.flush_fields()
paul@10	1677
paul@10	1678	class IndexReader:
paul@10	1679
paul@10	1680	"Accessing the term and field dictionaries."
paul@10	1681
paul@14	1682	def __init__(self, pathname):
paul@14	1683	self.dict_reader = get_term_reader(pathname, "merged")
paul@14	1684	self.field_dict_reader = get_field_reader(pathname, "merged")
paul@10	1685
paul@26	1686	def find_terms(self, term):
paul@26	1687	return self.dict_reader.find_terms(term)
paul@26	1688
paul@10	1689	def find_positions(self, term):
paul@10	1690	return self.dict_reader.find_positions(term)
paul@10	1691
paul@11	1692	def get_frequency(self, term):
paul@11	1693	return self.dict_reader.get_frequency(term)
paul@11	1694
paul@22	1695	def get_document_frequency(self, term):
paul@22	1696	return self.dict_reader.get_document_frequency(term)
paul@22	1697
paul@10	1698	def get_fields(self, docnum):
paul@13	1699	return self.field_dict_reader.get_fields(docnum)
paul@10	1700
paul@10	1701	def close(self):
paul@10	1702	self.dict_reader.close()
paul@10	1703	self.field_dict_reader.close()
paul@10	1704
paul@7	1705	class Index:
paul@7	1706
paul@7	1707	"An inverted index solution encapsulating the various components."
paul@7	1708
paul@7	1709	def __init__(self, pathname):
paul@7	1710	self.pathname = pathname
paul@7	1711	self.reader = None
paul@7	1712	self.writer = None
paul@7	1713
paul@20	1714	def get_writer(self, interval=TERM_INTERVAL, doc_interval=DOCUMENT_INTERVAL, flush_interval=FLUSH_INTERVAL):
paul@7	1715
paul@12	1716	"""
paul@20	1717	Return a writer, optionally using the given indexing 'interval',
paul@20	1718	'doc_interval' and 'flush_interval'.
paul@12	1719	"""
paul@7	1720
paul@7	1721	if not exists(self.pathname):
paul@7	1722	mkdir(self.pathname)
paul@7	1723
paul@20	1724	self.writer = IndexWriter(self.pathname, interval, doc_interval, flush_interval)
paul@7	1725	return self.writer
paul@7	1726
paul@12	1727	def get_reader(self, partition=0):
paul@7	1728
paul@7	1729	"Return a reader for the index."
paul@7	1730
paul@14	1731	# Ensure that only one partition exists.
paul@14	1732
paul@24	1733	self.merge()
paul@14	1734	return self._get_reader(partition)
paul@14	1735
paul@14	1736	def _get_reader(self, partition):
paul@14	1737
paul@14	1738	"Return a reader for the index."
paul@14	1739
paul@7	1740	if not exists(self.pathname):
paul@7	1741	raise OSError, "Index path %r does not exist." % self.pathname
paul@7	1742
paul@14	1743	self.reader = IndexReader(self.pathname)
paul@12	1744	return self.reader
paul@7	1745
paul@24	1746	def merge(self):
paul@24	1747
paul@24	1748	"Merge/optimise index partitions."
paul@24	1749
paul@24	1750	self.merge_terms()
paul@24	1751	self.merge_fields()
paul@24	1752
paul@20	1753	def merge_terms(self, interval=TERM_INTERVAL, doc_interval=DOCUMENT_INTERVAL):
paul@7	1754
paul@20	1755	"""
paul@20	1756	Merge term dictionaries using the given indexing 'interval' and
paul@20	1757	'doc_interval'.
paul@20	1758	"""
paul@10	1759
paul@12	1760	readers = []
paul@21	1761	partitions = set()
paul@10	1762
paul@14	1763	for filename in listdir(self.pathname):
paul@12	1764	if filename.startswith("terms-"): # 6 character prefix
paul@14	1765	partition = filename[6:]
paul@14	1766	readers.append(get_term_reader(self.pathname, partition))
paul@21	1767	partitions.add(partition)
paul@14	1768
paul@14	1769	# Write directly to a dictionary.
paul@14	1770
paul@14	1771	if len(readers) > 1:
paul@21	1772	if "merged" in partitions:
paul@21	1773	rename_term_files(self.pathname, "merged", "old-merged")
paul@21	1774	partitions.remove("merged")
paul@21	1775	partitions.add("old-merged")
paul@21	1776
paul@20	1777	writer = get_term_writer(self.pathname, "merged", interval, doc_interval)
paul@14	1778	merger = TermDictionaryMerger(writer, readers)
paul@14	1779	merger.merge()
paul@14	1780	merger.close()
paul@14	1781
paul@14	1782	# Remove old files.
paul@14	1783
paul@14	1784	for partition in partitions:
paul@14	1785	remove_term_files(self.pathname, partition)
paul@14	1786
paul@21	1787	elif len(readers) == 1:
paul@21	1788	partition = list(partitions)[0]
paul@21	1789	if partition != "merged":
paul@21	1790	rename_term_files(self.pathname, partition, "merged")
paul@14	1791
paul@20	1792	def merge_fields(self, interval=FIELD_INTERVAL):
paul@10	1793
paul@14	1794	"Merge field dictionaries using the given indexing 'interval'."
paul@14	1795
paul@14	1796	readers = []
paul@21	1797	partitions = set()
paul@14	1798
paul@14	1799	for filename in listdir(self.pathname):
paul@14	1800	if filename.startswith("fields-"): # 7 character prefix
paul@14	1801	partition = filename[7:]
paul@14	1802	readers.append(get_field_reader(self.pathname, partition))
paul@21	1803	partitions.add(partition)
paul@14	1804
paul@14	1805	# Write directly to a dictionary.
paul@13	1806
paul@14	1807	if len(readers) > 1:
paul@21	1808	if "merged" in partitions:
paul@21	1809	rename_field_files(self.pathname, "merged", "old-merged")
paul@21	1810	partitions.remove("merged")
paul@21	1811	partitions.add("old-merged")
paul@21	1812
paul@14	1813	writer = get_field_writer(self.pathname, "merged", interval)
paul@14	1814	merger = FieldDictionaryMerger(writer, readers)
paul@14	1815	merger.merge()
paul@14	1816	merger.close()
paul@14	1817
paul@14	1818	# Remove old files.
paul@14	1819
paul@14	1820	for partition in partitions:
paul@14	1821	remove_field_files(self.pathname, partition)
paul@14	1822
paul@21	1823	elif len(readers) == 1:
paul@21	1824	partition = list(partitions)[0]
paul@21	1825	if partition != "merged":
paul@21	1826	rename_field_files(self.pathname, partition, "merged")
paul@7	1827
paul@7	1828	def close(self):
paul@7	1829	if self.reader is not None:
paul@7	1830	self.reader.close()
paul@7	1831	self.reader = None
paul@7	1832	if self.writer is not None:
paul@7	1833	self.writer.close()
paul@7	1834	self.writer = None
paul@6	1835
paul@0	1836	# vim: tabstop=4 expandtab shiftwidth=4