iixr (annotate iixr.py in dec7d2ac1cc9)

iixr

Annotated iixr.py

29:dec7d2ac1cc9

2009-09-06

Paul Boddie

Made separate methods for sorted and unsorted position data.

paul@0	1	#!/usr/bin/env python
paul@0	2
paul@0	3	"""
paul@0	4	A simple (and sane) text indexing library.
paul@1	5
paul@1	6	Copyright (C) 2009 Paul Boddie <paul@boddie.org.uk>
paul@1	7
paul@1	8	This program is free software; you can redistribute it and/or modify it under
paul@1	9	the terms of the GNU General Public License as published by the Free Software
paul@1	10	Foundation; either version 3 of the License, or (at your option) any later
paul@1	11	version.
paul@1	12
paul@1	13	This program is distributed in the hope that it will be useful, but WITHOUT ANY
paul@1	14	WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
paul@1	15	PARTICULAR PURPOSE. See the GNU General Public License for more details.
paul@1	16
paul@1	17	You should have received a copy of the GNU General Public License along
paul@1	18	with this program. If not, see <http://www.gnu.org/licenses/>.
paul@0	19	"""
paul@0	20
paul@19	21	from os import dup, fdopen # independent iterator access to files
paul@12	22	from os import listdir, mkdir # index and partition discovery
paul@14	23	from os import remove, rename # partition manipulation
paul@7	24	from os.path import exists, join
paul@2	25	from os.path import commonprefix # to find common string prefixes
paul@3	26	from bisect import bisect_right # to find terms in the dictionary index
paul@12	27	from bisect import insort_right # to maintain a sorted list of data for merging
paul@10	28	import bz2, zlib # for field compression
paul@2	29
paul@21	30	try:
paul@21	31	set
paul@21	32	except NameError:
paul@21	33	from sets import Set as set
paul@21	34
paul@7	35	# Constants.
paul@7	36
paul@20	37	TERM_INTERVAL = 100
paul@20	38	DOCUMENT_INTERVAL = 100
paul@20	39	FIELD_INTERVAL = 100
paul@20	40	FLUSH_INTERVAL = 1000000
paul@20	41
paul@20	42	TERM_FILENAMES = "terms", "terms_index", "positions", "positions_index"
paul@20	43	FIELD_FILENAMES = "fields", "fields_index"
paul@7	44
paul@10	45	compressors = [("b", bz2.compress), ("z", zlib.compress)]
paul@10	46	decompressors = {"b" : bz2.decompress, "z" : zlib.decompress}
paul@10	47
paul@0	48	# Foundation classes.
paul@0	49
paul@0	50	class File:
paul@0	51
paul@0	52	"A basic file abstraction."
paul@0	53
paul@0	54	def __init__(self, f):
paul@0	55	self.f = f
paul@0	56	self.reset()
paul@0	57
paul@0	58	def reset(self):
paul@12	59
paul@12	60	"To be used to reset the state of the reader or writer between records."
paul@12	61
paul@0	62	pass
paul@0	63
paul@12	64	def rewind(self):
paul@12	65	self.f.seek(0)
paul@13	66	self.reset()
paul@12	67
paul@0	68	def close(self):
paul@7	69	if self.f is not None:
paul@7	70	self.f.close()
paul@7	71	self.f = None
paul@0	72
paul@0	73	class FileWriter(File):
paul@0	74
paul@0	75	"Writing basic data types to files."
paul@0	76
paul@0	77	def write_number(self, number):
paul@0	78
paul@0	79	"Write 'number' to the file using a variable length encoding."
paul@0	80
paul@0	81	# Negative numbers are not supported.
paul@0	82
paul@0	83	if number < 0:
paul@0	84	raise ValueError, "Number %r is negative." % number
paul@0	85
paul@0	86	# Special case: one byte containing zero.
paul@0	87
paul@0	88	elif number == 0:
paul@4	89	self.f.write(chr(0))
paul@0	90	return
paul@0	91
paul@0	92	# Write the number from least to most significant digits.
paul@0	93
paul@0	94	bytes = []
paul@0	95
paul@0	96	while number != 0:
paul@4	97	lsd = number & 127
paul@4	98	number = number >> 7
paul@4	99	if number != 0:
paul@4	100	lsd \|= 128
paul@0	101	bytes.append(chr(lsd))
paul@0	102
paul@0	103	record = "".join(bytes)
paul@0	104	self.f.write(record)
paul@0	105
paul@8	106	def write_string(self, s, compress=0):
paul@2	107
paul@8	108	"""
paul@8	109	Write 's' to the file, recording its length and compressing the string
paul@8	110	if 'compress' is set to a true value.
paul@8	111	"""
paul@2	112
paul@7	113	# Convert Unicode objects to strings.
paul@7	114
paul@7	115	if isinstance(s, unicode):
paul@7	116	s = s.encode("utf-8")
paul@7	117
paul@8	118	# Compress the string if requested.
paul@2	119
paul@8	120	if compress:
paul@10	121	for flag, fn in compressors:
paul@10	122	cs = fn(s)
paul@10	123
paul@10	124	# Take the first string shorter than the original.
paul@10	125
paul@10	126	if len(cs) < len(s):
paul@10	127	s = cs
paul@10	128	break
paul@10	129	else:
paul@10	130	flag = "-"
paul@10	131
paul@10	132	# Record whether compression was used.
paul@10	133
paul@10	134	self.f.write(flag)
paul@2	135
paul@8	136	# Write the length of the data before the data itself.
paul@8	137
paul@8	138	length = len(s)
paul@4	139	self.write_number(length)
paul@2	140	self.f.write(s)
paul@2	141
paul@0	142	class FileReader(File):
paul@0	143
paul@0	144	"Reading basic data types from files."
paul@0	145
paul@0	146	def read_number(self):
paul@0	147
paul@0	148	"Read a number from the file."
paul@0	149
paul@0	150	# Read each byte, adding it to the number.
paul@0	151
paul@0	152	shift = 0
paul@0	153	number = 0
paul@4	154	more = 1
paul@0	155
paul@4	156	while more:
paul@4	157	byte = self.f.read(1)
paul@4	158	if not byte:
paul@4	159	raise EOFError
paul@4	160
paul@4	161	csd = ord(byte)
paul@4	162	more = csd & 128 != 0
paul@4	163	if more:
paul@4	164	csd &= 127
paul@0	165	number += (csd << shift)
paul@4	166	shift += 7
paul@0	167
paul@0	168	return number
paul@0	169
paul@8	170	def read_string(self, decompress=0):
paul@2	171
paul@8	172	"""
paul@8	173	Read a string from the file, decompressing the stored data if
paul@8	174	'decompress' is set to a true value.
paul@8	175	"""
paul@2	176
paul@10	177	# Decompress the data if requested.
paul@10	178
paul@10	179	if decompress:
paul@10	180	flag = self.f.read(1)
paul@10	181	else:
paul@10	182	flag = "-"
paul@10	183
paul@4	184	length = self.read_number()
paul@8	185	s = self.f.read(length)
paul@8	186
paul@10	187	# Perform decompression if applicable.
paul@8	188
paul@10	189	if flag != "-":
paul@10	190	fn = decompressors[flag]
paul@10	191	s = fn(s)
paul@7	192
paul@7	193	# Convert strings to Unicode objects.
paul@7	194
paul@8	195	return unicode(s, "utf-8")
paul@2	196
paul@9	197	# Specific classes for storing term and position information.
paul@0	198
paul@0	199	class PositionWriter(FileWriter):
paul@0	200
paul@0	201	"Writing position information to files."
paul@0	202
paul@0	203	def reset(self):
paul@0	204	self.last_docnum = 0
paul@0	205
paul@29	206	def write_sorted_positions(self, docnum, positions):
paul@0	207
paul@19	208	"""
paul@19	209	Write for the document 'docnum' the given 'positions'.
paul@19	210	Return the offset of the written record.
paul@19	211	"""
paul@0	212
paul@0	213	if docnum < self.last_docnum:
paul@0	214	raise ValueError, "Document number %r is less than previous number %r." % (docnum, self.last_docnum)
paul@0	215
paul@19	216	# Record the offset of this record.
paul@19	217
paul@19	218	offset = self.f.tell()
paul@19	219
paul@0	220	# Write the document number delta.
paul@0	221
paul@0	222	self.write_number(docnum - self.last_docnum)
paul@0	223
paul@0	224	# Write the number of positions.
paul@0	225
paul@0	226	self.write_number(len(positions))
paul@0	227
paul@0	228	# Write the position deltas.
paul@0	229
paul@0	230	last = 0
paul@0	231	for position in positions:
paul@0	232	pos = position - last
paul@0	233	self.write_number(pos)
paul@0	234	last = position
paul@0	235
paul@0	236	self.last_docnum = docnum
paul@0	237
paul@19	238	return offset
paul@0	239
paul@29	240	def write_positions(self, docnum, positions):
paul@29	241
paul@29	242	"""
paul@29	243	Write for the document 'docnum' the given 'positions'.
paul@29	244	Return the offset of the written record.
paul@29	245	"""
paul@29	246
paul@29	247	# Make sure that the positions are sorted.
paul@29	248
paul@29	249	positions.sort()
paul@29	250	return self.write_sorted_positions(docnum, positions)
paul@29	251
paul@0	252	class PositionReader(FileReader):
paul@0	253
paul@0	254	"Reading position information from files."
paul@0	255
paul@0	256	def reset(self):
paul@0	257	self.last_docnum = 0
paul@0	258
paul@0	259	def read_positions(self):
paul@0	260
paul@0	261	"Read positions, returning a document number and a list of positions."
paul@0	262
paul@0	263	# Read the document number delta and add it to the last number.
paul@0	264
paul@0	265	self.last_docnum += self.read_number()
paul@0	266
paul@0	267	# Read the number of positions.
paul@0	268
paul@0	269	npositions = self.read_number()
paul@0	270
paul@0	271	# Read the position deltas, adding each previous position to get the
paul@0	272	# appropriate collection of absolute positions.
paul@0	273
paul@0	274	i = 0
paul@0	275	last = 0
paul@0	276	positions = []
paul@0	277
paul@0	278	while i < npositions:
paul@0	279	last += self.read_number()
paul@0	280	positions.append(last)
paul@0	281	i += 1
paul@0	282
paul@0	283	return self.last_docnum, positions
paul@0	284
paul@19	285	def read_term_positions(self, offset, count):
paul@0	286
paul@0	287	"""
paul@0	288	Read all positions from 'offset', seeking to that position in the file
paul@19	289	before reading. The number of documents available for reading is limited
paul@19	290	to 'count'.
paul@0	291	"""
paul@0	292
paul@19	293	# Duplicate the file handle.
paul@19	294
paul@19	295	f = fdopen(dup(self.f.fileno()), "rb")
paul@19	296	f.seek(offset)
paul@19	297	return PositionIterator(f, count)
paul@19	298
paul@19	299	class PositionIndexWriter(FileWriter):
paul@19	300
paul@19	301	"Writing position index information to files."
paul@19	302
paul@19	303	def reset(self):
paul@19	304	self.last_docnum = 0
paul@19	305	self.last_pos_offset = 0
paul@19	306
paul@19	307	def write_positions(self, docnum, pos_offset, count):
paul@19	308
paul@19	309	"""
paul@19	310	Write the given 'docnum, 'pos_offset' and document 'count' to the
paul@19	311	position index file.
paul@19	312	"""
paul@19	313
paul@19	314	# Record the offset of this record.
paul@19	315
paul@19	316	offset = self.f.tell()
paul@19	317
paul@19	318	# Write the document number delta.
paul@19	319
paul@19	320	self.write_number(docnum - self.last_docnum)
paul@19	321	self.last_docnum = docnum
paul@19	322
paul@19	323	# Write the position file offset delta.
paul@19	324
paul@19	325	self.write_number(pos_offset - self.last_pos_offset)
paul@19	326	self.last_pos_offset = pos_offset
paul@19	327
paul@19	328	# Write the document count.
paul@19	329
paul@19	330	self.write_number(count)
paul@19	331
paul@19	332	return offset
paul@19	333
paul@19	334	class PositionIndexReader(FileReader):
paul@19	335
paul@19	336	"Reading position index information from files."
paul@18	337
paul@19	338	def reset(self):
paul@19	339	self.last_docnum = 0
paul@19	340	self.last_pos_offset = 0
paul@19	341
paul@19	342	def read_positions(self):
paul@19	343
paul@19	344	"""
paul@19	345	Read a document number, a position file offset for the position index
paul@19	346	file, and the number of documents in a section of that file.
paul@19	347	"""
paul@19	348
paul@19	349	# Read the document number delta.
paul@19	350
paul@19	351	self.last_docnum += self.read_number()
paul@19	352
paul@19	353	# Read the offset delta.
paul@19	354
paul@19	355	self.last_pos_offset += self.read_number()
paul@19	356
paul@19	357	# Read the document count.
paul@19	358
paul@19	359	count = self.read_number()
paul@19	360
paul@19	361	return self.last_docnum, self.last_pos_offset, count
paul@19	362
paul@19	363	def read_term_positions(self, offset, doc_frequency):
paul@0	364
paul@19	365	"""
paul@19	366	Read all positions from 'offset', seeking to that position in the file
paul@19	367	before reading. The number of documents available for reading is limited
paul@19	368	to 'doc_frequency'.
paul@19	369	"""
paul@19	370
paul@19	371	# Duplicate the file handle.
paul@19	372
paul@19	373	f = fdopen(dup(self.f.fileno()), "rb")
paul@19	374	f.seek(offset)
paul@19	375	return PositionIndexIterator(f, doc_frequency)
paul@19	376
paul@21	377	# Iterators for position-related files.
paul@21	378
paul@21	379	class IteratorBase:
paul@21	380
paul@21	381	def __init__(self, count):
paul@21	382	self.replenish(count)
paul@21	383
paul@21	384	def replenish(self, count):
paul@21	385	self.count = count
paul@21	386	self.read_documents = 0
paul@21	387
paul@21	388	def __len__(self):
paul@21	389	return self.count
paul@21	390
paul@21	391	def sort(self):
paul@21	392	pass # Stored document positions are already sorted.
paul@21	393
paul@21	394	def __iter__(self):
paul@21	395	return self
paul@21	396
paul@21	397	class PositionIterator(PositionReader, IteratorBase):
paul@21	398
paul@21	399	"Iterating over document positions."
paul@21	400
paul@21	401	def __init__(self, f, count):
paul@21	402	PositionReader.__init__(self, f)
paul@21	403	IteratorBase.__init__(self, count)
paul@21	404
paul@21	405	def next(self):
paul@21	406
paul@21	407	"Read positions for a single document."
paul@21	408
paul@21	409	if self.read_documents < self.count:
paul@21	410	self.read_documents += 1
paul@21	411	return self.read_positions()
paul@21	412	else:
paul@21	413	raise StopIteration
paul@21	414
paul@19	415	class PositionIndexIterator(PositionIndexReader, IteratorBase):
paul@19	416
paul@19	417	"Iterating over document positions."
paul@19	418
paul@19	419	def __init__(self, f, count):
paul@19	420	PositionIndexReader.__init__(self, f)
paul@19	421	IteratorBase.__init__(self, count)
paul@19	422	self.section_count = 0
paul@19	423
paul@19	424	def next(self):
paul@19	425
paul@19	426	"Read positions for a single document."
paul@18	427
paul@19	428	self.read_documents += self.section_count
paul@19	429	if self.read_documents < self.count:
paul@19	430	docnum, pos_offset, self.section_count = t = self.read_positions()
paul@19	431	return t
paul@19	432	else:
paul@19	433	raise StopIteration
paul@19	434
paul@19	435	class PositionDictionaryWriter:
paul@19	436
paul@19	437	"Writing position dictionaries."
paul@19	438
paul@19	439	def __init__(self, position_writer, position_index_writer, interval):
paul@19	440	self.position_writer = position_writer
paul@19	441	self.position_index_writer = position_index_writer
paul@19	442	self.interval = interval
paul@19	443
paul@29	444	def write_sorted_term_positions(self, doc_positions):
paul@19	445
paul@19	446	"""
paul@19	447	Write all 'doc_positions' - a collection of tuples of the form (document
paul@19	448	number, position list) - to the file.
paul@19	449
paul@19	450	Add some records to the index, making dictionary entries.
paul@19	451
paul@19	452	Return a tuple containing the offset of the written data, the frequency
paul@19	453	(number of positions), and document frequency (number of documents) for
paul@19	454	the term involved.
paul@19	455	"""
paul@19	456
paul@20	457	# Reset the writers.
paul@19	458
paul@19	459	self.position_writer.reset()
paul@20	460	self.position_index_writer.reset()
paul@20	461
paul@19	462	index_offset = None
paul@19	463
paul@19	464	# Write the positions.
paul@19	465
paul@19	466	frequency = 0
paul@20	467	first_docnum = None
paul@19	468	first_offset = None
paul@19	469	count = 0
paul@19	470
paul@19	471	for docnum, positions in doc_positions:
paul@19	472	pos_offset = self.position_writer.write_positions(docnum, positions)
paul@19	473
paul@19	474	# Retain the first record offset for a subsequent index entry.
paul@19	475
paul@19	476	if first_offset is None:
paul@19	477	first_offset = pos_offset
paul@20	478	first_docnum = docnum
paul@19	479
paul@19	480	frequency += len(positions)
paul@20	481	count += 1
paul@19	482
paul@19	483	# Every {interval} entries, write an index entry.
paul@19	484
paul@19	485	if count == self.interval:
paul@20	486	io = self.position_index_writer.write_positions(first_docnum, first_offset, self.interval)
paul@0	487
paul@19	488	# Remember the first index entry offset.
paul@19	489
paul@19	490	if index_offset is None:
paul@19	491	index_offset = io
paul@19	492
paul@19	493	first_offset = None
paul@20	494	first_docnum = None
paul@19	495	count = 0
paul@19	496
paul@22	497	# Reset the position writer so that position readers accessing
paul@22	498	# a section start with the correct document number.
paul@22	499
paul@22	500	self.position_writer.reset()
paul@22	501
paul@19	502	# Finish writing an index entry for the remaining documents.
paul@19	503
paul@19	504	else:
paul@19	505	if first_offset is not None:
paul@20	506	io = self.position_index_writer.write_positions(first_docnum, first_offset, count)
paul@19	507
paul@19	508	# Remember the first index entry offset.
paul@19	509
paul@19	510	if index_offset is None:
paul@19	511	index_offset = io
paul@19	512
paul@19	513	return index_offset, frequency, len(doc_positions)
paul@19	514
paul@29	515	def write_term_positions(self, doc_positions):
paul@29	516
paul@29	517	"""
paul@29	518	Write all 'doc_positions' - a collection of tuples of the form (document
paul@29	519	number, position list) - to the file.
paul@29	520
paul@29	521	Add some records to the index, making dictionary entries.
paul@29	522
paul@29	523	Return a tuple containing the offset of the written data, the frequency
paul@29	524	(number of positions), and document frequency (number of documents) for
paul@29	525	the term involved.
paul@29	526	"""
paul@29	527
paul@29	528	doc_positions.sort()
paul@29	529	return self.write_sorted_term_positions(doc_positions)
paul@29	530
paul@19	531	def close(self):
paul@19	532	self.position_writer.close()
paul@19	533	self.position_index_writer.close()
paul@19	534
paul@19	535	class PositionDictionaryReader:
paul@18	536
paul@19	537	"Reading position dictionaries."
paul@19	538
paul@19	539	def __init__(self, position_reader, position_index_reader):
paul@19	540	self.position_reader = position_reader
paul@19	541	self.position_index_reader = position_index_reader
paul@19	542
paul@19	543	def read_term_positions(self, offset, doc_frequency):
paul@19	544
paul@19	545	"""
paul@19	546	Return an iterator for dictionary entries starting at 'offset' with the
paul@19	547	given 'doc_frequency'.
paul@19	548	"""
paul@18	549
paul@19	550	return PositionDictionaryIterator(self.position_reader,
paul@19	551	self.position_index_reader, offset, doc_frequency)
paul@19	552
paul@19	553	def close(self):
paul@19	554	self.position_reader.close()
paul@19	555	self.position_index_reader.close()
paul@19	556
paul@19	557	class PositionDictionaryIterator:
paul@19	558
paul@19	559	"Iteration over position dictionary entries."
paul@19	560
paul@19	561	def __init__(self, position_reader, position_index_reader, offset, doc_frequency):
paul@19	562	self.position_reader = position_reader
paul@20	563	self.doc_frequency = doc_frequency
paul@21	564	self.index_iterator = position_index_reader.read_term_positions(offset, doc_frequency)
paul@19	565
paul@22	566	# Remember the last values.
paul@22	567
paul@22	568	self.found_docnum, self.found_positions = None, None
paul@22	569
paul@21	570	# Maintain state for the next index entry, if read.
paul@21	571
paul@21	572	self.next_docnum, self.next_pos_offset, self.next_section_count = None, None, None
paul@21	573
paul@21	574	# Initialise the current index entry and current position file iterator.
paul@21	575
paul@21	576	self._next_section()
paul@21	577	self._init_section()
paul@0	578
paul@20	579	def __len__(self):
paul@20	580	return self.doc_frequency
paul@20	581
paul@20	582	def sort(self):
paul@20	583	pass
paul@20	584
paul@18	585	def __iter__(self):
paul@18	586	return self
paul@18	587
paul@18	588	def next(self):
paul@0	589
paul@21	590	"""
paul@21	591	Attempt to get the next document record from the section in the
paul@21	592	positions file.
paul@21	593	"""
paul@19	594
paul@22	595	# Return any visited but unrequested record.
paul@22	596
paul@22	597	if self.found_docnum is not None:
paul@22	598	t = self.found_docnum, self.found_positions
paul@22	599	self.found_docnum, self.found_positions = None, None
paul@22	600	return t
paul@22	601
paul@22	602	# Or search for the next record.
paul@22	603
paul@19	604	while 1:
paul@19	605
paul@19	606	# Either return the next record.
paul@19	607
paul@19	608	try:
paul@19	609	return self.iterator.next()
paul@0	610
paul@19	611	# Or, where a section is finished, get the next section and try again.
paul@19	612
paul@19	613	except StopIteration:
paul@20	614
paul@20	615	# Where a section follows, update the index iterator, but keep
paul@20	616	# reading using the same file iterator (since the data should
paul@20	617	# just follow on from the last section).
paul@20	618
paul@21	619	self._next_section()
paul@19	620	self.iterator.replenish(self.section_count)
paul@19	621
paul@22	622	# Reset the state of the iterator to make sure that document
paul@22	623	# numbers are correct.
paul@22	624
paul@22	625	self.iterator.reset()
paul@22	626
paul@22	627	def from_document(self, docnum):
paul@21	628
paul@21	629	"""
paul@21	630	Attempt to navigate to a positions entry for the given 'docnum',
paul@22	631	returning the positions for 'docnum', or None otherwise.
paul@21	632	"""
paul@21	633
paul@22	634	# Return any unrequested document positions.
paul@22	635
paul@22	636	if docnum == self.found_docnum:
paul@22	637	return self.found_positions
paul@22	638
paul@21	639	# Read ahead in the index until the next entry refers to a document
paul@21	640	# later than the desired document.
paul@21	641
paul@21	642	try:
paul@21	643	if self.next_docnum is None:
paul@21	644	self.next_docnum, self.next_pos_offset, self.next_section_count = self.index_iterator.next()
paul@21	645
paul@22	646	# Read until the next entry is after the desired document number,
paul@22	647	# or until the end of the results.
paul@22	648
paul@22	649	while self.next_docnum <= docnum:
paul@21	650	self._next_read_section()
paul@22	651	if self.docnum < docnum:
paul@22	652	self.next_docnum, self.next_pos_offset, self.next_section_count = self.index_iterator.next()
paul@22	653	else:
paul@22	654	break
paul@21	655
paul@21	656	except StopIteration:
paul@21	657	pass
paul@21	658
paul@21	659	# Navigate in the position file to the document.
paul@21	660
paul@21	661	self._init_section()
paul@19	662
paul@21	663	try:
paul@21	664	while 1:
paul@22	665	found_docnum, found_positions = self.iterator.next()
paul@22	666
paul@24	667	# Return the desired document positions or None (retaining the
paul@24	668	# positions for the document immediately after).
paul@22	669
paul@21	670	if docnum == found_docnum:
paul@22	671	return found_positions
paul@23	672	elif docnum < found_docnum:
paul@22	673	self.found_docnum, self.found_positions = found_docnum, found_positions
paul@21	674	return None
paul@22	675
paul@21	676	except StopIteration:
paul@21	677	return None
paul@21	678
paul@21	679	# Internal methods.
paul@21	680
paul@21	681	def _next_section(self):
paul@21	682
paul@21	683	"Attempt to get the next section in the index."
paul@21	684
paul@21	685	if self.next_docnum is None:
paul@21	686	self.docnum, self.pos_offset, self.section_count = self.index_iterator.next()
paul@21	687	else:
paul@21	688	self._next_read_section()
paul@21	689
paul@21	690	def _next_read_section(self):
paul@21	691
paul@21	692	"""
paul@21	693	Make the next index entry the current one without reading from the
paul@21	694	index.
paul@21	695	"""
paul@21	696
paul@21	697	self.docnum, self.pos_offset, self.section_count = self.next_docnum, self.next_pos_offset, self.next_section_count
paul@22	698	self.next_docnum, self.next_pos_offset, self.next_section_count = None, None, None
paul@21	699
paul@21	700	def _init_section(self):
paul@21	701
paul@21	702	"Initialise the iterator for the section in the position file."
paul@21	703
paul@19	704	self.iterator = self.position_reader.read_term_positions(self.pos_offset, self.section_count)
paul@0	705
paul@2	706	class TermWriter(FileWriter):
paul@2	707
paul@2	708	"Writing term information to files."
paul@2	709
paul@2	710	def reset(self):
paul@2	711	self.last_term = ""
paul@2	712	self.last_offset = 0
paul@2	713
paul@19	714	def write_term(self, term, offset, frequency, doc_frequency):
paul@2	715
paul@2	716	"""
paul@19	717	Write the given 'term', its position file 'offset', its 'frequency' and
paul@19	718	its 'doc_frequency' (number of documents in which it appears) to the
paul@19	719	term information file. Return the offset after the term information was
paul@19	720	written to the file.
paul@2	721	"""
paul@2	722
paul@2	723	# Write the prefix length and term suffix.
paul@2	724
paul@2	725	common = len(commonprefix([self.last_term, term]))
paul@2	726	suffix = term[common:]
paul@2	727
paul@4	728	self.write_number(common)
paul@2	729	self.write_string(suffix)
paul@2	730
paul@2	731	# Write the offset delta.
paul@2	732
paul@2	733	self.write_number(offset - self.last_offset)
paul@2	734
paul@11	735	# Write the frequency.
paul@11	736
paul@11	737	self.write_number(frequency)
paul@11	738
paul@19	739	# Write the document frequency.
paul@19	740
paul@19	741	self.write_number(doc_frequency)
paul@19	742
paul@2	743	self.last_term = term
paul@2	744	self.last_offset = offset
paul@2	745
paul@3	746	return self.f.tell()
paul@3	747
paul@2	748	class TermReader(FileReader):
paul@2	749
paul@2	750	"Reading term information from files."
paul@2	751
paul@2	752	def reset(self):
paul@2	753	self.last_term = ""
paul@2	754	self.last_offset = 0
paul@2	755
paul@2	756	def read_term(self):
paul@2	757
paul@2	758	"""
paul@19	759	Read a term, its position file offset, its frequency and its document
paul@25	760	frequency from the term information file.
paul@2	761	"""
paul@2	762
paul@2	763	# Read the prefix length and term suffix.
paul@2	764
paul@4	765	common = self.read_number()
paul@2	766	suffix = self.read_string()
paul@2	767
paul@2	768	self.last_term = self.last_term[:common] + suffix
paul@2	769
paul@2	770	# Read the offset delta.
paul@2	771
paul@2	772	self.last_offset += self.read_number()
paul@2	773
paul@11	774	# Read the frequency.
paul@11	775
paul@11	776	frequency = self.read_number()
paul@11	777
paul@19	778	# Read the document frequency.
paul@19	779
paul@19	780	doc_frequency = self.read_number()
paul@19	781
paul@19	782	return self.last_term, self.last_offset, frequency, doc_frequency
paul@2	783
paul@3	784	def go_to_term(self, term, offset, info_offset):
paul@3	785
paul@9	786	"""
paul@9	787	Seek past the entry for 'term' having 'offset' to 'info_offset'. This
paul@9	788	permits the scanning for later terms from the specified term.
paul@9	789	"""
paul@3	790
paul@3	791	self.f.seek(info_offset)
paul@3	792	self.last_term = term
paul@3	793	self.last_offset = offset
paul@3	794
paul@3	795	class TermIndexWriter(TermWriter):
paul@3	796
paul@3	797	"Writing term dictionary index details to files."
paul@3	798
paul@3	799	def reset(self):
paul@3	800	TermWriter.reset(self)
paul@3	801	self.last_info_offset = 0
paul@3	802
paul@19	803	def write_term(self, term, offset, frequency, doc_frequency, info_offset):
paul@3	804
paul@3	805	"""
paul@19	806	Write the given 'term', its position file 'offset', its 'frequency' and
paul@19	807	its 'doc_frequency' to the term dictionary index file, along with the
paul@19	808	'info_offset' in the term information file.
paul@3	809	"""
paul@3	810
paul@19	811	TermWriter.write_term(self, term, offset, frequency, doc_frequency)
paul@3	812
paul@3	813	# Write the information file offset delta.
paul@3	814
paul@3	815	self.write_number(info_offset - self.last_info_offset)
paul@3	816	self.last_info_offset = info_offset
paul@3	817
paul@3	818	class TermIndexReader(TermReader):
paul@3	819
paul@3	820	"Reading term dictionary index details from files."
paul@3	821
paul@3	822	def reset(self):
paul@3	823	TermReader.reset(self)
paul@3	824	self.last_info_offset = 0
paul@3	825
paul@3	826	def read_term(self):
paul@3	827
paul@3	828	"""
paul@19	829	Read a term, its position file offset, its frequency, its document
paul@19	830	frequency and a term information file offset from the term dictionary
paul@19	831	index file.
paul@3	832	"""
paul@3	833
paul@19	834	term, offset, frequency, doc_frequency = TermReader.read_term(self)
paul@3	835
paul@3	836	# Read the offset delta.
paul@3	837
paul@3	838	self.last_info_offset += self.read_number()
paul@3	839
paul@19	840	return term, offset, frequency, doc_frequency, self.last_info_offset
paul@3	841
paul@3	842	class TermDictionaryWriter:
paul@3	843
paul@3	844	"Writing term dictionaries."
paul@3	845
paul@19	846	def __init__(self, info_writer, index_writer, position_dict_writer, interval):
paul@3	847	self.info_writer = info_writer
paul@3	848	self.index_writer = index_writer
paul@19	849	self.position_dict_writer = position_dict_writer
paul@3	850	self.interval = interval
paul@3	851	self.entry = 0
paul@3	852
paul@19	853	def _write_term(self, term, offset, frequency, doc_frequency):
paul@3	854
paul@3	855	"""
paul@19	856	Write the given 'term', its position file 'offset', its 'frequency' and
paul@19	857	its 'doc_frequency' (number of documents in which it appears) to the
paul@19	858	term information file. Return the offset after the term information was
paul@19	859	written to the file.
paul@3	860	"""
paul@3	861
paul@19	862	info_offset = self.info_writer.write_term(term, offset, frequency, doc_frequency)
paul@3	863
paul@3	864	if self.entry % self.interval == 0:
paul@19	865	self.index_writer.write_term(term, offset, frequency, doc_frequency, info_offset)
paul@3	866
paul@3	867	self.entry += 1
paul@3	868
paul@5	869	def write_term_positions(self, term, doc_positions):
paul@5	870
paul@5	871	"""
paul@5	872	Write the given 'term' and the 'doc_positions' recording the documents
paul@5	873	and positions at which the term is found.
paul@5	874	"""
paul@5	875
paul@19	876	offset, frequency, doc_frequency = self.position_dict_writer.write_term_positions(doc_positions)
paul@19	877	self._write_term(term, offset, frequency, doc_frequency)
paul@5	878
paul@3	879	def close(self):
paul@3	880	self.info_writer.close()
paul@3	881	self.index_writer.close()
paul@19	882	self.position_dict_writer.close()
paul@3	883
paul@3	884	class TermDictionaryReader:
paul@3	885
paul@3	886	"Reading term dictionaries."
paul@3	887
paul@22	888	def __init__(self, info_reader, index_reader, position_dict_reader):
paul@3	889	self.info_reader = info_reader
paul@3	890	self.index_reader = index_reader
paul@22	891	self.position_dict_reader = position_dict_reader
paul@3	892
paul@3	893	self.terms = []
paul@3	894	try:
paul@3	895	while 1:
paul@3	896	self.terms.append(self.index_reader.read_term())
paul@3	897	except EOFError:
paul@3	898	pass
paul@3	899
paul@3	900	# Large numbers for ordering purposes.
paul@3	901
paul@14	902	self.max_offset = self.terms[-1][1] + 1
paul@3	903
paul@25	904	def _find_closest_entry(self, term):
paul@3	905
paul@11	906	"""
paul@25	907	Find the offsets and frequencies of 'term' from the term dictionary or
paul@25	908	the closest term starting with the value of 'term'.
paul@25	909
paul@25	910	Return the closest index entry consisting of a term, the position file
paul@25	911	offset, the term frequency, the document frequency, and the term details
paul@25	912	file offset.
paul@11	913	"""
paul@3	914
paul@14	915	i = bisect_right(self.terms, (term, self.max_offset, 0, 0)) - 1
paul@3	916
paul@3	917	# Get the entry position providing the term or one preceding it.
paul@25	918	# If no entry precedes the requested term, return the very first entry
paul@25	919	# as the closest.
paul@3	920
paul@3	921	if i == -1:
paul@25	922	return self.terms[0]
paul@25	923	else:
paul@25	924	return self.terms[i]
paul@25	925
paul@25	926	def _find_closest_term(self, term):
paul@25	927
paul@25	928	"""
paul@25	929	Find the offsets and frequencies of 'term' from the term dictionary or
paul@25	930	the closest term starting with the value of 'term'.
paul@25	931
paul@25	932	Return the closest term (or the term itself), the position file offset,
paul@25	933	the term frequency, the document frequency, and the term details file
paul@25	934	offset (or None if the reader is already positioned).
paul@25	935	"""
paul@25	936
paul@25	937	found_term, offset, frequency, doc_frequency, info_offset = self._find_closest_entry(term)
paul@3	938
paul@19	939	# Where the term is found immediately, return the offset and
paul@25	940	# frequencies. If the term does not appear, return the details of the
paul@25	941	# closest entry.
paul@25	942
paul@25	943	if term <= found_term:
paul@25	944	return found_term, offset, frequency, doc_frequency, info_offset
paul@3	945
paul@3	946	# Otherwise, seek past the index term's entry in the information file
paul@3	947	# and scan for the desired term.
paul@3	948
paul@3	949	else:
paul@3	950	self.info_reader.go_to_term(found_term, offset, info_offset)
paul@3	951	try:
paul@3	952	while term > found_term:
paul@19	953	found_term, offset, frequency, doc_frequency = self.info_reader.read_term()
paul@3	954	except EOFError:
paul@3	955	pass
paul@3	956
paul@25	957	return found_term, offset, frequency, doc_frequency, None
paul@25	958
paul@25	959	def _find_term(self, term):
paul@25	960
paul@25	961	"""
paul@25	962	Find the position file offset and frequency of 'term' from the term
paul@25	963	dictionary.
paul@25	964	"""
paul@25	965
paul@25	966	found_term, offset, frequency, doc_frequency, info_offset = self._find_closest_term(term)
paul@25	967
paul@25	968	# If the term is found, return the offset and frequencies.
paul@25	969
paul@25	970	if term == found_term:
paul@25	971	return offset, frequency, doc_frequency
paul@25	972	else:
paul@25	973	return None
paul@25	974
paul@25	975	def _get_positions(self, offset, doc_frequency):
paul@25	976	return self.position_dict_reader.read_term_positions(offset, doc_frequency)
paul@25	977
paul@25	978	# Sequential access methods.
paul@3	979
paul@12	980	def rewind(self):
paul@12	981	self.info_reader.rewind()
paul@12	982
paul@12	983	def read_term(self):
paul@12	984
paul@12	985	"""
paul@19	986	Return the next term, its frequency, its document frequency, and the
paul@19	987	documents and positions at which the term is found.
paul@12	988	"""
paul@12	989
paul@19	990	term, offset, frequency, doc_frequency = self.info_reader.read_term()
paul@19	991	positions = self._get_positions(offset, doc_frequency)
paul@19	992	return term, frequency, doc_frequency, positions
paul@12	993
paul@25	994	# Query methods.
paul@25	995
paul@25	996	def find_terms(self, term):
paul@25	997
paul@25	998	"Return all terms whose values start with the value of 'term'."
paul@25	999
paul@25	1000	terms = []
paul@25	1001
paul@25	1002	found_term, offset, frequency, doc_frequency, info_offset = self._find_closest_term(term)
paul@25	1003
paul@25	1004	# Position the reader, if necessary.
paul@25	1005
paul@25	1006	if info_offset is not None:
paul@25	1007	self.info_reader.go_to_term(found_term, offset, info_offset)
paul@25	1008
paul@25	1009	# Read and record terms.
paul@25	1010
paul@25	1011	try:
paul@25	1012	# Add the found term if it starts with the specified term.
paul@25	1013
paul@25	1014	while found_term.startswith(term):
paul@25	1015	terms.append(found_term)
paul@25	1016	found_term, offset, frequency, doc_frequency = self.info_reader.read_term()
paul@25	1017
paul@25	1018	except EOFError:
paul@25	1019	pass
paul@25	1020
paul@25	1021	return terms
paul@25	1022
paul@5	1023	def find_positions(self, term):
paul@5	1024
paul@5	1025	"Return the documents and positions at which the given 'term' is found."
paul@5	1026
paul@11	1027	t = self._find_term(term)
paul@11	1028	if t is None:
paul@5	1029	return None
paul@5	1030	else:
paul@19	1031	offset, frequency, doc_frequency = t
paul@19	1032	return self._get_positions(offset, doc_frequency)
paul@5	1033
paul@11	1034	def get_frequency(self, term):
paul@11	1035
paul@11	1036	"Return the frequency of the given 'term'."
paul@11	1037
paul@11	1038	t = self._find_term(term)
paul@11	1039	if t is None:
paul@11	1040	return None
paul@11	1041	else:
paul@19	1042	offset, frequency, doc_frequency = t
paul@11	1043	return frequency
paul@11	1044
paul@19	1045	def get_document_frequency(self, term):
paul@19	1046
paul@19	1047	"Return the document frequency of the given 'term'."
paul@19	1048
paul@19	1049	t = self._find_term(term)
paul@19	1050	if t is None:
paul@19	1051	return None
paul@19	1052	else:
paul@19	1053	offset, frequency, doc_frequency = t
paul@19	1054	return doc_frequency
paul@19	1055
paul@3	1056	def close(self):
paul@3	1057	self.info_reader.close()
paul@3	1058	self.index_reader.close()
paul@22	1059	self.position_dict_reader.close()
paul@3	1060
paul@9	1061	# Specific classes for storing document information.
paul@9	1062
paul@8	1063	class FieldWriter(FileWriter):
paul@8	1064
paul@8	1065	"Writing field data to files."
paul@8	1066
paul@9	1067	def reset(self):
paul@9	1068	self.last_docnum = 0
paul@9	1069
paul@9	1070	def write_fields(self, docnum, fields):
paul@8	1071
paul@8	1072	"""
paul@13	1073	Write for the given 'docnum', a list of 'fields' (integer, string pairs
paul@13	1074	representing field identifiers and values respectively).
paul@13	1075	Return the offset at which the fields are stored.
paul@8	1076	"""
paul@8	1077
paul@8	1078	offset = self.f.tell()
paul@8	1079
paul@9	1080	# Write the document number delta.
paul@9	1081
paul@9	1082	self.write_number(docnum - self.last_docnum)
paul@9	1083
paul@8	1084	# Write the number of fields.
paul@8	1085
paul@8	1086	self.write_number(len(fields))
paul@8	1087
paul@8	1088	# Write the fields themselves.
paul@8	1089
paul@13	1090	for i, field in fields:
paul@13	1091	self.write_number(i)
paul@10	1092	self.write_string(field, 1) # compress
paul@8	1093
paul@9	1094	self.last_docnum = docnum
paul@8	1095	return offset
paul@8	1096
paul@8	1097	class FieldReader(FileReader):
paul@8	1098
paul@8	1099	"Reading field data from files."
paul@8	1100
paul@9	1101	def reset(self):
paul@9	1102	self.last_docnum = 0
paul@9	1103
paul@8	1104	def read_fields(self):
paul@8	1105
paul@9	1106	"""
paul@9	1107	Read fields from the file, returning a tuple containing the document
paul@13	1108	number and a list of field (identifier, value) pairs.
paul@9	1109	"""
paul@9	1110
paul@9	1111	# Read the document number.
paul@9	1112
paul@9	1113	self.last_docnum += self.read_number()
paul@8	1114
paul@8	1115	# Read the number of fields.
paul@8	1116
paul@8	1117	nfields = self.read_number()
paul@8	1118
paul@8	1119	# Collect the fields.
paul@8	1120
paul@8	1121	fields = []
paul@8	1122	i = 0
paul@8	1123
paul@8	1124	while i < nfields:
paul@13	1125	identifier = self.read_number()
paul@13	1126	value = self.read_string(1) # decompress
paul@13	1127	fields.append((identifier, value))
paul@8	1128	i += 1
paul@8	1129
paul@9	1130	return self.last_docnum, fields
paul@9	1131
paul@9	1132	def read_document_fields(self, docnum, offset):
paul@8	1133
paul@9	1134	"""
paul@9	1135	Read fields for 'docnum' at the given 'offset'. This permits the
paul@9	1136	retrieval of details for the specified document, as well as scanning for
paul@9	1137	later documents.
paul@9	1138	"""
paul@8	1139
paul@8	1140	self.f.seek(offset)
paul@9	1141	bad_docnum, fields = self.read_fields()
paul@9	1142	self.last_docnum = docnum
paul@9	1143	return docnum, fields
paul@12	1144
paul@9	1145	class FieldIndexWriter(FileWriter):
paul@9	1146
paul@9	1147	"Writing field index details to files."
paul@9	1148
paul@9	1149	def reset(self):
paul@9	1150	self.last_docnum = 0
paul@10	1151	self.last_offset = 0
paul@9	1152
paul@9	1153	def write_document(self, docnum, offset):
paul@9	1154
paul@9	1155	"""
paul@9	1156	Write for the given 'docnum', the 'offset' at which the fields for the
paul@9	1157	document are stored in the fields file.
paul@9	1158	"""
paul@9	1159
paul@10	1160	# Write the document number and offset deltas.
paul@9	1161
paul@9	1162	self.write_number(docnum - self.last_docnum)
paul@10	1163	self.write_number(offset - self.last_offset)
paul@9	1164
paul@9	1165	self.last_docnum = docnum
paul@10	1166	self.last_offset = offset
paul@9	1167
paul@9	1168	class FieldIndexReader(FileReader):
paul@9	1169
paul@9	1170	"Reading field index details from files."
paul@9	1171
paul@9	1172	def reset(self):
paul@9	1173	self.last_docnum = 0
paul@10	1174	self.last_offset = 0
paul@9	1175
paul@9	1176	def read_document(self):
paul@9	1177
paul@9	1178	"Read a document number and field file offset."
paul@9	1179
paul@9	1180	# Read the document number delta and offset.
paul@9	1181
paul@9	1182	self.last_docnum += self.read_number()
paul@10	1183	self.last_offset += self.read_number()
paul@9	1184
paul@10	1185	return self.last_docnum, self.last_offset
paul@9	1186
paul@9	1187	class FieldDictionaryWriter:
paul@9	1188
paul@9	1189	"Writing field dictionary details."
paul@9	1190
paul@9	1191	def __init__(self, field_writer, field_index_writer, interval):
paul@9	1192	self.field_writer = field_writer
paul@9	1193	self.field_index_writer = field_index_writer
paul@9	1194	self.interval = interval
paul@9	1195	self.entry = 0
paul@9	1196
paul@9	1197	def write_fields(self, docnum, fields):
paul@9	1198
paul@9	1199	"Write details of the document with the given 'docnum' and 'fields'."
paul@9	1200
paul@9	1201	offset = self.field_writer.write_fields(docnum, fields)
paul@9	1202
paul@9	1203	if self.entry % self.interval == 0:
paul@9	1204	self.field_index_writer.write_document(docnum, offset)
paul@9	1205
paul@9	1206	self.entry += 1
paul@9	1207
paul@9	1208	def close(self):
paul@9	1209	self.field_writer.close()
paul@9	1210	self.field_index_writer.close()
paul@9	1211
paul@9	1212	class FieldDictionaryReader:
paul@9	1213
paul@9	1214	"Reading field dictionary details."
paul@9	1215
paul@9	1216	def __init__(self, field_reader, field_index_reader):
paul@9	1217	self.field_reader = field_reader
paul@9	1218	self.field_index_reader = field_index_reader
paul@9	1219
paul@9	1220	self.docs = []
paul@9	1221	try:
paul@9	1222	while 1:
paul@9	1223	self.docs.append(self.field_index_reader.read_document())
paul@9	1224	except EOFError:
paul@9	1225	pass
paul@9	1226
paul@9	1227	# Large numbers for ordering purposes.
paul@9	1228
paul@9	1229	self.max_offset = self.docs[-1][1]
paul@9	1230
paul@13	1231	def rewind(self):
paul@13	1232	self.field_reader.rewind()
paul@13	1233
paul@13	1234	def read_fields(self):
paul@13	1235
paul@13	1236	"Return the next document number and fields."
paul@13	1237
paul@13	1238	return self.field_reader.read_fields()
paul@13	1239
paul@13	1240	def get_fields(self, docnum):
paul@9	1241
paul@9	1242	"Read the fields of the document with the given 'docnum'."
paul@9	1243
paul@9	1244	i = bisect_right(self.docs, (docnum, self.max_offset)) - 1
paul@9	1245
paul@9	1246	# Get the entry position providing the term or one preceding it.
paul@9	1247
paul@9	1248	if i == -1:
paul@9	1249	return None
paul@9	1250
paul@9	1251	found_docnum, offset = self.docs[i]
paul@9	1252
paul@9	1253	# Read from the fields file.
paul@9	1254
paul@9	1255	found_docnum, fields = self.field_reader.read_document_fields(found_docnum, offset)
paul@9	1256
paul@9	1257	# Scan for the document, if necessary.
paul@9	1258
paul@9	1259	try:
paul@9	1260	while docnum > found_docnum:
paul@9	1261	found_docnum, fields = self.field_reader.read_fields()
paul@9	1262	except EOFError:
paul@9	1263	pass
paul@9	1264
paul@9	1265	# If the document is found, return the fields.
paul@9	1266
paul@9	1267	if docnum == found_docnum:
paul@9	1268	return fields
paul@9	1269	else:
paul@9	1270	return None
paul@9	1271
paul@9	1272	def close(self):
paul@9	1273	self.field_reader.close()
paul@9	1274	self.field_index_reader.close()
paul@8	1275
paul@12	1276	# Dictionary merging classes.
paul@12	1277
paul@13	1278	class Merger:
paul@12	1279
paul@13	1280	"Merge files."
paul@12	1281
paul@12	1282	def __init__(self, writer, readers):
paul@12	1283	self.writer = writer
paul@12	1284	self.readers = readers
paul@12	1285
paul@13	1286	def close(self):
paul@13	1287	for reader in self.readers:
paul@13	1288	reader.close()
paul@13	1289	self.writer.close()
paul@13	1290
paul@13	1291	class TermDictionaryMerger(Merger):
paul@13	1292
paul@13	1293	"Merge term and position files."
paul@13	1294
paul@12	1295	def merge(self):
paul@13	1296
paul@13	1297	"""
paul@13	1298	Merge terms and positions from the readers, sending them to the writer.
paul@13	1299	"""
paul@13	1300
paul@12	1301	entries = []
paul@12	1302
paul@12	1303	# Get the first entries from the readers.
paul@12	1304
paul@12	1305	for partition, reader in enumerate(self.readers):
paul@12	1306	reader.rewind()
paul@12	1307
paul@12	1308	try:
paul@19	1309	term, frequency, doc_frequency, positions = reader.read_term()
paul@12	1310	insort_right(entries, (term, positions, partition))
paul@12	1311	except EOFError:
paul@12	1312	pass
paul@12	1313
paul@12	1314	# While entries are available, write them out in order, merging where
paul@12	1315	# appropriate.
paul@12	1316
paul@12	1317	while entries:
paul@12	1318	term, doc_positions, partition = entries[0]
paul@12	1319	to_update = [partition]
paul@12	1320
paul@12	1321	nentries = len(entries)
paul@12	1322	i = 1
paul@12	1323
paul@12	1324	# Find other entries for the term.
paul@12	1325
paul@12	1326	while i < nentries:
paul@12	1327	other_term, other_doc_positions, other_partition = entries[i]
paul@12	1328
paul@12	1329	# For such entries, merge the positions.
paul@12	1330
paul@12	1331	if other_term == term:
paul@14	1332	doc_positions = self.merge_positions(doc_positions, other_doc_positions)
paul@12	1333	to_update.append(other_partition)
paul@12	1334	i += 1
paul@12	1335	else:
paul@12	1336	break
paul@12	1337
paul@12	1338	# Write the combined term details.
paul@12	1339
paul@12	1340	self.writer.write_term_positions(term, doc_positions)
paul@12	1341
paul@12	1342	# Update the entries from the affected readers.
paul@12	1343
paul@12	1344	del entries[:i]
paul@12	1345
paul@12	1346	for partition in to_update:
paul@12	1347	try:
paul@19	1348	term, frequency, doc_frequency, positions = self.readers[partition].read_term()
paul@12	1349	insort_right(entries, (term, positions, partition))
paul@12	1350	except EOFError:
paul@12	1351	pass
paul@12	1352
paul@13	1353	def merge_positions(self, doc_positions, other_doc_positions):
paul@13	1354
paul@13	1355	"""
paul@13	1356	Merge 'doc_positions' with 'other_doc_positions' so that common document
paul@13	1357	records contain positions from both collections.
paul@13	1358	"""
paul@13	1359
paul@13	1360	doc_position_dict = dict(doc_positions)
paul@13	1361
paul@13	1362	for docnum, positions in other_doc_positions:
paul@13	1363	if doc_position_dict.has_key(docnum):
paul@13	1364	doc_position_dict[docnum] += positions
paul@13	1365	else:
paul@13	1366	doc_position_dict[docnum] = positions
paul@13	1367
paul@14	1368	return doc_position_dict.items()
paul@13	1369
paul@13	1370	class FieldDictionaryMerger(Merger):
paul@13	1371
paul@13	1372	"Merge field files."
paul@13	1373
paul@13	1374	def merge(self):
paul@13	1375
paul@13	1376	"""
paul@13	1377	Merge fields from the readers, sending them to the writer.
paul@13	1378	"""
paul@13	1379
paul@13	1380	entries = []
paul@13	1381
paul@13	1382	# Get the first entries from the readers.
paul@13	1383
paul@13	1384	for partition, reader in enumerate(self.readers):
paul@13	1385	reader.rewind()
paul@13	1386
paul@13	1387	try:
paul@13	1388	docnum, fields = reader.read_fields()
paul@13	1389	insort_right(entries, (docnum, fields, partition))
paul@13	1390	except EOFError:
paul@13	1391	pass
paul@13	1392
paul@13	1393	# While entries are available, write them out in order, merging where
paul@13	1394	# appropriate.
paul@13	1395
paul@13	1396	while entries:
paul@13	1397	docnum, fields, partition = entries[0]
paul@13	1398	to_update = [partition]
paul@13	1399
paul@13	1400	nentries = len(entries)
paul@13	1401	i = 1
paul@13	1402
paul@13	1403	# Find other entries for the term.
paul@13	1404
paul@13	1405	while i < nentries:
paul@13	1406	other_docnum, other_fields, other_partition = entries[i]
paul@13	1407
paul@13	1408	# For such entries, merge the positions.
paul@13	1409
paul@17	1410	if other_docnum == docnum:
paul@13	1411	fields += other_fields
paul@13	1412	to_update.append(other_partition)
paul@13	1413	i += 1
paul@13	1414	else:
paul@13	1415	break
paul@13	1416
paul@13	1417	# Write the combined term details.
paul@13	1418
paul@13	1419	self.writer.write_fields(docnum, fields)
paul@13	1420
paul@13	1421	# Update the entries from the affected readers.
paul@13	1422
paul@13	1423	del entries[:i]
paul@13	1424
paul@13	1425	for partition in to_update:
paul@13	1426	try:
paul@14	1427	docnum, fields = self.readers[partition].read_fields()
paul@13	1428	insort_right(entries, (docnum, fields, partition))
paul@13	1429	except EOFError:
paul@13	1430	pass
paul@13	1431
paul@13	1432	# Utility functions.
paul@13	1433
paul@19	1434	def get_term_writer(pathname, partition, interval, doc_interval):
paul@13	1435
paul@13	1436	"""
paul@13	1437	Return a term dictionary writer using files under the given 'pathname'
paul@13	1438	labelled according to the given 'partition', using the given indexing
paul@19	1439	'interval' for terms and 'doc_interval' for document position records.
paul@13	1440	"""
paul@13	1441
paul@13	1442	tdf = open(join(pathname, "terms-%s" % partition), "wb")
paul@13	1443	info_writer = TermWriter(tdf)
paul@13	1444
paul@14	1445	tdif = open(join(pathname, "terms_index-%s" % partition), "wb")
paul@13	1446	index_writer = TermIndexWriter(tdif)
paul@13	1447
paul@13	1448	tpf = open(join(pathname, "positions-%s" % partition), "wb")
paul@13	1449	positions_writer = PositionWriter(tpf)
paul@13	1450
paul@19	1451	tpif = open(join(pathname, "positions_index-%s" % partition), "wb")
paul@19	1452	positions_index_writer = PositionIndexWriter(tpif)
paul@19	1453
paul@19	1454	positions_dict_writer = PositionDictionaryWriter(positions_writer, positions_index_writer, doc_interval)
paul@19	1455
paul@19	1456	return TermDictionaryWriter(info_writer, index_writer, positions_dict_writer, interval)
paul@13	1457
paul@13	1458	def get_field_writer(pathname, partition, interval):
paul@13	1459
paul@13	1460	"""
paul@13	1461	Return a field dictionary writer using files under the given 'pathname'
paul@13	1462	labelled according to the given 'partition', using the given indexing
paul@13	1463	'interval'.
paul@13	1464	"""
paul@13	1465
paul@13	1466	ff = open(join(pathname, "fields-%s" % partition), "wb")
paul@13	1467	field_writer = FieldWriter(ff)
paul@13	1468
paul@13	1469	fif = open(join(pathname, "fields_index-%s" % partition), "wb")
paul@13	1470	field_index_writer = FieldIndexWriter(fif)
paul@13	1471
paul@13	1472	return FieldDictionaryWriter(field_writer, field_index_writer, interval)
paul@13	1473
paul@14	1474	def get_term_reader(pathname, partition):
paul@14	1475
paul@14	1476	"""
paul@14	1477	Return a term dictionary reader using files under the given 'pathname'
paul@14	1478	labelled according to the given 'partition'.
paul@14	1479	"""
paul@14	1480
paul@14	1481	tdf = open(join(pathname, "terms-%s" % partition), "rb")
paul@14	1482	info_reader = TermReader(tdf)
paul@14	1483
paul@14	1484	tdif = open(join(pathname, "terms_index-%s" % partition), "rb")
paul@14	1485	index_reader = TermIndexReader(tdif)
paul@14	1486
paul@14	1487	tpf = open(join(pathname, "positions-%s" % partition), "rb")
paul@14	1488	positions_reader = PositionReader(tpf)
paul@14	1489
paul@19	1490	tpif = open(join(pathname, "positions_index-%s" % partition), "rb")
paul@19	1491	positions_index_reader = PositionIndexReader(tpif)
paul@19	1492
paul@19	1493	positions_dict_reader = PositionDictionaryReader(positions_reader, positions_index_reader)
paul@19	1494
paul@19	1495	return TermDictionaryReader(info_reader, index_reader, positions_dict_reader)
paul@14	1496
paul@14	1497	def get_field_reader(pathname, partition):
paul@14	1498
paul@14	1499	"""
paul@14	1500	Return a field dictionary reader using files under the given 'pathname'
paul@14	1501	labelled according to the given 'partition'.
paul@14	1502	"""
paul@14	1503
paul@14	1504	ff = open(join(pathname, "fields-%s" % partition), "rb")
paul@14	1505	field_reader = FieldReader(ff)
paul@14	1506
paul@14	1507	fif = open(join(pathname, "fields_index-%s" % partition), "rb")
paul@14	1508	field_index_reader = FieldIndexReader(fif)
paul@14	1509
paul@14	1510	return FieldDictionaryReader(field_reader, field_index_reader)
paul@14	1511
paul@14	1512	def rename_files(pathname, names, from_partition, to_partition):
paul@14	1513	for name in names:
paul@14	1514	rename(join(pathname, "%s-%s" % (name, from_partition)), join(pathname, "%s-%s" % (name, to_partition)))
paul@14	1515
paul@14	1516	def rename_term_files(pathname, from_partition, to_partition):
paul@20	1517	rename_files(pathname, TERM_FILENAMES, from_partition, to_partition)
paul@14	1518
paul@14	1519	def rename_field_files(pathname, from_partition, to_partition):
paul@20	1520	rename_files(pathname, FIELD_FILENAMES, from_partition, to_partition)
paul@14	1521
paul@14	1522	def remove_files(pathname, names, partition):
paul@14	1523	for name in names:
paul@14	1524	remove(join(pathname, "%s-%s" % (name, partition)))
paul@14	1525
paul@14	1526	def remove_term_files(pathname, partition):
paul@20	1527	remove_files(pathname, TERM_FILENAMES, partition)
paul@14	1528
paul@14	1529	def remove_field_files(pathname, partition):
paul@20	1530	remove_files(pathname, FIELD_FILENAMES, partition)
paul@14	1531
paul@8	1532	# High-level classes.
paul@8	1533
paul@6	1534	class IndexWriter:
paul@6	1535
paul@10	1536	"""
paul@10	1537	Building term information and writing it to the term and field dictionaries.
paul@10	1538	"""
paul@6	1539
paul@20	1540	def __init__(self, pathname, interval, doc_interval, flush_interval):
paul@12	1541	self.pathname = pathname
paul@12	1542	self.interval = interval
paul@20	1543	self.doc_interval = doc_interval
paul@12	1544	self.flush_interval = flush_interval
paul@12	1545
paul@12	1546	self.dict_partition = 0
paul@12	1547	self.field_dict_partition = 0
paul@12	1548
paul@6	1549	self.terms = {}
paul@10	1550	self.docs = {}
paul@6	1551
paul@12	1552	self.position_counter = 0
paul@12	1553	self.field_counter = 0
paul@12	1554
paul@6	1555	def add_position(self, term, docnum, position):
paul@6	1556
paul@6	1557	"""
paul@6	1558	Add a position entry for the given 'term' in the document with the given
paul@6	1559	'docnum', indicating the given 'position'.
paul@6	1560	"""
paul@6	1561
paul@6	1562	if not self.terms.has_key(term):
paul@6	1563	doc_positions = self.terms[term] = {}
paul@6	1564	else:
paul@6	1565	doc_positions = self.terms[term]
paul@6	1566
paul@6	1567	if not doc_positions.has_key(docnum):
paul@6	1568	doc = doc_positions[docnum] = []
paul@6	1569	else:
paul@6	1570	doc = doc_positions[docnum]
paul@6	1571
paul@6	1572	doc.append(position)
paul@6	1573
paul@12	1574	self.position_counter += 1
paul@13	1575	if self.flush_interval and self.position_counter >= self.flush_interval:
paul@12	1576	self.flush_terms()
paul@15	1577	self.position_counter = 0
paul@12	1578
paul@13	1579	def add_field(self, docnum, identifier, value):
paul@10	1580
paul@13	1581	"""
paul@13	1582	Add for the document with the given 'docnum' a field having the given
paul@13	1583	'identifier' and 'value'.
paul@13	1584	"""
paul@10	1585
paul@10	1586	if not self.docs.has_key(docnum):
paul@13	1587	doc_fields = self.docs[docnum] = []
paul@10	1588	else:
paul@13	1589	doc_fields = self.docs[docnum]
paul@10	1590
paul@25	1591	doc_fields.append((identifier, unicode(value))) # convert to string
paul@13	1592
paul@13	1593	self.field_counter += 1
paul@13	1594	if self.flush_interval and self.field_counter >= self.flush_interval:
paul@12	1595	self.flush_fields()
paul@15	1596	self.field_counter = 0
paul@12	1597
paul@26	1598	def set_fields(self, docnum, fields):
paul@26	1599
paul@26	1600	"""
paul@26	1601	Add for the document with the given 'docnum' the given 'fields': a list
paul@26	1602	of tuples each containing an integer identifier and a string value.
paul@26	1603	"""
paul@26	1604
paul@26	1605	self.docs[docnum] = fields
paul@26	1606
paul@26	1607	self.field_counter += len(fields)
paul@26	1608	if self.flush_interval and self.field_counter >= self.flush_interval:
paul@26	1609	self.flush_fields()
paul@26	1610	self.field_counter = 0
paul@26	1611
paul@12	1612	def get_term_writer(self):
paul@12	1613
paul@12	1614	"Return a term dictionary writer for the current partition."
paul@12	1615
paul@20	1616	return get_term_writer(self.pathname, self.dict_partition, self.interval, self.doc_interval)
paul@12	1617
paul@12	1618	def get_field_writer(self):
paul@12	1619
paul@12	1620	"Return a field dictionary writer for the current partition."
paul@12	1621
paul@13	1622	return get_field_writer(self.pathname, self.field_dict_partition, self.interval)
paul@12	1623
paul@12	1624	def flush_terms(self):
paul@12	1625
paul@12	1626	"Flush terms into the current term dictionary partition."
paul@6	1627
paul@6	1628	# Get the terms in order.
paul@6	1629
paul@6	1630	terms = self.terms.items()
paul@6	1631	terms.sort()
paul@6	1632
paul@12	1633	dict_writer = self.get_term_writer()
paul@12	1634
paul@6	1635	for term, doc_positions in terms:
paul@6	1636	doc_positions = doc_positions.items()
paul@12	1637	dict_writer.write_term_positions(term, doc_positions)
paul@12	1638
paul@12	1639	dict_writer.close()
paul@6	1640
paul@12	1641	self.terms = {}
paul@12	1642	self.dict_partition += 1
paul@12	1643
paul@12	1644	def flush_fields(self):
paul@12	1645
paul@12	1646	"Flush fields into the current term dictionary partition."
paul@7	1647
paul@10	1648	# Get the documents in order.
paul@10	1649
paul@10	1650	docs = self.docs.items()
paul@10	1651	docs.sort()
paul@10	1652
paul@12	1653	field_dict_writer = self.get_field_writer()
paul@12	1654
paul@10	1655	for docnum, fields in docs:
paul@12	1656	field_dict_writer.write_fields(docnum, fields)
paul@12	1657
paul@12	1658	field_dict_writer.close()
paul@10	1659
paul@12	1660	self.docs = {}
paul@12	1661	self.field_dict_partition += 1
paul@12	1662
paul@12	1663	def close(self):
paul@12	1664	if self.terms:
paul@12	1665	self.flush_terms()
paul@12	1666	if self.docs:
paul@12	1667	self.flush_fields()
paul@10	1668
paul@10	1669	class IndexReader:
paul@10	1670
paul@10	1671	"Accessing the term and field dictionaries."
paul@10	1672
paul@14	1673	def __init__(self, pathname):
paul@14	1674	self.dict_reader = get_term_reader(pathname, "merged")
paul@14	1675	self.field_dict_reader = get_field_reader(pathname, "merged")
paul@10	1676
paul@26	1677	def find_terms(self, term):
paul@26	1678	return self.dict_reader.find_terms(term)
paul@26	1679
paul@10	1680	def find_positions(self, term):
paul@10	1681	return self.dict_reader.find_positions(term)
paul@10	1682
paul@11	1683	def get_frequency(self, term):
paul@11	1684	return self.dict_reader.get_frequency(term)
paul@11	1685
paul@22	1686	def get_document_frequency(self, term):
paul@22	1687	return self.dict_reader.get_document_frequency(term)
paul@22	1688
paul@10	1689	def get_fields(self, docnum):
paul@13	1690	return self.field_dict_reader.get_fields(docnum)
paul@10	1691
paul@10	1692	def close(self):
paul@10	1693	self.dict_reader.close()
paul@10	1694	self.field_dict_reader.close()
paul@10	1695
paul@7	1696	class Index:
paul@7	1697
paul@7	1698	"An inverted index solution encapsulating the various components."
paul@7	1699
paul@7	1700	def __init__(self, pathname):
paul@7	1701	self.pathname = pathname
paul@7	1702	self.reader = None
paul@7	1703	self.writer = None
paul@7	1704
paul@20	1705	def get_writer(self, interval=TERM_INTERVAL, doc_interval=DOCUMENT_INTERVAL, flush_interval=FLUSH_INTERVAL):
paul@7	1706
paul@12	1707	"""
paul@20	1708	Return a writer, optionally using the given indexing 'interval',
paul@20	1709	'doc_interval' and 'flush_interval'.
paul@12	1710	"""
paul@7	1711
paul@7	1712	if not exists(self.pathname):
paul@7	1713	mkdir(self.pathname)
paul@7	1714
paul@20	1715	self.writer = IndexWriter(self.pathname, interval, doc_interval, flush_interval)
paul@7	1716	return self.writer
paul@7	1717
paul@12	1718	def get_reader(self, partition=0):
paul@7	1719
paul@7	1720	"Return a reader for the index."
paul@7	1721
paul@14	1722	# Ensure that only one partition exists.
paul@14	1723
paul@24	1724	self.merge()
paul@14	1725	return self._get_reader(partition)
paul@14	1726
paul@14	1727	def _get_reader(self, partition):
paul@14	1728
paul@14	1729	"Return a reader for the index."
paul@14	1730
paul@7	1731	if not exists(self.pathname):
paul@7	1732	raise OSError, "Index path %r does not exist." % self.pathname
paul@7	1733
paul@14	1734	self.reader = IndexReader(self.pathname)
paul@12	1735	return self.reader
paul@7	1736
paul@24	1737	def merge(self):
paul@24	1738
paul@24	1739	"Merge/optimise index partitions."
paul@24	1740
paul@24	1741	self.merge_terms()
paul@24	1742	self.merge_fields()
paul@24	1743
paul@20	1744	def merge_terms(self, interval=TERM_INTERVAL, doc_interval=DOCUMENT_INTERVAL):
paul@7	1745
paul@20	1746	"""
paul@20	1747	Merge term dictionaries using the given indexing 'interval' and
paul@20	1748	'doc_interval'.
paul@20	1749	"""
paul@10	1750
paul@12	1751	readers = []
paul@21	1752	partitions = set()
paul@10	1753
paul@14	1754	for filename in listdir(self.pathname):
paul@12	1755	if filename.startswith("terms-"): # 6 character prefix
paul@14	1756	partition = filename[6:]
paul@14	1757	readers.append(get_term_reader(self.pathname, partition))
paul@21	1758	partitions.add(partition)
paul@14	1759
paul@14	1760	# Write directly to a dictionary.
paul@14	1761
paul@14	1762	if len(readers) > 1:
paul@21	1763	if "merged" in partitions:
paul@21	1764	rename_term_files(self.pathname, "merged", "old-merged")
paul@21	1765	partitions.remove("merged")
paul@21	1766	partitions.add("old-merged")
paul@21	1767
paul@20	1768	writer = get_term_writer(self.pathname, "merged", interval, doc_interval)
paul@14	1769	merger = TermDictionaryMerger(writer, readers)
paul@14	1770	merger.merge()
paul@14	1771	merger.close()
paul@14	1772
paul@14	1773	# Remove old files.
paul@14	1774
paul@14	1775	for partition in partitions:
paul@14	1776	remove_term_files(self.pathname, partition)
paul@14	1777
paul@21	1778	elif len(readers) == 1:
paul@21	1779	partition = list(partitions)[0]
paul@21	1780	if partition != "merged":
paul@21	1781	rename_term_files(self.pathname, partition, "merged")
paul@14	1782
paul@20	1783	def merge_fields(self, interval=FIELD_INTERVAL):
paul@10	1784
paul@14	1785	"Merge field dictionaries using the given indexing 'interval'."
paul@14	1786
paul@14	1787	readers = []
paul@21	1788	partitions = set()
paul@14	1789
paul@14	1790	for filename in listdir(self.pathname):
paul@14	1791	if filename.startswith("fields-"): # 7 character prefix
paul@14	1792	partition = filename[7:]
paul@14	1793	readers.append(get_field_reader(self.pathname, partition))
paul@21	1794	partitions.add(partition)
paul@14	1795
paul@14	1796	# Write directly to a dictionary.
paul@13	1797
paul@14	1798	if len(readers) > 1:
paul@21	1799	if "merged" in partitions:
paul@21	1800	rename_field_files(self.pathname, "merged", "old-merged")
paul@21	1801	partitions.remove("merged")
paul@21	1802	partitions.add("old-merged")
paul@21	1803
paul@14	1804	writer = get_field_writer(self.pathname, "merged", interval)
paul@14	1805	merger = FieldDictionaryMerger(writer, readers)
paul@14	1806	merger.merge()
paul@14	1807	merger.close()
paul@14	1808
paul@14	1809	# Remove old files.
paul@14	1810
paul@14	1811	for partition in partitions:
paul@14	1812	remove_field_files(self.pathname, partition)
paul@14	1813
paul@21	1814	elif len(readers) == 1:
paul@21	1815	partition = list(partitions)[0]
paul@21	1816	if partition != "merged":
paul@21	1817	rename_field_files(self.pathname, partition, "merged")
paul@7	1818
paul@7	1819	def close(self):
paul@7	1820	if self.reader is not None:
paul@7	1821	self.reader.close()
paul@7	1822	self.reader = None
paul@7	1823	if self.writer is not None:
paul@7	1824	self.writer.close()
paul@7	1825	self.writer = None
paul@6	1826
paul@0	1827	# vim: tabstop=4 expandtab shiftwidth=4