iixr (annotate iixr.py in 4c3c6201310e)

iixr

Annotated iixr.py

22:4c3c6201310e

2009-09-03

Paul Boddie

Fixed position writing to restart document numbering for each section of the position file. Fixed position dictionary iteration to reset the position iterator so that the document numbering of a newly encountered section is properly interpreted. Fixed position dictionary iteration across index entries, ensuring that entries at the start of sections, recorded in index entries themselves, are handled correctly. Removed unnecessary reader reset operations where iterators will be created with reset state anyway. Added a document frequency method to IndexReader. Added result caching to the position dictionary iterator in order to preserve record data for documents which were visited unintentionally.

paul@0	1	#!/usr/bin/env python
paul@0	2
paul@0	3	"""
paul@0	4	A simple (and sane) text indexing library.
paul@1	5
paul@1	6	Copyright (C) 2009 Paul Boddie <paul@boddie.org.uk>
paul@1	7
paul@1	8	This program is free software; you can redistribute it and/or modify it under
paul@1	9	the terms of the GNU General Public License as published by the Free Software
paul@1	10	Foundation; either version 3 of the License, or (at your option) any later
paul@1	11	version.
paul@1	12
paul@1	13	This program is distributed in the hope that it will be useful, but WITHOUT ANY
paul@1	14	WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
paul@1	15	PARTICULAR PURPOSE. See the GNU General Public License for more details.
paul@1	16
paul@1	17	You should have received a copy of the GNU General Public License along
paul@1	18	with this program. If not, see <http://www.gnu.org/licenses/>.
paul@0	19	"""
paul@0	20
paul@19	21	from os import dup, fdopen # independent iterator access to files
paul@12	22	from os import listdir, mkdir # index and partition discovery
paul@14	23	from os import remove, rename # partition manipulation
paul@7	24	from os.path import exists, join
paul@2	25	from os.path import commonprefix # to find common string prefixes
paul@3	26	from bisect import bisect_right # to find terms in the dictionary index
paul@12	27	from bisect import insort_right # to maintain a sorted list of data for merging
paul@10	28	import bz2, zlib # for field compression
paul@2	29
paul@21	30	try:
paul@21	31	set
paul@21	32	except NameError:
paul@21	33	from sets import Set as set
paul@21	34
paul@7	35	# Constants.
paul@7	36
paul@20	37	TERM_INTERVAL = 100
paul@20	38	DOCUMENT_INTERVAL = 100
paul@20	39	FIELD_INTERVAL = 100
paul@20	40	FLUSH_INTERVAL = 1000000
paul@20	41
paul@20	42	TERM_FILENAMES = "terms", "terms_index", "positions", "positions_index"
paul@20	43	FIELD_FILENAMES = "fields", "fields_index"
paul@7	44
paul@10	45	compressors = [("b", bz2.compress), ("z", zlib.compress)]
paul@10	46	decompressors = {"b" : bz2.decompress, "z" : zlib.decompress}
paul@10	47
paul@0	48	# Foundation classes.
paul@0	49
paul@0	50	class File:
paul@0	51
paul@0	52	"A basic file abstraction."
paul@0	53
paul@0	54	def __init__(self, f):
paul@0	55	self.f = f
paul@0	56	self.reset()
paul@0	57
paul@0	58	def reset(self):
paul@12	59
paul@12	60	"To be used to reset the state of the reader or writer between records."
paul@12	61
paul@0	62	pass
paul@0	63
paul@12	64	def rewind(self):
paul@12	65	self.f.seek(0)
paul@13	66	self.reset()
paul@12	67
paul@0	68	def close(self):
paul@7	69	if self.f is not None:
paul@7	70	self.f.close()
paul@7	71	self.f = None
paul@0	72
paul@0	73	class FileWriter(File):
paul@0	74
paul@0	75	"Writing basic data types to files."
paul@0	76
paul@0	77	def write_number(self, number):
paul@0	78
paul@0	79	"Write 'number' to the file using a variable length encoding."
paul@0	80
paul@0	81	# Negative numbers are not supported.
paul@0	82
paul@0	83	if number < 0:
paul@0	84	raise ValueError, "Number %r is negative." % number
paul@0	85
paul@0	86	# Special case: one byte containing zero.
paul@0	87
paul@0	88	elif number == 0:
paul@4	89	self.f.write(chr(0))
paul@0	90	return
paul@0	91
paul@0	92	# Write the number from least to most significant digits.
paul@0	93
paul@0	94	bytes = []
paul@0	95
paul@0	96	while number != 0:
paul@4	97	lsd = number & 127
paul@4	98	number = number >> 7
paul@4	99	if number != 0:
paul@4	100	lsd \|= 128
paul@0	101	bytes.append(chr(lsd))
paul@0	102
paul@0	103	record = "".join(bytes)
paul@0	104	self.f.write(record)
paul@0	105
paul@8	106	def write_string(self, s, compress=0):
paul@2	107
paul@8	108	"""
paul@8	109	Write 's' to the file, recording its length and compressing the string
paul@8	110	if 'compress' is set to a true value.
paul@8	111	"""
paul@2	112
paul@7	113	# Convert Unicode objects to strings.
paul@7	114
paul@7	115	if isinstance(s, unicode):
paul@7	116	s = s.encode("utf-8")
paul@7	117
paul@8	118	# Compress the string if requested.
paul@2	119
paul@8	120	if compress:
paul@10	121	for flag, fn in compressors:
paul@10	122	cs = fn(s)
paul@10	123
paul@10	124	# Take the first string shorter than the original.
paul@10	125
paul@10	126	if len(cs) < len(s):
paul@10	127	s = cs
paul@10	128	break
paul@10	129	else:
paul@10	130	flag = "-"
paul@10	131
paul@10	132	# Record whether compression was used.
paul@10	133
paul@10	134	self.f.write(flag)
paul@2	135
paul@8	136	# Write the length of the data before the data itself.
paul@8	137
paul@8	138	length = len(s)
paul@4	139	self.write_number(length)
paul@2	140	self.f.write(s)
paul@2	141
paul@0	142	class FileReader(File):
paul@0	143
paul@0	144	"Reading basic data types from files."
paul@0	145
paul@0	146	def read_number(self):
paul@0	147
paul@0	148	"Read a number from the file."
paul@0	149
paul@0	150	# Read each byte, adding it to the number.
paul@0	151
paul@0	152	shift = 0
paul@0	153	number = 0
paul@4	154	more = 1
paul@0	155
paul@4	156	while more:
paul@4	157	byte = self.f.read(1)
paul@4	158	if not byte:
paul@4	159	raise EOFError
paul@4	160
paul@4	161	csd = ord(byte)
paul@4	162	more = csd & 128 != 0
paul@4	163	if more:
paul@4	164	csd &= 127
paul@0	165	number += (csd << shift)
paul@4	166	shift += 7
paul@0	167
paul@0	168	return number
paul@0	169
paul@8	170	def read_string(self, decompress=0):
paul@2	171
paul@8	172	"""
paul@8	173	Read a string from the file, decompressing the stored data if
paul@8	174	'decompress' is set to a true value.
paul@8	175	"""
paul@2	176
paul@10	177	# Decompress the data if requested.
paul@10	178
paul@10	179	if decompress:
paul@10	180	flag = self.f.read(1)
paul@10	181	else:
paul@10	182	flag = "-"
paul@10	183
paul@4	184	length = self.read_number()
paul@8	185	s = self.f.read(length)
paul@8	186
paul@10	187	# Perform decompression if applicable.
paul@8	188
paul@10	189	if flag != "-":
paul@10	190	fn = decompressors[flag]
paul@10	191	s = fn(s)
paul@7	192
paul@7	193	# Convert strings to Unicode objects.
paul@7	194
paul@8	195	return unicode(s, "utf-8")
paul@2	196
paul@9	197	# Specific classes for storing term and position information.
paul@0	198
paul@0	199	class PositionWriter(FileWriter):
paul@0	200
paul@0	201	"Writing position information to files."
paul@0	202
paul@0	203	def reset(self):
paul@0	204	self.last_docnum = 0
paul@0	205
paul@0	206	def write_positions(self, docnum, positions):
paul@0	207
paul@19	208	"""
paul@19	209	Write for the document 'docnum' the given 'positions'.
paul@19	210	Return the offset of the written record.
paul@19	211	"""
paul@0	212
paul@0	213	if docnum < self.last_docnum:
paul@0	214	raise ValueError, "Document number %r is less than previous number %r." % (docnum, self.last_docnum)
paul@0	215
paul@19	216	# Record the offset of this record.
paul@19	217
paul@19	218	offset = self.f.tell()
paul@19	219
paul@0	220	# Write the document number delta.
paul@0	221
paul@0	222	self.write_number(docnum - self.last_docnum)
paul@0	223
paul@0	224	# Write the number of positions.
paul@0	225
paul@0	226	self.write_number(len(positions))
paul@0	227
paul@7	228	# Make sure that the positions are sorted.
paul@7	229
paul@7	230	positions.sort()
paul@7	231
paul@0	232	# Write the position deltas.
paul@0	233
paul@0	234	last = 0
paul@0	235	for position in positions:
paul@0	236	pos = position - last
paul@0	237	self.write_number(pos)
paul@0	238	last = position
paul@0	239
paul@0	240	self.last_docnum = docnum
paul@0	241
paul@19	242	return offset
paul@0	243
paul@0	244	class PositionReader(FileReader):
paul@0	245
paul@0	246	"Reading position information from files."
paul@0	247
paul@0	248	def reset(self):
paul@0	249	self.last_docnum = 0
paul@0	250
paul@0	251	def read_positions(self):
paul@0	252
paul@0	253	"Read positions, returning a document number and a list of positions."
paul@0	254
paul@0	255	# Read the document number delta and add it to the last number.
paul@0	256
paul@0	257	self.last_docnum += self.read_number()
paul@0	258
paul@0	259	# Read the number of positions.
paul@0	260
paul@0	261	npositions = self.read_number()
paul@0	262
paul@0	263	# Read the position deltas, adding each previous position to get the
paul@0	264	# appropriate collection of absolute positions.
paul@0	265
paul@0	266	i = 0
paul@0	267	last = 0
paul@0	268	positions = []
paul@0	269
paul@0	270	while i < npositions:
paul@0	271	last += self.read_number()
paul@0	272	positions.append(last)
paul@0	273	i += 1
paul@0	274
paul@0	275	return self.last_docnum, positions
paul@0	276
paul@19	277	def read_term_positions(self, offset, count):
paul@0	278
paul@0	279	"""
paul@0	280	Read all positions from 'offset', seeking to that position in the file
paul@19	281	before reading. The number of documents available for reading is limited
paul@19	282	to 'count'.
paul@0	283	"""
paul@0	284
paul@19	285	# Duplicate the file handle.
paul@19	286
paul@19	287	f = fdopen(dup(self.f.fileno()), "rb")
paul@19	288	f.seek(offset)
paul@19	289	return PositionIterator(f, count)
paul@19	290
paul@19	291	class PositionIndexWriter(FileWriter):
paul@19	292
paul@19	293	"Writing position index information to files."
paul@19	294
paul@19	295	def reset(self):
paul@19	296	self.last_docnum = 0
paul@19	297	self.last_pos_offset = 0
paul@19	298
paul@19	299	def write_positions(self, docnum, pos_offset, count):
paul@19	300
paul@19	301	"""
paul@19	302	Write the given 'docnum, 'pos_offset' and document 'count' to the
paul@19	303	position index file.
paul@19	304	"""
paul@19	305
paul@19	306	# Record the offset of this record.
paul@19	307
paul@19	308	offset = self.f.tell()
paul@19	309
paul@19	310	# Write the document number delta.
paul@19	311
paul@19	312	self.write_number(docnum - self.last_docnum)
paul@19	313	self.last_docnum = docnum
paul@19	314
paul@19	315	# Write the position file offset delta.
paul@19	316
paul@19	317	self.write_number(pos_offset - self.last_pos_offset)
paul@19	318	self.last_pos_offset = pos_offset
paul@19	319
paul@19	320	# Write the document count.
paul@19	321
paul@19	322	self.write_number(count)
paul@19	323
paul@19	324	return offset
paul@19	325
paul@19	326	class PositionIndexReader(FileReader):
paul@19	327
paul@19	328	"Reading position index information from files."
paul@18	329
paul@19	330	def reset(self):
paul@19	331	self.last_docnum = 0
paul@19	332	self.last_pos_offset = 0
paul@19	333
paul@19	334	def read_positions(self):
paul@19	335
paul@19	336	"""
paul@19	337	Read a document number, a position file offset for the position index
paul@19	338	file, and the number of documents in a section of that file.
paul@19	339	"""
paul@19	340
paul@19	341	# Read the document number delta.
paul@19	342
paul@19	343	self.last_docnum += self.read_number()
paul@19	344
paul@19	345	# Read the offset delta.
paul@19	346
paul@19	347	self.last_pos_offset += self.read_number()
paul@19	348
paul@19	349	# Read the document count.
paul@19	350
paul@19	351	count = self.read_number()
paul@19	352
paul@19	353	return self.last_docnum, self.last_pos_offset, count
paul@19	354
paul@19	355	def read_term_positions(self, offset, doc_frequency):
paul@0	356
paul@19	357	"""
paul@19	358	Read all positions from 'offset', seeking to that position in the file
paul@19	359	before reading. The number of documents available for reading is limited
paul@19	360	to 'doc_frequency'.
paul@19	361	"""
paul@19	362
paul@19	363	# Duplicate the file handle.
paul@19	364
paul@19	365	f = fdopen(dup(self.f.fileno()), "rb")
paul@19	366	f.seek(offset)
paul@19	367	return PositionIndexIterator(f, doc_frequency)
paul@19	368
paul@21	369	# Iterators for position-related files.
paul@21	370
paul@21	371	class IteratorBase:
paul@21	372
paul@21	373	def __init__(self, count):
paul@21	374	self.replenish(count)
paul@21	375
paul@21	376	def replenish(self, count):
paul@21	377	self.count = count
paul@21	378	self.read_documents = 0
paul@21	379
paul@21	380	def __len__(self):
paul@21	381	return self.count
paul@21	382
paul@21	383	def sort(self):
paul@21	384	pass # Stored document positions are already sorted.
paul@21	385
paul@21	386	def __iter__(self):
paul@21	387	return self
paul@21	388
paul@21	389	class PositionIterator(PositionReader, IteratorBase):
paul@21	390
paul@21	391	"Iterating over document positions."
paul@21	392
paul@21	393	def __init__(self, f, count):
paul@21	394	PositionReader.__init__(self, f)
paul@21	395	IteratorBase.__init__(self, count)
paul@21	396
paul@21	397	def next(self):
paul@21	398
paul@21	399	"Read positions for a single document."
paul@21	400
paul@21	401	if self.read_documents < self.count:
paul@21	402	self.read_documents += 1
paul@21	403	return self.read_positions()
paul@21	404	else:
paul@21	405	raise StopIteration
paul@21	406
paul@19	407	class PositionIndexIterator(PositionIndexReader, IteratorBase):
paul@19	408
paul@19	409	"Iterating over document positions."
paul@19	410
paul@19	411	def __init__(self, f, count):
paul@19	412	PositionIndexReader.__init__(self, f)
paul@19	413	IteratorBase.__init__(self, count)
paul@19	414	self.section_count = 0
paul@19	415
paul@19	416	def next(self):
paul@19	417
paul@19	418	"Read positions for a single document."
paul@18	419
paul@19	420	self.read_documents += self.section_count
paul@19	421	if self.read_documents < self.count:
paul@19	422	docnum, pos_offset, self.section_count = t = self.read_positions()
paul@19	423	return t
paul@19	424	else:
paul@19	425	raise StopIteration
paul@19	426
paul@19	427	class PositionDictionaryWriter:
paul@19	428
paul@19	429	"Writing position dictionaries."
paul@19	430
paul@19	431	def __init__(self, position_writer, position_index_writer, interval):
paul@19	432	self.position_writer = position_writer
paul@19	433	self.position_index_writer = position_index_writer
paul@19	434	self.interval = interval
paul@19	435
paul@19	436	def write_term_positions(self, doc_positions):
paul@19	437
paul@19	438	"""
paul@19	439	Write all 'doc_positions' - a collection of tuples of the form (document
paul@19	440	number, position list) - to the file.
paul@19	441
paul@19	442	Add some records to the index, making dictionary entries.
paul@19	443
paul@19	444	Return a tuple containing the offset of the written data, the frequency
paul@19	445	(number of positions), and document frequency (number of documents) for
paul@19	446	the term involved.
paul@19	447	"""
paul@19	448
paul@20	449	# Reset the writers.
paul@19	450
paul@19	451	self.position_writer.reset()
paul@20	452	self.position_index_writer.reset()
paul@20	453
paul@19	454	index_offset = None
paul@19	455
paul@19	456	# Write the positions.
paul@19	457
paul@19	458	frequency = 0
paul@20	459	first_docnum = None
paul@19	460	first_offset = None
paul@19	461	count = 0
paul@19	462
paul@19	463	doc_positions.sort()
paul@19	464
paul@19	465	for docnum, positions in doc_positions:
paul@19	466	pos_offset = self.position_writer.write_positions(docnum, positions)
paul@19	467
paul@19	468	# Retain the first record offset for a subsequent index entry.
paul@19	469
paul@19	470	if first_offset is None:
paul@19	471	first_offset = pos_offset
paul@20	472	first_docnum = docnum
paul@19	473
paul@19	474	frequency += len(positions)
paul@20	475	count += 1
paul@19	476
paul@19	477	# Every {interval} entries, write an index entry.
paul@19	478
paul@19	479	if count == self.interval:
paul@20	480	io = self.position_index_writer.write_positions(first_docnum, first_offset, self.interval)
paul@0	481
paul@19	482	# Remember the first index entry offset.
paul@19	483
paul@19	484	if index_offset is None:
paul@19	485	index_offset = io
paul@19	486
paul@19	487	first_offset = None
paul@20	488	first_docnum = None
paul@19	489	count = 0
paul@19	490
paul@22	491	# Reset the position writer so that position readers accessing
paul@22	492	# a section start with the correct document number.
paul@22	493
paul@22	494	self.position_writer.reset()
paul@22	495
paul@19	496	# Finish writing an index entry for the remaining documents.
paul@19	497
paul@19	498	else:
paul@19	499	if first_offset is not None:
paul@20	500	io = self.position_index_writer.write_positions(first_docnum, first_offset, count)
paul@19	501
paul@19	502	# Remember the first index entry offset.
paul@19	503
paul@19	504	if index_offset is None:
paul@19	505	index_offset = io
paul@19	506
paul@19	507	return index_offset, frequency, len(doc_positions)
paul@19	508
paul@19	509	def close(self):
paul@19	510	self.position_writer.close()
paul@19	511	self.position_index_writer.close()
paul@19	512
paul@19	513	class PositionDictionaryReader:
paul@18	514
paul@19	515	"Reading position dictionaries."
paul@19	516
paul@19	517	def __init__(self, position_reader, position_index_reader):
paul@19	518	self.position_reader = position_reader
paul@19	519	self.position_index_reader = position_index_reader
paul@19	520
paul@19	521	def read_term_positions(self, offset, doc_frequency):
paul@19	522
paul@19	523	"""
paul@19	524	Return an iterator for dictionary entries starting at 'offset' with the
paul@19	525	given 'doc_frequency'.
paul@19	526	"""
paul@18	527
paul@19	528	return PositionDictionaryIterator(self.position_reader,
paul@19	529	self.position_index_reader, offset, doc_frequency)
paul@19	530
paul@19	531	def close(self):
paul@19	532	self.position_reader.close()
paul@19	533	self.position_index_reader.close()
paul@19	534
paul@19	535	class PositionDictionaryIterator:
paul@19	536
paul@19	537	"Iteration over position dictionary entries."
paul@19	538
paul@19	539	def __init__(self, position_reader, position_index_reader, offset, doc_frequency):
paul@19	540	self.position_reader = position_reader
paul@20	541	self.doc_frequency = doc_frequency
paul@21	542	self.index_iterator = position_index_reader.read_term_positions(offset, doc_frequency)
paul@19	543
paul@22	544	# Remember the last values.
paul@22	545
paul@22	546	self.found_docnum, self.found_positions = None, None
paul@22	547
paul@21	548	# Maintain state for the next index entry, if read.
paul@21	549
paul@21	550	self.next_docnum, self.next_pos_offset, self.next_section_count = None, None, None
paul@21	551
paul@21	552	# Initialise the current index entry and current position file iterator.
paul@21	553
paul@21	554	self._next_section()
paul@21	555	self._init_section()
paul@0	556
paul@20	557	def __len__(self):
paul@20	558	return self.doc_frequency
paul@20	559
paul@20	560	def sort(self):
paul@20	561	pass
paul@20	562
paul@18	563	def __iter__(self):
paul@18	564	return self
paul@18	565
paul@18	566	def next(self):
paul@0	567
paul@21	568	"""
paul@21	569	Attempt to get the next document record from the section in the
paul@21	570	positions file.
paul@21	571	"""
paul@19	572
paul@22	573	# Return any visited but unrequested record.
paul@22	574
paul@22	575	if self.found_docnum is not None:
paul@22	576	t = self.found_docnum, self.found_positions
paul@22	577	self.found_docnum, self.found_positions = None, None
paul@22	578	return t
paul@22	579
paul@22	580	# Or search for the next record.
paul@22	581
paul@19	582	while 1:
paul@19	583
paul@19	584	# Either return the next record.
paul@19	585
paul@19	586	try:
paul@19	587	return self.iterator.next()
paul@0	588
paul@19	589	# Or, where a section is finished, get the next section and try again.
paul@19	590
paul@19	591	except StopIteration:
paul@20	592
paul@20	593	# Where a section follows, update the index iterator, but keep
paul@20	594	# reading using the same file iterator (since the data should
paul@20	595	# just follow on from the last section).
paul@20	596
paul@21	597	self._next_section()
paul@19	598	self.iterator.replenish(self.section_count)
paul@19	599
paul@22	600	# Reset the state of the iterator to make sure that document
paul@22	601	# numbers are correct.
paul@22	602
paul@22	603	self.iterator.reset()
paul@22	604
paul@22	605	def from_document(self, docnum):
paul@21	606
paul@21	607	"""
paul@21	608	Attempt to navigate to a positions entry for the given 'docnum',
paul@22	609	returning the positions for 'docnum', or None otherwise.
paul@21	610	"""
paul@21	611
paul@22	612	# Return any unrequested document positions.
paul@22	613
paul@22	614	if docnum == self.found_docnum:
paul@22	615	return self.found_positions
paul@22	616
paul@21	617	# Read ahead in the index until the next entry refers to a document
paul@21	618	# later than the desired document.
paul@21	619
paul@21	620	try:
paul@21	621	if self.next_docnum is None:
paul@21	622	self.next_docnum, self.next_pos_offset, self.next_section_count = self.index_iterator.next()
paul@21	623
paul@22	624	# Read until the next entry is after the desired document number,
paul@22	625	# or until the end of the results.
paul@22	626
paul@22	627	while self.next_docnum <= docnum:
paul@21	628	self._next_read_section()
paul@22	629	if self.docnum < docnum:
paul@22	630	self.next_docnum, self.next_pos_offset, self.next_section_count = self.index_iterator.next()
paul@22	631	else:
paul@22	632	break
paul@21	633
paul@21	634	except StopIteration:
paul@21	635	pass
paul@21	636
paul@21	637	# Navigate in the position file to the document.
paul@21	638
paul@21	639	self._init_section()
paul@19	640
paul@21	641	try:
paul@21	642	while 1:
paul@22	643	found_docnum, found_positions = self.iterator.next()
paul@22	644
paul@22	645	# Return the desired document positions or those immediately
paul@22	646	# after.
paul@22	647
paul@21	648	if docnum == found_docnum:
paul@22	649	return found_positions
paul@22	650	elif docnum < self.found_docnum:
paul@22	651	self.found_docnum, self.found_positions = found_docnum, found_positions
paul@21	652	return None
paul@22	653
paul@21	654	except StopIteration:
paul@21	655	return None
paul@21	656
paul@21	657	# Internal methods.
paul@21	658
paul@21	659	def _next_section(self):
paul@21	660
paul@21	661	"Attempt to get the next section in the index."
paul@21	662
paul@21	663	if self.next_docnum is None:
paul@21	664	self.docnum, self.pos_offset, self.section_count = self.index_iterator.next()
paul@21	665	else:
paul@21	666	self._next_read_section()
paul@21	667
paul@21	668	def _next_read_section(self):
paul@21	669
paul@21	670	"""
paul@21	671	Make the next index entry the current one without reading from the
paul@21	672	index.
paul@21	673	"""
paul@21	674
paul@21	675	self.docnum, self.pos_offset, self.section_count = self.next_docnum, self.next_pos_offset, self.next_section_count
paul@22	676	self.next_docnum, self.next_pos_offset, self.next_section_count = None, None, None
paul@21	677
paul@21	678	def _init_section(self):
paul@21	679
paul@21	680	"Initialise the iterator for the section in the position file."
paul@21	681
paul@19	682	self.iterator = self.position_reader.read_term_positions(self.pos_offset, self.section_count)
paul@0	683
paul@2	684	class TermWriter(FileWriter):
paul@2	685
paul@2	686	"Writing term information to files."
paul@2	687
paul@2	688	def reset(self):
paul@2	689	self.last_term = ""
paul@2	690	self.last_offset = 0
paul@2	691
paul@19	692	def write_term(self, term, offset, frequency, doc_frequency):
paul@2	693
paul@2	694	"""
paul@19	695	Write the given 'term', its position file 'offset', its 'frequency' and
paul@19	696	its 'doc_frequency' (number of documents in which it appears) to the
paul@19	697	term information file. Return the offset after the term information was
paul@19	698	written to the file.
paul@2	699	"""
paul@2	700
paul@2	701	# Write the prefix length and term suffix.
paul@2	702
paul@2	703	common = len(commonprefix([self.last_term, term]))
paul@2	704	suffix = term[common:]
paul@2	705
paul@4	706	self.write_number(common)
paul@2	707	self.write_string(suffix)
paul@2	708
paul@2	709	# Write the offset delta.
paul@2	710
paul@2	711	self.write_number(offset - self.last_offset)
paul@2	712
paul@11	713	# Write the frequency.
paul@11	714
paul@11	715	self.write_number(frequency)
paul@11	716
paul@19	717	# Write the document frequency.
paul@19	718
paul@19	719	self.write_number(doc_frequency)
paul@19	720
paul@2	721	self.last_term = term
paul@2	722	self.last_offset = offset
paul@2	723
paul@3	724	return self.f.tell()
paul@3	725
paul@2	726	class TermReader(FileReader):
paul@2	727
paul@2	728	"Reading term information from files."
paul@2	729
paul@2	730	def reset(self):
paul@2	731	self.last_term = ""
paul@2	732	self.last_offset = 0
paul@2	733
paul@2	734	def read_term(self):
paul@2	735
paul@2	736	"""
paul@19	737	Read a term, its position file offset, its frequency and its document
paul@19	738	frequence from the term information file.
paul@2	739	"""
paul@2	740
paul@2	741	# Read the prefix length and term suffix.
paul@2	742
paul@4	743	common = self.read_number()
paul@2	744	suffix = self.read_string()
paul@2	745
paul@2	746	self.last_term = self.last_term[:common] + suffix
paul@2	747
paul@2	748	# Read the offset delta.
paul@2	749
paul@2	750	self.last_offset += self.read_number()
paul@2	751
paul@11	752	# Read the frequency.
paul@11	753
paul@11	754	frequency = self.read_number()
paul@11	755
paul@19	756	# Read the document frequency.
paul@19	757
paul@19	758	doc_frequency = self.read_number()
paul@19	759
paul@19	760	return self.last_term, self.last_offset, frequency, doc_frequency
paul@2	761
paul@3	762	def go_to_term(self, term, offset, info_offset):
paul@3	763
paul@9	764	"""
paul@9	765	Seek past the entry for 'term' having 'offset' to 'info_offset'. This
paul@9	766	permits the scanning for later terms from the specified term.
paul@9	767	"""
paul@3	768
paul@3	769	self.f.seek(info_offset)
paul@3	770	self.last_term = term
paul@3	771	self.last_offset = offset
paul@3	772
paul@3	773	class TermIndexWriter(TermWriter):
paul@3	774
paul@3	775	"Writing term dictionary index details to files."
paul@3	776
paul@3	777	def reset(self):
paul@3	778	TermWriter.reset(self)
paul@3	779	self.last_info_offset = 0
paul@3	780
paul@19	781	def write_term(self, term, offset, frequency, doc_frequency, info_offset):
paul@3	782
paul@3	783	"""
paul@19	784	Write the given 'term', its position file 'offset', its 'frequency' and
paul@19	785	its 'doc_frequency' to the term dictionary index file, along with the
paul@19	786	'info_offset' in the term information file.
paul@3	787	"""
paul@3	788
paul@19	789	TermWriter.write_term(self, term, offset, frequency, doc_frequency)
paul@3	790
paul@3	791	# Write the information file offset delta.
paul@3	792
paul@3	793	self.write_number(info_offset - self.last_info_offset)
paul@3	794	self.last_info_offset = info_offset
paul@3	795
paul@3	796	class TermIndexReader(TermReader):
paul@3	797
paul@3	798	"Reading term dictionary index details from files."
paul@3	799
paul@3	800	def reset(self):
paul@3	801	TermReader.reset(self)
paul@3	802	self.last_info_offset = 0
paul@3	803
paul@3	804	def read_term(self):
paul@3	805
paul@3	806	"""
paul@19	807	Read a term, its position file offset, its frequency, its document
paul@19	808	frequency and a term information file offset from the term dictionary
paul@19	809	index file.
paul@3	810	"""
paul@3	811
paul@19	812	term, offset, frequency, doc_frequency = TermReader.read_term(self)
paul@3	813
paul@3	814	# Read the offset delta.
paul@3	815
paul@3	816	self.last_info_offset += self.read_number()
paul@3	817
paul@19	818	return term, offset, frequency, doc_frequency, self.last_info_offset
paul@3	819
paul@3	820	class TermDictionaryWriter:
paul@3	821
paul@3	822	"Writing term dictionaries."
paul@3	823
paul@19	824	def __init__(self, info_writer, index_writer, position_dict_writer, interval):
paul@3	825	self.info_writer = info_writer
paul@3	826	self.index_writer = index_writer
paul@19	827	self.position_dict_writer = position_dict_writer
paul@3	828	self.interval = interval
paul@3	829	self.entry = 0
paul@3	830
paul@19	831	def _write_term(self, term, offset, frequency, doc_frequency):
paul@3	832
paul@3	833	"""
paul@19	834	Write the given 'term', its position file 'offset', its 'frequency' and
paul@19	835	its 'doc_frequency' (number of documents in which it appears) to the
paul@19	836	term information file. Return the offset after the term information was
paul@19	837	written to the file.
paul@3	838	"""
paul@3	839
paul@19	840	info_offset = self.info_writer.write_term(term, offset, frequency, doc_frequency)
paul@3	841
paul@3	842	if self.entry % self.interval == 0:
paul@19	843	self.index_writer.write_term(term, offset, frequency, doc_frequency, info_offset)
paul@3	844
paul@3	845	self.entry += 1
paul@3	846
paul@5	847	def write_term_positions(self, term, doc_positions):
paul@5	848
paul@5	849	"""
paul@5	850	Write the given 'term' and the 'doc_positions' recording the documents
paul@5	851	and positions at which the term is found.
paul@5	852	"""
paul@5	853
paul@19	854	offset, frequency, doc_frequency = self.position_dict_writer.write_term_positions(doc_positions)
paul@19	855	self._write_term(term, offset, frequency, doc_frequency)
paul@5	856
paul@3	857	def close(self):
paul@3	858	self.info_writer.close()
paul@3	859	self.index_writer.close()
paul@19	860	self.position_dict_writer.close()
paul@3	861
paul@3	862	class TermDictionaryReader:
paul@3	863
paul@3	864	"Reading term dictionaries."
paul@3	865
paul@22	866	def __init__(self, info_reader, index_reader, position_dict_reader):
paul@3	867	self.info_reader = info_reader
paul@3	868	self.index_reader = index_reader
paul@22	869	self.position_dict_reader = position_dict_reader
paul@3	870
paul@3	871	self.terms = []
paul@3	872	try:
paul@3	873	while 1:
paul@3	874	self.terms.append(self.index_reader.read_term())
paul@3	875	except EOFError:
paul@3	876	pass
paul@3	877
paul@3	878	# Large numbers for ordering purposes.
paul@3	879
paul@14	880	self.max_offset = self.terms[-1][1] + 1
paul@3	881
paul@9	882	def _find_term(self, term):
paul@3	883
paul@11	884	"""
paul@11	885	Find the position file offset and frequency of 'term' from the term
paul@11	886	dictionary.
paul@11	887	"""
paul@3	888
paul@14	889	i = bisect_right(self.terms, (term, self.max_offset, 0, 0)) - 1
paul@3	890
paul@3	891	# Get the entry position providing the term or one preceding it.
paul@3	892
paul@3	893	if i == -1:
paul@3	894	return None
paul@3	895
paul@19	896	found_term, offset, frequency, doc_frequency, info_offset = self.terms[i]
paul@3	897
paul@19	898	# Where the term is found immediately, return the offset and
paul@19	899	# frequencies.
paul@3	900
paul@3	901	if term == found_term:
paul@19	902	return offset, frequency, doc_frequency
paul@3	903
paul@3	904	# Otherwise, seek past the index term's entry in the information file
paul@3	905	# and scan for the desired term.
paul@3	906
paul@3	907	else:
paul@3	908	self.info_reader.go_to_term(found_term, offset, info_offset)
paul@3	909	try:
paul@3	910	while term > found_term:
paul@19	911	found_term, offset, frequency, doc_frequency = self.info_reader.read_term()
paul@3	912	except EOFError:
paul@3	913	pass
paul@3	914
paul@19	915	# If the term is found, return the offset and frequencies.
paul@3	916
paul@3	917	if term == found_term:
paul@19	918	return offset, frequency, doc_frequency
paul@3	919	else:
paul@3	920	return None
paul@3	921
paul@12	922	def rewind(self):
paul@12	923	self.info_reader.rewind()
paul@12	924
paul@19	925	def _get_positions(self, offset, doc_frequency):
paul@22	926	return self.position_dict_reader.read_term_positions(offset, doc_frequency)
paul@12	927
paul@12	928	def read_term(self):
paul@12	929
paul@12	930	"""
paul@19	931	Return the next term, its frequency, its document frequency, and the
paul@19	932	documents and positions at which the term is found.
paul@12	933	"""
paul@12	934
paul@19	935	term, offset, frequency, doc_frequency = self.info_reader.read_term()
paul@19	936	positions = self._get_positions(offset, doc_frequency)
paul@19	937	return term, frequency, doc_frequency, positions
paul@12	938
paul@5	939	def find_positions(self, term):
paul@5	940
paul@5	941	"Return the documents and positions at which the given 'term' is found."
paul@5	942
paul@11	943	t = self._find_term(term)
paul@11	944	if t is None:
paul@5	945	return None
paul@5	946	else:
paul@19	947	offset, frequency, doc_frequency = t
paul@19	948	return self._get_positions(offset, doc_frequency)
paul@5	949
paul@11	950	def get_frequency(self, term):
paul@11	951
paul@11	952	"Return the frequency of the given 'term'."
paul@11	953
paul@11	954	t = self._find_term(term)
paul@11	955	if t is None:
paul@11	956	return None
paul@11	957	else:
paul@19	958	offset, frequency, doc_frequency = t
paul@11	959	return frequency
paul@11	960
paul@19	961	def get_document_frequency(self, term):
paul@19	962
paul@19	963	"Return the document frequency of the given 'term'."
paul@19	964
paul@19	965	t = self._find_term(term)
paul@19	966	if t is None:
paul@19	967	return None
paul@19	968	else:
paul@19	969	offset, frequency, doc_frequency = t
paul@19	970	return doc_frequency
paul@19	971
paul@3	972	def close(self):
paul@3	973	self.info_reader.close()
paul@3	974	self.index_reader.close()
paul@22	975	self.position_dict_reader.close()
paul@3	976
paul@9	977	# Specific classes for storing document information.
paul@9	978
paul@8	979	class FieldWriter(FileWriter):
paul@8	980
paul@8	981	"Writing field data to files."
paul@8	982
paul@9	983	def reset(self):
paul@9	984	self.last_docnum = 0
paul@9	985
paul@9	986	def write_fields(self, docnum, fields):
paul@8	987
paul@8	988	"""
paul@13	989	Write for the given 'docnum', a list of 'fields' (integer, string pairs
paul@13	990	representing field identifiers and values respectively).
paul@13	991	Return the offset at which the fields are stored.
paul@8	992	"""
paul@8	993
paul@8	994	offset = self.f.tell()
paul@8	995
paul@9	996	# Write the document number delta.
paul@9	997
paul@9	998	self.write_number(docnum - self.last_docnum)
paul@9	999
paul@8	1000	# Write the number of fields.
paul@8	1001
paul@8	1002	self.write_number(len(fields))
paul@8	1003
paul@8	1004	# Write the fields themselves.
paul@8	1005
paul@13	1006	for i, field in fields:
paul@13	1007	self.write_number(i)
paul@10	1008	self.write_string(field, 1) # compress
paul@8	1009
paul@9	1010	self.last_docnum = docnum
paul@8	1011	return offset
paul@8	1012
paul@8	1013	class FieldReader(FileReader):
paul@8	1014
paul@8	1015	"Reading field data from files."
paul@8	1016
paul@9	1017	def reset(self):
paul@9	1018	self.last_docnum = 0
paul@9	1019
paul@8	1020	def read_fields(self):
paul@8	1021
paul@9	1022	"""
paul@9	1023	Read fields from the file, returning a tuple containing the document
paul@13	1024	number and a list of field (identifier, value) pairs.
paul@9	1025	"""
paul@9	1026
paul@9	1027	# Read the document number.
paul@9	1028
paul@9	1029	self.last_docnum += self.read_number()
paul@8	1030
paul@8	1031	# Read the number of fields.
paul@8	1032
paul@8	1033	nfields = self.read_number()
paul@8	1034
paul@8	1035	# Collect the fields.
paul@8	1036
paul@8	1037	fields = []
paul@8	1038	i = 0
paul@8	1039
paul@8	1040	while i < nfields:
paul@13	1041	identifier = self.read_number()
paul@13	1042	value = self.read_string(1) # decompress
paul@13	1043	fields.append((identifier, value))
paul@8	1044	i += 1
paul@8	1045
paul@9	1046	return self.last_docnum, fields
paul@9	1047
paul@9	1048	def read_document_fields(self, docnum, offset):
paul@8	1049
paul@9	1050	"""
paul@9	1051	Read fields for 'docnum' at the given 'offset'. This permits the
paul@9	1052	retrieval of details for the specified document, as well as scanning for
paul@9	1053	later documents.
paul@9	1054	"""
paul@8	1055
paul@8	1056	self.f.seek(offset)
paul@9	1057	bad_docnum, fields = self.read_fields()
paul@9	1058	self.last_docnum = docnum
paul@9	1059	return docnum, fields
paul@12	1060
paul@9	1061	class FieldIndexWriter(FileWriter):
paul@9	1062
paul@9	1063	"Writing field index details to files."
paul@9	1064
paul@9	1065	def reset(self):
paul@9	1066	self.last_docnum = 0
paul@10	1067	self.last_offset = 0
paul@9	1068
paul@9	1069	def write_document(self, docnum, offset):
paul@9	1070
paul@9	1071	"""
paul@9	1072	Write for the given 'docnum', the 'offset' at which the fields for the
paul@9	1073	document are stored in the fields file.
paul@9	1074	"""
paul@9	1075
paul@10	1076	# Write the document number and offset deltas.
paul@9	1077
paul@9	1078	self.write_number(docnum - self.last_docnum)
paul@10	1079	self.write_number(offset - self.last_offset)
paul@9	1080
paul@9	1081	self.last_docnum = docnum
paul@10	1082	self.last_offset = offset
paul@9	1083
paul@9	1084	class FieldIndexReader(FileReader):
paul@9	1085
paul@9	1086	"Reading field index details from files."
paul@9	1087
paul@9	1088	def reset(self):
paul@9	1089	self.last_docnum = 0
paul@10	1090	self.last_offset = 0
paul@9	1091
paul@9	1092	def read_document(self):
paul@9	1093
paul@9	1094	"Read a document number and field file offset."
paul@9	1095
paul@9	1096	# Read the document number delta and offset.
paul@9	1097
paul@9	1098	self.last_docnum += self.read_number()
paul@10	1099	self.last_offset += self.read_number()
paul@9	1100
paul@10	1101	return self.last_docnum, self.last_offset
paul@9	1102
paul@9	1103	class FieldDictionaryWriter:
paul@9	1104
paul@9	1105	"Writing field dictionary details."
paul@9	1106
paul@9	1107	def __init__(self, field_writer, field_index_writer, interval):
paul@9	1108	self.field_writer = field_writer
paul@9	1109	self.field_index_writer = field_index_writer
paul@9	1110	self.interval = interval
paul@9	1111	self.entry = 0
paul@9	1112
paul@9	1113	def write_fields(self, docnum, fields):
paul@9	1114
paul@9	1115	"Write details of the document with the given 'docnum' and 'fields'."
paul@9	1116
paul@9	1117	offset = self.field_writer.write_fields(docnum, fields)
paul@9	1118
paul@9	1119	if self.entry % self.interval == 0:
paul@9	1120	self.field_index_writer.write_document(docnum, offset)
paul@9	1121
paul@9	1122	self.entry += 1
paul@9	1123
paul@9	1124	def close(self):
paul@9	1125	self.field_writer.close()
paul@9	1126	self.field_index_writer.close()
paul@9	1127
paul@9	1128	class FieldDictionaryReader:
paul@9	1129
paul@9	1130	"Reading field dictionary details."
paul@9	1131
paul@9	1132	def __init__(self, field_reader, field_index_reader):
paul@9	1133	self.field_reader = field_reader
paul@9	1134	self.field_index_reader = field_index_reader
paul@9	1135
paul@9	1136	self.docs = []
paul@9	1137	try:
paul@9	1138	while 1:
paul@9	1139	self.docs.append(self.field_index_reader.read_document())
paul@9	1140	except EOFError:
paul@9	1141	pass
paul@9	1142
paul@9	1143	# Large numbers for ordering purposes.
paul@9	1144
paul@9	1145	self.max_offset = self.docs[-1][1]
paul@9	1146
paul@13	1147	def rewind(self):
paul@13	1148	self.field_reader.rewind()
paul@13	1149
paul@13	1150	def read_fields(self):
paul@13	1151
paul@13	1152	"Return the next document number and fields."
paul@13	1153
paul@13	1154	return self.field_reader.read_fields()
paul@13	1155
paul@13	1156	def get_fields(self, docnum):
paul@9	1157
paul@9	1158	"Read the fields of the document with the given 'docnum'."
paul@9	1159
paul@9	1160	i = bisect_right(self.docs, (docnum, self.max_offset)) - 1
paul@9	1161
paul@9	1162	# Get the entry position providing the term or one preceding it.
paul@9	1163
paul@9	1164	if i == -1:
paul@9	1165	return None
paul@9	1166
paul@9	1167	found_docnum, offset = self.docs[i]
paul@9	1168
paul@9	1169	# Read from the fields file.
paul@9	1170
paul@9	1171	found_docnum, fields = self.field_reader.read_document_fields(found_docnum, offset)
paul@9	1172
paul@9	1173	# Scan for the document, if necessary.
paul@9	1174
paul@9	1175	try:
paul@9	1176	while docnum > found_docnum:
paul@9	1177	found_docnum, fields = self.field_reader.read_fields()
paul@9	1178	except EOFError:
paul@9	1179	pass
paul@9	1180
paul@9	1181	# If the document is found, return the fields.
paul@9	1182
paul@9	1183	if docnum == found_docnum:
paul@9	1184	return fields
paul@9	1185	else:
paul@9	1186	return None
paul@9	1187
paul@9	1188	def close(self):
paul@9	1189	self.field_reader.close()
paul@9	1190	self.field_index_reader.close()
paul@8	1191
paul@12	1192	# Dictionary merging classes.
paul@12	1193
paul@13	1194	class Merger:
paul@12	1195
paul@13	1196	"Merge files."
paul@12	1197
paul@12	1198	def __init__(self, writer, readers):
paul@12	1199	self.writer = writer
paul@12	1200	self.readers = readers
paul@12	1201
paul@13	1202	def close(self):
paul@13	1203	for reader in self.readers:
paul@13	1204	reader.close()
paul@13	1205	self.writer.close()
paul@13	1206
paul@13	1207	class TermDictionaryMerger(Merger):
paul@13	1208
paul@13	1209	"Merge term and position files."
paul@13	1210
paul@12	1211	def merge(self):
paul@13	1212
paul@13	1213	"""
paul@13	1214	Merge terms and positions from the readers, sending them to the writer.
paul@13	1215	"""
paul@13	1216
paul@12	1217	entries = []
paul@12	1218
paul@12	1219	# Get the first entries from the readers.
paul@12	1220
paul@12	1221	for partition, reader in enumerate(self.readers):
paul@12	1222	reader.rewind()
paul@12	1223
paul@12	1224	try:
paul@19	1225	term, frequency, doc_frequency, positions = reader.read_term()
paul@12	1226	insort_right(entries, (term, positions, partition))
paul@12	1227	except EOFError:
paul@12	1228	pass
paul@12	1229
paul@12	1230	# While entries are available, write them out in order, merging where
paul@12	1231	# appropriate.
paul@12	1232
paul@12	1233	while entries:
paul@12	1234	term, doc_positions, partition = entries[0]
paul@12	1235	to_update = [partition]
paul@12	1236
paul@12	1237	nentries = len(entries)
paul@12	1238	i = 1
paul@12	1239
paul@12	1240	# Find other entries for the term.
paul@12	1241
paul@12	1242	while i < nentries:
paul@12	1243	other_term, other_doc_positions, other_partition = entries[i]
paul@12	1244
paul@12	1245	# For such entries, merge the positions.
paul@12	1246
paul@12	1247	if other_term == term:
paul@14	1248	doc_positions = self.merge_positions(doc_positions, other_doc_positions)
paul@12	1249	to_update.append(other_partition)
paul@12	1250	i += 1
paul@12	1251	else:
paul@12	1252	break
paul@12	1253
paul@12	1254	# Write the combined term details.
paul@12	1255
paul@12	1256	self.writer.write_term_positions(term, doc_positions)
paul@12	1257
paul@12	1258	# Update the entries from the affected readers.
paul@12	1259
paul@12	1260	del entries[:i]
paul@12	1261
paul@12	1262	for partition in to_update:
paul@12	1263	try:
paul@19	1264	term, frequency, doc_frequency, positions = self.readers[partition].read_term()
paul@12	1265	insort_right(entries, (term, positions, partition))
paul@12	1266	except EOFError:
paul@12	1267	pass
paul@12	1268
paul@13	1269	def merge_positions(self, doc_positions, other_doc_positions):
paul@13	1270
paul@13	1271	"""
paul@13	1272	Merge 'doc_positions' with 'other_doc_positions' so that common document
paul@13	1273	records contain positions from both collections.
paul@13	1274	"""
paul@13	1275
paul@13	1276	doc_position_dict = dict(doc_positions)
paul@13	1277
paul@13	1278	for docnum, positions in other_doc_positions:
paul@13	1279	if doc_position_dict.has_key(docnum):
paul@13	1280	doc_position_dict[docnum] += positions
paul@13	1281	else:
paul@13	1282	doc_position_dict[docnum] = positions
paul@13	1283
paul@14	1284	return doc_position_dict.items()
paul@13	1285
paul@13	1286	class FieldDictionaryMerger(Merger):
paul@13	1287
paul@13	1288	"Merge field files."
paul@13	1289
paul@13	1290	def merge(self):
paul@13	1291
paul@13	1292	"""
paul@13	1293	Merge fields from the readers, sending them to the writer.
paul@13	1294	"""
paul@13	1295
paul@13	1296	entries = []
paul@13	1297
paul@13	1298	# Get the first entries from the readers.
paul@13	1299
paul@13	1300	for partition, reader in enumerate(self.readers):
paul@13	1301	reader.rewind()
paul@13	1302
paul@13	1303	try:
paul@13	1304	docnum, fields = reader.read_fields()
paul@13	1305	insort_right(entries, (docnum, fields, partition))
paul@13	1306	except EOFError:
paul@13	1307	pass
paul@13	1308
paul@13	1309	# While entries are available, write them out in order, merging where
paul@13	1310	# appropriate.
paul@13	1311
paul@13	1312	while entries:
paul@13	1313	docnum, fields, partition = entries[0]
paul@13	1314	to_update = [partition]
paul@13	1315
paul@13	1316	nentries = len(entries)
paul@13	1317	i = 1
paul@13	1318
paul@13	1319	# Find other entries for the term.
paul@13	1320
paul@13	1321	while i < nentries:
paul@13	1322	other_docnum, other_fields, other_partition = entries[i]
paul@13	1323
paul@13	1324	# For such entries, merge the positions.
paul@13	1325
paul@17	1326	if other_docnum == docnum:
paul@13	1327	fields += other_fields
paul@13	1328	to_update.append(other_partition)
paul@13	1329	i += 1
paul@13	1330	else:
paul@13	1331	break
paul@13	1332
paul@13	1333	# Write the combined term details.
paul@13	1334
paul@13	1335	self.writer.write_fields(docnum, fields)
paul@13	1336
paul@13	1337	# Update the entries from the affected readers.
paul@13	1338
paul@13	1339	del entries[:i]
paul@13	1340
paul@13	1341	for partition in to_update:
paul@13	1342	try:
paul@14	1343	docnum, fields = self.readers[partition].read_fields()
paul@13	1344	insort_right(entries, (docnum, fields, partition))
paul@13	1345	except EOFError:
paul@13	1346	pass
paul@13	1347
paul@13	1348	# Utility functions.
paul@13	1349
paul@19	1350	def get_term_writer(pathname, partition, interval, doc_interval):
paul@13	1351
paul@13	1352	"""
paul@13	1353	Return a term dictionary writer using files under the given 'pathname'
paul@13	1354	labelled according to the given 'partition', using the given indexing
paul@19	1355	'interval' for terms and 'doc_interval' for document position records.
paul@13	1356	"""
paul@13	1357
paul@13	1358	tdf = open(join(pathname, "terms-%s" % partition), "wb")
paul@13	1359	info_writer = TermWriter(tdf)
paul@13	1360
paul@14	1361	tdif = open(join(pathname, "terms_index-%s" % partition), "wb")
paul@13	1362	index_writer = TermIndexWriter(tdif)
paul@13	1363
paul@13	1364	tpf = open(join(pathname, "positions-%s" % partition), "wb")
paul@13	1365	positions_writer = PositionWriter(tpf)
paul@13	1366
paul@19	1367	tpif = open(join(pathname, "positions_index-%s" % partition), "wb")
paul@19	1368	positions_index_writer = PositionIndexWriter(tpif)
paul@19	1369
paul@19	1370	positions_dict_writer = PositionDictionaryWriter(positions_writer, positions_index_writer, doc_interval)
paul@19	1371
paul@19	1372	return TermDictionaryWriter(info_writer, index_writer, positions_dict_writer, interval)
paul@13	1373
paul@13	1374	def get_field_writer(pathname, partition, interval):
paul@13	1375
paul@13	1376	"""
paul@13	1377	Return a field dictionary writer using files under the given 'pathname'
paul@13	1378	labelled according to the given 'partition', using the given indexing
paul@13	1379	'interval'.
paul@13	1380	"""
paul@13	1381
paul@13	1382	ff = open(join(pathname, "fields-%s" % partition), "wb")
paul@13	1383	field_writer = FieldWriter(ff)
paul@13	1384
paul@13	1385	fif = open(join(pathname, "fields_index-%s" % partition), "wb")
paul@13	1386	field_index_writer = FieldIndexWriter(fif)
paul@13	1387
paul@13	1388	return FieldDictionaryWriter(field_writer, field_index_writer, interval)
paul@13	1389
paul@14	1390	def get_term_reader(pathname, partition):
paul@14	1391
paul@14	1392	"""
paul@14	1393	Return a term dictionary reader using files under the given 'pathname'
paul@14	1394	labelled according to the given 'partition'.
paul@14	1395	"""
paul@14	1396
paul@14	1397	tdf = open(join(pathname, "terms-%s" % partition), "rb")
paul@14	1398	info_reader = TermReader(tdf)
paul@14	1399
paul@14	1400	tdif = open(join(pathname, "terms_index-%s" % partition), "rb")
paul@14	1401	index_reader = TermIndexReader(tdif)
paul@14	1402
paul@14	1403	tpf = open(join(pathname, "positions-%s" % partition), "rb")
paul@14	1404	positions_reader = PositionReader(tpf)
paul@14	1405
paul@19	1406	tpif = open(join(pathname, "positions_index-%s" % partition), "rb")
paul@19	1407	positions_index_reader = PositionIndexReader(tpif)
paul@19	1408
paul@19	1409	positions_dict_reader = PositionDictionaryReader(positions_reader, positions_index_reader)
paul@19	1410
paul@19	1411	return TermDictionaryReader(info_reader, index_reader, positions_dict_reader)
paul@14	1412
paul@14	1413	def get_field_reader(pathname, partition):
paul@14	1414
paul@14	1415	"""
paul@14	1416	Return a field dictionary reader using files under the given 'pathname'
paul@14	1417	labelled according to the given 'partition'.
paul@14	1418	"""
paul@14	1419
paul@14	1420	ff = open(join(pathname, "fields-%s" % partition), "rb")
paul@14	1421	field_reader = FieldReader(ff)
paul@14	1422
paul@14	1423	fif = open(join(pathname, "fields_index-%s" % partition), "rb")
paul@14	1424	field_index_reader = FieldIndexReader(fif)
paul@14	1425
paul@14	1426	return FieldDictionaryReader(field_reader, field_index_reader)
paul@14	1427
paul@14	1428	def rename_files(pathname, names, from_partition, to_partition):
paul@14	1429	for name in names:
paul@14	1430	rename(join(pathname, "%s-%s" % (name, from_partition)), join(pathname, "%s-%s" % (name, to_partition)))
paul@14	1431
paul@14	1432	def rename_term_files(pathname, from_partition, to_partition):
paul@20	1433	rename_files(pathname, TERM_FILENAMES, from_partition, to_partition)
paul@14	1434
paul@14	1435	def rename_field_files(pathname, from_partition, to_partition):
paul@20	1436	rename_files(pathname, FIELD_FILENAMES, from_partition, to_partition)
paul@14	1437
paul@14	1438	def remove_files(pathname, names, partition):
paul@14	1439	for name in names:
paul@14	1440	remove(join(pathname, "%s-%s" % (name, partition)))
paul@14	1441
paul@14	1442	def remove_term_files(pathname, partition):
paul@20	1443	remove_files(pathname, TERM_FILENAMES, partition)
paul@14	1444
paul@14	1445	def remove_field_files(pathname, partition):
paul@20	1446	remove_files(pathname, FIELD_FILENAMES, partition)
paul@14	1447
paul@8	1448	# High-level classes.
paul@8	1449
paul@6	1450	class IndexWriter:
paul@6	1451
paul@10	1452	"""
paul@10	1453	Building term information and writing it to the term and field dictionaries.
paul@10	1454	"""
paul@6	1455
paul@20	1456	def __init__(self, pathname, interval, doc_interval, flush_interval):
paul@12	1457	self.pathname = pathname
paul@12	1458	self.interval = interval
paul@20	1459	self.doc_interval = doc_interval
paul@12	1460	self.flush_interval = flush_interval
paul@12	1461
paul@12	1462	self.dict_partition = 0
paul@12	1463	self.field_dict_partition = 0
paul@12	1464
paul@6	1465	self.terms = {}
paul@10	1466	self.docs = {}
paul@6	1467
paul@12	1468	self.position_counter = 0
paul@12	1469	self.field_counter = 0
paul@12	1470
paul@6	1471	def add_position(self, term, docnum, position):
paul@6	1472
paul@6	1473	"""
paul@6	1474	Add a position entry for the given 'term' in the document with the given
paul@6	1475	'docnum', indicating the given 'position'.
paul@6	1476	"""
paul@6	1477
paul@6	1478	if not self.terms.has_key(term):
paul@6	1479	doc_positions = self.terms[term] = {}
paul@6	1480	else:
paul@6	1481	doc_positions = self.terms[term]
paul@6	1482
paul@6	1483	if not doc_positions.has_key(docnum):
paul@6	1484	doc = doc_positions[docnum] = []
paul@6	1485	else:
paul@6	1486	doc = doc_positions[docnum]
paul@6	1487
paul@6	1488	doc.append(position)
paul@6	1489
paul@12	1490	self.position_counter += 1
paul@13	1491	if self.flush_interval and self.position_counter >= self.flush_interval:
paul@12	1492	self.flush_terms()
paul@15	1493	self.position_counter = 0
paul@12	1494
paul@13	1495	def add_field(self, docnum, identifier, value):
paul@10	1496
paul@13	1497	"""
paul@13	1498	Add for the document with the given 'docnum' a field having the given
paul@13	1499	'identifier' and 'value'.
paul@13	1500	"""
paul@10	1501
paul@10	1502	if not self.docs.has_key(docnum):
paul@13	1503	doc_fields = self.docs[docnum] = []
paul@10	1504	else:
paul@13	1505	doc_fields = self.docs[docnum]
paul@10	1506
paul@13	1507	doc_fields.append((identifier, value))
paul@13	1508
paul@13	1509	self.field_counter += 1
paul@13	1510	if self.flush_interval and self.field_counter >= self.flush_interval:
paul@12	1511	self.flush_fields()
paul@15	1512	self.field_counter = 0
paul@12	1513
paul@12	1514	def get_term_writer(self):
paul@12	1515
paul@12	1516	"Return a term dictionary writer for the current partition."
paul@12	1517
paul@20	1518	return get_term_writer(self.pathname, self.dict_partition, self.interval, self.doc_interval)
paul@12	1519
paul@12	1520	def get_field_writer(self):
paul@12	1521
paul@12	1522	"Return a field dictionary writer for the current partition."
paul@12	1523
paul@13	1524	return get_field_writer(self.pathname, self.field_dict_partition, self.interval)
paul@12	1525
paul@12	1526	def flush_terms(self):
paul@12	1527
paul@12	1528	"Flush terms into the current term dictionary partition."
paul@6	1529
paul@6	1530	# Get the terms in order.
paul@6	1531
paul@6	1532	terms = self.terms.items()
paul@6	1533	terms.sort()
paul@6	1534
paul@12	1535	dict_writer = self.get_term_writer()
paul@12	1536
paul@6	1537	for term, doc_positions in terms:
paul@6	1538	doc_positions = doc_positions.items()
paul@12	1539	dict_writer.write_term_positions(term, doc_positions)
paul@12	1540
paul@12	1541	dict_writer.close()
paul@6	1542
paul@12	1543	self.terms = {}
paul@12	1544	self.dict_partition += 1
paul@12	1545
paul@12	1546	def flush_fields(self):
paul@12	1547
paul@12	1548	"Flush fields into the current term dictionary partition."
paul@7	1549
paul@10	1550	# Get the documents in order.
paul@10	1551
paul@10	1552	docs = self.docs.items()
paul@10	1553	docs.sort()
paul@10	1554
paul@12	1555	field_dict_writer = self.get_field_writer()
paul@12	1556
paul@10	1557	for docnum, fields in docs:
paul@12	1558	field_dict_writer.write_fields(docnum, fields)
paul@12	1559
paul@12	1560	field_dict_writer.close()
paul@10	1561
paul@12	1562	self.docs = {}
paul@12	1563	self.field_dict_partition += 1
paul@12	1564
paul@12	1565	def close(self):
paul@12	1566	if self.terms:
paul@12	1567	self.flush_terms()
paul@12	1568	if self.docs:
paul@12	1569	self.flush_fields()
paul@10	1570
paul@10	1571	class IndexReader:
paul@10	1572
paul@10	1573	"Accessing the term and field dictionaries."
paul@10	1574
paul@14	1575	def __init__(self, pathname):
paul@14	1576	self.dict_reader = get_term_reader(pathname, "merged")
paul@14	1577	self.field_dict_reader = get_field_reader(pathname, "merged")
paul@10	1578
paul@10	1579	def find_positions(self, term):
paul@10	1580	return self.dict_reader.find_positions(term)
paul@10	1581
paul@11	1582	def get_frequency(self, term):
paul@11	1583	return self.dict_reader.get_frequency(term)
paul@11	1584
paul@22	1585	def get_document_frequency(self, term):
paul@22	1586	return self.dict_reader.get_document_frequency(term)
paul@22	1587
paul@10	1588	def get_fields(self, docnum):
paul@13	1589	return self.field_dict_reader.get_fields(docnum)
paul@10	1590
paul@10	1591	def close(self):
paul@10	1592	self.dict_reader.close()
paul@10	1593	self.field_dict_reader.close()
paul@10	1594
paul@7	1595	class Index:
paul@7	1596
paul@7	1597	"An inverted index solution encapsulating the various components."
paul@7	1598
paul@7	1599	def __init__(self, pathname):
paul@7	1600	self.pathname = pathname
paul@7	1601	self.reader = None
paul@7	1602	self.writer = None
paul@7	1603
paul@20	1604	def get_writer(self, interval=TERM_INTERVAL, doc_interval=DOCUMENT_INTERVAL, flush_interval=FLUSH_INTERVAL):
paul@7	1605
paul@12	1606	"""
paul@20	1607	Return a writer, optionally using the given indexing 'interval',
paul@20	1608	'doc_interval' and 'flush_interval'.
paul@12	1609	"""
paul@7	1610
paul@7	1611	if not exists(self.pathname):
paul@7	1612	mkdir(self.pathname)
paul@7	1613
paul@20	1614	self.writer = IndexWriter(self.pathname, interval, doc_interval, flush_interval)
paul@7	1615	return self.writer
paul@7	1616
paul@12	1617	def get_reader(self, partition=0):
paul@7	1618
paul@7	1619	"Return a reader for the index."
paul@7	1620
paul@14	1621	# Ensure that only one partition exists.
paul@14	1622
paul@14	1623	self.merge_terms()
paul@14	1624	self.merge_fields()
paul@14	1625
paul@14	1626	return self._get_reader(partition)
paul@14	1627
paul@14	1628	def _get_reader(self, partition):
paul@14	1629
paul@14	1630	"Return a reader for the index."
paul@14	1631
paul@7	1632	if not exists(self.pathname):
paul@7	1633	raise OSError, "Index path %r does not exist." % self.pathname
paul@7	1634
paul@14	1635	self.reader = IndexReader(self.pathname)
paul@12	1636	return self.reader
paul@7	1637
paul@20	1638	def merge_terms(self, interval=TERM_INTERVAL, doc_interval=DOCUMENT_INTERVAL):
paul@7	1639
paul@20	1640	"""
paul@20	1641	Merge term dictionaries using the given indexing 'interval' and
paul@20	1642	'doc_interval'.
paul@20	1643	"""
paul@10	1644
paul@12	1645	readers = []
paul@21	1646	partitions = set()
paul@10	1647
paul@14	1648	for filename in listdir(self.pathname):
paul@12	1649	if filename.startswith("terms-"): # 6 character prefix
paul@14	1650	partition = filename[6:]
paul@14	1651	readers.append(get_term_reader(self.pathname, partition))
paul@21	1652	partitions.add(partition)
paul@14	1653
paul@14	1654	# Write directly to a dictionary.
paul@14	1655
paul@14	1656	if len(readers) > 1:
paul@21	1657	if "merged" in partitions:
paul@21	1658	rename_term_files(self.pathname, "merged", "old-merged")
paul@21	1659	partitions.remove("merged")
paul@21	1660	partitions.add("old-merged")
paul@21	1661
paul@20	1662	writer = get_term_writer(self.pathname, "merged", interval, doc_interval)
paul@14	1663	merger = TermDictionaryMerger(writer, readers)
paul@14	1664	merger.merge()
paul@14	1665	merger.close()
paul@14	1666
paul@14	1667	# Remove old files.
paul@14	1668
paul@14	1669	for partition in partitions:
paul@14	1670	remove_term_files(self.pathname, partition)
paul@14	1671
paul@21	1672	elif len(readers) == 1:
paul@21	1673	partition = list(partitions)[0]
paul@21	1674	if partition != "merged":
paul@21	1675	rename_term_files(self.pathname, partition, "merged")
paul@14	1676
paul@20	1677	def merge_fields(self, interval=FIELD_INTERVAL):
paul@10	1678
paul@14	1679	"Merge field dictionaries using the given indexing 'interval'."
paul@14	1680
paul@14	1681	readers = []
paul@21	1682	partitions = set()
paul@14	1683
paul@14	1684	for filename in listdir(self.pathname):
paul@14	1685	if filename.startswith("fields-"): # 7 character prefix
paul@14	1686	partition = filename[7:]
paul@14	1687	readers.append(get_field_reader(self.pathname, partition))
paul@21	1688	partitions.add(partition)
paul@14	1689
paul@14	1690	# Write directly to a dictionary.
paul@13	1691
paul@14	1692	if len(readers) > 1:
paul@21	1693	if "merged" in partitions:
paul@21	1694	rename_field_files(self.pathname, "merged", "old-merged")
paul@21	1695	partitions.remove("merged")
paul@21	1696	partitions.add("old-merged")
paul@21	1697
paul@14	1698	writer = get_field_writer(self.pathname, "merged", interval)
paul@14	1699	merger = FieldDictionaryMerger(writer, readers)
paul@14	1700	merger.merge()
paul@14	1701	merger.close()
paul@14	1702
paul@14	1703	# Remove old files.
paul@14	1704
paul@14	1705	for partition in partitions:
paul@14	1706	remove_field_files(self.pathname, partition)
paul@14	1707
paul@21	1708	elif len(readers) == 1:
paul@21	1709	partition = list(partitions)[0]
paul@21	1710	if partition != "merged":
paul@21	1711	rename_field_files(self.pathname, partition, "merged")
paul@7	1712
paul@7	1713	def close(self):
paul@7	1714	if self.reader is not None:
paul@7	1715	self.reader.close()
paul@7	1716	self.reader = None
paul@7	1717	if self.writer is not None:
paul@7	1718	self.writer.close()
paul@7	1719	self.writer = None
paul@6	1720
paul@0	1721	# vim: tabstop=4 expandtab shiftwidth=4