iixr (annotate iixr.py in dafcd23d5fa9)

iixr

Annotated iixr.py

33:dafcd23d5fa9

2009-09-09

Paul Boddie

Attempted to fix document position merging.

paul@0	1	#!/usr/bin/env python
paul@0	2
paul@0	3	"""
paul@0	4	A simple (and sane) text indexing library.
paul@1	5
paul@1	6	Copyright (C) 2009 Paul Boddie <paul@boddie.org.uk>
paul@1	7
paul@1	8	This program is free software; you can redistribute it and/or modify it under
paul@1	9	the terms of the GNU General Public License as published by the Free Software
paul@1	10	Foundation; either version 3 of the License, or (at your option) any later
paul@1	11	version.
paul@1	12
paul@1	13	This program is distributed in the hope that it will be useful, but WITHOUT ANY
paul@1	14	WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
paul@1	15	PARTICULAR PURPOSE. See the GNU General Public License for more details.
paul@1	16
paul@1	17	You should have received a copy of the GNU General Public License along
paul@1	18	with this program. If not, see <http://www.gnu.org/licenses/>.
paul@0	19	"""
paul@0	20
paul@19	21	from os import dup, fdopen # independent iterator access to files
paul@12	22	from os import listdir, mkdir # index and partition discovery
paul@14	23	from os import remove, rename # partition manipulation
paul@7	24	from os.path import exists, join
paul@2	25	from os.path import commonprefix # to find common string prefixes
paul@3	26	from bisect import bisect_right # to find terms in the dictionary index
paul@12	27	from bisect import insort_right # to maintain a sorted list of data for merging
paul@10	28	import bz2, zlib # for field compression
paul@33	29	from itermerge import itermerge
paul@2	30
paul@21	31	try:
paul@21	32	set
paul@21	33	except NameError:
paul@21	34	from sets import Set as set
paul@21	35
paul@7	36	# Constants.
paul@7	37
paul@20	38	TERM_INTERVAL = 100
paul@20	39	DOCUMENT_INTERVAL = 100
paul@20	40	FIELD_INTERVAL = 100
paul@28	41	FLUSH_INTERVAL = 10000
paul@20	42
paul@20	43	TERM_FILENAMES = "terms", "terms_index", "positions", "positions_index"
paul@20	44	FIELD_FILENAMES = "fields", "fields_index"
paul@7	45
paul@10	46	compressors = [("b", bz2.compress), ("z", zlib.compress)]
paul@10	47	decompressors = {"b" : bz2.decompress, "z" : zlib.decompress}
paul@10	48
paul@0	49	# Foundation classes.
paul@0	50
paul@0	51	class File:
paul@0	52
paul@0	53	"A basic file abstraction."
paul@0	54
paul@0	55	def __init__(self, f):
paul@0	56	self.f = f
paul@0	57	self.reset()
paul@0	58
paul@0	59	def reset(self):
paul@12	60
paul@12	61	"To be used to reset the state of the reader or writer between records."
paul@12	62
paul@0	63	pass
paul@0	64
paul@12	65	def rewind(self):
paul@12	66	self.f.seek(0)
paul@13	67	self.reset()
paul@12	68
paul@0	69	def close(self):
paul@7	70	if self.f is not None:
paul@7	71	self.f.close()
paul@7	72	self.f = None
paul@0	73
paul@0	74	class FileWriter(File):
paul@0	75
paul@0	76	"Writing basic data types to files."
paul@0	77
paul@0	78	def write_number(self, number):
paul@0	79
paul@0	80	"Write 'number' to the file using a variable length encoding."
paul@0	81
paul@0	82	# Negative numbers are not supported.
paul@0	83
paul@0	84	if number < 0:
paul@0	85	raise ValueError, "Number %r is negative." % number
paul@0	86
paul@27	87	# Special case: one byte containing a 7-bit number.
paul@27	88
paul@27	89	elif number < 128:
paul@27	90	self.f.write(chr(number))
paul@0	91	return
paul@0	92
paul@0	93	# Write the number from least to most significant digits.
paul@0	94
paul@0	95	bytes = []
paul@0	96
paul@0	97	while number != 0:
paul@4	98	lsd = number & 127
paul@4	99	number = number >> 7
paul@4	100	if number != 0:
paul@4	101	lsd \|= 128
paul@0	102	bytes.append(chr(lsd))
paul@0	103
paul@0	104	record = "".join(bytes)
paul@0	105	self.f.write(record)
paul@0	106
paul@8	107	def write_string(self, s, compress=0):
paul@2	108
paul@8	109	"""
paul@8	110	Write 's' to the file, recording its length and compressing the string
paul@8	111	if 'compress' is set to a true value.
paul@8	112	"""
paul@2	113
paul@7	114	# Convert Unicode objects to strings.
paul@7	115
paul@7	116	if isinstance(s, unicode):
paul@7	117	s = s.encode("utf-8")
paul@7	118
paul@8	119	# Compress the string if requested.
paul@2	120
paul@8	121	if compress:
paul@10	122	for flag, fn in compressors:
paul@10	123	cs = fn(s)
paul@10	124
paul@10	125	# Take the first string shorter than the original.
paul@10	126
paul@10	127	if len(cs) < len(s):
paul@10	128	s = cs
paul@10	129	break
paul@10	130	else:
paul@10	131	flag = "-"
paul@10	132
paul@10	133	# Record whether compression was used.
paul@10	134
paul@10	135	self.f.write(flag)
paul@2	136
paul@8	137	# Write the length of the data before the data itself.
paul@8	138
paul@8	139	length = len(s)
paul@4	140	self.write_number(length)
paul@2	141	self.f.write(s)
paul@2	142
paul@0	143	class FileReader(File):
paul@0	144
paul@0	145	"Reading basic data types from files."
paul@0	146
paul@0	147	def read_number(self):
paul@0	148
paul@0	149	"Read a number from the file."
paul@0	150
paul@0	151	# Read each byte, adding it to the number.
paul@0	152
paul@0	153	shift = 0
paul@0	154	number = 0
paul@4	155	more = 1
paul@0	156
paul@4	157	while more:
paul@4	158	byte = self.f.read(1)
paul@4	159	if not byte:
paul@4	160	raise EOFError
paul@4	161
paul@4	162	csd = ord(byte)
paul@4	163	more = csd & 128 != 0
paul@4	164	if more:
paul@4	165	csd &= 127
paul@0	166	number += (csd << shift)
paul@4	167	shift += 7
paul@0	168
paul@0	169	return number
paul@0	170
paul@8	171	def read_string(self, decompress=0):
paul@2	172
paul@8	173	"""
paul@8	174	Read a string from the file, decompressing the stored data if
paul@8	175	'decompress' is set to a true value.
paul@8	176	"""
paul@2	177
paul@10	178	# Decompress the data if requested.
paul@10	179
paul@10	180	if decompress:
paul@10	181	flag = self.f.read(1)
paul@10	182	else:
paul@10	183	flag = "-"
paul@10	184
paul@4	185	length = self.read_number()
paul@8	186	s = self.f.read(length)
paul@8	187
paul@10	188	# Perform decompression if applicable.
paul@8	189
paul@10	190	if flag != "-":
paul@10	191	fn = decompressors[flag]
paul@10	192	s = fn(s)
paul@7	193
paul@7	194	# Convert strings to Unicode objects.
paul@7	195
paul@8	196	return unicode(s, "utf-8")
paul@2	197
paul@9	198	# Specific classes for storing term and position information.
paul@0	199
paul@0	200	class PositionWriter(FileWriter):
paul@0	201
paul@0	202	"Writing position information to files."
paul@0	203
paul@0	204	def reset(self):
paul@0	205	self.last_docnum = 0
paul@0	206
paul@0	207	def write_positions(self, docnum, positions):
paul@0	208
paul@19	209	"""
paul@19	210	Write for the document 'docnum' the given 'positions'.
paul@19	211	Return the offset of the written record.
paul@19	212	"""
paul@0	213
paul@0	214	if docnum < self.last_docnum:
paul@0	215	raise ValueError, "Document number %r is less than previous number %r." % (docnum, self.last_docnum)
paul@0	216
paul@19	217	# Record the offset of this record.
paul@19	218
paul@19	219	offset = self.f.tell()
paul@19	220
paul@0	221	# Write the document number delta.
paul@0	222
paul@0	223	self.write_number(docnum - self.last_docnum)
paul@0	224
paul@0	225	# Write the number of positions.
paul@0	226
paul@0	227	self.write_number(len(positions))
paul@0	228
paul@7	229	# Make sure that the positions are sorted.
paul@7	230
paul@7	231	positions.sort()
paul@7	232
paul@0	233	# Write the position deltas.
paul@0	234
paul@0	235	last = 0
paul@0	236	for position in positions:
paul@0	237	pos = position - last
paul@0	238	self.write_number(pos)
paul@0	239	last = position
paul@0	240
paul@0	241	self.last_docnum = docnum
paul@0	242
paul@19	243	return offset
paul@0	244
paul@0	245	class PositionReader(FileReader):
paul@0	246
paul@0	247	"Reading position information from files."
paul@0	248
paul@0	249	def reset(self):
paul@0	250	self.last_docnum = 0
paul@0	251
paul@0	252	def read_positions(self):
paul@0	253
paul@0	254	"Read positions, returning a document number and a list of positions."
paul@0	255
paul@0	256	# Read the document number delta and add it to the last number.
paul@0	257
paul@0	258	self.last_docnum += self.read_number()
paul@0	259
paul@0	260	# Read the number of positions.
paul@0	261
paul@0	262	npositions = self.read_number()
paul@0	263
paul@0	264	# Read the position deltas, adding each previous position to get the
paul@0	265	# appropriate collection of absolute positions.
paul@0	266
paul@0	267	i = 0
paul@0	268	last = 0
paul@0	269	positions = []
paul@0	270
paul@0	271	while i < npositions:
paul@0	272	last += self.read_number()
paul@0	273	positions.append(last)
paul@0	274	i += 1
paul@0	275
paul@0	276	return self.last_docnum, positions
paul@0	277
paul@19	278	def read_term_positions(self, offset, count):
paul@0	279
paul@0	280	"""
paul@0	281	Read all positions from 'offset', seeking to that position in the file
paul@19	282	before reading. The number of documents available for reading is limited
paul@19	283	to 'count'.
paul@0	284	"""
paul@0	285
paul@19	286	# Duplicate the file handle.
paul@19	287
paul@19	288	f = fdopen(dup(self.f.fileno()), "rb")
paul@19	289	f.seek(offset)
paul@19	290	return PositionIterator(f, count)
paul@19	291
paul@19	292	class PositionIndexWriter(FileWriter):
paul@19	293
paul@19	294	"Writing position index information to files."
paul@19	295
paul@19	296	def reset(self):
paul@19	297	self.last_docnum = 0
paul@19	298	self.last_pos_offset = 0
paul@19	299
paul@19	300	def write_positions(self, docnum, pos_offset, count):
paul@19	301
paul@19	302	"""
paul@19	303	Write the given 'docnum, 'pos_offset' and document 'count' to the
paul@19	304	position index file.
paul@19	305	"""
paul@19	306
paul@19	307	# Record the offset of this record.
paul@19	308
paul@19	309	offset = self.f.tell()
paul@19	310
paul@19	311	# Write the document number delta.
paul@19	312
paul@19	313	self.write_number(docnum - self.last_docnum)
paul@19	314	self.last_docnum = docnum
paul@19	315
paul@19	316	# Write the position file offset delta.
paul@19	317
paul@19	318	self.write_number(pos_offset - self.last_pos_offset)
paul@19	319	self.last_pos_offset = pos_offset
paul@19	320
paul@19	321	# Write the document count.
paul@19	322
paul@19	323	self.write_number(count)
paul@19	324
paul@19	325	return offset
paul@19	326
paul@19	327	class PositionIndexReader(FileReader):
paul@19	328
paul@19	329	"Reading position index information from files."
paul@18	330
paul@19	331	def reset(self):
paul@19	332	self.last_docnum = 0
paul@19	333	self.last_pos_offset = 0
paul@19	334
paul@19	335	def read_positions(self):
paul@19	336
paul@19	337	"""
paul@19	338	Read a document number, a position file offset for the position index
paul@19	339	file, and the number of documents in a section of that file.
paul@19	340	"""
paul@19	341
paul@19	342	# Read the document number delta.
paul@19	343
paul@19	344	self.last_docnum += self.read_number()
paul@19	345
paul@19	346	# Read the offset delta.
paul@19	347
paul@19	348	self.last_pos_offset += self.read_number()
paul@19	349
paul@19	350	# Read the document count.
paul@19	351
paul@19	352	count = self.read_number()
paul@19	353
paul@19	354	return self.last_docnum, self.last_pos_offset, count
paul@19	355
paul@19	356	def read_term_positions(self, offset, doc_frequency):
paul@0	357
paul@19	358	"""
paul@19	359	Read all positions from 'offset', seeking to that position in the file
paul@19	360	before reading. The number of documents available for reading is limited
paul@19	361	to 'doc_frequency'.
paul@19	362	"""
paul@19	363
paul@19	364	# Duplicate the file handle.
paul@19	365
paul@19	366	f = fdopen(dup(self.f.fileno()), "rb")
paul@19	367	f.seek(offset)
paul@19	368	return PositionIndexIterator(f, doc_frequency)
paul@19	369
paul@21	370	# Iterators for position-related files.
paul@21	371
paul@21	372	class IteratorBase:
paul@21	373
paul@21	374	def __init__(self, count):
paul@21	375	self.replenish(count)
paul@21	376
paul@21	377	def replenish(self, count):
paul@21	378	self.count = count
paul@21	379	self.read_documents = 0
paul@21	380
paul@21	381	def __len__(self):
paul@21	382	return self.count
paul@21	383
paul@21	384	def sort(self):
paul@21	385	pass # Stored document positions are already sorted.
paul@21	386
paul@21	387	def __iter__(self):
paul@21	388	return self
paul@21	389
paul@21	390	class PositionIterator(PositionReader, IteratorBase):
paul@21	391
paul@21	392	"Iterating over document positions."
paul@21	393
paul@21	394	def __init__(self, f, count):
paul@21	395	PositionReader.__init__(self, f)
paul@21	396	IteratorBase.__init__(self, count)
paul@21	397
paul@21	398	def next(self):
paul@21	399
paul@21	400	"Read positions for a single document."
paul@21	401
paul@21	402	if self.read_documents < self.count:
paul@21	403	self.read_documents += 1
paul@21	404	return self.read_positions()
paul@21	405	else:
paul@21	406	raise StopIteration
paul@21	407
paul@19	408	class PositionIndexIterator(PositionIndexReader, IteratorBase):
paul@19	409
paul@19	410	"Iterating over document positions."
paul@19	411
paul@19	412	def __init__(self, f, count):
paul@19	413	PositionIndexReader.__init__(self, f)
paul@19	414	IteratorBase.__init__(self, count)
paul@19	415	self.section_count = 0
paul@19	416
paul@19	417	def next(self):
paul@19	418
paul@19	419	"Read positions for a single document."
paul@18	420
paul@19	421	self.read_documents += self.section_count
paul@19	422	if self.read_documents < self.count:
paul@19	423	docnum, pos_offset, self.section_count = t = self.read_positions()
paul@19	424	return t
paul@19	425	else:
paul@19	426	raise StopIteration
paul@19	427
paul@19	428	class PositionDictionaryWriter:
paul@19	429
paul@19	430	"Writing position dictionaries."
paul@19	431
paul@19	432	def __init__(self, position_writer, position_index_writer, interval):
paul@19	433	self.position_writer = position_writer
paul@19	434	self.position_index_writer = position_index_writer
paul@19	435	self.interval = interval
paul@19	436
paul@19	437	def write_term_positions(self, doc_positions):
paul@19	438
paul@19	439	"""
paul@19	440	Write all 'doc_positions' - a collection of tuples of the form (document
paul@19	441	number, position list) - to the file.
paul@19	442
paul@19	443	Add some records to the index, making dictionary entries.
paul@19	444
paul@19	445	Return a tuple containing the offset of the written data, the frequency
paul@19	446	(number of positions), and document frequency (number of documents) for
paul@19	447	the term involved.
paul@19	448	"""
paul@19	449
paul@20	450	# Reset the writers.
paul@19	451
paul@19	452	self.position_writer.reset()
paul@20	453	self.position_index_writer.reset()
paul@20	454
paul@19	455	index_offset = None
paul@19	456
paul@19	457	# Write the positions.
paul@19	458
paul@19	459	frequency = 0
paul@20	460	first_docnum = None
paul@19	461	first_offset = None
paul@19	462	count = 0
paul@19	463
paul@19	464	doc_positions.sort()
paul@19	465
paul@19	466	for docnum, positions in doc_positions:
paul@19	467	pos_offset = self.position_writer.write_positions(docnum, positions)
paul@19	468
paul@19	469	# Retain the first record offset for a subsequent index entry.
paul@19	470
paul@19	471	if first_offset is None:
paul@19	472	first_offset = pos_offset
paul@20	473	first_docnum = docnum
paul@19	474
paul@19	475	frequency += len(positions)
paul@20	476	count += 1
paul@19	477
paul@19	478	# Every {interval} entries, write an index entry.
paul@19	479
paul@19	480	if count == self.interval:
paul@20	481	io = self.position_index_writer.write_positions(first_docnum, first_offset, self.interval)
paul@0	482
paul@19	483	# Remember the first index entry offset.
paul@19	484
paul@19	485	if index_offset is None:
paul@19	486	index_offset = io
paul@19	487
paul@19	488	first_offset = None
paul@20	489	first_docnum = None
paul@19	490	count = 0
paul@19	491
paul@22	492	# Reset the position writer so that position readers accessing
paul@22	493	# a section start with the correct document number.
paul@22	494
paul@22	495	self.position_writer.reset()
paul@22	496
paul@19	497	# Finish writing an index entry for the remaining documents.
paul@19	498
paul@19	499	else:
paul@19	500	if first_offset is not None:
paul@20	501	io = self.position_index_writer.write_positions(first_docnum, first_offset, count)
paul@19	502
paul@19	503	# Remember the first index entry offset.
paul@19	504
paul@19	505	if index_offset is None:
paul@19	506	index_offset = io
paul@19	507
paul@19	508	return index_offset, frequency, len(doc_positions)
paul@19	509
paul@19	510	def close(self):
paul@19	511	self.position_writer.close()
paul@19	512	self.position_index_writer.close()
paul@19	513
paul@19	514	class PositionDictionaryReader:
paul@18	515
paul@19	516	"Reading position dictionaries."
paul@19	517
paul@19	518	def __init__(self, position_reader, position_index_reader):
paul@19	519	self.position_reader = position_reader
paul@19	520	self.position_index_reader = position_index_reader
paul@19	521
paul@19	522	def read_term_positions(self, offset, doc_frequency):
paul@19	523
paul@19	524	"""
paul@19	525	Return an iterator for dictionary entries starting at 'offset' with the
paul@19	526	given 'doc_frequency'.
paul@19	527	"""
paul@18	528
paul@19	529	return PositionDictionaryIterator(self.position_reader,
paul@19	530	self.position_index_reader, offset, doc_frequency)
paul@19	531
paul@19	532	def close(self):
paul@19	533	self.position_reader.close()
paul@19	534	self.position_index_reader.close()
paul@19	535
paul@19	536	class PositionDictionaryIterator:
paul@19	537
paul@19	538	"Iteration over position dictionary entries."
paul@19	539
paul@19	540	def __init__(self, position_reader, position_index_reader, offset, doc_frequency):
paul@19	541	self.position_reader = position_reader
paul@20	542	self.doc_frequency = doc_frequency
paul@21	543	self.index_iterator = position_index_reader.read_term_positions(offset, doc_frequency)
paul@19	544
paul@22	545	# Remember the last values.
paul@22	546
paul@22	547	self.found_docnum, self.found_positions = None, None
paul@22	548
paul@21	549	# Maintain state for the next index entry, if read.
paul@21	550
paul@21	551	self.next_docnum, self.next_pos_offset, self.next_section_count = None, None, None
paul@21	552
paul@21	553	# Initialise the current index entry and current position file iterator.
paul@21	554
paul@21	555	self._next_section()
paul@21	556	self._init_section()
paul@0	557
paul@20	558	def __len__(self):
paul@20	559	return self.doc_frequency
paul@20	560
paul@20	561	def sort(self):
paul@20	562	pass
paul@20	563
paul@18	564	def __iter__(self):
paul@18	565	return self
paul@18	566
paul@18	567	def next(self):
paul@0	568
paul@21	569	"""
paul@21	570	Attempt to get the next document record from the section in the
paul@21	571	positions file.
paul@21	572	"""
paul@19	573
paul@22	574	# Return any visited but unrequested record.
paul@22	575
paul@22	576	if self.found_docnum is not None:
paul@22	577	t = self.found_docnum, self.found_positions
paul@22	578	self.found_docnum, self.found_positions = None, None
paul@22	579	return t
paul@22	580
paul@22	581	# Or search for the next record.
paul@22	582
paul@19	583	while 1:
paul@19	584
paul@19	585	# Either return the next record.
paul@19	586
paul@19	587	try:
paul@19	588	return self.iterator.next()
paul@0	589
paul@19	590	# Or, where a section is finished, get the next section and try again.
paul@19	591
paul@19	592	except StopIteration:
paul@20	593
paul@20	594	# Where a section follows, update the index iterator, but keep
paul@20	595	# reading using the same file iterator (since the data should
paul@20	596	# just follow on from the last section).
paul@20	597
paul@21	598	self._next_section()
paul@19	599	self.iterator.replenish(self.section_count)
paul@19	600
paul@22	601	# Reset the state of the iterator to make sure that document
paul@22	602	# numbers are correct.
paul@22	603
paul@22	604	self.iterator.reset()
paul@22	605
paul@22	606	def from_document(self, docnum):
paul@21	607
paul@21	608	"""
paul@21	609	Attempt to navigate to a positions entry for the given 'docnum',
paul@22	610	returning the positions for 'docnum', or None otherwise.
paul@21	611	"""
paul@21	612
paul@22	613	# Return any unrequested document positions.
paul@22	614
paul@22	615	if docnum == self.found_docnum:
paul@22	616	return self.found_positions
paul@22	617
paul@21	618	# Read ahead in the index until the next entry refers to a document
paul@21	619	# later than the desired document.
paul@21	620
paul@21	621	try:
paul@21	622	if self.next_docnum is None:
paul@21	623	self.next_docnum, self.next_pos_offset, self.next_section_count = self.index_iterator.next()
paul@21	624
paul@22	625	# Read until the next entry is after the desired document number,
paul@22	626	# or until the end of the results.
paul@22	627
paul@22	628	while self.next_docnum <= docnum:
paul@21	629	self._next_read_section()
paul@22	630	if self.docnum < docnum:
paul@22	631	self.next_docnum, self.next_pos_offset, self.next_section_count = self.index_iterator.next()
paul@22	632	else:
paul@22	633	break
paul@21	634
paul@21	635	except StopIteration:
paul@21	636	pass
paul@21	637
paul@21	638	# Navigate in the position file to the document.
paul@21	639
paul@21	640	self._init_section()
paul@19	641
paul@21	642	try:
paul@21	643	while 1:
paul@22	644	found_docnum, found_positions = self.iterator.next()
paul@22	645
paul@24	646	# Return the desired document positions or None (retaining the
paul@24	647	# positions for the document immediately after).
paul@22	648
paul@21	649	if docnum == found_docnum:
paul@22	650	return found_positions
paul@23	651	elif docnum < found_docnum:
paul@22	652	self.found_docnum, self.found_positions = found_docnum, found_positions
paul@21	653	return None
paul@22	654
paul@21	655	except StopIteration:
paul@21	656	return None
paul@21	657
paul@21	658	# Internal methods.
paul@21	659
paul@21	660	def _next_section(self):
paul@21	661
paul@21	662	"Attempt to get the next section in the index."
paul@21	663
paul@21	664	if self.next_docnum is None:
paul@21	665	self.docnum, self.pos_offset, self.section_count = self.index_iterator.next()
paul@21	666	else:
paul@21	667	self._next_read_section()
paul@21	668
paul@21	669	def _next_read_section(self):
paul@21	670
paul@21	671	"""
paul@21	672	Make the next index entry the current one without reading from the
paul@21	673	index.
paul@21	674	"""
paul@21	675
paul@21	676	self.docnum, self.pos_offset, self.section_count = self.next_docnum, self.next_pos_offset, self.next_section_count
paul@22	677	self.next_docnum, self.next_pos_offset, self.next_section_count = None, None, None
paul@21	678
paul@21	679	def _init_section(self):
paul@21	680
paul@21	681	"Initialise the iterator for the section in the position file."
paul@21	682
paul@19	683	self.iterator = self.position_reader.read_term_positions(self.pos_offset, self.section_count)
paul@0	684
paul@2	685	class TermWriter(FileWriter):
paul@2	686
paul@2	687	"Writing term information to files."
paul@2	688
paul@2	689	def reset(self):
paul@2	690	self.last_term = ""
paul@2	691	self.last_offset = 0
paul@2	692
paul@19	693	def write_term(self, term, offset, frequency, doc_frequency):
paul@2	694
paul@2	695	"""
paul@19	696	Write the given 'term', its position file 'offset', its 'frequency' and
paul@19	697	its 'doc_frequency' (number of documents in which it appears) to the
paul@19	698	term information file. Return the offset after the term information was
paul@19	699	written to the file.
paul@2	700	"""
paul@2	701
paul@2	702	# Write the prefix length and term suffix.
paul@2	703
paul@2	704	common = len(commonprefix([self.last_term, term]))
paul@2	705	suffix = term[common:]
paul@2	706
paul@4	707	self.write_number(common)
paul@2	708	self.write_string(suffix)
paul@2	709
paul@2	710	# Write the offset delta.
paul@2	711
paul@2	712	self.write_number(offset - self.last_offset)
paul@2	713
paul@11	714	# Write the frequency.
paul@11	715
paul@11	716	self.write_number(frequency)
paul@11	717
paul@19	718	# Write the document frequency.
paul@19	719
paul@19	720	self.write_number(doc_frequency)
paul@19	721
paul@2	722	self.last_term = term
paul@2	723	self.last_offset = offset
paul@2	724
paul@3	725	return self.f.tell()
paul@3	726
paul@2	727	class TermReader(FileReader):
paul@2	728
paul@2	729	"Reading term information from files."
paul@2	730
paul@2	731	def reset(self):
paul@2	732	self.last_term = ""
paul@2	733	self.last_offset = 0
paul@2	734
paul@2	735	def read_term(self):
paul@2	736
paul@2	737	"""
paul@19	738	Read a term, its position file offset, its frequency and its document
paul@25	739	frequency from the term information file.
paul@2	740	"""
paul@2	741
paul@2	742	# Read the prefix length and term suffix.
paul@2	743
paul@4	744	common = self.read_number()
paul@2	745	suffix = self.read_string()
paul@2	746
paul@2	747	self.last_term = self.last_term[:common] + suffix
paul@2	748
paul@2	749	# Read the offset delta.
paul@2	750
paul@2	751	self.last_offset += self.read_number()
paul@2	752
paul@11	753	# Read the frequency.
paul@11	754
paul@11	755	frequency = self.read_number()
paul@11	756
paul@19	757	# Read the document frequency.
paul@19	758
paul@19	759	doc_frequency = self.read_number()
paul@19	760
paul@19	761	return self.last_term, self.last_offset, frequency, doc_frequency
paul@2	762
paul@3	763	def go_to_term(self, term, offset, info_offset):
paul@3	764
paul@9	765	"""
paul@9	766	Seek past the entry for 'term' having 'offset' to 'info_offset'. This
paul@9	767	permits the scanning for later terms from the specified term.
paul@9	768	"""
paul@3	769
paul@3	770	self.f.seek(info_offset)
paul@3	771	self.last_term = term
paul@3	772	self.last_offset = offset
paul@3	773
paul@3	774	class TermIndexWriter(TermWriter):
paul@3	775
paul@3	776	"Writing term dictionary index details to files."
paul@3	777
paul@3	778	def reset(self):
paul@3	779	TermWriter.reset(self)
paul@3	780	self.last_info_offset = 0
paul@3	781
paul@19	782	def write_term(self, term, offset, frequency, doc_frequency, info_offset):
paul@3	783
paul@3	784	"""
paul@19	785	Write the given 'term', its position file 'offset', its 'frequency' and
paul@19	786	its 'doc_frequency' to the term dictionary index file, along with the
paul@19	787	'info_offset' in the term information file.
paul@3	788	"""
paul@3	789
paul@19	790	TermWriter.write_term(self, term, offset, frequency, doc_frequency)
paul@3	791
paul@3	792	# Write the information file offset delta.
paul@3	793
paul@3	794	self.write_number(info_offset - self.last_info_offset)
paul@3	795	self.last_info_offset = info_offset
paul@3	796
paul@3	797	class TermIndexReader(TermReader):
paul@3	798
paul@3	799	"Reading term dictionary index details from files."
paul@3	800
paul@3	801	def reset(self):
paul@3	802	TermReader.reset(self)
paul@3	803	self.last_info_offset = 0
paul@3	804
paul@3	805	def read_term(self):
paul@3	806
paul@3	807	"""
paul@19	808	Read a term, its position file offset, its frequency, its document
paul@19	809	frequency and a term information file offset from the term dictionary
paul@19	810	index file.
paul@3	811	"""
paul@3	812
paul@19	813	term, offset, frequency, doc_frequency = TermReader.read_term(self)
paul@3	814
paul@3	815	# Read the offset delta.
paul@3	816
paul@3	817	self.last_info_offset += self.read_number()
paul@3	818
paul@19	819	return term, offset, frequency, doc_frequency, self.last_info_offset
paul@3	820
paul@3	821	class TermDictionaryWriter:
paul@3	822
paul@3	823	"Writing term dictionaries."
paul@3	824
paul@19	825	def __init__(self, info_writer, index_writer, position_dict_writer, interval):
paul@3	826	self.info_writer = info_writer
paul@3	827	self.index_writer = index_writer
paul@19	828	self.position_dict_writer = position_dict_writer
paul@3	829	self.interval = interval
paul@3	830	self.entry = 0
paul@3	831
paul@19	832	def _write_term(self, term, offset, frequency, doc_frequency):
paul@3	833
paul@3	834	"""
paul@19	835	Write the given 'term', its position file 'offset', its 'frequency' and
paul@19	836	its 'doc_frequency' (number of documents in which it appears) to the
paul@19	837	term information file. Return the offset after the term information was
paul@19	838	written to the file.
paul@3	839	"""
paul@3	840
paul@19	841	info_offset = self.info_writer.write_term(term, offset, frequency, doc_frequency)
paul@3	842
paul@3	843	if self.entry % self.interval == 0:
paul@19	844	self.index_writer.write_term(term, offset, frequency, doc_frequency, info_offset)
paul@3	845
paul@3	846	self.entry += 1
paul@3	847
paul@5	848	def write_term_positions(self, term, doc_positions):
paul@5	849
paul@5	850	"""
paul@5	851	Write the given 'term' and the 'doc_positions' recording the documents
paul@5	852	and positions at which the term is found.
paul@5	853	"""
paul@5	854
paul@19	855	offset, frequency, doc_frequency = self.position_dict_writer.write_term_positions(doc_positions)
paul@19	856	self._write_term(term, offset, frequency, doc_frequency)
paul@5	857
paul@3	858	def close(self):
paul@3	859	self.info_writer.close()
paul@3	860	self.index_writer.close()
paul@19	861	self.position_dict_writer.close()
paul@3	862
paul@3	863	class TermDictionaryReader:
paul@3	864
paul@3	865	"Reading term dictionaries."
paul@3	866
paul@22	867	def __init__(self, info_reader, index_reader, position_dict_reader):
paul@3	868	self.info_reader = info_reader
paul@3	869	self.index_reader = index_reader
paul@22	870	self.position_dict_reader = position_dict_reader
paul@3	871
paul@3	872	self.terms = []
paul@3	873	try:
paul@3	874	while 1:
paul@3	875	self.terms.append(self.index_reader.read_term())
paul@3	876	except EOFError:
paul@3	877	pass
paul@3	878
paul@3	879	# Large numbers for ordering purposes.
paul@3	880
paul@28	881	if self.terms:
paul@28	882	self.max_offset = self.terms[-1][1] + 1
paul@28	883	else:
paul@28	884	self.max_offset = None
paul@3	885
paul@25	886	def _find_closest_entry(self, term):
paul@3	887
paul@11	888	"""
paul@25	889	Find the offsets and frequencies of 'term' from the term dictionary or
paul@25	890	the closest term starting with the value of 'term'.
paul@25	891
paul@25	892	Return the closest index entry consisting of a term, the position file
paul@25	893	offset, the term frequency, the document frequency, and the term details
paul@25	894	file offset.
paul@11	895	"""
paul@3	896
paul@14	897	i = bisect_right(self.terms, (term, self.max_offset, 0, 0)) - 1
paul@3	898
paul@3	899	# Get the entry position providing the term or one preceding it.
paul@25	900	# If no entry precedes the requested term, return the very first entry
paul@25	901	# as the closest.
paul@3	902
paul@3	903	if i == -1:
paul@25	904	return self.terms[0]
paul@25	905	else:
paul@25	906	return self.terms[i]
paul@25	907
paul@25	908	def _find_closest_term(self, term):
paul@25	909
paul@25	910	"""
paul@25	911	Find the offsets and frequencies of 'term' from the term dictionary or
paul@25	912	the closest term starting with the value of 'term'.
paul@25	913
paul@25	914	Return the closest term (or the term itself), the position file offset,
paul@25	915	the term frequency, the document frequency, and the term details file
paul@25	916	offset (or None if the reader is already positioned).
paul@25	917	"""
paul@25	918
paul@25	919	found_term, offset, frequency, doc_frequency, info_offset = self._find_closest_entry(term)
paul@3	920
paul@19	921	# Where the term is found immediately, return the offset and
paul@25	922	# frequencies. If the term does not appear, return the details of the
paul@25	923	# closest entry.
paul@25	924
paul@25	925	if term <= found_term:
paul@25	926	return found_term, offset, frequency, doc_frequency, info_offset
paul@3	927
paul@3	928	# Otherwise, seek past the index term's entry in the information file
paul@3	929	# and scan for the desired term.
paul@3	930
paul@3	931	else:
paul@3	932	self.info_reader.go_to_term(found_term, offset, info_offset)
paul@3	933	try:
paul@3	934	while term > found_term:
paul@19	935	found_term, offset, frequency, doc_frequency = self.info_reader.read_term()
paul@3	936	except EOFError:
paul@3	937	pass
paul@3	938
paul@25	939	return found_term, offset, frequency, doc_frequency, None
paul@25	940
paul@25	941	def _find_term(self, term):
paul@25	942
paul@25	943	"""
paul@25	944	Find the position file offset and frequency of 'term' from the term
paul@25	945	dictionary.
paul@25	946	"""
paul@25	947
paul@25	948	found_term, offset, frequency, doc_frequency, info_offset = self._find_closest_term(term)
paul@25	949
paul@25	950	# If the term is found, return the offset and frequencies.
paul@25	951
paul@25	952	if term == found_term:
paul@25	953	return offset, frequency, doc_frequency
paul@25	954	else:
paul@25	955	return None
paul@25	956
paul@25	957	def _get_positions(self, offset, doc_frequency):
paul@25	958	return self.position_dict_reader.read_term_positions(offset, doc_frequency)
paul@25	959
paul@25	960	# Sequential access methods.
paul@3	961
paul@12	962	def rewind(self):
paul@12	963	self.info_reader.rewind()
paul@12	964
paul@12	965	def read_term(self):
paul@12	966
paul@12	967	"""
paul@19	968	Return the next term, its frequency, its document frequency, and the
paul@19	969	documents and positions at which the term is found.
paul@12	970	"""
paul@12	971
paul@19	972	term, offset, frequency, doc_frequency = self.info_reader.read_term()
paul@19	973	positions = self._get_positions(offset, doc_frequency)
paul@19	974	return term, frequency, doc_frequency, positions
paul@12	975
paul@25	976	# Query methods.
paul@25	977
paul@25	978	def find_terms(self, term):
paul@25	979
paul@25	980	"Return all terms whose values start with the value of 'term'."
paul@25	981
paul@25	982	terms = []
paul@25	983
paul@25	984	found_term, offset, frequency, doc_frequency, info_offset = self._find_closest_term(term)
paul@25	985
paul@25	986	# Position the reader, if necessary.
paul@25	987
paul@25	988	if info_offset is not None:
paul@25	989	self.info_reader.go_to_term(found_term, offset, info_offset)
paul@25	990
paul@25	991	# Read and record terms.
paul@25	992
paul@25	993	try:
paul@25	994	# Add the found term if it starts with the specified term.
paul@25	995
paul@25	996	while found_term.startswith(term):
paul@25	997	terms.append(found_term)
paul@25	998	found_term, offset, frequency, doc_frequency = self.info_reader.read_term()
paul@25	999
paul@25	1000	except EOFError:
paul@25	1001	pass
paul@25	1002
paul@25	1003	return terms
paul@25	1004
paul@5	1005	def find_positions(self, term):
paul@5	1006
paul@5	1007	"Return the documents and positions at which the given 'term' is found."
paul@5	1008
paul@11	1009	t = self._find_term(term)
paul@11	1010	if t is None:
paul@5	1011	return None
paul@5	1012	else:
paul@19	1013	offset, frequency, doc_frequency = t
paul@19	1014	return self._get_positions(offset, doc_frequency)
paul@5	1015
paul@11	1016	def get_frequency(self, term):
paul@11	1017
paul@11	1018	"Return the frequency of the given 'term'."
paul@11	1019
paul@11	1020	t = self._find_term(term)
paul@11	1021	if t is None:
paul@11	1022	return None
paul@11	1023	else:
paul@19	1024	offset, frequency, doc_frequency = t
paul@11	1025	return frequency
paul@11	1026
paul@19	1027	def get_document_frequency(self, term):
paul@19	1028
paul@19	1029	"Return the document frequency of the given 'term'."
paul@19	1030
paul@19	1031	t = self._find_term(term)
paul@19	1032	if t is None:
paul@19	1033	return None
paul@19	1034	else:
paul@19	1035	offset, frequency, doc_frequency = t
paul@19	1036	return doc_frequency
paul@19	1037
paul@3	1038	def close(self):
paul@3	1039	self.info_reader.close()
paul@3	1040	self.index_reader.close()
paul@22	1041	self.position_dict_reader.close()
paul@3	1042
paul@9	1043	# Specific classes for storing document information.
paul@9	1044
paul@8	1045	class FieldWriter(FileWriter):
paul@8	1046
paul@8	1047	"Writing field data to files."
paul@8	1048
paul@9	1049	def reset(self):
paul@9	1050	self.last_docnum = 0
paul@9	1051
paul@9	1052	def write_fields(self, docnum, fields):
paul@8	1053
paul@8	1054	"""
paul@13	1055	Write for the given 'docnum', a list of 'fields' (integer, string pairs
paul@13	1056	representing field identifiers and values respectively).
paul@13	1057	Return the offset at which the fields are stored.
paul@8	1058	"""
paul@8	1059
paul@8	1060	offset = self.f.tell()
paul@8	1061
paul@9	1062	# Write the document number delta.
paul@9	1063
paul@9	1064	self.write_number(docnum - self.last_docnum)
paul@9	1065
paul@8	1066	# Write the number of fields.
paul@8	1067
paul@8	1068	self.write_number(len(fields))
paul@8	1069
paul@8	1070	# Write the fields themselves.
paul@8	1071
paul@13	1072	for i, field in fields:
paul@13	1073	self.write_number(i)
paul@10	1074	self.write_string(field, 1) # compress
paul@8	1075
paul@9	1076	self.last_docnum = docnum
paul@8	1077	return offset
paul@8	1078
paul@8	1079	class FieldReader(FileReader):
paul@8	1080
paul@8	1081	"Reading field data from files."
paul@8	1082
paul@9	1083	def reset(self):
paul@9	1084	self.last_docnum = 0
paul@9	1085
paul@8	1086	def read_fields(self):
paul@8	1087
paul@9	1088	"""
paul@9	1089	Read fields from the file, returning a tuple containing the document
paul@13	1090	number and a list of field (identifier, value) pairs.
paul@9	1091	"""
paul@9	1092
paul@9	1093	# Read the document number.
paul@9	1094
paul@9	1095	self.last_docnum += self.read_number()
paul@8	1096
paul@8	1097	# Read the number of fields.
paul@8	1098
paul@8	1099	nfields = self.read_number()
paul@8	1100
paul@8	1101	# Collect the fields.
paul@8	1102
paul@8	1103	fields = []
paul@8	1104	i = 0
paul@8	1105
paul@8	1106	while i < nfields:
paul@13	1107	identifier = self.read_number()
paul@13	1108	value = self.read_string(1) # decompress
paul@13	1109	fields.append((identifier, value))
paul@8	1110	i += 1
paul@8	1111
paul@9	1112	return self.last_docnum, fields
paul@9	1113
paul@9	1114	def read_document_fields(self, docnum, offset):
paul@8	1115
paul@9	1116	"""
paul@9	1117	Read fields for 'docnum' at the given 'offset'. This permits the
paul@9	1118	retrieval of details for the specified document, as well as scanning for
paul@9	1119	later documents.
paul@9	1120	"""
paul@8	1121
paul@8	1122	self.f.seek(offset)
paul@9	1123	bad_docnum, fields = self.read_fields()
paul@9	1124	self.last_docnum = docnum
paul@9	1125	return docnum, fields
paul@12	1126
paul@9	1127	class FieldIndexWriter(FileWriter):
paul@9	1128
paul@9	1129	"Writing field index details to files."
paul@9	1130
paul@9	1131	def reset(self):
paul@9	1132	self.last_docnum = 0
paul@10	1133	self.last_offset = 0
paul@9	1134
paul@9	1135	def write_document(self, docnum, offset):
paul@9	1136
paul@9	1137	"""
paul@9	1138	Write for the given 'docnum', the 'offset' at which the fields for the
paul@9	1139	document are stored in the fields file.
paul@9	1140	"""
paul@9	1141
paul@10	1142	# Write the document number and offset deltas.
paul@9	1143
paul@9	1144	self.write_number(docnum - self.last_docnum)
paul@10	1145	self.write_number(offset - self.last_offset)
paul@9	1146
paul@9	1147	self.last_docnum = docnum
paul@10	1148	self.last_offset = offset
paul@9	1149
paul@9	1150	class FieldIndexReader(FileReader):
paul@9	1151
paul@9	1152	"Reading field index details from files."
paul@9	1153
paul@9	1154	def reset(self):
paul@9	1155	self.last_docnum = 0
paul@10	1156	self.last_offset = 0
paul@9	1157
paul@9	1158	def read_document(self):
paul@9	1159
paul@9	1160	"Read a document number and field file offset."
paul@9	1161
paul@9	1162	# Read the document number delta and offset.
paul@9	1163
paul@9	1164	self.last_docnum += self.read_number()
paul@10	1165	self.last_offset += self.read_number()
paul@9	1166
paul@10	1167	return self.last_docnum, self.last_offset
paul@9	1168
paul@9	1169	class FieldDictionaryWriter:
paul@9	1170
paul@9	1171	"Writing field dictionary details."
paul@9	1172
paul@9	1173	def __init__(self, field_writer, field_index_writer, interval):
paul@9	1174	self.field_writer = field_writer
paul@9	1175	self.field_index_writer = field_index_writer
paul@9	1176	self.interval = interval
paul@9	1177	self.entry = 0
paul@9	1178
paul@9	1179	def write_fields(self, docnum, fields):
paul@9	1180
paul@9	1181	"Write details of the document with the given 'docnum' and 'fields'."
paul@9	1182
paul@9	1183	offset = self.field_writer.write_fields(docnum, fields)
paul@9	1184
paul@9	1185	if self.entry % self.interval == 0:
paul@9	1186	self.field_index_writer.write_document(docnum, offset)
paul@9	1187
paul@9	1188	self.entry += 1
paul@9	1189
paul@9	1190	def close(self):
paul@9	1191	self.field_writer.close()
paul@9	1192	self.field_index_writer.close()
paul@9	1193
paul@9	1194	class FieldDictionaryReader:
paul@9	1195
paul@9	1196	"Reading field dictionary details."
paul@9	1197
paul@9	1198	def __init__(self, field_reader, field_index_reader):
paul@9	1199	self.field_reader = field_reader
paul@9	1200	self.field_index_reader = field_index_reader
paul@9	1201
paul@9	1202	self.docs = []
paul@9	1203	try:
paul@9	1204	while 1:
paul@9	1205	self.docs.append(self.field_index_reader.read_document())
paul@9	1206	except EOFError:
paul@9	1207	pass
paul@9	1208
paul@9	1209	# Large numbers for ordering purposes.
paul@9	1210
paul@28	1211	if self.docs:
paul@28	1212	self.max_offset = self.docs[-1][1]
paul@28	1213	else:
paul@28	1214	self.max_offset = None
paul@9	1215
paul@13	1216	def rewind(self):
paul@13	1217	self.field_reader.rewind()
paul@13	1218
paul@13	1219	def read_fields(self):
paul@13	1220
paul@13	1221	"Return the next document number and fields."
paul@13	1222
paul@13	1223	return self.field_reader.read_fields()
paul@13	1224
paul@13	1225	def get_fields(self, docnum):
paul@9	1226
paul@9	1227	"Read the fields of the document with the given 'docnum'."
paul@9	1228
paul@9	1229	i = bisect_right(self.docs, (docnum, self.max_offset)) - 1
paul@9	1230
paul@9	1231	# Get the entry position providing the term or one preceding it.
paul@9	1232
paul@9	1233	if i == -1:
paul@9	1234	return None
paul@9	1235
paul@9	1236	found_docnum, offset = self.docs[i]
paul@9	1237
paul@9	1238	# Read from the fields file.
paul@9	1239
paul@9	1240	found_docnum, fields = self.field_reader.read_document_fields(found_docnum, offset)
paul@9	1241
paul@9	1242	# Scan for the document, if necessary.
paul@9	1243
paul@9	1244	try:
paul@9	1245	while docnum > found_docnum:
paul@9	1246	found_docnum, fields = self.field_reader.read_fields()
paul@9	1247	except EOFError:
paul@9	1248	pass
paul@9	1249
paul@9	1250	# If the document is found, return the fields.
paul@9	1251
paul@9	1252	if docnum == found_docnum:
paul@9	1253	return fields
paul@9	1254	else:
paul@9	1255	return None
paul@9	1256
paul@9	1257	def close(self):
paul@9	1258	self.field_reader.close()
paul@9	1259	self.field_index_reader.close()
paul@8	1260
paul@12	1261	# Dictionary merging classes.
paul@12	1262
paul@13	1263	class Merger:
paul@12	1264
paul@13	1265	"Merge files."
paul@12	1266
paul@12	1267	def __init__(self, writer, readers):
paul@12	1268	self.writer = writer
paul@12	1269	self.readers = readers
paul@12	1270
paul@13	1271	def close(self):
paul@13	1272	for reader in self.readers:
paul@13	1273	reader.close()
paul@13	1274	self.writer.close()
paul@13	1275
paul@13	1276	class TermDictionaryMerger(Merger):
paul@13	1277
paul@13	1278	"Merge term and position files."
paul@13	1279
paul@12	1280	def merge(self):
paul@13	1281
paul@13	1282	"""
paul@13	1283	Merge terms and positions from the readers, sending them to the writer.
paul@13	1284	"""
paul@13	1285
paul@12	1286	entries = []
paul@12	1287
paul@12	1288	# Get the first entries from the readers.
paul@12	1289
paul@12	1290	for partition, reader in enumerate(self.readers):
paul@12	1291	reader.rewind()
paul@12	1292
paul@12	1293	try:
paul@19	1294	term, frequency, doc_frequency, positions = reader.read_term()
paul@12	1295	insort_right(entries, (term, positions, partition))
paul@12	1296	except EOFError:
paul@12	1297	pass
paul@12	1298
paul@12	1299	# While entries are available, write them out in order, merging where
paul@12	1300	# appropriate.
paul@12	1301
paul@12	1302	while entries:
paul@12	1303	term, doc_positions, partition = entries[0]
paul@12	1304	to_update = [partition]
paul@12	1305
paul@12	1306	nentries = len(entries)
paul@12	1307	i = 1
paul@12	1308
paul@12	1309	# Find other entries for the term.
paul@12	1310
paul@12	1311	while i < nentries:
paul@12	1312	other_term, other_doc_positions, other_partition = entries[i]
paul@12	1313
paul@12	1314	# For such entries, merge the positions.
paul@12	1315
paul@12	1316	if other_term == term:
paul@33	1317	doc_positions = itermerge(doc_positions, other_doc_positions)
paul@12	1318	to_update.append(other_partition)
paul@12	1319	i += 1
paul@12	1320	else:
paul@12	1321	break
paul@12	1322
paul@12	1323	# Write the combined term details.
paul@12	1324
paul@12	1325	self.writer.write_term_positions(term, doc_positions)
paul@12	1326
paul@12	1327	# Update the entries from the affected readers.
paul@12	1328
paul@12	1329	del entries[:i]
paul@12	1330
paul@12	1331	for partition in to_update:
paul@12	1332	try:
paul@19	1333	term, frequency, doc_frequency, positions = self.readers[partition].read_term()
paul@12	1334	insort_right(entries, (term, positions, partition))
paul@12	1335	except EOFError:
paul@12	1336	pass
paul@12	1337
paul@13	1338	class FieldDictionaryMerger(Merger):
paul@13	1339
paul@13	1340	"Merge field files."
paul@13	1341
paul@13	1342	def merge(self):
paul@13	1343
paul@13	1344	"""
paul@13	1345	Merge fields from the readers, sending them to the writer.
paul@13	1346	"""
paul@13	1347
paul@13	1348	entries = []
paul@13	1349
paul@13	1350	# Get the first entries from the readers.
paul@13	1351
paul@13	1352	for partition, reader in enumerate(self.readers):
paul@13	1353	reader.rewind()
paul@13	1354
paul@13	1355	try:
paul@13	1356	docnum, fields = reader.read_fields()
paul@13	1357	insort_right(entries, (docnum, fields, partition))
paul@13	1358	except EOFError:
paul@13	1359	pass
paul@13	1360
paul@13	1361	# While entries are available, write them out in order, merging where
paul@32	1362	# appropriate. Since fields from one document should only appear in a
paul@32	1363	# single partition, only one partition will be updated at a time.
paul@13	1364
paul@13	1365	while entries:
paul@13	1366	docnum, fields, partition = entries[0]
paul@13	1367
paul@13	1368	# Write the combined term details.
paul@13	1369
paul@13	1370	self.writer.write_fields(docnum, fields)
paul@13	1371
paul@13	1372	# Update the entries from the affected readers.
paul@13	1373
paul@32	1374	del entries[0]
paul@32	1375
paul@32	1376	try:
paul@32	1377	docnum, fields = self.readers[partition].read_fields()
paul@32	1378	insort_right(entries, (docnum, fields, partition))
paul@32	1379	except EOFError:
paul@32	1380	pass
paul@13	1381
paul@13	1382	# Utility functions.
paul@13	1383
paul@19	1384	def get_term_writer(pathname, partition, interval, doc_interval):
paul@13	1385
paul@13	1386	"""
paul@13	1387	Return a term dictionary writer using files under the given 'pathname'
paul@13	1388	labelled according to the given 'partition', using the given indexing
paul@19	1389	'interval' for terms and 'doc_interval' for document position records.
paul@13	1390	"""
paul@13	1391
paul@13	1392	tdf = open(join(pathname, "terms-%s" % partition), "wb")
paul@13	1393	info_writer = TermWriter(tdf)
paul@13	1394
paul@14	1395	tdif = open(join(pathname, "terms_index-%s" % partition), "wb")
paul@13	1396	index_writer = TermIndexWriter(tdif)
paul@13	1397
paul@13	1398	tpf = open(join(pathname, "positions-%s" % partition), "wb")
paul@13	1399	positions_writer = PositionWriter(tpf)
paul@13	1400
paul@19	1401	tpif = open(join(pathname, "positions_index-%s" % partition), "wb")
paul@19	1402	positions_index_writer = PositionIndexWriter(tpif)
paul@19	1403
paul@19	1404	positions_dict_writer = PositionDictionaryWriter(positions_writer, positions_index_writer, doc_interval)
paul@19	1405
paul@19	1406	return TermDictionaryWriter(info_writer, index_writer, positions_dict_writer, interval)
paul@13	1407
paul@13	1408	def get_field_writer(pathname, partition, interval):
paul@13	1409
paul@13	1410	"""
paul@13	1411	Return a field dictionary writer using files under the given 'pathname'
paul@13	1412	labelled according to the given 'partition', using the given indexing
paul@13	1413	'interval'.
paul@13	1414	"""
paul@13	1415
paul@13	1416	ff = open(join(pathname, "fields-%s" % partition), "wb")
paul@13	1417	field_writer = FieldWriter(ff)
paul@13	1418
paul@13	1419	fif = open(join(pathname, "fields_index-%s" % partition), "wb")
paul@13	1420	field_index_writer = FieldIndexWriter(fif)
paul@13	1421
paul@13	1422	return FieldDictionaryWriter(field_writer, field_index_writer, interval)
paul@13	1423
paul@14	1424	def get_term_reader(pathname, partition):
paul@14	1425
paul@14	1426	"""
paul@14	1427	Return a term dictionary reader using files under the given 'pathname'
paul@14	1428	labelled according to the given 'partition'.
paul@14	1429	"""
paul@14	1430
paul@14	1431	tdf = open(join(pathname, "terms-%s" % partition), "rb")
paul@14	1432	info_reader = TermReader(tdf)
paul@14	1433
paul@14	1434	tdif = open(join(pathname, "terms_index-%s" % partition), "rb")
paul@14	1435	index_reader = TermIndexReader(tdif)
paul@14	1436
paul@14	1437	tpf = open(join(pathname, "positions-%s" % partition), "rb")
paul@14	1438	positions_reader = PositionReader(tpf)
paul@14	1439
paul@19	1440	tpif = open(join(pathname, "positions_index-%s" % partition), "rb")
paul@19	1441	positions_index_reader = PositionIndexReader(tpif)
paul@19	1442
paul@19	1443	positions_dict_reader = PositionDictionaryReader(positions_reader, positions_index_reader)
paul@19	1444
paul@19	1445	return TermDictionaryReader(info_reader, index_reader, positions_dict_reader)
paul@14	1446
paul@14	1447	def get_field_reader(pathname, partition):
paul@14	1448
paul@14	1449	"""
paul@14	1450	Return a field dictionary reader using files under the given 'pathname'
paul@14	1451	labelled according to the given 'partition'.
paul@14	1452	"""
paul@14	1453
paul@14	1454	ff = open(join(pathname, "fields-%s" % partition), "rb")
paul@14	1455	field_reader = FieldReader(ff)
paul@14	1456
paul@14	1457	fif = open(join(pathname, "fields_index-%s" % partition), "rb")
paul@14	1458	field_index_reader = FieldIndexReader(fif)
paul@14	1459
paul@14	1460	return FieldDictionaryReader(field_reader, field_index_reader)
paul@14	1461
paul@14	1462	def rename_files(pathname, names, from_partition, to_partition):
paul@14	1463	for name in names:
paul@14	1464	rename(join(pathname, "%s-%s" % (name, from_partition)), join(pathname, "%s-%s" % (name, to_partition)))
paul@14	1465
paul@14	1466	def rename_term_files(pathname, from_partition, to_partition):
paul@20	1467	rename_files(pathname, TERM_FILENAMES, from_partition, to_partition)
paul@14	1468
paul@14	1469	def rename_field_files(pathname, from_partition, to_partition):
paul@20	1470	rename_files(pathname, FIELD_FILENAMES, from_partition, to_partition)
paul@14	1471
paul@14	1472	def remove_files(pathname, names, partition):
paul@14	1473	for name in names:
paul@14	1474	remove(join(pathname, "%s-%s" % (name, partition)))
paul@14	1475
paul@14	1476	def remove_term_files(pathname, partition):
paul@20	1477	remove_files(pathname, TERM_FILENAMES, partition)
paul@14	1478
paul@14	1479	def remove_field_files(pathname, partition):
paul@20	1480	remove_files(pathname, FIELD_FILENAMES, partition)
paul@14	1481
paul@8	1482	# High-level classes.
paul@8	1483
paul@28	1484	class Document:
paul@28	1485
paul@28	1486	"A container of document information."
paul@28	1487
paul@28	1488	def __init__(self, docnum):
paul@28	1489	self.docnum = docnum
paul@28	1490	self.fields = []
paul@28	1491	self.terms = {}
paul@28	1492
paul@28	1493	def add_position(self, term, position):
paul@28	1494
paul@28	1495	"""
paul@28	1496	Add a position entry for the given 'term', indicating the given
paul@28	1497	'position'.
paul@28	1498	"""
paul@28	1499
paul@28	1500	self.terms.setdefault(term, []).append(position)
paul@28	1501
paul@28	1502	def add_field(self, identifier, value):
paul@28	1503
paul@28	1504	"Add a field having the given 'identifier' and 'value'."
paul@28	1505
paul@28	1506	self.fields.append((identifier, unicode(value))) # convert to string
paul@28	1507
paul@31	1508	def set_fields(self, fields):
paul@28	1509
paul@28	1510	"""
paul@31	1511	Set the document's 'fields': a list of tuples each containing an integer
paul@31	1512	identifier and a string value.
paul@28	1513	"""
paul@28	1514
paul@28	1515	self.fields = fields
paul@28	1516
paul@6	1517	class IndexWriter:
paul@6	1518
paul@10	1519	"""
paul@10	1520	Building term information and writing it to the term and field dictionaries.
paul@10	1521	"""
paul@6	1522
paul@20	1523	def __init__(self, pathname, interval, doc_interval, flush_interval):
paul@12	1524	self.pathname = pathname
paul@12	1525	self.interval = interval
paul@20	1526	self.doc_interval = doc_interval
paul@12	1527	self.flush_interval = flush_interval
paul@12	1528
paul@12	1529	self.dict_partition = 0
paul@12	1530	self.field_dict_partition = 0
paul@12	1531
paul@6	1532	self.terms = {}
paul@10	1533	self.docs = {}
paul@6	1534
paul@27	1535	self.doc_counter = 0
paul@12	1536
paul@28	1537	def add_document(self, doc):
paul@10	1538
paul@13	1539	"""
paul@28	1540	Add the given document 'doc', updating the document counter and flushing
paul@28	1541	terms and fields if appropriate.
paul@13	1542	"""
paul@10	1543
paul@28	1544	for term, positions in doc.terms.items():
paul@28	1545	self.terms.setdefault(term, {})[doc.docnum] = positions
paul@28	1546
paul@28	1547	self.docs[doc.docnum] = doc.fields
paul@27	1548
paul@27	1549	self.doc_counter += 1
paul@27	1550	if self.flush_interval and self.doc_counter >= self.flush_interval:
paul@27	1551	self.flush_terms()
paul@12	1552	self.flush_fields()
paul@27	1553	self.doc_counter = 0
paul@26	1554
paul@12	1555	def get_term_writer(self):
paul@12	1556
paul@12	1557	"Return a term dictionary writer for the current partition."
paul@12	1558
paul@20	1559	return get_term_writer(self.pathname, self.dict_partition, self.interval, self.doc_interval)
paul@12	1560
paul@12	1561	def get_field_writer(self):
paul@12	1562
paul@12	1563	"Return a field dictionary writer for the current partition."
paul@12	1564
paul@13	1565	return get_field_writer(self.pathname, self.field_dict_partition, self.interval)
paul@12	1566
paul@12	1567	def flush_terms(self):
paul@12	1568
paul@12	1569	"Flush terms into the current term dictionary partition."
paul@6	1570
paul@6	1571	# Get the terms in order.
paul@6	1572
paul@32	1573	all_terms = self.terms
paul@32	1574	terms = all_terms.keys()
paul@6	1575	terms.sort()
paul@6	1576
paul@12	1577	dict_writer = self.get_term_writer()
paul@12	1578
paul@32	1579	for term in terms:
paul@32	1580	doc_positions = all_terms[term].items()
paul@12	1581	dict_writer.write_term_positions(term, doc_positions)
paul@12	1582
paul@12	1583	dict_writer.close()
paul@6	1584
paul@12	1585	self.terms = {}
paul@12	1586	self.dict_partition += 1
paul@12	1587
paul@12	1588	def flush_fields(self):
paul@12	1589
paul@12	1590	"Flush fields into the current term dictionary partition."
paul@7	1591
paul@10	1592	# Get the documents in order.
paul@10	1593
paul@10	1594	docs = self.docs.items()
paul@10	1595	docs.sort()
paul@10	1596
paul@12	1597	field_dict_writer = self.get_field_writer()
paul@12	1598
paul@10	1599	for docnum, fields in docs:
paul@12	1600	field_dict_writer.write_fields(docnum, fields)
paul@12	1601
paul@12	1602	field_dict_writer.close()
paul@10	1603
paul@12	1604	self.docs = {}
paul@12	1605	self.field_dict_partition += 1
paul@12	1606
paul@12	1607	def close(self):
paul@12	1608	if self.terms:
paul@12	1609	self.flush_terms()
paul@12	1610	if self.docs:
paul@12	1611	self.flush_fields()
paul@10	1612
paul@10	1613	class IndexReader:
paul@10	1614
paul@10	1615	"Accessing the term and field dictionaries."
paul@10	1616
paul@14	1617	def __init__(self, pathname):
paul@14	1618	self.dict_reader = get_term_reader(pathname, "merged")
paul@14	1619	self.field_dict_reader = get_field_reader(pathname, "merged")
paul@10	1620
paul@26	1621	def find_terms(self, term):
paul@26	1622	return self.dict_reader.find_terms(term)
paul@26	1623
paul@10	1624	def find_positions(self, term):
paul@10	1625	return self.dict_reader.find_positions(term)
paul@10	1626
paul@11	1627	def get_frequency(self, term):
paul@11	1628	return self.dict_reader.get_frequency(term)
paul@11	1629
paul@22	1630	def get_document_frequency(self, term):
paul@22	1631	return self.dict_reader.get_document_frequency(term)
paul@22	1632
paul@10	1633	def get_fields(self, docnum):
paul@13	1634	return self.field_dict_reader.get_fields(docnum)
paul@10	1635
paul@10	1636	def close(self):
paul@10	1637	self.dict_reader.close()
paul@10	1638	self.field_dict_reader.close()
paul@10	1639
paul@7	1640	class Index:
paul@7	1641
paul@7	1642	"An inverted index solution encapsulating the various components."
paul@7	1643
paul@7	1644	def __init__(self, pathname):
paul@7	1645	self.pathname = pathname
paul@7	1646	self.reader = None
paul@7	1647	self.writer = None
paul@7	1648
paul@20	1649	def get_writer(self, interval=TERM_INTERVAL, doc_interval=DOCUMENT_INTERVAL, flush_interval=FLUSH_INTERVAL):
paul@7	1650
paul@12	1651	"""
paul@20	1652	Return a writer, optionally using the given indexing 'interval',
paul@20	1653	'doc_interval' and 'flush_interval'.
paul@12	1654	"""
paul@7	1655
paul@7	1656	if not exists(self.pathname):
paul@7	1657	mkdir(self.pathname)
paul@7	1658
paul@20	1659	self.writer = IndexWriter(self.pathname, interval, doc_interval, flush_interval)
paul@7	1660	return self.writer
paul@7	1661
paul@12	1662	def get_reader(self, partition=0):
paul@7	1663
paul@7	1664	"Return a reader for the index."
paul@7	1665
paul@14	1666	# Ensure that only one partition exists.
paul@14	1667
paul@24	1668	self.merge()
paul@14	1669	return self._get_reader(partition)
paul@14	1670
paul@14	1671	def _get_reader(self, partition):
paul@14	1672
paul@14	1673	"Return a reader for the index."
paul@14	1674
paul@7	1675	if not exists(self.pathname):
paul@7	1676	raise OSError, "Index path %r does not exist." % self.pathname
paul@7	1677
paul@14	1678	self.reader = IndexReader(self.pathname)
paul@12	1679	return self.reader
paul@7	1680
paul@24	1681	def merge(self):
paul@24	1682
paul@24	1683	"Merge/optimise index partitions."
paul@24	1684
paul@24	1685	self.merge_terms()
paul@24	1686	self.merge_fields()
paul@24	1687
paul@20	1688	def merge_terms(self, interval=TERM_INTERVAL, doc_interval=DOCUMENT_INTERVAL):
paul@7	1689
paul@20	1690	"""
paul@20	1691	Merge term dictionaries using the given indexing 'interval' and
paul@20	1692	'doc_interval'.
paul@20	1693	"""
paul@10	1694
paul@12	1695	readers = []
paul@21	1696	partitions = set()
paul@10	1697
paul@14	1698	for filename in listdir(self.pathname):
paul@12	1699	if filename.startswith("terms-"): # 6 character prefix
paul@14	1700	partition = filename[6:]
paul@14	1701	readers.append(get_term_reader(self.pathname, partition))
paul@21	1702	partitions.add(partition)
paul@14	1703
paul@14	1704	# Write directly to a dictionary.
paul@14	1705
paul@14	1706	if len(readers) > 1:
paul@21	1707	if "merged" in partitions:
paul@21	1708	rename_term_files(self.pathname, "merged", "old-merged")
paul@21	1709	partitions.remove("merged")
paul@21	1710	partitions.add("old-merged")
paul@21	1711
paul@20	1712	writer = get_term_writer(self.pathname, "merged", interval, doc_interval)
paul@14	1713	merger = TermDictionaryMerger(writer, readers)
paul@14	1714	merger.merge()
paul@14	1715	merger.close()
paul@14	1716
paul@14	1717	# Remove old files.
paul@14	1718
paul@14	1719	for partition in partitions:
paul@14	1720	remove_term_files(self.pathname, partition)
paul@14	1721
paul@21	1722	elif len(readers) == 1:
paul@21	1723	partition = list(partitions)[0]
paul@21	1724	if partition != "merged":
paul@21	1725	rename_term_files(self.pathname, partition, "merged")
paul@14	1726
paul@20	1727	def merge_fields(self, interval=FIELD_INTERVAL):
paul@10	1728
paul@14	1729	"Merge field dictionaries using the given indexing 'interval'."
paul@14	1730
paul@14	1731	readers = []
paul@21	1732	partitions = set()
paul@14	1733
paul@14	1734	for filename in listdir(self.pathname):
paul@14	1735	if filename.startswith("fields-"): # 7 character prefix
paul@14	1736	partition = filename[7:]
paul@14	1737	readers.append(get_field_reader(self.pathname, partition))
paul@21	1738	partitions.add(partition)
paul@14	1739
paul@14	1740	# Write directly to a dictionary.
paul@13	1741
paul@14	1742	if len(readers) > 1:
paul@21	1743	if "merged" in partitions:
paul@21	1744	rename_field_files(self.pathname, "merged", "old-merged")
paul@21	1745	partitions.remove("merged")
paul@21	1746	partitions.add("old-merged")
paul@21	1747
paul@14	1748	writer = get_field_writer(self.pathname, "merged", interval)
paul@14	1749	merger = FieldDictionaryMerger(writer, readers)
paul@14	1750	merger.merge()
paul@14	1751	merger.close()
paul@14	1752
paul@14	1753	# Remove old files.
paul@14	1754
paul@14	1755	for partition in partitions:
paul@14	1756	remove_field_files(self.pathname, partition)
paul@14	1757
paul@21	1758	elif len(readers) == 1:
paul@21	1759	partition = list(partitions)[0]
paul@21	1760	if partition != "merged":
paul@21	1761	rename_field_files(self.pathname, partition, "merged")
paul@7	1762
paul@7	1763	def close(self):
paul@7	1764	if self.reader is not None:
paul@7	1765	self.reader.close()
paul@7	1766	self.reader = None
paul@7	1767	if self.writer is not None:
paul@7	1768	self.writer.close()
paul@7	1769	self.writer = None
paul@6	1770
paul@0	1771	# vim: tabstop=4 expandtab shiftwidth=4