iixr (annotate iixr.py in e8e80bfe5b5d)

iixr

Annotated iixr.py

20:e8e80bfe5b5d

2009-09-02

Paul Boddie

Fixed position dictionary writing to write the correct document number in the index for the specified position file offset. Fixed index iterator usage in the position dictionary iterator. Introduced document indexing interval parameters into the API. Fixed the test program to remove old test files, to close the correct files, and to use the appropriate names (fixing copying errors).

paul@0	1	#!/usr/bin/env python
paul@0	2
paul@0	3	"""
paul@0	4	A simple (and sane) text indexing library.
paul@1	5
paul@1	6	Copyright (C) 2009 Paul Boddie <paul@boddie.org.uk>
paul@1	7
paul@1	8	This program is free software; you can redistribute it and/or modify it under
paul@1	9	the terms of the GNU General Public License as published by the Free Software
paul@1	10	Foundation; either version 3 of the License, or (at your option) any later
paul@1	11	version.
paul@1	12
paul@1	13	This program is distributed in the hope that it will be useful, but WITHOUT ANY
paul@1	14	WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
paul@1	15	PARTICULAR PURPOSE. See the GNU General Public License for more details.
paul@1	16
paul@1	17	You should have received a copy of the GNU General Public License along
paul@1	18	with this program. If not, see <http://www.gnu.org/licenses/>.
paul@0	19	"""
paul@0	20
paul@19	21	from os import dup, fdopen # independent iterator access to files
paul@12	22	from os import listdir, mkdir # index and partition discovery
paul@14	23	from os import remove, rename # partition manipulation
paul@7	24	from os.path import exists, join
paul@2	25	from os.path import commonprefix # to find common string prefixes
paul@3	26	from bisect import bisect_right # to find terms in the dictionary index
paul@12	27	from bisect import insort_right # to maintain a sorted list of data for merging
paul@10	28	import bz2, zlib # for field compression
paul@2	29
paul@7	30	# Constants.
paul@7	31
paul@20	32	TERM_INTERVAL = 100
paul@20	33	DOCUMENT_INTERVAL = 100
paul@20	34	FIELD_INTERVAL = 100
paul@20	35	FLUSH_INTERVAL = 1000000
paul@20	36
paul@20	37	TERM_FILENAMES = "terms", "terms_index", "positions", "positions_index"
paul@20	38	FIELD_FILENAMES = "fields", "fields_index"
paul@7	39
paul@10	40	compressors = [("b", bz2.compress), ("z", zlib.compress)]
paul@10	41	decompressors = {"b" : bz2.decompress, "z" : zlib.decompress}
paul@10	42
paul@0	43	# Foundation classes.
paul@0	44
paul@0	45	class File:
paul@0	46
paul@0	47	"A basic file abstraction."
paul@0	48
paul@0	49	def __init__(self, f):
paul@0	50	self.f = f
paul@0	51	self.reset()
paul@0	52
paul@0	53	def reset(self):
paul@12	54
paul@12	55	"To be used to reset the state of the reader or writer between records."
paul@12	56
paul@0	57	pass
paul@0	58
paul@12	59	def rewind(self):
paul@12	60	self.f.seek(0)
paul@13	61	self.reset()
paul@12	62
paul@0	63	def close(self):
paul@7	64	if self.f is not None:
paul@7	65	self.f.close()
paul@7	66	self.f = None
paul@0	67
paul@0	68	class FileWriter(File):
paul@0	69
paul@0	70	"Writing basic data types to files."
paul@0	71
paul@0	72	def write_number(self, number):
paul@0	73
paul@0	74	"Write 'number' to the file using a variable length encoding."
paul@0	75
paul@0	76	# Negative numbers are not supported.
paul@0	77
paul@0	78	if number < 0:
paul@0	79	raise ValueError, "Number %r is negative." % number
paul@0	80
paul@0	81	# Special case: one byte containing zero.
paul@0	82
paul@0	83	elif number == 0:
paul@4	84	self.f.write(chr(0))
paul@0	85	return
paul@0	86
paul@0	87	# Write the number from least to most significant digits.
paul@0	88
paul@0	89	bytes = []
paul@0	90
paul@0	91	while number != 0:
paul@4	92	lsd = number & 127
paul@4	93	number = number >> 7
paul@4	94	if number != 0:
paul@4	95	lsd \|= 128
paul@0	96	bytes.append(chr(lsd))
paul@0	97
paul@0	98	record = "".join(bytes)
paul@0	99	self.f.write(record)
paul@0	100
paul@8	101	def write_string(self, s, compress=0):
paul@2	102
paul@8	103	"""
paul@8	104	Write 's' to the file, recording its length and compressing the string
paul@8	105	if 'compress' is set to a true value.
paul@8	106	"""
paul@2	107
paul@7	108	# Convert Unicode objects to strings.
paul@7	109
paul@7	110	if isinstance(s, unicode):
paul@7	111	s = s.encode("utf-8")
paul@7	112
paul@8	113	# Compress the string if requested.
paul@2	114
paul@8	115	if compress:
paul@10	116	for flag, fn in compressors:
paul@10	117	cs = fn(s)
paul@10	118
paul@10	119	# Take the first string shorter than the original.
paul@10	120
paul@10	121	if len(cs) < len(s):
paul@10	122	s = cs
paul@10	123	break
paul@10	124	else:
paul@10	125	flag = "-"
paul@10	126
paul@10	127	# Record whether compression was used.
paul@10	128
paul@10	129	self.f.write(flag)
paul@2	130
paul@8	131	# Write the length of the data before the data itself.
paul@8	132
paul@8	133	length = len(s)
paul@4	134	self.write_number(length)
paul@2	135	self.f.write(s)
paul@2	136
paul@0	137	class FileReader(File):
paul@0	138
paul@0	139	"Reading basic data types from files."
paul@0	140
paul@0	141	def read_number(self):
paul@0	142
paul@0	143	"Read a number from the file."
paul@0	144
paul@0	145	# Read each byte, adding it to the number.
paul@0	146
paul@0	147	shift = 0
paul@0	148	number = 0
paul@4	149	more = 1
paul@0	150
paul@4	151	while more:
paul@4	152	byte = self.f.read(1)
paul@4	153	if not byte:
paul@4	154	raise EOFError
paul@4	155
paul@4	156	csd = ord(byte)
paul@4	157	more = csd & 128 != 0
paul@4	158	if more:
paul@4	159	csd &= 127
paul@0	160	number += (csd << shift)
paul@4	161	shift += 7
paul@0	162
paul@0	163	return number
paul@0	164
paul@8	165	def read_string(self, decompress=0):
paul@2	166
paul@8	167	"""
paul@8	168	Read a string from the file, decompressing the stored data if
paul@8	169	'decompress' is set to a true value.
paul@8	170	"""
paul@2	171
paul@10	172	# Decompress the data if requested.
paul@10	173
paul@10	174	if decompress:
paul@10	175	flag = self.f.read(1)
paul@10	176	else:
paul@10	177	flag = "-"
paul@10	178
paul@4	179	length = self.read_number()
paul@8	180	s = self.f.read(length)
paul@8	181
paul@10	182	# Perform decompression if applicable.
paul@8	183
paul@10	184	if flag != "-":
paul@10	185	fn = decompressors[flag]
paul@10	186	s = fn(s)
paul@7	187
paul@7	188	# Convert strings to Unicode objects.
paul@7	189
paul@8	190	return unicode(s, "utf-8")
paul@2	191
paul@9	192	# Specific classes for storing term and position information.
paul@0	193
paul@0	194	class PositionWriter(FileWriter):
paul@0	195
paul@0	196	"Writing position information to files."
paul@0	197
paul@0	198	def reset(self):
paul@0	199	self.last_docnum = 0
paul@0	200
paul@0	201	def write_positions(self, docnum, positions):
paul@0	202
paul@19	203	"""
paul@19	204	Write for the document 'docnum' the given 'positions'.
paul@19	205	Return the offset of the written record.
paul@19	206	"""
paul@0	207
paul@0	208	if docnum < self.last_docnum:
paul@0	209	raise ValueError, "Document number %r is less than previous number %r." % (docnum, self.last_docnum)
paul@0	210
paul@19	211	# Record the offset of this record.
paul@19	212
paul@19	213	offset = self.f.tell()
paul@19	214
paul@0	215	# Write the document number delta.
paul@0	216
paul@0	217	self.write_number(docnum - self.last_docnum)
paul@0	218
paul@0	219	# Write the number of positions.
paul@0	220
paul@0	221	self.write_number(len(positions))
paul@0	222
paul@7	223	# Make sure that the positions are sorted.
paul@7	224
paul@7	225	positions.sort()
paul@7	226
paul@0	227	# Write the position deltas.
paul@0	228
paul@0	229	last = 0
paul@0	230	for position in positions:
paul@0	231	pos = position - last
paul@0	232	self.write_number(pos)
paul@0	233	last = position
paul@0	234
paul@0	235	self.last_docnum = docnum
paul@0	236
paul@19	237	return offset
paul@0	238
paul@0	239	class PositionReader(FileReader):
paul@0	240
paul@0	241	"Reading position information from files."
paul@0	242
paul@0	243	def reset(self):
paul@0	244	self.last_docnum = 0
paul@0	245
paul@0	246	def read_positions(self):
paul@0	247
paul@0	248	"Read positions, returning a document number and a list of positions."
paul@0	249
paul@0	250	# Read the document number delta and add it to the last number.
paul@0	251
paul@0	252	self.last_docnum += self.read_number()
paul@0	253
paul@0	254	# Read the number of positions.
paul@0	255
paul@0	256	npositions = self.read_number()
paul@0	257
paul@0	258	# Read the position deltas, adding each previous position to get the
paul@0	259	# appropriate collection of absolute positions.
paul@0	260
paul@0	261	i = 0
paul@0	262	last = 0
paul@0	263	positions = []
paul@0	264
paul@0	265	while i < npositions:
paul@0	266	last += self.read_number()
paul@0	267	positions.append(last)
paul@0	268	i += 1
paul@0	269
paul@0	270	return self.last_docnum, positions
paul@0	271
paul@19	272	def read_term_positions(self, offset, count):
paul@0	273
paul@0	274	"""
paul@0	275	Read all positions from 'offset', seeking to that position in the file
paul@19	276	before reading. The number of documents available for reading is limited
paul@19	277	to 'count'.
paul@0	278	"""
paul@0	279
paul@0	280	self.reset()
paul@19	281
paul@19	282	# Duplicate the file handle.
paul@19	283
paul@19	284	f = fdopen(dup(self.f.fileno()), "rb")
paul@19	285	f.seek(offset)
paul@19	286	return PositionIterator(f, count)
paul@19	287
paul@19	288	class IteratorBase:
paul@19	289
paul@19	290	def __init__(self, count):
paul@19	291	self.replenish(count)
paul@0	292
paul@19	293	def replenish(self, count):
paul@19	294	self.count = count
paul@19	295	self.read_documents = 0
paul@19	296
paul@19	297	def __len__(self):
paul@19	298	return self.count
paul@18	299
paul@19	300	def sort(self):
paul@19	301	pass # Stored document positions are already sorted.
paul@18	302
paul@19	303	def __iter__(self):
paul@19	304	return self
paul@19	305
paul@19	306	class PositionIterator(PositionReader, IteratorBase):
paul@18	307
paul@18	308	"Iterating over document positions."
paul@18	309
paul@19	310	def __init__(self, f, count):
paul@18	311	PositionReader.__init__(self, f)
paul@19	312	IteratorBase.__init__(self, count)
paul@19	313
paul@19	314	def next(self):
paul@19	315
paul@19	316	"Read positions for a single document."
paul@19	317
paul@19	318	if self.read_documents < self.count:
paul@19	319	self.read_documents += 1
paul@19	320	return self.read_positions()
paul@19	321	else:
paul@19	322	raise StopIteration
paul@19	323
paul@19	324	class PositionIndexWriter(FileWriter):
paul@19	325
paul@19	326	"Writing position index information to files."
paul@19	327
paul@19	328	def reset(self):
paul@19	329	self.last_docnum = 0
paul@19	330	self.last_pos_offset = 0
paul@19	331
paul@19	332	def write_positions(self, docnum, pos_offset, count):
paul@19	333
paul@19	334	"""
paul@19	335	Write the given 'docnum, 'pos_offset' and document 'count' to the
paul@19	336	position index file.
paul@19	337	"""
paul@19	338
paul@19	339	# Record the offset of this record.
paul@19	340
paul@19	341	offset = self.f.tell()
paul@19	342
paul@19	343	# Write the document number delta.
paul@19	344
paul@19	345	self.write_number(docnum - self.last_docnum)
paul@19	346	self.last_docnum = docnum
paul@19	347
paul@19	348	# Write the position file offset delta.
paul@19	349
paul@19	350	self.write_number(pos_offset - self.last_pos_offset)
paul@19	351	self.last_pos_offset = pos_offset
paul@19	352
paul@19	353	# Write the document count.
paul@19	354
paul@19	355	self.write_number(count)
paul@19	356
paul@19	357	return offset
paul@19	358
paul@19	359	class PositionIndexReader(FileReader):
paul@19	360
paul@19	361	"Reading position index information from files."
paul@18	362
paul@19	363	def reset(self):
paul@19	364	self.last_docnum = 0
paul@19	365	self.last_pos_offset = 0
paul@19	366
paul@19	367	def read_positions(self):
paul@19	368
paul@19	369	"""
paul@19	370	Read a document number, a position file offset for the position index
paul@19	371	file, and the number of documents in a section of that file.
paul@19	372	"""
paul@19	373
paul@19	374	# Read the document number delta.
paul@19	375
paul@19	376	self.last_docnum += self.read_number()
paul@19	377
paul@19	378	# Read the offset delta.
paul@19	379
paul@19	380	self.last_pos_offset += self.read_number()
paul@19	381
paul@19	382	# Read the document count.
paul@19	383
paul@19	384	count = self.read_number()
paul@19	385
paul@19	386	return self.last_docnum, self.last_pos_offset, count
paul@19	387
paul@19	388	def read_term_positions(self, offset, doc_frequency):
paul@0	389
paul@19	390	"""
paul@19	391	Read all positions from 'offset', seeking to that position in the file
paul@19	392	before reading. The number of documents available for reading is limited
paul@19	393	to 'doc_frequency'.
paul@19	394	"""
paul@19	395
paul@19	396	# NOTE: This is almost a duplication of PositionReader.read_term_positions.
paul@19	397
paul@19	398	self.reset()
paul@19	399
paul@19	400	# Duplicate the file handle.
paul@19	401
paul@19	402	f = fdopen(dup(self.f.fileno()), "rb")
paul@19	403	f.seek(offset)
paul@19	404	return PositionIndexIterator(f, doc_frequency)
paul@19	405
paul@19	406	class PositionIndexIterator(PositionIndexReader, IteratorBase):
paul@19	407
paul@19	408	"Iterating over document positions."
paul@19	409
paul@19	410	def __init__(self, f, count):
paul@19	411	PositionIndexReader.__init__(self, f)
paul@19	412	IteratorBase.__init__(self, count)
paul@19	413	self.section_count = 0
paul@19	414
paul@19	415	def next(self):
paul@19	416
paul@19	417	"Read positions for a single document."
paul@18	418
paul@19	419	self.read_documents += self.section_count
paul@19	420	if self.read_documents < self.count:
paul@19	421	docnum, pos_offset, self.section_count = t = self.read_positions()
paul@19	422	return t
paul@19	423	else:
paul@19	424	raise StopIteration
paul@19	425
paul@19	426	class PositionDictionaryWriter:
paul@19	427
paul@19	428	"Writing position dictionaries."
paul@19	429
paul@19	430	def __init__(self, position_writer, position_index_writer, interval):
paul@19	431	self.position_writer = position_writer
paul@19	432	self.position_index_writer = position_index_writer
paul@19	433	self.interval = interval
paul@19	434
paul@19	435	def write_term_positions(self, doc_positions):
paul@19	436
paul@19	437	"""
paul@19	438	Write all 'doc_positions' - a collection of tuples of the form (document
paul@19	439	number, position list) - to the file.
paul@19	440
paul@19	441	Add some records to the index, making dictionary entries.
paul@19	442
paul@19	443	Return a tuple containing the offset of the written data, the frequency
paul@19	444	(number of positions), and document frequency (number of documents) for
paul@19	445	the term involved.
paul@19	446	"""
paul@19	447
paul@20	448	# Reset the writers.
paul@19	449
paul@19	450	self.position_writer.reset()
paul@20	451	self.position_index_writer.reset()
paul@20	452
paul@19	453	index_offset = None
paul@19	454
paul@19	455	# Write the positions.
paul@19	456
paul@19	457	frequency = 0
paul@20	458	first_docnum = None
paul@19	459	first_offset = None
paul@19	460	count = 0
paul@19	461
paul@19	462	doc_positions.sort()
paul@19	463
paul@19	464	for docnum, positions in doc_positions:
paul@19	465	pos_offset = self.position_writer.write_positions(docnum, positions)
paul@19	466
paul@19	467	# Retain the first record offset for a subsequent index entry.
paul@19	468
paul@19	469	if first_offset is None:
paul@19	470	first_offset = pos_offset
paul@20	471	first_docnum = docnum
paul@19	472
paul@19	473	frequency += len(positions)
paul@20	474	count += 1
paul@19	475
paul@19	476	# Every {interval} entries, write an index entry.
paul@19	477
paul@19	478	if count == self.interval:
paul@20	479	io = self.position_index_writer.write_positions(first_docnum, first_offset, self.interval)
paul@0	480
paul@19	481	# Remember the first index entry offset.
paul@19	482
paul@19	483	if index_offset is None:
paul@19	484	index_offset = io
paul@19	485
paul@19	486	first_offset = None
paul@20	487	first_docnum = None
paul@19	488	count = 0
paul@19	489
paul@19	490	# Finish writing an index entry for the remaining documents.
paul@19	491
paul@19	492	else:
paul@19	493	if first_offset is not None:
paul@20	494	io = self.position_index_writer.write_positions(first_docnum, first_offset, count)
paul@19	495
paul@19	496	# Remember the first index entry offset.
paul@19	497
paul@19	498	if index_offset is None:
paul@19	499	index_offset = io
paul@19	500
paul@19	501	return index_offset, frequency, len(doc_positions)
paul@19	502
paul@19	503	def close(self):
paul@19	504	self.position_writer.close()
paul@19	505	self.position_index_writer.close()
paul@19	506
paul@19	507	class PositionDictionaryReader:
paul@18	508
paul@19	509	"Reading position dictionaries."
paul@19	510
paul@19	511	def __init__(self, position_reader, position_index_reader):
paul@19	512	self.position_reader = position_reader
paul@19	513	self.position_index_reader = position_index_reader
paul@19	514
paul@19	515	def read_term_positions(self, offset, doc_frequency):
paul@19	516
paul@19	517	"""
paul@19	518	Return an iterator for dictionary entries starting at 'offset' with the
paul@19	519	given 'doc_frequency'.
paul@19	520	"""
paul@18	521
paul@19	522	return PositionDictionaryIterator(self.position_reader,
paul@19	523	self.position_index_reader, offset, doc_frequency)
paul@19	524
paul@19	525	def close(self):
paul@19	526	self.position_reader.close()
paul@19	527	self.position_index_reader.close()
paul@19	528
paul@19	529	class PositionDictionaryIterator:
paul@19	530
paul@19	531	"Iteration over position dictionary entries."
paul@19	532
paul@19	533	def __init__(self, position_reader, position_index_reader, offset, doc_frequency):
paul@19	534	self.position_reader = position_reader
paul@20	535	self.doc_frequency = doc_frequency
paul@19	536
paul@19	537	self.index_iterator = position_index_reader.read_term_positions(offset, doc_frequency)
paul@19	538	self.next_section()
paul@19	539	self.init_section()
paul@0	540
paul@20	541	def __len__(self):
paul@20	542	return self.doc_frequency
paul@20	543
paul@20	544	def sort(self):
paul@20	545	pass
paul@20	546
paul@18	547	def __iter__(self):
paul@18	548	return self
paul@18	549
paul@18	550	def next(self):
paul@0	551
paul@19	552	# Attempt to get the next document record from the section in the positions file.
paul@19	553
paul@19	554	while 1:
paul@19	555
paul@19	556	# Either return the next record.
paul@19	557
paul@19	558	try:
paul@19	559	return self.iterator.next()
paul@0	560
paul@19	561	# Or, where a section is finished, get the next section and try again.
paul@19	562
paul@19	563	except StopIteration:
paul@20	564
paul@20	565	# Where a section follows, update the index iterator, but keep
paul@20	566	# reading using the same file iterator (since the data should
paul@20	567	# just follow on from the last section).
paul@20	568
paul@19	569	self.next_section()
paul@19	570	self.iterator.replenish(self.section_count)
paul@19	571
paul@19	572	def next_section(self):
paul@20	573	self.docnum, self.pos_offset, self.section_count = self.index_iterator.next()
paul@19	574
paul@19	575	def init_section(self):
paul@19	576	self.iterator = self.position_reader.read_term_positions(self.pos_offset, self.section_count)
paul@0	577
paul@2	578	class TermWriter(FileWriter):
paul@2	579
paul@2	580	"Writing term information to files."
paul@2	581
paul@2	582	def reset(self):
paul@2	583	self.last_term = ""
paul@2	584	self.last_offset = 0
paul@2	585
paul@19	586	def write_term(self, term, offset, frequency, doc_frequency):
paul@2	587
paul@2	588	"""
paul@19	589	Write the given 'term', its position file 'offset', its 'frequency' and
paul@19	590	its 'doc_frequency' (number of documents in which it appears) to the
paul@19	591	term information file. Return the offset after the term information was
paul@19	592	written to the file.
paul@2	593	"""
paul@2	594
paul@2	595	# Write the prefix length and term suffix.
paul@2	596
paul@2	597	common = len(commonprefix([self.last_term, term]))
paul@2	598	suffix = term[common:]
paul@2	599
paul@4	600	self.write_number(common)
paul@2	601	self.write_string(suffix)
paul@2	602
paul@2	603	# Write the offset delta.
paul@2	604
paul@2	605	self.write_number(offset - self.last_offset)
paul@2	606
paul@11	607	# Write the frequency.
paul@11	608
paul@11	609	self.write_number(frequency)
paul@11	610
paul@19	611	# Write the document frequency.
paul@19	612
paul@19	613	self.write_number(doc_frequency)
paul@19	614
paul@2	615	self.last_term = term
paul@2	616	self.last_offset = offset
paul@2	617
paul@3	618	return self.f.tell()
paul@3	619
paul@2	620	class TermReader(FileReader):
paul@2	621
paul@2	622	"Reading term information from files."
paul@2	623
paul@2	624	def reset(self):
paul@2	625	self.last_term = ""
paul@2	626	self.last_offset = 0
paul@2	627
paul@2	628	def read_term(self):
paul@2	629
paul@2	630	"""
paul@19	631	Read a term, its position file offset, its frequency and its document
paul@19	632	frequence from the term information file.
paul@2	633	"""
paul@2	634
paul@2	635	# Read the prefix length and term suffix.
paul@2	636
paul@4	637	common = self.read_number()
paul@2	638	suffix = self.read_string()
paul@2	639
paul@2	640	self.last_term = self.last_term[:common] + suffix
paul@2	641
paul@2	642	# Read the offset delta.
paul@2	643
paul@2	644	self.last_offset += self.read_number()
paul@2	645
paul@11	646	# Read the frequency.
paul@11	647
paul@11	648	frequency = self.read_number()
paul@11	649
paul@19	650	# Read the document frequency.
paul@19	651
paul@19	652	doc_frequency = self.read_number()
paul@19	653
paul@19	654	return self.last_term, self.last_offset, frequency, doc_frequency
paul@2	655
paul@3	656	def go_to_term(self, term, offset, info_offset):
paul@3	657
paul@9	658	"""
paul@9	659	Seek past the entry for 'term' having 'offset' to 'info_offset'. This
paul@9	660	permits the scanning for later terms from the specified term.
paul@9	661	"""
paul@3	662
paul@3	663	self.f.seek(info_offset)
paul@3	664	self.last_term = term
paul@3	665	self.last_offset = offset
paul@3	666
paul@3	667	class TermIndexWriter(TermWriter):
paul@3	668
paul@3	669	"Writing term dictionary index details to files."
paul@3	670
paul@3	671	def reset(self):
paul@3	672	TermWriter.reset(self)
paul@3	673	self.last_info_offset = 0
paul@3	674
paul@19	675	def write_term(self, term, offset, frequency, doc_frequency, info_offset):
paul@3	676
paul@3	677	"""
paul@19	678	Write the given 'term', its position file 'offset', its 'frequency' and
paul@19	679	its 'doc_frequency' to the term dictionary index file, along with the
paul@19	680	'info_offset' in the term information file.
paul@3	681	"""
paul@3	682
paul@19	683	TermWriter.write_term(self, term, offset, frequency, doc_frequency)
paul@3	684
paul@3	685	# Write the information file offset delta.
paul@3	686
paul@3	687	self.write_number(info_offset - self.last_info_offset)
paul@3	688	self.last_info_offset = info_offset
paul@3	689
paul@3	690	class TermIndexReader(TermReader):
paul@3	691
paul@3	692	"Reading term dictionary index details from files."
paul@3	693
paul@3	694	def reset(self):
paul@3	695	TermReader.reset(self)
paul@3	696	self.last_info_offset = 0
paul@3	697
paul@3	698	def read_term(self):
paul@3	699
paul@3	700	"""
paul@19	701	Read a term, its position file offset, its frequency, its document
paul@19	702	frequency and a term information file offset from the term dictionary
paul@19	703	index file.
paul@3	704	"""
paul@3	705
paul@19	706	term, offset, frequency, doc_frequency = TermReader.read_term(self)
paul@3	707
paul@3	708	# Read the offset delta.
paul@3	709
paul@3	710	self.last_info_offset += self.read_number()
paul@3	711
paul@19	712	return term, offset, frequency, doc_frequency, self.last_info_offset
paul@3	713
paul@3	714	class TermDictionaryWriter:
paul@3	715
paul@3	716	"Writing term dictionaries."
paul@3	717
paul@19	718	def __init__(self, info_writer, index_writer, position_dict_writer, interval):
paul@3	719	self.info_writer = info_writer
paul@3	720	self.index_writer = index_writer
paul@19	721	self.position_dict_writer = position_dict_writer
paul@3	722	self.interval = interval
paul@3	723	self.entry = 0
paul@3	724
paul@19	725	def _write_term(self, term, offset, frequency, doc_frequency):
paul@3	726
paul@3	727	"""
paul@19	728	Write the given 'term', its position file 'offset', its 'frequency' and
paul@19	729	its 'doc_frequency' (number of documents in which it appears) to the
paul@19	730	term information file. Return the offset after the term information was
paul@19	731	written to the file.
paul@3	732	"""
paul@3	733
paul@19	734	info_offset = self.info_writer.write_term(term, offset, frequency, doc_frequency)
paul@3	735
paul@3	736	if self.entry % self.interval == 0:
paul@19	737	self.index_writer.write_term(term, offset, frequency, doc_frequency, info_offset)
paul@3	738
paul@3	739	self.entry += 1
paul@3	740
paul@5	741	def write_term_positions(self, term, doc_positions):
paul@5	742
paul@5	743	"""
paul@5	744	Write the given 'term' and the 'doc_positions' recording the documents
paul@5	745	and positions at which the term is found.
paul@5	746	"""
paul@5	747
paul@19	748	offset, frequency, doc_frequency = self.position_dict_writer.write_term_positions(doc_positions)
paul@19	749	self._write_term(term, offset, frequency, doc_frequency)
paul@5	750
paul@3	751	def close(self):
paul@3	752	self.info_writer.close()
paul@3	753	self.index_writer.close()
paul@19	754	self.position_dict_writer.close()
paul@3	755
paul@3	756	class TermDictionaryReader:
paul@3	757
paul@3	758	"Reading term dictionaries."
paul@3	759
paul@5	760	def __init__(self, info_reader, index_reader, position_reader):
paul@3	761	self.info_reader = info_reader
paul@3	762	self.index_reader = index_reader
paul@5	763	self.position_reader = position_reader
paul@3	764
paul@3	765	self.terms = []
paul@3	766	try:
paul@3	767	while 1:
paul@3	768	self.terms.append(self.index_reader.read_term())
paul@3	769	except EOFError:
paul@3	770	pass
paul@3	771
paul@3	772	# Large numbers for ordering purposes.
paul@3	773
paul@14	774	self.max_offset = self.terms[-1][1] + 1
paul@3	775
paul@9	776	def _find_term(self, term):
paul@3	777
paul@11	778	"""
paul@11	779	Find the position file offset and frequency of 'term' from the term
paul@11	780	dictionary.
paul@11	781	"""
paul@3	782
paul@14	783	i = bisect_right(self.terms, (term, self.max_offset, 0, 0)) - 1
paul@3	784
paul@3	785	# Get the entry position providing the term or one preceding it.
paul@3	786
paul@3	787	if i == -1:
paul@3	788	return None
paul@3	789
paul@19	790	found_term, offset, frequency, doc_frequency, info_offset = self.terms[i]
paul@3	791
paul@19	792	# Where the term is found immediately, return the offset and
paul@19	793	# frequencies.
paul@3	794
paul@3	795	if term == found_term:
paul@19	796	return offset, frequency, doc_frequency
paul@3	797
paul@3	798	# Otherwise, seek past the index term's entry in the information file
paul@3	799	# and scan for the desired term.
paul@3	800
paul@3	801	else:
paul@3	802	self.info_reader.go_to_term(found_term, offset, info_offset)
paul@3	803	try:
paul@3	804	while term > found_term:
paul@19	805	found_term, offset, frequency, doc_frequency = self.info_reader.read_term()
paul@3	806	except EOFError:
paul@3	807	pass
paul@3	808
paul@19	809	# If the term is found, return the offset and frequencies.
paul@3	810
paul@3	811	if term == found_term:
paul@19	812	return offset, frequency, doc_frequency
paul@3	813	else:
paul@3	814	return None
paul@3	815
paul@12	816	def rewind(self):
paul@12	817	self.info_reader.rewind()
paul@12	818
paul@19	819	def _get_positions(self, offset, doc_frequency):
paul@19	820	return self.position_reader.read_term_positions(offset, doc_frequency)
paul@12	821
paul@12	822	def read_term(self):
paul@12	823
paul@12	824	"""
paul@19	825	Return the next term, its frequency, its document frequency, and the
paul@19	826	documents and positions at which the term is found.
paul@12	827	"""
paul@12	828
paul@19	829	term, offset, frequency, doc_frequency = self.info_reader.read_term()
paul@19	830	positions = self._get_positions(offset, doc_frequency)
paul@19	831	return term, frequency, doc_frequency, positions
paul@12	832
paul@5	833	def find_positions(self, term):
paul@5	834
paul@5	835	"Return the documents and positions at which the given 'term' is found."
paul@5	836
paul@11	837	t = self._find_term(term)
paul@11	838	if t is None:
paul@5	839	return None
paul@5	840	else:
paul@19	841	offset, frequency, doc_frequency = t
paul@19	842	return self._get_positions(offset, doc_frequency)
paul@5	843
paul@11	844	def get_frequency(self, term):
paul@11	845
paul@11	846	"Return the frequency of the given 'term'."
paul@11	847
paul@11	848	t = self._find_term(term)
paul@11	849	if t is None:
paul@11	850	return None
paul@11	851	else:
paul@19	852	offset, frequency, doc_frequency = t
paul@11	853	return frequency
paul@11	854
paul@19	855	def get_document_frequency(self, term):
paul@19	856
paul@19	857	"Return the document frequency of the given 'term'."
paul@19	858
paul@19	859	t = self._find_term(term)
paul@19	860	if t is None:
paul@19	861	return None
paul@19	862	else:
paul@19	863	offset, frequency, doc_frequency = t
paul@19	864	return doc_frequency
paul@19	865
paul@3	866	def close(self):
paul@3	867	self.info_reader.close()
paul@3	868	self.index_reader.close()
paul@5	869	self.position_reader.close()
paul@3	870
paul@9	871	# Specific classes for storing document information.
paul@9	872
paul@8	873	class FieldWriter(FileWriter):
paul@8	874
paul@8	875	"Writing field data to files."
paul@8	876
paul@9	877	def reset(self):
paul@9	878	self.last_docnum = 0
paul@9	879
paul@9	880	def write_fields(self, docnum, fields):
paul@8	881
paul@8	882	"""
paul@13	883	Write for the given 'docnum', a list of 'fields' (integer, string pairs
paul@13	884	representing field identifiers and values respectively).
paul@13	885	Return the offset at which the fields are stored.
paul@8	886	"""
paul@8	887
paul@8	888	offset = self.f.tell()
paul@8	889
paul@9	890	# Write the document number delta.
paul@9	891
paul@9	892	self.write_number(docnum - self.last_docnum)
paul@9	893
paul@8	894	# Write the number of fields.
paul@8	895
paul@8	896	self.write_number(len(fields))
paul@8	897
paul@8	898	# Write the fields themselves.
paul@8	899
paul@13	900	for i, field in fields:
paul@13	901	self.write_number(i)
paul@10	902	self.write_string(field, 1) # compress
paul@8	903
paul@9	904	self.last_docnum = docnum
paul@8	905	return offset
paul@8	906
paul@8	907	class FieldReader(FileReader):
paul@8	908
paul@8	909	"Reading field data from files."
paul@8	910
paul@9	911	def reset(self):
paul@9	912	self.last_docnum = 0
paul@9	913
paul@8	914	def read_fields(self):
paul@8	915
paul@9	916	"""
paul@9	917	Read fields from the file, returning a tuple containing the document
paul@13	918	number and a list of field (identifier, value) pairs.
paul@9	919	"""
paul@9	920
paul@9	921	# Read the document number.
paul@9	922
paul@9	923	self.last_docnum += self.read_number()
paul@8	924
paul@8	925	# Read the number of fields.
paul@8	926
paul@8	927	nfields = self.read_number()
paul@8	928
paul@8	929	# Collect the fields.
paul@8	930
paul@8	931	fields = []
paul@8	932	i = 0
paul@8	933
paul@8	934	while i < nfields:
paul@13	935	identifier = self.read_number()
paul@13	936	value = self.read_string(1) # decompress
paul@13	937	fields.append((identifier, value))
paul@8	938	i += 1
paul@8	939
paul@9	940	return self.last_docnum, fields
paul@9	941
paul@9	942	def read_document_fields(self, docnum, offset):
paul@8	943
paul@9	944	"""
paul@9	945	Read fields for 'docnum' at the given 'offset'. This permits the
paul@9	946	retrieval of details for the specified document, as well as scanning for
paul@9	947	later documents.
paul@9	948	"""
paul@8	949
paul@8	950	self.f.seek(offset)
paul@9	951	bad_docnum, fields = self.read_fields()
paul@9	952	self.last_docnum = docnum
paul@9	953	return docnum, fields
paul@12	954
paul@9	955	class FieldIndexWriter(FileWriter):
paul@9	956
paul@9	957	"Writing field index details to files."
paul@9	958
paul@9	959	def reset(self):
paul@9	960	self.last_docnum = 0
paul@10	961	self.last_offset = 0
paul@9	962
paul@9	963	def write_document(self, docnum, offset):
paul@9	964
paul@9	965	"""
paul@9	966	Write for the given 'docnum', the 'offset' at which the fields for the
paul@9	967	document are stored in the fields file.
paul@9	968	"""
paul@9	969
paul@10	970	# Write the document number and offset deltas.
paul@9	971
paul@9	972	self.write_number(docnum - self.last_docnum)
paul@10	973	self.write_number(offset - self.last_offset)
paul@9	974
paul@9	975	self.last_docnum = docnum
paul@10	976	self.last_offset = offset
paul@9	977
paul@9	978	class FieldIndexReader(FileReader):
paul@9	979
paul@9	980	"Reading field index details from files."
paul@9	981
paul@9	982	def reset(self):
paul@9	983	self.last_docnum = 0
paul@10	984	self.last_offset = 0
paul@9	985
paul@9	986	def read_document(self):
paul@9	987
paul@9	988	"Read a document number and field file offset."
paul@9	989
paul@9	990	# Read the document number delta and offset.
paul@9	991
paul@9	992	self.last_docnum += self.read_number()
paul@10	993	self.last_offset += self.read_number()
paul@9	994
paul@10	995	return self.last_docnum, self.last_offset
paul@9	996
paul@9	997	class FieldDictionaryWriter:
paul@9	998
paul@9	999	"Writing field dictionary details."
paul@9	1000
paul@9	1001	def __init__(self, field_writer, field_index_writer, interval):
paul@9	1002	self.field_writer = field_writer
paul@9	1003	self.field_index_writer = field_index_writer
paul@9	1004	self.interval = interval
paul@9	1005	self.entry = 0
paul@9	1006
paul@9	1007	def write_fields(self, docnum, fields):
paul@9	1008
paul@9	1009	"Write details of the document with the given 'docnum' and 'fields'."
paul@9	1010
paul@9	1011	offset = self.field_writer.write_fields(docnum, fields)
paul@9	1012
paul@9	1013	if self.entry % self.interval == 0:
paul@9	1014	self.field_index_writer.write_document(docnum, offset)
paul@9	1015
paul@9	1016	self.entry += 1
paul@9	1017
paul@9	1018	def close(self):
paul@9	1019	self.field_writer.close()
paul@9	1020	self.field_index_writer.close()
paul@9	1021
paul@9	1022	class FieldDictionaryReader:
paul@9	1023
paul@9	1024	"Reading field dictionary details."
paul@9	1025
paul@9	1026	def __init__(self, field_reader, field_index_reader):
paul@9	1027	self.field_reader = field_reader
paul@9	1028	self.field_index_reader = field_index_reader
paul@9	1029
paul@9	1030	self.docs = []
paul@9	1031	try:
paul@9	1032	while 1:
paul@9	1033	self.docs.append(self.field_index_reader.read_document())
paul@9	1034	except EOFError:
paul@9	1035	pass
paul@9	1036
paul@9	1037	# Large numbers for ordering purposes.
paul@9	1038
paul@9	1039	self.max_offset = self.docs[-1][1]
paul@9	1040
paul@13	1041	def rewind(self):
paul@13	1042	self.field_reader.rewind()
paul@13	1043
paul@13	1044	def read_fields(self):
paul@13	1045
paul@13	1046	"Return the next document number and fields."
paul@13	1047
paul@13	1048	return self.field_reader.read_fields()
paul@13	1049
paul@13	1050	def get_fields(self, docnum):
paul@9	1051
paul@9	1052	"Read the fields of the document with the given 'docnum'."
paul@9	1053
paul@9	1054	i = bisect_right(self.docs, (docnum, self.max_offset)) - 1
paul@9	1055
paul@9	1056	# Get the entry position providing the term or one preceding it.
paul@9	1057
paul@9	1058	if i == -1:
paul@9	1059	return None
paul@9	1060
paul@9	1061	found_docnum, offset = self.docs[i]
paul@9	1062
paul@9	1063	# Read from the fields file.
paul@9	1064
paul@9	1065	found_docnum, fields = self.field_reader.read_document_fields(found_docnum, offset)
paul@9	1066
paul@9	1067	# Scan for the document, if necessary.
paul@9	1068
paul@9	1069	try:
paul@9	1070	while docnum > found_docnum:
paul@9	1071	found_docnum, fields = self.field_reader.read_fields()
paul@9	1072	except EOFError:
paul@9	1073	pass
paul@9	1074
paul@9	1075	# If the document is found, return the fields.
paul@9	1076
paul@9	1077	if docnum == found_docnum:
paul@9	1078	return fields
paul@9	1079	else:
paul@9	1080	return None
paul@9	1081
paul@9	1082	def close(self):
paul@9	1083	self.field_reader.close()
paul@9	1084	self.field_index_reader.close()
paul@8	1085
paul@12	1086	# Dictionary merging classes.
paul@12	1087
paul@13	1088	class Merger:
paul@12	1089
paul@13	1090	"Merge files."
paul@12	1091
paul@12	1092	def __init__(self, writer, readers):
paul@12	1093	self.writer = writer
paul@12	1094	self.readers = readers
paul@12	1095
paul@13	1096	def close(self):
paul@13	1097	for reader in self.readers:
paul@13	1098	reader.close()
paul@13	1099	self.writer.close()
paul@13	1100
paul@13	1101	class TermDictionaryMerger(Merger):
paul@13	1102
paul@13	1103	"Merge term and position files."
paul@13	1104
paul@12	1105	def merge(self):
paul@13	1106
paul@13	1107	"""
paul@13	1108	Merge terms and positions from the readers, sending them to the writer.
paul@13	1109	"""
paul@13	1110
paul@12	1111	entries = []
paul@12	1112
paul@12	1113	# Get the first entries from the readers.
paul@12	1114
paul@12	1115	for partition, reader in enumerate(self.readers):
paul@12	1116	reader.rewind()
paul@12	1117
paul@12	1118	try:
paul@19	1119	term, frequency, doc_frequency, positions = reader.read_term()
paul@12	1120	insort_right(entries, (term, positions, partition))
paul@12	1121	except EOFError:
paul@12	1122	pass
paul@12	1123
paul@12	1124	# While entries are available, write them out in order, merging where
paul@12	1125	# appropriate.
paul@12	1126
paul@12	1127	while entries:
paul@12	1128	term, doc_positions, partition = entries[0]
paul@12	1129	to_update = [partition]
paul@12	1130
paul@12	1131	nentries = len(entries)
paul@12	1132	i = 1
paul@12	1133
paul@12	1134	# Find other entries for the term.
paul@12	1135
paul@12	1136	while i < nentries:
paul@12	1137	other_term, other_doc_positions, other_partition = entries[i]
paul@12	1138
paul@12	1139	# For such entries, merge the positions.
paul@12	1140
paul@12	1141	if other_term == term:
paul@14	1142	doc_positions = self.merge_positions(doc_positions, other_doc_positions)
paul@12	1143	to_update.append(other_partition)
paul@12	1144	i += 1
paul@12	1145	else:
paul@12	1146	break
paul@12	1147
paul@12	1148	# Write the combined term details.
paul@12	1149
paul@12	1150	self.writer.write_term_positions(term, doc_positions)
paul@12	1151
paul@12	1152	# Update the entries from the affected readers.
paul@12	1153
paul@12	1154	del entries[:i]
paul@12	1155
paul@12	1156	for partition in to_update:
paul@12	1157	try:
paul@19	1158	term, frequency, doc_frequency, positions = self.readers[partition].read_term()
paul@12	1159	insort_right(entries, (term, positions, partition))
paul@12	1160	except EOFError:
paul@12	1161	pass
paul@12	1162
paul@13	1163	def merge_positions(self, doc_positions, other_doc_positions):
paul@13	1164
paul@13	1165	"""
paul@13	1166	Merge 'doc_positions' with 'other_doc_positions' so that common document
paul@13	1167	records contain positions from both collections.
paul@13	1168	"""
paul@13	1169
paul@13	1170	doc_position_dict = dict(doc_positions)
paul@13	1171
paul@13	1172	for docnum, positions in other_doc_positions:
paul@13	1173	if doc_position_dict.has_key(docnum):
paul@13	1174	doc_position_dict[docnum] += positions
paul@13	1175	else:
paul@13	1176	doc_position_dict[docnum] = positions
paul@13	1177
paul@14	1178	return doc_position_dict.items()
paul@13	1179
paul@13	1180	class FieldDictionaryMerger(Merger):
paul@13	1181
paul@13	1182	"Merge field files."
paul@13	1183
paul@13	1184	def merge(self):
paul@13	1185
paul@13	1186	"""
paul@13	1187	Merge fields from the readers, sending them to the writer.
paul@13	1188	"""
paul@13	1189
paul@13	1190	entries = []
paul@13	1191
paul@13	1192	# Get the first entries from the readers.
paul@13	1193
paul@13	1194	for partition, reader in enumerate(self.readers):
paul@13	1195	reader.rewind()
paul@13	1196
paul@13	1197	try:
paul@13	1198	docnum, fields = reader.read_fields()
paul@13	1199	insort_right(entries, (docnum, fields, partition))
paul@13	1200	except EOFError:
paul@13	1201	pass
paul@13	1202
paul@13	1203	# While entries are available, write them out in order, merging where
paul@13	1204	# appropriate.
paul@13	1205
paul@13	1206	while entries:
paul@13	1207	docnum, fields, partition = entries[0]
paul@13	1208	to_update = [partition]
paul@13	1209
paul@13	1210	nentries = len(entries)
paul@13	1211	i = 1
paul@13	1212
paul@13	1213	# Find other entries for the term.
paul@13	1214
paul@13	1215	while i < nentries:
paul@13	1216	other_docnum, other_fields, other_partition = entries[i]
paul@13	1217
paul@13	1218	# For such entries, merge the positions.
paul@13	1219
paul@17	1220	if other_docnum == docnum:
paul@13	1221	fields += other_fields
paul@13	1222	to_update.append(other_partition)
paul@13	1223	i += 1
paul@13	1224	else:
paul@13	1225	break
paul@13	1226
paul@13	1227	# Write the combined term details.
paul@13	1228
paul@13	1229	self.writer.write_fields(docnum, fields)
paul@13	1230
paul@13	1231	# Update the entries from the affected readers.
paul@13	1232
paul@13	1233	del entries[:i]
paul@13	1234
paul@13	1235	for partition in to_update:
paul@13	1236	try:
paul@14	1237	docnum, fields = self.readers[partition].read_fields()
paul@13	1238	insort_right(entries, (docnum, fields, partition))
paul@13	1239	except EOFError:
paul@13	1240	pass
paul@13	1241
paul@13	1242	# Utility functions.
paul@13	1243
paul@19	1244	def get_term_writer(pathname, partition, interval, doc_interval):
paul@13	1245
paul@13	1246	"""
paul@13	1247	Return a term dictionary writer using files under the given 'pathname'
paul@13	1248	labelled according to the given 'partition', using the given indexing
paul@19	1249	'interval' for terms and 'doc_interval' for document position records.
paul@13	1250	"""
paul@13	1251
paul@13	1252	tdf = open(join(pathname, "terms-%s" % partition), "wb")
paul@13	1253	info_writer = TermWriter(tdf)
paul@13	1254
paul@14	1255	tdif = open(join(pathname, "terms_index-%s" % partition), "wb")
paul@13	1256	index_writer = TermIndexWriter(tdif)
paul@13	1257
paul@13	1258	tpf = open(join(pathname, "positions-%s" % partition), "wb")
paul@13	1259	positions_writer = PositionWriter(tpf)
paul@13	1260
paul@19	1261	tpif = open(join(pathname, "positions_index-%s" % partition), "wb")
paul@19	1262	positions_index_writer = PositionIndexWriter(tpif)
paul@19	1263
paul@19	1264	positions_dict_writer = PositionDictionaryWriter(positions_writer, positions_index_writer, doc_interval)
paul@19	1265
paul@19	1266	return TermDictionaryWriter(info_writer, index_writer, positions_dict_writer, interval)
paul@13	1267
paul@13	1268	def get_field_writer(pathname, partition, interval):
paul@13	1269
paul@13	1270	"""
paul@13	1271	Return a field dictionary writer using files under the given 'pathname'
paul@13	1272	labelled according to the given 'partition', using the given indexing
paul@13	1273	'interval'.
paul@13	1274	"""
paul@13	1275
paul@13	1276	ff = open(join(pathname, "fields-%s" % partition), "wb")
paul@13	1277	field_writer = FieldWriter(ff)
paul@13	1278
paul@13	1279	fif = open(join(pathname, "fields_index-%s" % partition), "wb")
paul@13	1280	field_index_writer = FieldIndexWriter(fif)
paul@13	1281
paul@13	1282	return FieldDictionaryWriter(field_writer, field_index_writer, interval)
paul@13	1283
paul@14	1284	def get_term_reader(pathname, partition):
paul@14	1285
paul@14	1286	"""
paul@14	1287	Return a term dictionary reader using files under the given 'pathname'
paul@14	1288	labelled according to the given 'partition'.
paul@14	1289	"""
paul@14	1290
paul@14	1291	tdf = open(join(pathname, "terms-%s" % partition), "rb")
paul@14	1292	info_reader = TermReader(tdf)
paul@14	1293
paul@14	1294	tdif = open(join(pathname, "terms_index-%s" % partition), "rb")
paul@14	1295	index_reader = TermIndexReader(tdif)
paul@14	1296
paul@14	1297	tpf = open(join(pathname, "positions-%s" % partition), "rb")
paul@14	1298	positions_reader = PositionReader(tpf)
paul@14	1299
paul@19	1300	tpif = open(join(pathname, "positions_index-%s" % partition), "rb")
paul@19	1301	positions_index_reader = PositionIndexReader(tpif)
paul@19	1302
paul@19	1303	positions_dict_reader = PositionDictionaryReader(positions_reader, positions_index_reader)
paul@19	1304
paul@19	1305	return TermDictionaryReader(info_reader, index_reader, positions_dict_reader)
paul@14	1306
paul@14	1307	def get_field_reader(pathname, partition):
paul@14	1308
paul@14	1309	"""
paul@14	1310	Return a field dictionary reader using files under the given 'pathname'
paul@14	1311	labelled according to the given 'partition'.
paul@14	1312	"""
paul@14	1313
paul@14	1314	ff = open(join(pathname, "fields-%s" % partition), "rb")
paul@14	1315	field_reader = FieldReader(ff)
paul@14	1316
paul@14	1317	fif = open(join(pathname, "fields_index-%s" % partition), "rb")
paul@14	1318	field_index_reader = FieldIndexReader(fif)
paul@14	1319
paul@14	1320	return FieldDictionaryReader(field_reader, field_index_reader)
paul@14	1321
paul@14	1322	def rename_files(pathname, names, from_partition, to_partition):
paul@14	1323	for name in names:
paul@14	1324	rename(join(pathname, "%s-%s" % (name, from_partition)), join(pathname, "%s-%s" % (name, to_partition)))
paul@14	1325
paul@14	1326	def rename_term_files(pathname, from_partition, to_partition):
paul@20	1327	rename_files(pathname, TERM_FILENAMES, from_partition, to_partition)
paul@14	1328
paul@14	1329	def rename_field_files(pathname, from_partition, to_partition):
paul@20	1330	rename_files(pathname, FIELD_FILENAMES, from_partition, to_partition)
paul@14	1331
paul@14	1332	def remove_files(pathname, names, partition):
paul@14	1333	for name in names:
paul@14	1334	remove(join(pathname, "%s-%s" % (name, partition)))
paul@14	1335
paul@14	1336	def remove_term_files(pathname, partition):
paul@20	1337	remove_files(pathname, TERM_FILENAMES, partition)
paul@14	1338
paul@14	1339	def remove_field_files(pathname, partition):
paul@20	1340	remove_files(pathname, FIELD_FILENAMES, partition)
paul@14	1341
paul@8	1342	# High-level classes.
paul@8	1343
paul@6	1344	class IndexWriter:
paul@6	1345
paul@10	1346	"""
paul@10	1347	Building term information and writing it to the term and field dictionaries.
paul@10	1348	"""
paul@6	1349
paul@20	1350	def __init__(self, pathname, interval, doc_interval, flush_interval):
paul@12	1351	self.pathname = pathname
paul@12	1352	self.interval = interval
paul@20	1353	self.doc_interval = doc_interval
paul@12	1354	self.flush_interval = flush_interval
paul@12	1355
paul@12	1356	self.dict_partition = 0
paul@12	1357	self.field_dict_partition = 0
paul@12	1358
paul@6	1359	self.terms = {}
paul@10	1360	self.docs = {}
paul@6	1361
paul@12	1362	self.position_counter = 0
paul@12	1363	self.field_counter = 0
paul@12	1364
paul@6	1365	def add_position(self, term, docnum, position):
paul@6	1366
paul@6	1367	"""
paul@6	1368	Add a position entry for the given 'term' in the document with the given
paul@6	1369	'docnum', indicating the given 'position'.
paul@6	1370	"""
paul@6	1371
paul@6	1372	if not self.terms.has_key(term):
paul@6	1373	doc_positions = self.terms[term] = {}
paul@6	1374	else:
paul@6	1375	doc_positions = self.terms[term]
paul@6	1376
paul@6	1377	if not doc_positions.has_key(docnum):
paul@6	1378	doc = doc_positions[docnum] = []
paul@6	1379	else:
paul@6	1380	doc = doc_positions[docnum]
paul@6	1381
paul@6	1382	doc.append(position)
paul@6	1383
paul@12	1384	self.position_counter += 1
paul@13	1385	if self.flush_interval and self.position_counter >= self.flush_interval:
paul@12	1386	self.flush_terms()
paul@15	1387	self.position_counter = 0
paul@12	1388
paul@13	1389	def add_field(self, docnum, identifier, value):
paul@10	1390
paul@13	1391	"""
paul@13	1392	Add for the document with the given 'docnum' a field having the given
paul@13	1393	'identifier' and 'value'.
paul@13	1394	"""
paul@10	1395
paul@10	1396	if not self.docs.has_key(docnum):
paul@13	1397	doc_fields = self.docs[docnum] = []
paul@10	1398	else:
paul@13	1399	doc_fields = self.docs[docnum]
paul@10	1400
paul@13	1401	doc_fields.append((identifier, value))
paul@13	1402
paul@13	1403	self.field_counter += 1
paul@13	1404	if self.flush_interval and self.field_counter >= self.flush_interval:
paul@12	1405	self.flush_fields()
paul@15	1406	self.field_counter = 0
paul@12	1407
paul@12	1408	def get_term_writer(self):
paul@12	1409
paul@12	1410	"Return a term dictionary writer for the current partition."
paul@12	1411
paul@20	1412	return get_term_writer(self.pathname, self.dict_partition, self.interval, self.doc_interval)
paul@12	1413
paul@12	1414	def get_field_writer(self):
paul@12	1415
paul@12	1416	"Return a field dictionary writer for the current partition."
paul@12	1417
paul@13	1418	return get_field_writer(self.pathname, self.field_dict_partition, self.interval)
paul@12	1419
paul@12	1420	def flush_terms(self):
paul@12	1421
paul@12	1422	"Flush terms into the current term dictionary partition."
paul@6	1423
paul@6	1424	# Get the terms in order.
paul@6	1425
paul@6	1426	terms = self.terms.items()
paul@6	1427	terms.sort()
paul@6	1428
paul@12	1429	dict_writer = self.get_term_writer()
paul@12	1430
paul@6	1431	for term, doc_positions in terms:
paul@6	1432	doc_positions = doc_positions.items()
paul@12	1433	dict_writer.write_term_positions(term, doc_positions)
paul@12	1434
paul@12	1435	dict_writer.close()
paul@6	1436
paul@12	1437	self.terms = {}
paul@12	1438	self.dict_partition += 1
paul@12	1439
paul@12	1440	def flush_fields(self):
paul@12	1441
paul@12	1442	"Flush fields into the current term dictionary partition."
paul@7	1443
paul@10	1444	# Get the documents in order.
paul@10	1445
paul@10	1446	docs = self.docs.items()
paul@10	1447	docs.sort()
paul@10	1448
paul@12	1449	field_dict_writer = self.get_field_writer()
paul@12	1450
paul@10	1451	for docnum, fields in docs:
paul@12	1452	field_dict_writer.write_fields(docnum, fields)
paul@12	1453
paul@12	1454	field_dict_writer.close()
paul@10	1455
paul@12	1456	self.docs = {}
paul@12	1457	self.field_dict_partition += 1
paul@12	1458
paul@12	1459	def close(self):
paul@12	1460	if self.terms:
paul@12	1461	self.flush_terms()
paul@12	1462	if self.docs:
paul@12	1463	self.flush_fields()
paul@10	1464
paul@10	1465	class IndexReader:
paul@10	1466
paul@10	1467	"Accessing the term and field dictionaries."
paul@10	1468
paul@14	1469	def __init__(self, pathname):
paul@14	1470	self.dict_reader = get_term_reader(pathname, "merged")
paul@14	1471	self.field_dict_reader = get_field_reader(pathname, "merged")
paul@10	1472
paul@10	1473	def find_positions(self, term):
paul@10	1474	return self.dict_reader.find_positions(term)
paul@10	1475
paul@11	1476	def get_frequency(self, term):
paul@11	1477	return self.dict_reader.get_frequency(term)
paul@11	1478
paul@10	1479	def get_fields(self, docnum):
paul@13	1480	return self.field_dict_reader.get_fields(docnum)
paul@10	1481
paul@10	1482	def close(self):
paul@10	1483	self.dict_reader.close()
paul@10	1484	self.field_dict_reader.close()
paul@10	1485
paul@7	1486	class Index:
paul@7	1487
paul@7	1488	"An inverted index solution encapsulating the various components."
paul@7	1489
paul@7	1490	def __init__(self, pathname):
paul@7	1491	self.pathname = pathname
paul@7	1492	self.reader = None
paul@7	1493	self.writer = None
paul@7	1494
paul@20	1495	def get_writer(self, interval=TERM_INTERVAL, doc_interval=DOCUMENT_INTERVAL, flush_interval=FLUSH_INTERVAL):
paul@7	1496
paul@12	1497	"""
paul@20	1498	Return a writer, optionally using the given indexing 'interval',
paul@20	1499	'doc_interval' and 'flush_interval'.
paul@12	1500	"""
paul@7	1501
paul@7	1502	if not exists(self.pathname):
paul@7	1503	mkdir(self.pathname)
paul@7	1504
paul@20	1505	self.writer = IndexWriter(self.pathname, interval, doc_interval, flush_interval)
paul@7	1506	return self.writer
paul@7	1507
paul@12	1508	def get_reader(self, partition=0):
paul@7	1509
paul@7	1510	"Return a reader for the index."
paul@7	1511
paul@14	1512	# Ensure that only one partition exists.
paul@14	1513
paul@14	1514	self.merge_terms()
paul@14	1515	self.merge_fields()
paul@14	1516
paul@14	1517	return self._get_reader(partition)
paul@14	1518
paul@14	1519	def _get_reader(self, partition):
paul@14	1520
paul@14	1521	"Return a reader for the index."
paul@14	1522
paul@7	1523	if not exists(self.pathname):
paul@7	1524	raise OSError, "Index path %r does not exist." % self.pathname
paul@7	1525
paul@14	1526	self.reader = IndexReader(self.pathname)
paul@12	1527	return self.reader
paul@7	1528
paul@20	1529	def merge_terms(self, interval=TERM_INTERVAL, doc_interval=DOCUMENT_INTERVAL):
paul@7	1530
paul@20	1531	"""
paul@20	1532	Merge term dictionaries using the given indexing 'interval' and
paul@20	1533	'doc_interval'.
paul@20	1534	"""
paul@10	1535
paul@12	1536	readers = []
paul@14	1537	partitions = []
paul@10	1538
paul@14	1539	for filename in listdir(self.pathname):
paul@12	1540	if filename.startswith("terms-"): # 6 character prefix
paul@14	1541	partition = filename[6:]
paul@14	1542	readers.append(get_term_reader(self.pathname, partition))
paul@14	1543	partitions.append(partition)
paul@14	1544
paul@14	1545	# Write directly to a dictionary.
paul@14	1546
paul@14	1547	if len(readers) > 1:
paul@20	1548	writer = get_term_writer(self.pathname, "merged", interval, doc_interval)
paul@14	1549	merger = TermDictionaryMerger(writer, readers)
paul@14	1550	merger.merge()
paul@14	1551	merger.close()
paul@14	1552
paul@14	1553	# Remove old files.
paul@14	1554
paul@14	1555	for partition in partitions:
paul@14	1556	remove_term_files(self.pathname, partition)
paul@14	1557
paul@14	1558	elif len(readers) == 1 and partitions[0] != "merged":
paul@14	1559	rename_term_files(self.pathname, partitions[0], "merged")
paul@14	1560
paul@20	1561	def merge_fields(self, interval=FIELD_INTERVAL):
paul@10	1562
paul@14	1563	"Merge field dictionaries using the given indexing 'interval'."
paul@14	1564
paul@14	1565	readers = []
paul@14	1566	partitions = []
paul@14	1567
paul@14	1568	for filename in listdir(self.pathname):
paul@14	1569	if filename.startswith("fields-"): # 7 character prefix
paul@14	1570	partition = filename[7:]
paul@14	1571	readers.append(get_field_reader(self.pathname, partition))
paul@14	1572	partitions.append(partition)
paul@14	1573
paul@14	1574	# Write directly to a dictionary.
paul@13	1575
paul@14	1576	if len(readers) > 1:
paul@14	1577	writer = get_field_writer(self.pathname, "merged", interval)
paul@14	1578	merger = FieldDictionaryMerger(writer, readers)
paul@14	1579	merger.merge()
paul@14	1580	merger.close()
paul@14	1581
paul@14	1582	# Remove old files.
paul@14	1583
paul@14	1584	for partition in partitions:
paul@14	1585	remove_field_files(self.pathname, partition)
paul@14	1586
paul@14	1587	elif len(readers) == 1 and partitions[0] != "merged":
paul@14	1588	rename_field_files(self.pathname, partitions[0], "merged")
paul@7	1589
paul@7	1590	def close(self):
paul@7	1591	if self.reader is not None:
paul@7	1592	self.reader.close()
paul@7	1593	self.reader = None
paul@7	1594	if self.writer is not None:
paul@7	1595	self.writer.close()
paul@7	1596	self.writer = None
paul@6	1597
paul@0	1598	# vim: tabstop=4 expandtab shiftwidth=4