iixr (annotate iixr.py in 1cf3b82959f3)

iixr

Annotated iixr.py

19:1cf3b82959f3

2009-09-02

Paul Boddie

Attempted to introduce position dictionaries with extra term record details providing document frequency information. Attempted to introduce file descriptor duplication in order to support concurrent iterators.

paul@0	1	#!/usr/bin/env python
paul@0	2
paul@0	3	"""
paul@0	4	A simple (and sane) text indexing library.
paul@1	5
paul@1	6	Copyright (C) 2009 Paul Boddie <paul@boddie.org.uk>
paul@1	7
paul@1	8	This program is free software; you can redistribute it and/or modify it under
paul@1	9	the terms of the GNU General Public License as published by the Free Software
paul@1	10	Foundation; either version 3 of the License, or (at your option) any later
paul@1	11	version.
paul@1	12
paul@1	13	This program is distributed in the hope that it will be useful, but WITHOUT ANY
paul@1	14	WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
paul@1	15	PARTICULAR PURPOSE. See the GNU General Public License for more details.
paul@1	16
paul@1	17	You should have received a copy of the GNU General Public License along
paul@1	18	with this program. If not, see <http://www.gnu.org/licenses/>.
paul@0	19	"""
paul@0	20
paul@19	21	from os import dup, fdopen # independent iterator access to files
paul@12	22	from os import listdir, mkdir # index and partition discovery
paul@14	23	from os import remove, rename # partition manipulation
paul@7	24	from os.path import exists, join
paul@2	25	from os.path import commonprefix # to find common string prefixes
paul@3	26	from bisect import bisect_right # to find terms in the dictionary index
paul@12	27	from bisect import insort_right # to maintain a sorted list of data for merging
paul@10	28	import bz2, zlib # for field compression
paul@2	29
paul@7	30	# Constants.
paul@7	31
paul@7	32	INTERVAL = 100
paul@12	33	FLUSH_INTERVAL = 1000000
paul@7	34
paul@10	35	compressors = [("b", bz2.compress), ("z", zlib.compress)]
paul@10	36	decompressors = {"b" : bz2.decompress, "z" : zlib.decompress}
paul@10	37
paul@0	38	# Foundation classes.
paul@0	39
paul@0	40	class File:
paul@0	41
paul@0	42	"A basic file abstraction."
paul@0	43
paul@0	44	def __init__(self, f):
paul@0	45	self.f = f
paul@0	46	self.reset()
paul@0	47
paul@0	48	def reset(self):
paul@12	49
paul@12	50	"To be used to reset the state of the reader or writer between records."
paul@12	51
paul@0	52	pass
paul@0	53
paul@12	54	def rewind(self):
paul@12	55	self.f.seek(0)
paul@13	56	self.reset()
paul@12	57
paul@0	58	def close(self):
paul@7	59	if self.f is not None:
paul@7	60	self.f.close()
paul@7	61	self.f = None
paul@0	62
paul@0	63	class FileWriter(File):
paul@0	64
paul@0	65	"Writing basic data types to files."
paul@0	66
paul@0	67	def write_number(self, number):
paul@0	68
paul@0	69	"Write 'number' to the file using a variable length encoding."
paul@0	70
paul@0	71	# Negative numbers are not supported.
paul@0	72
paul@0	73	if number < 0:
paul@0	74	raise ValueError, "Number %r is negative." % number
paul@0	75
paul@0	76	# Special case: one byte containing zero.
paul@0	77
paul@0	78	elif number == 0:
paul@4	79	self.f.write(chr(0))
paul@0	80	return
paul@0	81
paul@0	82	# Write the number from least to most significant digits.
paul@0	83
paul@0	84	bytes = []
paul@0	85
paul@0	86	while number != 0:
paul@4	87	lsd = number & 127
paul@4	88	number = number >> 7
paul@4	89	if number != 0:
paul@4	90	lsd \|= 128
paul@0	91	bytes.append(chr(lsd))
paul@0	92
paul@0	93	record = "".join(bytes)
paul@0	94	self.f.write(record)
paul@0	95
paul@8	96	def write_string(self, s, compress=0):
paul@2	97
paul@8	98	"""
paul@8	99	Write 's' to the file, recording its length and compressing the string
paul@8	100	if 'compress' is set to a true value.
paul@8	101	"""
paul@2	102
paul@7	103	# Convert Unicode objects to strings.
paul@7	104
paul@7	105	if isinstance(s, unicode):
paul@7	106	s = s.encode("utf-8")
paul@7	107
paul@8	108	# Compress the string if requested.
paul@2	109
paul@8	110	if compress:
paul@10	111	for flag, fn in compressors:
paul@10	112	cs = fn(s)
paul@10	113
paul@10	114	# Take the first string shorter than the original.
paul@10	115
paul@10	116	if len(cs) < len(s):
paul@10	117	s = cs
paul@10	118	break
paul@10	119	else:
paul@10	120	flag = "-"
paul@10	121
paul@10	122	# Record whether compression was used.
paul@10	123
paul@10	124	self.f.write(flag)
paul@2	125
paul@8	126	# Write the length of the data before the data itself.
paul@8	127
paul@8	128	length = len(s)
paul@4	129	self.write_number(length)
paul@2	130	self.f.write(s)
paul@2	131
paul@0	132	class FileReader(File):
paul@0	133
paul@0	134	"Reading basic data types from files."
paul@0	135
paul@0	136	def read_number(self):
paul@0	137
paul@0	138	"Read a number from the file."
paul@0	139
paul@0	140	# Read each byte, adding it to the number.
paul@0	141
paul@0	142	shift = 0
paul@0	143	number = 0
paul@4	144	more = 1
paul@0	145
paul@4	146	while more:
paul@4	147	byte = self.f.read(1)
paul@4	148	if not byte:
paul@4	149	raise EOFError
paul@4	150
paul@4	151	csd = ord(byte)
paul@4	152	more = csd & 128 != 0
paul@4	153	if more:
paul@4	154	csd &= 127
paul@0	155	number += (csd << shift)
paul@4	156	shift += 7
paul@0	157
paul@0	158	return number
paul@0	159
paul@8	160	def read_string(self, decompress=0):
paul@2	161
paul@8	162	"""
paul@8	163	Read a string from the file, decompressing the stored data if
paul@8	164	'decompress' is set to a true value.
paul@8	165	"""
paul@2	166
paul@10	167	# Decompress the data if requested.
paul@10	168
paul@10	169	if decompress:
paul@10	170	flag = self.f.read(1)
paul@10	171	else:
paul@10	172	flag = "-"
paul@10	173
paul@4	174	length = self.read_number()
paul@8	175	s = self.f.read(length)
paul@8	176
paul@10	177	# Perform decompression if applicable.
paul@8	178
paul@10	179	if flag != "-":
paul@10	180	fn = decompressors[flag]
paul@10	181	s = fn(s)
paul@7	182
paul@7	183	# Convert strings to Unicode objects.
paul@7	184
paul@8	185	return unicode(s, "utf-8")
paul@2	186
paul@9	187	# Specific classes for storing term and position information.
paul@0	188
paul@0	189	class PositionWriter(FileWriter):
paul@0	190
paul@0	191	"Writing position information to files."
paul@0	192
paul@0	193	def reset(self):
paul@0	194	self.last_docnum = 0
paul@0	195
paul@0	196	def write_positions(self, docnum, positions):
paul@0	197
paul@19	198	"""
paul@19	199	Write for the document 'docnum' the given 'positions'.
paul@19	200	Return the offset of the written record.
paul@19	201	"""
paul@0	202
paul@0	203	if docnum < self.last_docnum:
paul@0	204	raise ValueError, "Document number %r is less than previous number %r." % (docnum, self.last_docnum)
paul@0	205
paul@19	206	# Record the offset of this record.
paul@19	207
paul@19	208	offset = self.f.tell()
paul@19	209
paul@0	210	# Write the document number delta.
paul@0	211
paul@0	212	self.write_number(docnum - self.last_docnum)
paul@0	213
paul@0	214	# Write the number of positions.
paul@0	215
paul@0	216	self.write_number(len(positions))
paul@0	217
paul@7	218	# Make sure that the positions are sorted.
paul@7	219
paul@7	220	positions.sort()
paul@7	221
paul@0	222	# Write the position deltas.
paul@0	223
paul@0	224	last = 0
paul@0	225	for position in positions:
paul@0	226	pos = position - last
paul@0	227	self.write_number(pos)
paul@0	228	last = position
paul@0	229
paul@0	230	self.last_docnum = docnum
paul@0	231
paul@19	232	return offset
paul@0	233
paul@0	234	class PositionReader(FileReader):
paul@0	235
paul@0	236	"Reading position information from files."
paul@0	237
paul@0	238	def reset(self):
paul@0	239	self.last_docnum = 0
paul@0	240
paul@0	241	def read_positions(self):
paul@0	242
paul@0	243	"Read positions, returning a document number and a list of positions."
paul@0	244
paul@0	245	# Read the document number delta and add it to the last number.
paul@0	246
paul@0	247	self.last_docnum += self.read_number()
paul@0	248
paul@0	249	# Read the number of positions.
paul@0	250
paul@0	251	npositions = self.read_number()
paul@0	252
paul@0	253	# Read the position deltas, adding each previous position to get the
paul@0	254	# appropriate collection of absolute positions.
paul@0	255
paul@0	256	i = 0
paul@0	257	last = 0
paul@0	258	positions = []
paul@0	259
paul@0	260	while i < npositions:
paul@0	261	last += self.read_number()
paul@0	262	positions.append(last)
paul@0	263	i += 1
paul@0	264
paul@0	265	return self.last_docnum, positions
paul@0	266
paul@19	267	def read_term_positions(self, offset, count):
paul@0	268
paul@0	269	"""
paul@0	270	Read all positions from 'offset', seeking to that position in the file
paul@19	271	before reading. The number of documents available for reading is limited
paul@19	272	to 'count'.
paul@0	273	"""
paul@0	274
paul@0	275	self.reset()
paul@19	276
paul@19	277	# Duplicate the file handle.
paul@19	278
paul@19	279	f = fdopen(dup(self.f.fileno()), "rb")
paul@19	280	f.seek(offset)
paul@19	281	return PositionIterator(f, count)
paul@19	282
paul@19	283	class IteratorBase:
paul@19	284
paul@19	285	def __init__(self, count):
paul@19	286	self.replenish(count)
paul@0	287
paul@19	288	def replenish(self, count):
paul@19	289	self.count = count
paul@19	290	self.read_documents = 0
paul@19	291
paul@19	292	def __len__(self):
paul@19	293	return self.count
paul@18	294
paul@19	295	def sort(self):
paul@19	296	pass # Stored document positions are already sorted.
paul@18	297
paul@19	298	def __iter__(self):
paul@19	299	return self
paul@19	300
paul@19	301	class PositionIterator(PositionReader, IteratorBase):
paul@18	302
paul@18	303	"Iterating over document positions."
paul@18	304
paul@19	305	def __init__(self, f, count):
paul@18	306	PositionReader.__init__(self, f)
paul@19	307	IteratorBase.__init__(self, count)
paul@19	308
paul@19	309	def next(self):
paul@19	310
paul@19	311	"Read positions for a single document."
paul@19	312
paul@19	313	if self.read_documents < self.count:
paul@19	314	self.read_documents += 1
paul@19	315	return self.read_positions()
paul@19	316	else:
paul@19	317	raise StopIteration
paul@19	318
paul@19	319	class PositionIndexWriter(FileWriter):
paul@19	320
paul@19	321	"Writing position index information to files."
paul@19	322
paul@19	323	def reset(self):
paul@19	324	self.last_docnum = 0
paul@19	325	self.last_pos_offset = 0
paul@19	326
paul@19	327	def write_positions(self, docnum, pos_offset, count):
paul@19	328
paul@19	329	"""
paul@19	330	Write the given 'docnum, 'pos_offset' and document 'count' to the
paul@19	331	position index file.
paul@19	332	"""
paul@19	333
paul@19	334	# Record the offset of this record.
paul@19	335
paul@19	336	offset = self.f.tell()
paul@19	337
paul@19	338	# Write the document number delta.
paul@19	339
paul@19	340	self.write_number(docnum - self.last_docnum)
paul@19	341	self.last_docnum = docnum
paul@19	342
paul@19	343	# Write the position file offset delta.
paul@19	344
paul@19	345	self.write_number(pos_offset - self.last_pos_offset)
paul@19	346	self.last_pos_offset = pos_offset
paul@19	347
paul@19	348	# Write the document count.
paul@19	349
paul@19	350	self.write_number(count)
paul@19	351
paul@19	352	return offset
paul@19	353
paul@19	354	class PositionIndexReader(FileReader):
paul@19	355
paul@19	356	"Reading position index information from files."
paul@18	357
paul@19	358	def reset(self):
paul@19	359	self.last_docnum = 0
paul@19	360	self.last_pos_offset = 0
paul@19	361
paul@19	362	def read_positions(self):
paul@19	363
paul@19	364	"""
paul@19	365	Read a document number, a position file offset for the position index
paul@19	366	file, and the number of documents in a section of that file.
paul@19	367	"""
paul@19	368
paul@19	369	# Read the document number delta.
paul@19	370
paul@19	371	self.last_docnum += self.read_number()
paul@19	372
paul@19	373	# Read the offset delta.
paul@19	374
paul@19	375	self.last_pos_offset += self.read_number()
paul@19	376
paul@19	377	# Read the document count.
paul@19	378
paul@19	379	count = self.read_number()
paul@19	380
paul@19	381	return self.last_docnum, self.last_pos_offset, count
paul@19	382
paul@19	383	def read_term_positions(self, offset, doc_frequency):
paul@0	384
paul@19	385	"""
paul@19	386	Read all positions from 'offset', seeking to that position in the file
paul@19	387	before reading. The number of documents available for reading is limited
paul@19	388	to 'doc_frequency'.
paul@19	389	"""
paul@19	390
paul@19	391	# NOTE: This is almost a duplication of PositionReader.read_term_positions.
paul@19	392
paul@19	393	self.reset()
paul@19	394
paul@19	395	# Duplicate the file handle.
paul@19	396
paul@19	397	f = fdopen(dup(self.f.fileno()), "rb")
paul@19	398	f.seek(offset)
paul@19	399	return PositionIndexIterator(f, doc_frequency)
paul@19	400
paul@19	401	class PositionIndexIterator(PositionIndexReader, IteratorBase):
paul@19	402
paul@19	403	"Iterating over document positions."
paul@19	404
paul@19	405	def __init__(self, f, count):
paul@19	406	PositionIndexReader.__init__(self, f)
paul@19	407	IteratorBase.__init__(self, count)
paul@19	408	self.section_count = 0
paul@19	409
paul@19	410	def next(self):
paul@19	411
paul@19	412	"Read positions for a single document."
paul@18	413
paul@19	414	self.read_documents += self.section_count
paul@19	415	if self.read_documents < self.count:
paul@19	416	docnum, pos_offset, self.section_count = t = self.read_positions()
paul@19	417	return t
paul@19	418	else:
paul@19	419	raise StopIteration
paul@19	420
paul@19	421	class PositionDictionaryWriter:
paul@19	422
paul@19	423	"Writing position dictionaries."
paul@19	424
paul@19	425	def __init__(self, position_writer, position_index_writer, interval):
paul@19	426	self.position_writer = position_writer
paul@19	427	self.position_index_writer = position_index_writer
paul@19	428	self.interval = interval
paul@19	429
paul@19	430	def write_term_positions(self, doc_positions):
paul@19	431
paul@19	432	"""
paul@19	433	Write all 'doc_positions' - a collection of tuples of the form (document
paul@19	434	number, position list) - to the file.
paul@19	435
paul@19	436	Add some records to the index, making dictionary entries.
paul@19	437
paul@19	438	Return a tuple containing the offset of the written data, the frequency
paul@19	439	(number of positions), and document frequency (number of documents) for
paul@19	440	the term involved.
paul@19	441	"""
paul@19	442
paul@19	443	# Reset the writer.
paul@19	444
paul@19	445	self.position_writer.reset()
paul@19	446	index_offset = None
paul@19	447
paul@19	448	# Write the positions.
paul@19	449
paul@19	450	frequency = 0
paul@19	451	first_offset = None
paul@19	452	count = 0
paul@19	453
paul@19	454	doc_positions.sort()
paul@19	455
paul@19	456	for docnum, positions in doc_positions:
paul@19	457	pos_offset = self.position_writer.write_positions(docnum, positions)
paul@19	458
paul@19	459	# Retain the first record offset for a subsequent index entry.
paul@19	460
paul@19	461	if first_offset is None:
paul@19	462	first_offset = pos_offset
paul@19	463
paul@19	464	frequency += len(positions)
paul@19	465
paul@19	466	# Every {interval} entries, write an index entry.
paul@19	467
paul@19	468	if count == self.interval:
paul@19	469	io = self.position_index_writer.write_positions(docnum, first_offset, self.interval)
paul@0	470
paul@19	471	# Remember the first index entry offset.
paul@19	472
paul@19	473	if index_offset is None:
paul@19	474	index_offset = io
paul@19	475
paul@19	476	first_offset = None
paul@19	477	count = 0
paul@19	478
paul@19	479	count += 1
paul@19	480
paul@19	481	# Finish writing an index entry for the remaining documents.
paul@19	482
paul@19	483	else:
paul@19	484	if first_offset is not None:
paul@19	485	io = self.position_index_writer.write_positions(docnum, first_offset, count)
paul@19	486
paul@19	487	# Remember the first index entry offset.
paul@19	488
paul@19	489	if index_offset is None:
paul@19	490	index_offset = io
paul@19	491
paul@19	492	return index_offset, frequency, len(doc_positions)
paul@19	493
paul@19	494	def close(self):
paul@19	495	self.position_writer.close()
paul@19	496	self.position_index_writer.close()
paul@19	497
paul@19	498	class PositionDictionaryReader:
paul@18	499
paul@19	500	"Reading position dictionaries."
paul@19	501
paul@19	502	def __init__(self, position_reader, position_index_reader):
paul@19	503	self.position_reader = position_reader
paul@19	504	self.position_index_reader = position_index_reader
paul@19	505
paul@19	506	def read_term_positions(self, offset, doc_frequency):
paul@19	507
paul@19	508	"""
paul@19	509	Return an iterator for dictionary entries starting at 'offset' with the
paul@19	510	given 'doc_frequency'.
paul@19	511	"""
paul@18	512
paul@19	513	return PositionDictionaryIterator(self.position_reader,
paul@19	514	self.position_index_reader, offset, doc_frequency)
paul@19	515
paul@19	516	def close(self):
paul@19	517	self.position_reader.close()
paul@19	518	self.position_index_reader.close()
paul@19	519
paul@19	520	class PositionDictionaryIterator:
paul@19	521
paul@19	522	"Iteration over position dictionary entries."
paul@19	523
paul@19	524	def __init__(self, position_reader, position_index_reader, offset, doc_frequency):
paul@19	525	self.position_reader = position_reader
paul@19	526
paul@19	527	self.index_iterator = position_index_reader.read_term_positions(offset, doc_frequency)
paul@19	528	self.next_section()
paul@19	529	self.init_section()
paul@0	530
paul@18	531	def __iter__(self):
paul@18	532	return self
paul@18	533
paul@18	534	def next(self):
paul@0	535
paul@19	536	# Attempt to get the next document record from the section in the positions file.
paul@19	537
paul@19	538	while 1:
paul@19	539
paul@19	540	# Either return the next record.
paul@19	541
paul@19	542	try:
paul@19	543	return self.iterator.next()
paul@0	544
paul@19	545	# Or, where a section is finished, get the next section and try again.
paul@19	546
paul@19	547	except StopIteration:
paul@19	548	self.next_section()
paul@19	549	self.iterator.replenish(self.section_count)
paul@19	550
paul@19	551	def next_section(self):
paul@19	552	self.docnum, self.pos_offset, self.section_count = self.index_iterator.read_positions()
paul@19	553
paul@19	554	def init_section(self):
paul@19	555	self.iterator = self.position_reader.read_term_positions(self.pos_offset, self.section_count)
paul@0	556
paul@2	557	class TermWriter(FileWriter):
paul@2	558
paul@2	559	"Writing term information to files."
paul@2	560
paul@2	561	def reset(self):
paul@2	562	self.last_term = ""
paul@2	563	self.last_offset = 0
paul@2	564
paul@19	565	def write_term(self, term, offset, frequency, doc_frequency):
paul@2	566
paul@2	567	"""
paul@19	568	Write the given 'term', its position file 'offset', its 'frequency' and
paul@19	569	its 'doc_frequency' (number of documents in which it appears) to the
paul@19	570	term information file. Return the offset after the term information was
paul@19	571	written to the file.
paul@2	572	"""
paul@2	573
paul@2	574	# Write the prefix length and term suffix.
paul@2	575
paul@2	576	common = len(commonprefix([self.last_term, term]))
paul@2	577	suffix = term[common:]
paul@2	578
paul@4	579	self.write_number(common)
paul@2	580	self.write_string(suffix)
paul@2	581
paul@2	582	# Write the offset delta.
paul@2	583
paul@2	584	self.write_number(offset - self.last_offset)
paul@2	585
paul@11	586	# Write the frequency.
paul@11	587
paul@11	588	self.write_number(frequency)
paul@11	589
paul@19	590	# Write the document frequency.
paul@19	591
paul@19	592	self.write_number(doc_frequency)
paul@19	593
paul@2	594	self.last_term = term
paul@2	595	self.last_offset = offset
paul@2	596
paul@3	597	return self.f.tell()
paul@3	598
paul@2	599	class TermReader(FileReader):
paul@2	600
paul@2	601	"Reading term information from files."
paul@2	602
paul@2	603	def reset(self):
paul@2	604	self.last_term = ""
paul@2	605	self.last_offset = 0
paul@2	606
paul@2	607	def read_term(self):
paul@2	608
paul@2	609	"""
paul@19	610	Read a term, its position file offset, its frequency and its document
paul@19	611	frequence from the term information file.
paul@2	612	"""
paul@2	613
paul@2	614	# Read the prefix length and term suffix.
paul@2	615
paul@4	616	common = self.read_number()
paul@2	617	suffix = self.read_string()
paul@2	618
paul@2	619	self.last_term = self.last_term[:common] + suffix
paul@2	620
paul@2	621	# Read the offset delta.
paul@2	622
paul@2	623	self.last_offset += self.read_number()
paul@2	624
paul@11	625	# Read the frequency.
paul@11	626
paul@11	627	frequency = self.read_number()
paul@11	628
paul@19	629	# Read the document frequency.
paul@19	630
paul@19	631	doc_frequency = self.read_number()
paul@19	632
paul@19	633	return self.last_term, self.last_offset, frequency, doc_frequency
paul@2	634
paul@3	635	def go_to_term(self, term, offset, info_offset):
paul@3	636
paul@9	637	"""
paul@9	638	Seek past the entry for 'term' having 'offset' to 'info_offset'. This
paul@9	639	permits the scanning for later terms from the specified term.
paul@9	640	"""
paul@3	641
paul@3	642	self.f.seek(info_offset)
paul@3	643	self.last_term = term
paul@3	644	self.last_offset = offset
paul@3	645
paul@3	646	class TermIndexWriter(TermWriter):
paul@3	647
paul@3	648	"Writing term dictionary index details to files."
paul@3	649
paul@3	650	def reset(self):
paul@3	651	TermWriter.reset(self)
paul@3	652	self.last_info_offset = 0
paul@3	653
paul@19	654	def write_term(self, term, offset, frequency, doc_frequency, info_offset):
paul@3	655
paul@3	656	"""
paul@19	657	Write the given 'term', its position file 'offset', its 'frequency' and
paul@19	658	its 'doc_frequency' to the term dictionary index file, along with the
paul@19	659	'info_offset' in the term information file.
paul@3	660	"""
paul@3	661
paul@19	662	TermWriter.write_term(self, term, offset, frequency, doc_frequency)
paul@3	663
paul@3	664	# Write the information file offset delta.
paul@3	665
paul@3	666	self.write_number(info_offset - self.last_info_offset)
paul@3	667	self.last_info_offset = info_offset
paul@3	668
paul@3	669	class TermIndexReader(TermReader):
paul@3	670
paul@3	671	"Reading term dictionary index details from files."
paul@3	672
paul@3	673	def reset(self):
paul@3	674	TermReader.reset(self)
paul@3	675	self.last_info_offset = 0
paul@3	676
paul@3	677	def read_term(self):
paul@3	678
paul@3	679	"""
paul@19	680	Read a term, its position file offset, its frequency, its document
paul@19	681	frequency and a term information file offset from the term dictionary
paul@19	682	index file.
paul@3	683	"""
paul@3	684
paul@19	685	term, offset, frequency, doc_frequency = TermReader.read_term(self)
paul@3	686
paul@3	687	# Read the offset delta.
paul@3	688
paul@3	689	self.last_info_offset += self.read_number()
paul@3	690
paul@19	691	return term, offset, frequency, doc_frequency, self.last_info_offset
paul@3	692
paul@3	693	class TermDictionaryWriter:
paul@3	694
paul@3	695	"Writing term dictionaries."
paul@3	696
paul@19	697	def __init__(self, info_writer, index_writer, position_dict_writer, interval):
paul@3	698	self.info_writer = info_writer
paul@3	699	self.index_writer = index_writer
paul@19	700	self.position_dict_writer = position_dict_writer
paul@3	701	self.interval = interval
paul@3	702	self.entry = 0
paul@3	703
paul@19	704	def _write_term(self, term, offset, frequency, doc_frequency):
paul@3	705
paul@3	706	"""
paul@19	707	Write the given 'term', its position file 'offset', its 'frequency' and
paul@19	708	its 'doc_frequency' (number of documents in which it appears) to the
paul@19	709	term information file. Return the offset after the term information was
paul@19	710	written to the file.
paul@3	711	"""
paul@3	712
paul@19	713	info_offset = self.info_writer.write_term(term, offset, frequency, doc_frequency)
paul@3	714
paul@3	715	if self.entry % self.interval == 0:
paul@19	716	self.index_writer.write_term(term, offset, frequency, doc_frequency, info_offset)
paul@3	717
paul@3	718	self.entry += 1
paul@3	719
paul@5	720	def write_term_positions(self, term, doc_positions):
paul@5	721
paul@5	722	"""
paul@5	723	Write the given 'term' and the 'doc_positions' recording the documents
paul@5	724	and positions at which the term is found.
paul@5	725	"""
paul@5	726
paul@19	727	offset, frequency, doc_frequency = self.position_dict_writer.write_term_positions(doc_positions)
paul@19	728	self._write_term(term, offset, frequency, doc_frequency)
paul@5	729
paul@3	730	def close(self):
paul@3	731	self.info_writer.close()
paul@3	732	self.index_writer.close()
paul@19	733	self.position_dict_writer.close()
paul@3	734
paul@3	735	class TermDictionaryReader:
paul@3	736
paul@3	737	"Reading term dictionaries."
paul@3	738
paul@5	739	def __init__(self, info_reader, index_reader, position_reader):
paul@3	740	self.info_reader = info_reader
paul@3	741	self.index_reader = index_reader
paul@5	742	self.position_reader = position_reader
paul@3	743
paul@3	744	self.terms = []
paul@3	745	try:
paul@3	746	while 1:
paul@3	747	self.terms.append(self.index_reader.read_term())
paul@3	748	except EOFError:
paul@3	749	pass
paul@3	750
paul@3	751	# Large numbers for ordering purposes.
paul@3	752
paul@14	753	self.max_offset = self.terms[-1][1] + 1
paul@3	754
paul@9	755	def _find_term(self, term):
paul@3	756
paul@11	757	"""
paul@11	758	Find the position file offset and frequency of 'term' from the term
paul@11	759	dictionary.
paul@11	760	"""
paul@3	761
paul@14	762	i = bisect_right(self.terms, (term, self.max_offset, 0, 0)) - 1
paul@3	763
paul@3	764	# Get the entry position providing the term or one preceding it.
paul@3	765
paul@3	766	if i == -1:
paul@3	767	return None
paul@3	768
paul@19	769	found_term, offset, frequency, doc_frequency, info_offset = self.terms[i]
paul@3	770
paul@19	771	# Where the term is found immediately, return the offset and
paul@19	772	# frequencies.
paul@3	773
paul@3	774	if term == found_term:
paul@19	775	return offset, frequency, doc_frequency
paul@3	776
paul@3	777	# Otherwise, seek past the index term's entry in the information file
paul@3	778	# and scan for the desired term.
paul@3	779
paul@3	780	else:
paul@3	781	self.info_reader.go_to_term(found_term, offset, info_offset)
paul@3	782	try:
paul@3	783	while term > found_term:
paul@19	784	found_term, offset, frequency, doc_frequency = self.info_reader.read_term()
paul@3	785	except EOFError:
paul@3	786	pass
paul@3	787
paul@19	788	# If the term is found, return the offset and frequencies.
paul@3	789
paul@3	790	if term == found_term:
paul@19	791	return offset, frequency, doc_frequency
paul@3	792	else:
paul@3	793	return None
paul@3	794
paul@12	795	def rewind(self):
paul@12	796	self.info_reader.rewind()
paul@12	797
paul@19	798	def _get_positions(self, offset, doc_frequency):
paul@19	799	return self.position_reader.read_term_positions(offset, doc_frequency)
paul@12	800
paul@12	801	def read_term(self):
paul@12	802
paul@12	803	"""
paul@19	804	Return the next term, its frequency, its document frequency, and the
paul@19	805	documents and positions at which the term is found.
paul@12	806	"""
paul@12	807
paul@19	808	term, offset, frequency, doc_frequency = self.info_reader.read_term()
paul@19	809	positions = self._get_positions(offset, doc_frequency)
paul@19	810	return term, frequency, doc_frequency, positions
paul@12	811
paul@5	812	def find_positions(self, term):
paul@5	813
paul@5	814	"Return the documents and positions at which the given 'term' is found."
paul@5	815
paul@11	816	t = self._find_term(term)
paul@11	817	if t is None:
paul@5	818	return None
paul@5	819	else:
paul@19	820	offset, frequency, doc_frequency = t
paul@19	821	return self._get_positions(offset, doc_frequency)
paul@5	822
paul@11	823	def get_frequency(self, term):
paul@11	824
paul@11	825	"Return the frequency of the given 'term'."
paul@11	826
paul@11	827	t = self._find_term(term)
paul@11	828	if t is None:
paul@11	829	return None
paul@11	830	else:
paul@19	831	offset, frequency, doc_frequency = t
paul@11	832	return frequency
paul@11	833
paul@19	834	def get_document_frequency(self, term):
paul@19	835
paul@19	836	"Return the document frequency of the given 'term'."
paul@19	837
paul@19	838	t = self._find_term(term)
paul@19	839	if t is None:
paul@19	840	return None
paul@19	841	else:
paul@19	842	offset, frequency, doc_frequency = t
paul@19	843	return doc_frequency
paul@19	844
paul@3	845	def close(self):
paul@3	846	self.info_reader.close()
paul@3	847	self.index_reader.close()
paul@5	848	self.position_reader.close()
paul@3	849
paul@9	850	# Specific classes for storing document information.
paul@9	851
paul@8	852	class FieldWriter(FileWriter):
paul@8	853
paul@8	854	"Writing field data to files."
paul@8	855
paul@9	856	def reset(self):
paul@9	857	self.last_docnum = 0
paul@9	858
paul@9	859	def write_fields(self, docnum, fields):
paul@8	860
paul@8	861	"""
paul@13	862	Write for the given 'docnum', a list of 'fields' (integer, string pairs
paul@13	863	representing field identifiers and values respectively).
paul@13	864	Return the offset at which the fields are stored.
paul@8	865	"""
paul@8	866
paul@8	867	offset = self.f.tell()
paul@8	868
paul@9	869	# Write the document number delta.
paul@9	870
paul@9	871	self.write_number(docnum - self.last_docnum)
paul@9	872
paul@8	873	# Write the number of fields.
paul@8	874
paul@8	875	self.write_number(len(fields))
paul@8	876
paul@8	877	# Write the fields themselves.
paul@8	878
paul@13	879	for i, field in fields:
paul@13	880	self.write_number(i)
paul@10	881	self.write_string(field, 1) # compress
paul@8	882
paul@9	883	self.last_docnum = docnum
paul@8	884	return offset
paul@8	885
paul@8	886	class FieldReader(FileReader):
paul@8	887
paul@8	888	"Reading field data from files."
paul@8	889
paul@9	890	def reset(self):
paul@9	891	self.last_docnum = 0
paul@9	892
paul@8	893	def read_fields(self):
paul@8	894
paul@9	895	"""
paul@9	896	Read fields from the file, returning a tuple containing the document
paul@13	897	number and a list of field (identifier, value) pairs.
paul@9	898	"""
paul@9	899
paul@9	900	# Read the document number.
paul@9	901
paul@9	902	self.last_docnum += self.read_number()
paul@8	903
paul@8	904	# Read the number of fields.
paul@8	905
paul@8	906	nfields = self.read_number()
paul@8	907
paul@8	908	# Collect the fields.
paul@8	909
paul@8	910	fields = []
paul@8	911	i = 0
paul@8	912
paul@8	913	while i < nfields:
paul@13	914	identifier = self.read_number()
paul@13	915	value = self.read_string(1) # decompress
paul@13	916	fields.append((identifier, value))
paul@8	917	i += 1
paul@8	918
paul@9	919	return self.last_docnum, fields
paul@9	920
paul@9	921	def read_document_fields(self, docnum, offset):
paul@8	922
paul@9	923	"""
paul@9	924	Read fields for 'docnum' at the given 'offset'. This permits the
paul@9	925	retrieval of details for the specified document, as well as scanning for
paul@9	926	later documents.
paul@9	927	"""
paul@8	928
paul@8	929	self.f.seek(offset)
paul@9	930	bad_docnum, fields = self.read_fields()
paul@9	931	self.last_docnum = docnum
paul@9	932	return docnum, fields
paul@12	933
paul@9	934	class FieldIndexWriter(FileWriter):
paul@9	935
paul@9	936	"Writing field index details to files."
paul@9	937
paul@9	938	def reset(self):
paul@9	939	self.last_docnum = 0
paul@10	940	self.last_offset = 0
paul@9	941
paul@9	942	def write_document(self, docnum, offset):
paul@9	943
paul@9	944	"""
paul@9	945	Write for the given 'docnum', the 'offset' at which the fields for the
paul@9	946	document are stored in the fields file.
paul@9	947	"""
paul@9	948
paul@10	949	# Write the document number and offset deltas.
paul@9	950
paul@9	951	self.write_number(docnum - self.last_docnum)
paul@10	952	self.write_number(offset - self.last_offset)
paul@9	953
paul@9	954	self.last_docnum = docnum
paul@10	955	self.last_offset = offset
paul@9	956
paul@9	957	class FieldIndexReader(FileReader):
paul@9	958
paul@9	959	"Reading field index details from files."
paul@9	960
paul@9	961	def reset(self):
paul@9	962	self.last_docnum = 0
paul@10	963	self.last_offset = 0
paul@9	964
paul@9	965	def read_document(self):
paul@9	966
paul@9	967	"Read a document number and field file offset."
paul@9	968
paul@9	969	# Read the document number delta and offset.
paul@9	970
paul@9	971	self.last_docnum += self.read_number()
paul@10	972	self.last_offset += self.read_number()
paul@9	973
paul@10	974	return self.last_docnum, self.last_offset
paul@9	975
paul@9	976	class FieldDictionaryWriter:
paul@9	977
paul@9	978	"Writing field dictionary details."
paul@9	979
paul@9	980	def __init__(self, field_writer, field_index_writer, interval):
paul@9	981	self.field_writer = field_writer
paul@9	982	self.field_index_writer = field_index_writer
paul@9	983	self.interval = interval
paul@9	984	self.entry = 0
paul@9	985
paul@9	986	def write_fields(self, docnum, fields):
paul@9	987
paul@9	988	"Write details of the document with the given 'docnum' and 'fields'."
paul@9	989
paul@9	990	offset = self.field_writer.write_fields(docnum, fields)
paul@9	991
paul@9	992	if self.entry % self.interval == 0:
paul@9	993	self.field_index_writer.write_document(docnum, offset)
paul@9	994
paul@9	995	self.entry += 1
paul@9	996
paul@9	997	def close(self):
paul@9	998	self.field_writer.close()
paul@9	999	self.field_index_writer.close()
paul@9	1000
paul@9	1001	class FieldDictionaryReader:
paul@9	1002
paul@9	1003	"Reading field dictionary details."
paul@9	1004
paul@9	1005	def __init__(self, field_reader, field_index_reader):
paul@9	1006	self.field_reader = field_reader
paul@9	1007	self.field_index_reader = field_index_reader
paul@9	1008
paul@9	1009	self.docs = []
paul@9	1010	try:
paul@9	1011	while 1:
paul@9	1012	self.docs.append(self.field_index_reader.read_document())
paul@9	1013	except EOFError:
paul@9	1014	pass
paul@9	1015
paul@9	1016	# Large numbers for ordering purposes.
paul@9	1017
paul@9	1018	self.max_offset = self.docs[-1][1]
paul@9	1019
paul@13	1020	def rewind(self):
paul@13	1021	self.field_reader.rewind()
paul@13	1022
paul@13	1023	def read_fields(self):
paul@13	1024
paul@13	1025	"Return the next document number and fields."
paul@13	1026
paul@13	1027	return self.field_reader.read_fields()
paul@13	1028
paul@13	1029	def get_fields(self, docnum):
paul@9	1030
paul@9	1031	"Read the fields of the document with the given 'docnum'."
paul@9	1032
paul@9	1033	i = bisect_right(self.docs, (docnum, self.max_offset)) - 1
paul@9	1034
paul@9	1035	# Get the entry position providing the term or one preceding it.
paul@9	1036
paul@9	1037	if i == -1:
paul@9	1038	return None
paul@9	1039
paul@9	1040	found_docnum, offset = self.docs[i]
paul@9	1041
paul@9	1042	# Read from the fields file.
paul@9	1043
paul@9	1044	found_docnum, fields = self.field_reader.read_document_fields(found_docnum, offset)
paul@9	1045
paul@9	1046	# Scan for the document, if necessary.
paul@9	1047
paul@9	1048	try:
paul@9	1049	while docnum > found_docnum:
paul@9	1050	found_docnum, fields = self.field_reader.read_fields()
paul@9	1051	except EOFError:
paul@9	1052	pass
paul@9	1053
paul@9	1054	# If the document is found, return the fields.
paul@9	1055
paul@9	1056	if docnum == found_docnum:
paul@9	1057	return fields
paul@9	1058	else:
paul@9	1059	return None
paul@9	1060
paul@9	1061	def close(self):
paul@9	1062	self.field_reader.close()
paul@9	1063	self.field_index_reader.close()
paul@8	1064
paul@12	1065	# Dictionary merging classes.
paul@12	1066
paul@13	1067	class Merger:
paul@12	1068
paul@13	1069	"Merge files."
paul@12	1070
paul@12	1071	def __init__(self, writer, readers):
paul@12	1072	self.writer = writer
paul@12	1073	self.readers = readers
paul@12	1074
paul@13	1075	def close(self):
paul@13	1076	for reader in self.readers:
paul@13	1077	reader.close()
paul@13	1078	self.writer.close()
paul@13	1079
paul@13	1080	class TermDictionaryMerger(Merger):
paul@13	1081
paul@13	1082	"Merge term and position files."
paul@13	1083
paul@12	1084	def merge(self):
paul@13	1085
paul@13	1086	"""
paul@13	1087	Merge terms and positions from the readers, sending them to the writer.
paul@13	1088	"""
paul@13	1089
paul@12	1090	entries = []
paul@12	1091
paul@12	1092	# Get the first entries from the readers.
paul@12	1093
paul@12	1094	for partition, reader in enumerate(self.readers):
paul@12	1095	reader.rewind()
paul@12	1096
paul@12	1097	try:
paul@19	1098	term, frequency, doc_frequency, positions = reader.read_term()
paul@12	1099	insort_right(entries, (term, positions, partition))
paul@12	1100	except EOFError:
paul@12	1101	pass
paul@12	1102
paul@12	1103	# While entries are available, write them out in order, merging where
paul@12	1104	# appropriate.
paul@12	1105
paul@12	1106	while entries:
paul@12	1107	term, doc_positions, partition = entries[0]
paul@12	1108	to_update = [partition]
paul@12	1109
paul@12	1110	nentries = len(entries)
paul@12	1111	i = 1
paul@12	1112
paul@12	1113	# Find other entries for the term.
paul@12	1114
paul@12	1115	while i < nentries:
paul@12	1116	other_term, other_doc_positions, other_partition = entries[i]
paul@12	1117
paul@12	1118	# For such entries, merge the positions.
paul@12	1119
paul@12	1120	if other_term == term:
paul@14	1121	doc_positions = self.merge_positions(doc_positions, other_doc_positions)
paul@12	1122	to_update.append(other_partition)
paul@12	1123	i += 1
paul@12	1124	else:
paul@12	1125	break
paul@12	1126
paul@12	1127	# Write the combined term details.
paul@12	1128
paul@12	1129	self.writer.write_term_positions(term, doc_positions)
paul@12	1130
paul@12	1131	# Update the entries from the affected readers.
paul@12	1132
paul@12	1133	del entries[:i]
paul@12	1134
paul@12	1135	for partition in to_update:
paul@12	1136	try:
paul@19	1137	term, frequency, doc_frequency, positions = self.readers[partition].read_term()
paul@12	1138	insort_right(entries, (term, positions, partition))
paul@12	1139	except EOFError:
paul@12	1140	pass
paul@12	1141
paul@13	1142	def merge_positions(self, doc_positions, other_doc_positions):
paul@13	1143
paul@13	1144	"""
paul@13	1145	Merge 'doc_positions' with 'other_doc_positions' so that common document
paul@13	1146	records contain positions from both collections.
paul@13	1147	"""
paul@13	1148
paul@13	1149	doc_position_dict = dict(doc_positions)
paul@13	1150
paul@13	1151	for docnum, positions in other_doc_positions:
paul@13	1152	if doc_position_dict.has_key(docnum):
paul@13	1153	doc_position_dict[docnum] += positions
paul@13	1154	else:
paul@13	1155	doc_position_dict[docnum] = positions
paul@13	1156
paul@14	1157	return doc_position_dict.items()
paul@13	1158
paul@13	1159	class FieldDictionaryMerger(Merger):
paul@13	1160
paul@13	1161	"Merge field files."
paul@13	1162
paul@13	1163	def merge(self):
paul@13	1164
paul@13	1165	"""
paul@13	1166	Merge fields from the readers, sending them to the writer.
paul@13	1167	"""
paul@13	1168
paul@13	1169	entries = []
paul@13	1170
paul@13	1171	# Get the first entries from the readers.
paul@13	1172
paul@13	1173	for partition, reader in enumerate(self.readers):
paul@13	1174	reader.rewind()
paul@13	1175
paul@13	1176	try:
paul@13	1177	docnum, fields = reader.read_fields()
paul@13	1178	insort_right(entries, (docnum, fields, partition))
paul@13	1179	except EOFError:
paul@13	1180	pass
paul@13	1181
paul@13	1182	# While entries are available, write them out in order, merging where
paul@13	1183	# appropriate.
paul@13	1184
paul@13	1185	while entries:
paul@13	1186	docnum, fields, partition = entries[0]
paul@13	1187	to_update = [partition]
paul@13	1188
paul@13	1189	nentries = len(entries)
paul@13	1190	i = 1
paul@13	1191
paul@13	1192	# Find other entries for the term.
paul@13	1193
paul@13	1194	while i < nentries:
paul@13	1195	other_docnum, other_fields, other_partition = entries[i]
paul@13	1196
paul@13	1197	# For such entries, merge the positions.
paul@13	1198
paul@17	1199	if other_docnum == docnum:
paul@13	1200	fields += other_fields
paul@13	1201	to_update.append(other_partition)
paul@13	1202	i += 1
paul@13	1203	else:
paul@13	1204	break
paul@13	1205
paul@13	1206	# Write the combined term details.
paul@13	1207
paul@13	1208	self.writer.write_fields(docnum, fields)
paul@13	1209
paul@13	1210	# Update the entries from the affected readers.
paul@13	1211
paul@13	1212	del entries[:i]
paul@13	1213
paul@13	1214	for partition in to_update:
paul@13	1215	try:
paul@14	1216	docnum, fields = self.readers[partition].read_fields()
paul@13	1217	insort_right(entries, (docnum, fields, partition))
paul@13	1218	except EOFError:
paul@13	1219	pass
paul@13	1220
paul@13	1221	# Utility functions.
paul@13	1222
paul@19	1223	def get_term_writer(pathname, partition, interval, doc_interval):
paul@13	1224
paul@13	1225	"""
paul@13	1226	Return a term dictionary writer using files under the given 'pathname'
paul@13	1227	labelled according to the given 'partition', using the given indexing
paul@19	1228	'interval' for terms and 'doc_interval' for document position records.
paul@13	1229	"""
paul@13	1230
paul@13	1231	tdf = open(join(pathname, "terms-%s" % partition), "wb")
paul@13	1232	info_writer = TermWriter(tdf)
paul@13	1233
paul@14	1234	tdif = open(join(pathname, "terms_index-%s" % partition), "wb")
paul@13	1235	index_writer = TermIndexWriter(tdif)
paul@13	1236
paul@13	1237	tpf = open(join(pathname, "positions-%s" % partition), "wb")
paul@13	1238	positions_writer = PositionWriter(tpf)
paul@13	1239
paul@19	1240	tpif = open(join(pathname, "positions_index-%s" % partition), "wb")
paul@19	1241	positions_index_writer = PositionIndexWriter(tpif)
paul@19	1242
paul@19	1243	positions_dict_writer = PositionDictionaryWriter(positions_writer, positions_index_writer, doc_interval)
paul@19	1244
paul@19	1245	return TermDictionaryWriter(info_writer, index_writer, positions_dict_writer, interval)
paul@13	1246
paul@13	1247	def get_field_writer(pathname, partition, interval):
paul@13	1248
paul@13	1249	"""
paul@13	1250	Return a field dictionary writer using files under the given 'pathname'
paul@13	1251	labelled according to the given 'partition', using the given indexing
paul@13	1252	'interval'.
paul@13	1253	"""
paul@13	1254
paul@13	1255	ff = open(join(pathname, "fields-%s" % partition), "wb")
paul@13	1256	field_writer = FieldWriter(ff)
paul@13	1257
paul@13	1258	fif = open(join(pathname, "fields_index-%s" % partition), "wb")
paul@13	1259	field_index_writer = FieldIndexWriter(fif)
paul@13	1260
paul@13	1261	return FieldDictionaryWriter(field_writer, field_index_writer, interval)
paul@13	1262
paul@14	1263	def get_term_reader(pathname, partition):
paul@14	1264
paul@14	1265	"""
paul@14	1266	Return a term dictionary reader using files under the given 'pathname'
paul@14	1267	labelled according to the given 'partition'.
paul@14	1268	"""
paul@14	1269
paul@14	1270	tdf = open(join(pathname, "terms-%s" % partition), "rb")
paul@14	1271	info_reader = TermReader(tdf)
paul@14	1272
paul@14	1273	tdif = open(join(pathname, "terms_index-%s" % partition), "rb")
paul@14	1274	index_reader = TermIndexReader(tdif)
paul@14	1275
paul@14	1276	tpf = open(join(pathname, "positions-%s" % partition), "rb")
paul@14	1277	positions_reader = PositionReader(tpf)
paul@14	1278
paul@19	1279	tpif = open(join(pathname, "positions_index-%s" % partition), "rb")
paul@19	1280	positions_index_reader = PositionIndexReader(tpif)
paul@19	1281
paul@19	1282	positions_dict_reader = PositionDictionaryReader(positions_reader, positions_index_reader)
paul@19	1283
paul@19	1284	return TermDictionaryReader(info_reader, index_reader, positions_dict_reader)
paul@14	1285
paul@14	1286	def get_field_reader(pathname, partition):
paul@14	1287
paul@14	1288	"""
paul@14	1289	Return a field dictionary reader using files under the given 'pathname'
paul@14	1290	labelled according to the given 'partition'.
paul@14	1291	"""
paul@14	1292
paul@14	1293	ff = open(join(pathname, "fields-%s" % partition), "rb")
paul@14	1294	field_reader = FieldReader(ff)
paul@14	1295
paul@14	1296	fif = open(join(pathname, "fields_index-%s" % partition), "rb")
paul@14	1297	field_index_reader = FieldIndexReader(fif)
paul@14	1298
paul@14	1299	return FieldDictionaryReader(field_reader, field_index_reader)
paul@14	1300
paul@14	1301	def rename_files(pathname, names, from_partition, to_partition):
paul@14	1302	for name in names:
paul@14	1303	rename(join(pathname, "%s-%s" % (name, from_partition)), join(pathname, "%s-%s" % (name, to_partition)))
paul@14	1304
paul@14	1305	def rename_term_files(pathname, from_partition, to_partition):
paul@14	1306	rename_files(pathname, ("terms", "terms_index", "positions"), from_partition, to_partition)
paul@14	1307
paul@14	1308	def rename_field_files(pathname, from_partition, to_partition):
paul@14	1309	rename_files(pathname, ("fields", "fields_index"), from_partition, to_partition)
paul@14	1310
paul@14	1311	def remove_files(pathname, names, partition):
paul@14	1312	for name in names:
paul@14	1313	remove(join(pathname, "%s-%s" % (name, partition)))
paul@14	1314
paul@14	1315	def remove_term_files(pathname, partition):
paul@14	1316	remove_files(pathname, ("terms", "terms_index", "positions"), partition)
paul@14	1317
paul@14	1318	def remove_field_files(pathname, partition):
paul@14	1319	remove_files(pathname, ("fields", "fields_index"), partition)
paul@14	1320
paul@8	1321	# High-level classes.
paul@8	1322
paul@6	1323	class IndexWriter:
paul@6	1324
paul@10	1325	"""
paul@10	1326	Building term information and writing it to the term and field dictionaries.
paul@10	1327	"""
paul@6	1328
paul@12	1329	def __init__(self, pathname, interval, flush_interval):
paul@12	1330	self.pathname = pathname
paul@12	1331	self.interval = interval
paul@12	1332	self.flush_interval = flush_interval
paul@12	1333
paul@12	1334	self.dict_partition = 0
paul@12	1335	self.field_dict_partition = 0
paul@12	1336
paul@6	1337	self.terms = {}
paul@10	1338	self.docs = {}
paul@6	1339
paul@12	1340	self.position_counter = 0
paul@12	1341	self.field_counter = 0
paul@12	1342
paul@6	1343	def add_position(self, term, docnum, position):
paul@6	1344
paul@6	1345	"""
paul@6	1346	Add a position entry for the given 'term' in the document with the given
paul@6	1347	'docnum', indicating the given 'position'.
paul@6	1348	"""
paul@6	1349
paul@6	1350	if not self.terms.has_key(term):
paul@6	1351	doc_positions = self.terms[term] = {}
paul@6	1352	else:
paul@6	1353	doc_positions = self.terms[term]
paul@6	1354
paul@6	1355	if not doc_positions.has_key(docnum):
paul@6	1356	doc = doc_positions[docnum] = []
paul@6	1357	else:
paul@6	1358	doc = doc_positions[docnum]
paul@6	1359
paul@6	1360	doc.append(position)
paul@6	1361
paul@12	1362	self.position_counter += 1
paul@13	1363	if self.flush_interval and self.position_counter >= self.flush_interval:
paul@12	1364	self.flush_terms()
paul@15	1365	self.position_counter = 0
paul@12	1366
paul@13	1367	def add_field(self, docnum, identifier, value):
paul@10	1368
paul@13	1369	"""
paul@13	1370	Add for the document with the given 'docnum' a field having the given
paul@13	1371	'identifier' and 'value'.
paul@13	1372	"""
paul@10	1373
paul@10	1374	if not self.docs.has_key(docnum):
paul@13	1375	doc_fields = self.docs[docnum] = []
paul@10	1376	else:
paul@13	1377	doc_fields = self.docs[docnum]
paul@10	1378
paul@13	1379	doc_fields.append((identifier, value))
paul@13	1380
paul@13	1381	self.field_counter += 1
paul@13	1382	if self.flush_interval and self.field_counter >= self.flush_interval:
paul@12	1383	self.flush_fields()
paul@15	1384	self.field_counter = 0
paul@12	1385
paul@12	1386	def get_term_writer(self):
paul@12	1387
paul@12	1388	"Return a term dictionary writer for the current partition."
paul@12	1389
paul@13	1390	return get_term_writer(self.pathname, self.dict_partition, self.interval)
paul@12	1391
paul@12	1392	def get_field_writer(self):
paul@12	1393
paul@12	1394	"Return a field dictionary writer for the current partition."
paul@12	1395
paul@13	1396	return get_field_writer(self.pathname, self.field_dict_partition, self.interval)
paul@12	1397
paul@12	1398	def flush_terms(self):
paul@12	1399
paul@12	1400	"Flush terms into the current term dictionary partition."
paul@6	1401
paul@6	1402	# Get the terms in order.
paul@6	1403
paul@6	1404	terms = self.terms.items()
paul@6	1405	terms.sort()
paul@6	1406
paul@12	1407	dict_writer = self.get_term_writer()
paul@12	1408
paul@6	1409	for term, doc_positions in terms:
paul@6	1410	doc_positions = doc_positions.items()
paul@12	1411	dict_writer.write_term_positions(term, doc_positions)
paul@12	1412
paul@12	1413	dict_writer.close()
paul@6	1414
paul@12	1415	self.terms = {}
paul@12	1416	self.dict_partition += 1
paul@12	1417
paul@12	1418	def flush_fields(self):
paul@12	1419
paul@12	1420	"Flush fields into the current term dictionary partition."
paul@7	1421
paul@10	1422	# Get the documents in order.
paul@10	1423
paul@10	1424	docs = self.docs.items()
paul@10	1425	docs.sort()
paul@10	1426
paul@12	1427	field_dict_writer = self.get_field_writer()
paul@12	1428
paul@10	1429	for docnum, fields in docs:
paul@12	1430	field_dict_writer.write_fields(docnum, fields)
paul@12	1431
paul@12	1432	field_dict_writer.close()
paul@10	1433
paul@12	1434	self.docs = {}
paul@12	1435	self.field_dict_partition += 1
paul@12	1436
paul@12	1437	def close(self):
paul@12	1438	if self.terms:
paul@12	1439	self.flush_terms()
paul@12	1440	if self.docs:
paul@12	1441	self.flush_fields()
paul@10	1442
paul@10	1443	class IndexReader:
paul@10	1444
paul@10	1445	"Accessing the term and field dictionaries."
paul@10	1446
paul@14	1447	def __init__(self, pathname):
paul@14	1448	self.dict_reader = get_term_reader(pathname, "merged")
paul@14	1449	self.field_dict_reader = get_field_reader(pathname, "merged")
paul@10	1450
paul@10	1451	def find_positions(self, term):
paul@10	1452	return self.dict_reader.find_positions(term)
paul@10	1453
paul@11	1454	def get_frequency(self, term):
paul@11	1455	return self.dict_reader.get_frequency(term)
paul@11	1456
paul@10	1457	def get_fields(self, docnum):
paul@13	1458	return self.field_dict_reader.get_fields(docnum)
paul@10	1459
paul@10	1460	def close(self):
paul@10	1461	self.dict_reader.close()
paul@10	1462	self.field_dict_reader.close()
paul@10	1463
paul@7	1464	class Index:
paul@7	1465
paul@7	1466	"An inverted index solution encapsulating the various components."
paul@7	1467
paul@7	1468	def __init__(self, pathname):
paul@7	1469	self.pathname = pathname
paul@7	1470	self.reader = None
paul@7	1471	self.writer = None
paul@7	1472
paul@12	1473	def get_writer(self, interval=INTERVAL, flush_interval=FLUSH_INTERVAL):
paul@7	1474
paul@12	1475	"""
paul@12	1476	Return a writer, optionally using the given indexing 'interval' and
paul@12	1477	'flush_interval'.
paul@12	1478	"""
paul@7	1479
paul@7	1480	if not exists(self.pathname):
paul@7	1481	mkdir(self.pathname)
paul@7	1482
paul@12	1483	self.writer = IndexWriter(self.pathname, interval, flush_interval)
paul@7	1484	return self.writer
paul@7	1485
paul@12	1486	def get_reader(self, partition=0):
paul@7	1487
paul@7	1488	"Return a reader for the index."
paul@7	1489
paul@14	1490	# Ensure that only one partition exists.
paul@14	1491
paul@14	1492	self.merge_terms()
paul@14	1493	self.merge_fields()
paul@14	1494
paul@14	1495	return self._get_reader(partition)
paul@14	1496
paul@14	1497	def _get_reader(self, partition):
paul@14	1498
paul@14	1499	"Return a reader for the index."
paul@14	1500
paul@7	1501	if not exists(self.pathname):
paul@7	1502	raise OSError, "Index path %r does not exist." % self.pathname
paul@7	1503
paul@14	1504	self.reader = IndexReader(self.pathname)
paul@12	1505	return self.reader
paul@7	1506
paul@13	1507	def merge_terms(self, interval=INTERVAL):
paul@7	1508
paul@13	1509	"Merge term dictionaries using the given indexing 'interval'."
paul@10	1510
paul@12	1511	readers = []
paul@14	1512	partitions = []
paul@10	1513
paul@14	1514	for filename in listdir(self.pathname):
paul@12	1515	if filename.startswith("terms-"): # 6 character prefix
paul@14	1516	partition = filename[6:]
paul@14	1517	readers.append(get_term_reader(self.pathname, partition))
paul@14	1518	partitions.append(partition)
paul@14	1519
paul@14	1520	# Write directly to a dictionary.
paul@14	1521
paul@14	1522	if len(readers) > 1:
paul@14	1523	writer = get_term_writer(self.pathname, "merged", interval)
paul@14	1524	merger = TermDictionaryMerger(writer, readers)
paul@14	1525	merger.merge()
paul@14	1526	merger.close()
paul@14	1527
paul@14	1528	# Remove old files.
paul@14	1529
paul@14	1530	for partition in partitions:
paul@14	1531	remove_term_files(self.pathname, partition)
paul@14	1532
paul@14	1533	elif len(readers) == 1 and partitions[0] != "merged":
paul@14	1534	rename_term_files(self.pathname, partitions[0], "merged")
paul@14	1535
paul@14	1536	def merge_fields(self, interval=INTERVAL):
paul@10	1537
paul@14	1538	"Merge field dictionaries using the given indexing 'interval'."
paul@14	1539
paul@14	1540	readers = []
paul@14	1541	partitions = []
paul@14	1542
paul@14	1543	for filename in listdir(self.pathname):
paul@14	1544	if filename.startswith("fields-"): # 7 character prefix
paul@14	1545	partition = filename[7:]
paul@14	1546	readers.append(get_field_reader(self.pathname, partition))
paul@14	1547	partitions.append(partition)
paul@14	1548
paul@14	1549	# Write directly to a dictionary.
paul@13	1550
paul@14	1551	if len(readers) > 1:
paul@14	1552	writer = get_field_writer(self.pathname, "merged", interval)
paul@14	1553	merger = FieldDictionaryMerger(writer, readers)
paul@14	1554	merger.merge()
paul@14	1555	merger.close()
paul@14	1556
paul@14	1557	# Remove old files.
paul@14	1558
paul@14	1559	for partition in partitions:
paul@14	1560	remove_field_files(self.pathname, partition)
paul@14	1561
paul@14	1562	elif len(readers) == 1 and partitions[0] != "merged":
paul@14	1563	rename_field_files(self.pathname, partitions[0], "merged")
paul@7	1564
paul@7	1565	def close(self):
paul@7	1566	if self.reader is not None:
paul@7	1567	self.reader.close()
paul@7	1568	self.reader = None
paul@7	1569	if self.writer is not None:
paul@7	1570	self.writer.close()
paul@7	1571	self.writer = None
paul@6	1572
paul@0	1573	# vim: tabstop=4 expandtab shiftwidth=4