iixr (annotate iixr.py in 0ba1bf2fa563)

iixr

Annotated iixr.py

12:0ba1bf2fa563

2009-08-30

Paul Boddie

Introduced index "partitions", sequential access to term dictionaries, and some support for merging partitions.

paul@0	1	#!/usr/bin/env python
paul@0	2
paul@0	3	"""
paul@0	4	A simple (and sane) text indexing library.
paul@1	5
paul@1	6	Copyright (C) 2009 Paul Boddie <paul@boddie.org.uk>
paul@1	7
paul@1	8	This program is free software; you can redistribute it and/or modify it under
paul@1	9	the terms of the GNU General Public License as published by the Free Software
paul@1	10	Foundation; either version 3 of the License, or (at your option) any later
paul@1	11	version.
paul@1	12
paul@1	13	This program is distributed in the hope that it will be useful, but WITHOUT ANY
paul@1	14	WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
paul@1	15	PARTICULAR PURPOSE. See the GNU General Public License for more details.
paul@1	16
paul@1	17	You should have received a copy of the GNU General Public License along
paul@1	18	with this program. If not, see <http://www.gnu.org/licenses/>.
paul@0	19	"""
paul@0	20
paul@12	21	from os import listdir, mkdir # index and partition discovery
paul@7	22	from os.path import exists, join
paul@2	23	from os.path import commonprefix # to find common string prefixes
paul@3	24	from bisect import bisect_right # to find terms in the dictionary index
paul@12	25	from bisect import insort_right # to maintain a sorted list of data for merging
paul@10	26	import bz2, zlib # for field compression
paul@2	27
paul@7	28	# Constants.
paul@7	29
paul@7	30	INTERVAL = 100
paul@12	31	FLUSH_INTERVAL = 1000000
paul@7	32
paul@10	33	compressors = [("b", bz2.compress), ("z", zlib.compress)]
paul@10	34	decompressors = {"b" : bz2.decompress, "z" : zlib.decompress}
paul@10	35
paul@0	36	# Foundation classes.
paul@0	37
paul@0	38	class File:
paul@0	39
paul@0	40	"A basic file abstraction."
paul@0	41
paul@0	42	def __init__(self, f):
paul@0	43	self.f = f
paul@0	44	self.reset()
paul@0	45
paul@0	46	def reset(self):
paul@12	47
paul@12	48	"To be used to reset the state of the reader or writer between records."
paul@12	49
paul@0	50	pass
paul@0	51
paul@12	52	def rewind(self):
paul@12	53	self.f.seek(0)
paul@12	54
paul@0	55	def close(self):
paul@7	56	if self.f is not None:
paul@7	57	self.f.close()
paul@7	58	self.f = None
paul@0	59
paul@0	60	class FileWriter(File):
paul@0	61
paul@0	62	"Writing basic data types to files."
paul@0	63
paul@0	64	def write_number(self, number):
paul@0	65
paul@0	66	"Write 'number' to the file using a variable length encoding."
paul@0	67
paul@0	68	# Negative numbers are not supported.
paul@0	69
paul@0	70	if number < 0:
paul@0	71	raise ValueError, "Number %r is negative." % number
paul@0	72
paul@0	73	# Special case: one byte containing zero.
paul@0	74
paul@0	75	elif number == 0:
paul@4	76	self.f.write(chr(0))
paul@0	77	return
paul@0	78
paul@0	79	# Write the number from least to most significant digits.
paul@0	80
paul@0	81	bytes = []
paul@0	82
paul@0	83	while number != 0:
paul@4	84	lsd = number & 127
paul@4	85	number = number >> 7
paul@4	86	if number != 0:
paul@4	87	lsd \|= 128
paul@0	88	bytes.append(chr(lsd))
paul@0	89
paul@0	90	record = "".join(bytes)
paul@0	91	self.f.write(record)
paul@0	92
paul@8	93	def write_string(self, s, compress=0):
paul@2	94
paul@8	95	"""
paul@8	96	Write 's' to the file, recording its length and compressing the string
paul@8	97	if 'compress' is set to a true value.
paul@8	98	"""
paul@2	99
paul@7	100	# Convert Unicode objects to strings.
paul@7	101
paul@7	102	if isinstance(s, unicode):
paul@7	103	s = s.encode("utf-8")
paul@7	104
paul@8	105	# Compress the string if requested.
paul@2	106
paul@8	107	if compress:
paul@10	108	for flag, fn in compressors:
paul@10	109	cs = fn(s)
paul@10	110
paul@10	111	# Take the first string shorter than the original.
paul@10	112
paul@10	113	if len(cs) < len(s):
paul@10	114	s = cs
paul@10	115	break
paul@10	116	else:
paul@10	117	flag = "-"
paul@10	118
paul@10	119	# Record whether compression was used.
paul@10	120
paul@10	121	self.f.write(flag)
paul@2	122
paul@8	123	# Write the length of the data before the data itself.
paul@8	124
paul@8	125	length = len(s)
paul@4	126	self.write_number(length)
paul@2	127	self.f.write(s)
paul@2	128
paul@0	129	class FileReader(File):
paul@0	130
paul@0	131	"Reading basic data types from files."
paul@0	132
paul@0	133	def read_number(self):
paul@0	134
paul@0	135	"Read a number from the file."
paul@0	136
paul@0	137	# Read each byte, adding it to the number.
paul@0	138
paul@0	139	shift = 0
paul@0	140	number = 0
paul@4	141	more = 1
paul@0	142
paul@4	143	while more:
paul@4	144	byte = self.f.read(1)
paul@4	145	if not byte:
paul@4	146	raise EOFError
paul@4	147
paul@4	148	csd = ord(byte)
paul@4	149	more = csd & 128 != 0
paul@4	150	if more:
paul@4	151	csd &= 127
paul@0	152	number += (csd << shift)
paul@4	153	shift += 7
paul@0	154
paul@0	155	return number
paul@0	156
paul@8	157	def read_string(self, decompress=0):
paul@2	158
paul@8	159	"""
paul@8	160	Read a string from the file, decompressing the stored data if
paul@8	161	'decompress' is set to a true value.
paul@8	162	"""
paul@2	163
paul@10	164	# Decompress the data if requested.
paul@10	165
paul@10	166	if decompress:
paul@10	167	flag = self.f.read(1)
paul@10	168	else:
paul@10	169	flag = "-"
paul@10	170
paul@4	171	length = self.read_number()
paul@8	172	s = self.f.read(length)
paul@8	173
paul@10	174	# Perform decompression if applicable.
paul@8	175
paul@10	176	if flag != "-":
paul@10	177	fn = decompressors[flag]
paul@10	178	s = fn(s)
paul@7	179
paul@7	180	# Convert strings to Unicode objects.
paul@7	181
paul@8	182	return unicode(s, "utf-8")
paul@2	183
paul@9	184	# Specific classes for storing term and position information.
paul@0	185
paul@0	186	class PositionWriter(FileWriter):
paul@0	187
paul@0	188	"Writing position information to files."
paul@0	189
paul@0	190	def reset(self):
paul@0	191	self.last_docnum = 0
paul@0	192
paul@0	193	def write_positions(self, docnum, positions):
paul@0	194
paul@0	195	"Write for the document 'docnum' the given 'positions'."
paul@0	196
paul@0	197	if docnum < self.last_docnum:
paul@0	198	raise ValueError, "Document number %r is less than previous number %r." % (docnum, self.last_docnum)
paul@0	199
paul@0	200	# Write the document number delta.
paul@0	201
paul@0	202	self.write_number(docnum - self.last_docnum)
paul@0	203
paul@0	204	# Write the number of positions.
paul@0	205
paul@0	206	self.write_number(len(positions))
paul@0	207
paul@7	208	# Make sure that the positions are sorted.
paul@7	209
paul@7	210	positions.sort()
paul@7	211
paul@0	212	# Write the position deltas.
paul@0	213
paul@0	214	last = 0
paul@0	215	for position in positions:
paul@0	216	pos = position - last
paul@0	217	self.write_number(pos)
paul@0	218	last = position
paul@0	219
paul@0	220	self.last_docnum = docnum
paul@0	221
paul@12	222	def write_term_positions(self, doc_positions):
paul@0	223
paul@0	224	"""
paul@0	225	Write all 'doc_positions' - a collection of tuples of the form (document
paul@11	226	number, position list) - to the file, returning a tuple containing the
paul@11	227	offset at which they were stored together with the frequency (number of
paul@11	228	positions) for the term involved.
paul@0	229	"""
paul@0	230
paul@0	231	# Reset the writer and record the current file offset.
paul@0	232
paul@0	233	self.reset()
paul@0	234	offset = self.f.tell()
paul@0	235
paul@0	236	# Write the number of documents.
paul@0	237
paul@0	238	self.write_number(len(doc_positions))
paul@0	239
paul@0	240	# Write the positions.
paul@0	241
paul@11	242	frequency = 0
paul@11	243
paul@0	244	for docnum, positions in doc_positions:
paul@0	245	self.write_positions(docnum, positions)
paul@11	246	frequency += len(positions)
paul@0	247
paul@11	248	return offset, frequency
paul@0	249
paul@0	250	class PositionReader(FileReader):
paul@0	251
paul@0	252	"Reading position information from files."
paul@0	253
paul@0	254	def reset(self):
paul@0	255	self.last_docnum = 0
paul@0	256
paul@0	257	def read_positions(self):
paul@0	258
paul@0	259	"Read positions, returning a document number and a list of positions."
paul@0	260
paul@0	261	# Read the document number delta and add it to the last number.
paul@0	262
paul@0	263	self.last_docnum += self.read_number()
paul@0	264
paul@0	265	# Read the number of positions.
paul@0	266
paul@0	267	npositions = self.read_number()
paul@0	268
paul@0	269	# Read the position deltas, adding each previous position to get the
paul@0	270	# appropriate collection of absolute positions.
paul@0	271
paul@0	272	i = 0
paul@0	273	last = 0
paul@0	274	positions = []
paul@0	275
paul@0	276	while i < npositions:
paul@0	277	last += self.read_number()
paul@0	278	positions.append(last)
paul@0	279	i += 1
paul@0	280
paul@0	281	return self.last_docnum, positions
paul@0	282
paul@12	283	def read_term_positions(self, offset):
paul@0	284
paul@0	285	"""
paul@0	286	Read all positions from 'offset', seeking to that position in the file
paul@0	287	before reading.
paul@0	288	"""
paul@0	289
paul@0	290	self.reset()
paul@0	291	self.f.seek(offset)
paul@0	292
paul@0	293	# Read the number of documents.
paul@0	294
paul@0	295	ndocuments = self.read_number()
paul@0	296
paul@0	297	# Read all records.
paul@0	298
paul@0	299	i = 0
paul@0	300	doc_positions = []
paul@0	301
paul@0	302	while i < ndocuments:
paul@0	303	doc_positions.append(self.read_positions())
paul@0	304	i += 1
paul@0	305
paul@0	306	return doc_positions
paul@0	307
paul@2	308	class TermWriter(FileWriter):
paul@2	309
paul@2	310	"Writing term information to files."
paul@2	311
paul@2	312	def reset(self):
paul@2	313	self.last_term = ""
paul@2	314	self.last_offset = 0
paul@2	315
paul@11	316	def write_term(self, term, offset, frequency):
paul@2	317
paul@2	318	"""
paul@11	319	Write the given 'term', its position file 'offset', and its 'frequency'
paul@11	320	to the term information file. Return the offset after the term
paul@11	321	information was written to the file.
paul@2	322	"""
paul@2	323
paul@2	324	# Too long terms are not currently supported.
paul@2	325
paul@2	326	if len(term) > 255:
paul@2	327	raise ValueError, "Term %r is too long." % term
paul@2	328
paul@2	329	# Write the prefix length and term suffix.
paul@2	330
paul@2	331	common = len(commonprefix([self.last_term, term]))
paul@2	332	suffix = term[common:]
paul@2	333
paul@4	334	self.write_number(common)
paul@2	335	self.write_string(suffix)
paul@2	336
paul@2	337	# Write the offset delta.
paul@2	338
paul@2	339	self.write_number(offset - self.last_offset)
paul@2	340
paul@11	341	# Write the frequency.
paul@11	342
paul@11	343	self.write_number(frequency)
paul@11	344
paul@2	345	self.last_term = term
paul@2	346	self.last_offset = offset
paul@2	347
paul@3	348	return self.f.tell()
paul@3	349
paul@2	350	class TermReader(FileReader):
paul@2	351
paul@2	352	"Reading term information from files."
paul@2	353
paul@2	354	def reset(self):
paul@2	355	self.last_term = ""
paul@2	356	self.last_offset = 0
paul@2	357
paul@12	358	def rewind(self):
paul@12	359	self.reset()
paul@12	360	FileReader.rewind(self)
paul@12	361
paul@2	362	def read_term(self):
paul@2	363
paul@2	364	"""
paul@11	365	Read a term, its position file offset, and its frequency from the term
paul@11	366	information file.
paul@2	367	"""
paul@2	368
paul@2	369	# Read the prefix length and term suffix.
paul@2	370
paul@4	371	common = self.read_number()
paul@2	372	suffix = self.read_string()
paul@2	373
paul@2	374	self.last_term = self.last_term[:common] + suffix
paul@2	375
paul@2	376	# Read the offset delta.
paul@2	377
paul@2	378	self.last_offset += self.read_number()
paul@2	379
paul@11	380	# Read the frequency.
paul@11	381
paul@11	382	frequency = self.read_number()
paul@11	383
paul@11	384	return self.last_term, self.last_offset, frequency
paul@2	385
paul@3	386	def go_to_term(self, term, offset, info_offset):
paul@3	387
paul@9	388	"""
paul@9	389	Seek past the entry for 'term' having 'offset' to 'info_offset'. This
paul@9	390	permits the scanning for later terms from the specified term.
paul@9	391	"""
paul@3	392
paul@3	393	self.f.seek(info_offset)
paul@3	394	self.last_term = term
paul@3	395	self.last_offset = offset
paul@3	396
paul@3	397	class TermIndexWriter(TermWriter):
paul@3	398
paul@3	399	"Writing term dictionary index details to files."
paul@3	400
paul@3	401	def reset(self):
paul@3	402	TermWriter.reset(self)
paul@3	403	self.last_info_offset = 0
paul@3	404
paul@11	405	def write_term(self, term, offset, frequency, info_offset):
paul@3	406
paul@3	407	"""
paul@11	408	Write the given 'term', its position file 'offset', and its 'frequency'
paul@11	409	to the term dictionary index file, along with the 'info_offset' in the
paul@11	410	term information file.
paul@3	411	"""
paul@3	412
paul@11	413	TermWriter.write_term(self, term, offset, frequency)
paul@3	414
paul@3	415	# Write the information file offset delta.
paul@3	416
paul@3	417	self.write_number(info_offset - self.last_info_offset)
paul@3	418	self.last_info_offset = info_offset
paul@3	419
paul@3	420	class TermIndexReader(TermReader):
paul@3	421
paul@3	422	"Reading term dictionary index details from files."
paul@3	423
paul@3	424	def reset(self):
paul@3	425	TermReader.reset(self)
paul@3	426	self.last_info_offset = 0
paul@3	427
paul@3	428	def read_term(self):
paul@3	429
paul@3	430	"""
paul@11	431	Read a term, its position file offset, its frequency, and its term
paul@11	432	information file offset from the term dictionary index file.
paul@3	433	"""
paul@3	434
paul@11	435	term, offset, frequency = TermReader.read_term(self)
paul@3	436
paul@3	437	# Read the offset delta.
paul@3	438
paul@3	439	self.last_info_offset += self.read_number()
paul@3	440
paul@11	441	return term, offset, frequency, self.last_info_offset
paul@3	442
paul@3	443	class TermDictionaryWriter:
paul@3	444
paul@3	445	"Writing term dictionaries."
paul@3	446
paul@5	447	def __init__(self, info_writer, index_writer, position_writer, interval):
paul@3	448	self.info_writer = info_writer
paul@3	449	self.index_writer = index_writer
paul@5	450	self.position_writer = position_writer
paul@3	451	self.interval = interval
paul@3	452	self.entry = 0
paul@3	453
paul@11	454	def _write_term(self, term, offset, frequency):
paul@3	455
paul@3	456	"""
paul@11	457	Write the given 'term', its position file 'offset', and its 'frequency'
paul@11	458	to the term information file and optionally to the index, making a
paul@11	459	dictionary entry.
paul@3	460	"""
paul@3	461
paul@11	462	info_offset = self.info_writer.write_term(term, offset, frequency)
paul@3	463
paul@3	464	if self.entry % self.interval == 0:
paul@11	465	self.index_writer.write_term(term, offset, frequency, info_offset)
paul@3	466
paul@3	467	self.entry += 1
paul@3	468
paul@5	469	def write_term_positions(self, term, doc_positions):
paul@5	470
paul@5	471	"""
paul@5	472	Write the given 'term' and the 'doc_positions' recording the documents
paul@5	473	and positions at which the term is found.
paul@5	474	"""
paul@5	475
paul@12	476	offset, frequency = self.position_writer.write_term_positions(doc_positions)
paul@11	477	self._write_term(term, offset, frequency)
paul@5	478
paul@3	479	def close(self):
paul@3	480	self.info_writer.close()
paul@3	481	self.index_writer.close()
paul@5	482	self.position_writer.close()
paul@3	483
paul@3	484	class TermDictionaryReader:
paul@3	485
paul@3	486	"Reading term dictionaries."
paul@3	487
paul@5	488	def __init__(self, info_reader, index_reader, position_reader):
paul@3	489	self.info_reader = info_reader
paul@3	490	self.index_reader = index_reader
paul@5	491	self.position_reader = position_reader
paul@3	492
paul@3	493	self.terms = []
paul@3	494	try:
paul@3	495	while 1:
paul@3	496	self.terms.append(self.index_reader.read_term())
paul@3	497	except EOFError:
paul@3	498	pass
paul@3	499
paul@3	500	# Large numbers for ordering purposes.
paul@3	501
paul@3	502	self.max_offset = self.terms[-1][1]
paul@3	503	self.max_info_offset = self.terms[-1][2]
paul@3	504
paul@9	505	def _find_term(self, term):
paul@3	506
paul@11	507	"""
paul@11	508	Find the position file offset and frequency of 'term' from the term
paul@11	509	dictionary.
paul@11	510	"""
paul@3	511
paul@3	512	i = bisect_right(self.terms, (term, self.max_offset, self.max_info_offset)) - 1
paul@3	513
paul@3	514	# Get the entry position providing the term or one preceding it.
paul@3	515
paul@3	516	if i == -1:
paul@3	517	return None
paul@3	518
paul@11	519	found_term, offset, frequency, info_offset = self.terms[i]
paul@3	520
paul@3	521	# Where the term is found immediately, return the offset.
paul@3	522
paul@3	523	if term == found_term:
paul@11	524	return offset, frequency
paul@3	525
paul@3	526	# Otherwise, seek past the index term's entry in the information file
paul@3	527	# and scan for the desired term.
paul@3	528
paul@3	529	else:
paul@3	530	self.info_reader.go_to_term(found_term, offset, info_offset)
paul@3	531	try:
paul@3	532	while term > found_term:
paul@11	533	found_term, offset, frequency = self.info_reader.read_term()
paul@3	534	except EOFError:
paul@3	535	pass
paul@3	536
paul@11	537	# If the term is found, return the offset and frequency.
paul@3	538
paul@3	539	if term == found_term:
paul@11	540	return offset, frequency
paul@3	541	else:
paul@3	542	return None
paul@3	543
paul@12	544	def rewind(self):
paul@12	545	self.info_reader.rewind()
paul@12	546
paul@12	547	def _get_positions(self, offset):
paul@12	548	return self.position_reader.read_term_positions(offset)
paul@12	549
paul@12	550	def read_term(self):
paul@12	551
paul@12	552	"""
paul@12	553	Return the next term, its frequency and the documents and positions at
paul@12	554	which the term is found.
paul@12	555	"""
paul@12	556
paul@12	557	term, offset, frequency = self.info_reader.read_term()
paul@12	558	positions = self._get_positions(offset)
paul@12	559	return term, frequency, positions
paul@12	560
paul@5	561	def find_positions(self, term):
paul@5	562
paul@5	563	"Return the documents and positions at which the given 'term' is found."
paul@5	564
paul@11	565	t = self._find_term(term)
paul@11	566	if t is None:
paul@5	567	return None
paul@5	568	else:
paul@11	569	offset, frequency = t
paul@12	570	return self._get_positions(offset)
paul@5	571
paul@11	572	def get_frequency(self, term):
paul@11	573
paul@11	574	"Return the frequency of the given 'term'."
paul@11	575
paul@11	576	t = self._find_term(term)
paul@11	577	if t is None:
paul@11	578	return None
paul@11	579	else:
paul@11	580	offset, frequency = t
paul@11	581	return frequency
paul@11	582
paul@3	583	def close(self):
paul@3	584	self.info_reader.close()
paul@3	585	self.index_reader.close()
paul@5	586	self.position_reader.close()
paul@3	587
paul@9	588	# Specific classes for storing document information.
paul@9	589
paul@8	590	class FieldWriter(FileWriter):
paul@8	591
paul@8	592	"Writing field data to files."
paul@8	593
paul@9	594	def reset(self):
paul@9	595	self.last_docnum = 0
paul@9	596
paul@9	597	def write_fields(self, docnum, fields):
paul@8	598
paul@8	599	"""
paul@9	600	Write for the given 'docnum', a list of 'fields' (strings representing
paul@9	601	field values). Return the offset at which the fields are stored.
paul@8	602	"""
paul@8	603
paul@8	604	offset = self.f.tell()
paul@8	605
paul@9	606	# Write the document number delta.
paul@9	607
paul@9	608	self.write_number(docnum - self.last_docnum)
paul@9	609
paul@8	610	# Write the number of fields.
paul@8	611
paul@8	612	self.write_number(len(fields))
paul@8	613
paul@8	614	# Write the fields themselves.
paul@8	615
paul@8	616	for field in fields:
paul@10	617	self.write_string(field, 1) # compress
paul@8	618
paul@9	619	self.last_docnum = docnum
paul@8	620	return offset
paul@8	621
paul@8	622	class FieldReader(FileReader):
paul@8	623
paul@8	624	"Reading field data from files."
paul@8	625
paul@9	626	def reset(self):
paul@9	627	self.last_docnum = 0
paul@9	628
paul@8	629	def read_fields(self):
paul@8	630
paul@9	631	"""
paul@9	632	Read fields from the file, returning a tuple containing the document
paul@9	633	number and a list of field values.
paul@9	634	"""
paul@9	635
paul@9	636	# Read the document number.
paul@9	637
paul@9	638	self.last_docnum += self.read_number()
paul@8	639
paul@8	640	# Read the number of fields.
paul@8	641
paul@8	642	nfields = self.read_number()
paul@8	643
paul@8	644	# Collect the fields.
paul@8	645
paul@8	646	fields = []
paul@8	647	i = 0
paul@8	648
paul@8	649	while i < nfields:
paul@10	650	fields.append(self.read_string(1)) # decompress
paul@8	651	i += 1
paul@8	652
paul@9	653	return self.last_docnum, fields
paul@9	654
paul@9	655	def read_document_fields(self, docnum, offset):
paul@8	656
paul@9	657	"""
paul@9	658	Read fields for 'docnum' at the given 'offset'. This permits the
paul@9	659	retrieval of details for the specified document, as well as scanning for
paul@9	660	later documents.
paul@9	661	"""
paul@8	662
paul@8	663	self.f.seek(offset)
paul@9	664	bad_docnum, fields = self.read_fields()
paul@9	665	self.last_docnum = docnum
paul@9	666	return docnum, fields
paul@12	667
paul@9	668	class FieldIndexWriter(FileWriter):
paul@9	669
paul@9	670	"Writing field index details to files."
paul@9	671
paul@9	672	def reset(self):
paul@9	673	self.last_docnum = 0
paul@10	674	self.last_offset = 0
paul@9	675
paul@9	676	def write_document(self, docnum, offset):
paul@9	677
paul@9	678	"""
paul@9	679	Write for the given 'docnum', the 'offset' at which the fields for the
paul@9	680	document are stored in the fields file.
paul@9	681	"""
paul@9	682
paul@10	683	# Write the document number and offset deltas.
paul@9	684
paul@9	685	self.write_number(docnum - self.last_docnum)
paul@10	686	self.write_number(offset - self.last_offset)
paul@9	687
paul@9	688	self.last_docnum = docnum
paul@10	689	self.last_offset = offset
paul@9	690
paul@9	691	class FieldIndexReader(FileReader):
paul@9	692
paul@9	693	"Reading field index details from files."
paul@9	694
paul@9	695	def reset(self):
paul@9	696	self.last_docnum = 0
paul@10	697	self.last_offset = 0
paul@9	698
paul@9	699	def read_document(self):
paul@9	700
paul@9	701	"Read a document number and field file offset."
paul@9	702
paul@9	703	# Read the document number delta and offset.
paul@9	704
paul@9	705	self.last_docnum += self.read_number()
paul@10	706	self.last_offset += self.read_number()
paul@9	707
paul@10	708	return self.last_docnum, self.last_offset
paul@9	709
paul@9	710	class FieldDictionaryWriter:
paul@9	711
paul@9	712	"Writing field dictionary details."
paul@9	713
paul@9	714	def __init__(self, field_writer, field_index_writer, interval):
paul@9	715	self.field_writer = field_writer
paul@9	716	self.field_index_writer = field_index_writer
paul@9	717	self.interval = interval
paul@9	718	self.entry = 0
paul@9	719
paul@9	720	def write_fields(self, docnum, fields):
paul@9	721
paul@9	722	"Write details of the document with the given 'docnum' and 'fields'."
paul@9	723
paul@9	724	offset = self.field_writer.write_fields(docnum, fields)
paul@9	725
paul@9	726	if self.entry % self.interval == 0:
paul@9	727	self.field_index_writer.write_document(docnum, offset)
paul@9	728
paul@9	729	self.entry += 1
paul@9	730
paul@9	731	def close(self):
paul@9	732	self.field_writer.close()
paul@9	733	self.field_index_writer.close()
paul@9	734
paul@9	735	class FieldDictionaryReader:
paul@9	736
paul@9	737	"Reading field dictionary details."
paul@9	738
paul@9	739	def __init__(self, field_reader, field_index_reader):
paul@9	740	self.field_reader = field_reader
paul@9	741	self.field_index_reader = field_index_reader
paul@9	742
paul@9	743	self.docs = []
paul@9	744	try:
paul@9	745	while 1:
paul@9	746	self.docs.append(self.field_index_reader.read_document())
paul@9	747	except EOFError:
paul@9	748	pass
paul@9	749
paul@9	750	# Large numbers for ordering purposes.
paul@9	751
paul@9	752	self.max_offset = self.docs[-1][1]
paul@9	753
paul@9	754	def read_fields(self, docnum):
paul@9	755
paul@9	756	"Read the fields of the document with the given 'docnum'."
paul@9	757
paul@9	758	i = bisect_right(self.docs, (docnum, self.max_offset)) - 1
paul@9	759
paul@9	760	# Get the entry position providing the term or one preceding it.
paul@9	761
paul@9	762	if i == -1:
paul@9	763	return None
paul@9	764
paul@9	765	found_docnum, offset = self.docs[i]
paul@9	766
paul@9	767	# Read from the fields file.
paul@9	768
paul@9	769	found_docnum, fields = self.field_reader.read_document_fields(found_docnum, offset)
paul@9	770
paul@9	771	# Scan for the document, if necessary.
paul@9	772
paul@9	773	try:
paul@9	774	while docnum > found_docnum:
paul@9	775	found_docnum, fields = self.field_reader.read_fields()
paul@9	776	except EOFError:
paul@9	777	pass
paul@9	778
paul@9	779	# If the document is found, return the fields.
paul@9	780
paul@9	781	if docnum == found_docnum:
paul@9	782	return fields
paul@9	783	else:
paul@9	784	return None
paul@9	785
paul@9	786	def close(self):
paul@9	787	self.field_reader.close()
paul@9	788	self.field_index_reader.close()
paul@8	789
paul@12	790	# Dictionary merging classes.
paul@12	791
paul@12	792	class TermDictionaryMerger:
paul@12	793
paul@12	794	"Merge position files."
paul@12	795
paul@12	796	def __init__(self, writer, readers):
paul@12	797	self.writer = writer
paul@12	798	self.readers = readers
paul@12	799
paul@12	800	def merge(self):
paul@12	801	entries = []
paul@12	802
paul@12	803	# Get the first entries from the readers.
paul@12	804
paul@12	805	for partition, reader in enumerate(self.readers):
paul@12	806	reader.rewind()
paul@12	807
paul@12	808	try:
paul@12	809	term, frequency, positions = reader.read_term()
paul@12	810	insort_right(entries, (term, positions, partition))
paul@12	811	except EOFError:
paul@12	812	pass
paul@12	813
paul@12	814	# While entries are available, write them out in order, merging where
paul@12	815	# appropriate.
paul@12	816
paul@12	817	while entries:
paul@12	818	term, doc_positions, partition = entries[0]
paul@12	819	to_update = [partition]
paul@12	820
paul@12	821	nentries = len(entries)
paul@12	822	i = 1
paul@12	823
paul@12	824	# Find other entries for the term.
paul@12	825
paul@12	826	while i < nentries:
paul@12	827	other_term, other_doc_positions, other_partition = entries[i]
paul@12	828
paul@12	829	# For such entries, merge the positions.
paul@12	830
paul@12	831	if other_term == term:
paul@12	832	doc_positions += other_doc_positions
paul@12	833	to_update.append(other_partition)
paul@12	834	i += 1
paul@12	835	else:
paul@12	836	break
paul@12	837
paul@12	838	# Write the combined term details.
paul@12	839
paul@12	840	doc_positions.sort()
paul@12	841	self.writer.write_term_positions(term, doc_positions)
paul@12	842
paul@12	843	# Update the entries from the affected readers.
paul@12	844
paul@12	845	del entries[:i]
paul@12	846
paul@12	847	for partition in to_update:
paul@12	848	try:
paul@12	849	term, frequency, positions = self_readers[partition].read_term()
paul@12	850	insort_right(entries, (term, positions, partition))
paul@12	851	except EOFError:
paul@12	852	pass
paul@12	853
paul@8	854	# High-level classes.
paul@8	855
paul@6	856	class IndexWriter:
paul@6	857
paul@10	858	"""
paul@10	859	Building term information and writing it to the term and field dictionaries.
paul@10	860	"""
paul@6	861
paul@12	862	def __init__(self, pathname, interval, flush_interval):
paul@12	863	self.pathname = pathname
paul@12	864	self.interval = interval
paul@12	865	self.flush_interval = flush_interval
paul@12	866
paul@12	867	self.dict_partition = 0
paul@12	868	self.field_dict_partition = 0
paul@12	869
paul@6	870	self.terms = {}
paul@10	871	self.docs = {}
paul@6	872
paul@12	873	self.position_counter = 0
paul@12	874	self.field_counter = 0
paul@12	875
paul@6	876	def add_position(self, term, docnum, position):
paul@6	877
paul@6	878	"""
paul@6	879	Add a position entry for the given 'term' in the document with the given
paul@6	880	'docnum', indicating the given 'position'.
paul@6	881	"""
paul@6	882
paul@6	883	if not self.terms.has_key(term):
paul@6	884	doc_positions = self.terms[term] = {}
paul@6	885	else:
paul@6	886	doc_positions = self.terms[term]
paul@6	887
paul@6	888	if not doc_positions.has_key(docnum):
paul@6	889	doc = doc_positions[docnum] = []
paul@6	890	else:
paul@6	891	doc = doc_positions[docnum]
paul@6	892
paul@6	893	doc.append(position)
paul@6	894
paul@12	895	self.position_counter += 1
paul@12	896	if self.flush_threshold and self.position_counter >= self.flush_threshold:
paul@12	897	self.flush_terms()
paul@12	898
paul@10	899	def add_fields(self, docnum, fields):
paul@10	900
paul@10	901	"Add for the document with the given 'docnum' a list of 'fields'."
paul@10	902
paul@10	903	if not self.docs.has_key(docnum):
paul@10	904	doc_fields = self.docs[docnum] = fields
paul@10	905	else:
paul@10	906	self.docs[docnum] += fields
paul@10	907
paul@12	908	self.field_counter += len(fields)
paul@12	909	if self.flush_threshold and self.field_counter >= self.flush_threshold:
paul@12	910	self.flush_fields()
paul@12	911
paul@12	912	def get_term_writer(self):
paul@12	913
paul@12	914	"Return a term dictionary writer for the current partition."
paul@12	915
paul@12	916	tdf = open(join(self.pathname, "terms-%d" % self.dict_partition), "wb")
paul@12	917	info_writer = TermWriter(tdf)
paul@12	918
paul@12	919	tdif = open(join(self.pathname, "index-%d" % self.dict_partition), "wb")
paul@12	920	index_writer = TermIndexWriter(tdif)
paul@12	921
paul@12	922	tpf = open(join(self.pathname, "positions-%d" % self.dict_partition), "wb")
paul@12	923	positions_writer = PositionWriter(tpf)
paul@12	924
paul@12	925	return TermDictionaryWriter(info_writer, index_writer, positions_writer, self.interval)
paul@12	926
paul@12	927	def get_field_writer(self):
paul@12	928
paul@12	929	"Return a field dictionary writer for the current partition."
paul@12	930
paul@12	931	ff = open(join(self.pathname, "fields-%d" % self.field_dict_partition), "wb")
paul@12	932	field_writer = FieldWriter(ff)
paul@12	933
paul@12	934	fif = open(join(self.pathname, "fields_index-%d" % self.field_dict_partition), "wb")
paul@12	935	field_index_writer = FieldIndexWriter(fif)
paul@12	936
paul@12	937	return FieldDictionaryWriter(field_writer, field_index_writer, self.interval)
paul@12	938
paul@12	939	def flush_terms(self):
paul@12	940
paul@12	941	"Flush terms into the current term dictionary partition."
paul@6	942
paul@6	943	# Get the terms in order.
paul@6	944
paul@6	945	terms = self.terms.items()
paul@6	946	terms.sort()
paul@6	947
paul@12	948	dict_writer = self.get_term_writer()
paul@12	949
paul@6	950	for term, doc_positions in terms:
paul@6	951	doc_positions = doc_positions.items()
paul@6	952	doc_positions.sort()
paul@12	953	dict_writer.write_term_positions(term, doc_positions)
paul@12	954
paul@12	955	dict_writer.close()
paul@6	956
paul@12	957	self.terms = {}
paul@12	958	self.dict_partition += 1
paul@12	959
paul@12	960	def flush_fields(self):
paul@12	961
paul@12	962	"Flush fields into the current term dictionary partition."
paul@7	963
paul@10	964	# Get the documents in order.
paul@10	965
paul@10	966	docs = self.docs.items()
paul@10	967	docs.sort()
paul@10	968
paul@12	969	field_dict_writer = self.get_field_writer()
paul@12	970
paul@10	971	for docnum, fields in docs:
paul@12	972	field_dict_writer.write_fields(docnum, fields)
paul@12	973
paul@12	974	field_dict_writer.close()
paul@10	975
paul@12	976	self.docs = {}
paul@12	977	self.field_dict_partition += 1
paul@12	978
paul@12	979	def close(self):
paul@12	980	if self.terms:
paul@12	981	self.flush_terms()
paul@12	982	if self.docs:
paul@12	983	self.flush_fields()
paul@10	984
paul@10	985	class IndexReader:
paul@10	986
paul@10	987	"Accessing the term and field dictionaries."
paul@10	988
paul@12	989	def __init__(self, pathname, partition=0):
paul@12	990	self.pathname = pathname
paul@12	991	self.dict_reader = self.get_term_reader(partition)
paul@12	992	self.field_dict_reader = self.get_field_reader(partition)
paul@12	993
paul@12	994	def get_term_reader(self, partition):
paul@12	995	tdf = open(join(self.pathname, "terms-%d" % partition), "rb")
paul@12	996	info_reader = TermReader(tdf)
paul@12	997
paul@12	998	tdif = open(join(self.pathname, "index-%d" % partition), "rb")
paul@12	999	index_reader = TermIndexReader(tdif)
paul@12	1000
paul@12	1001	tpf = open(join(self.pathname, "positions-%d" % partition), "rb")
paul@12	1002	positions_reader = PositionReader(tpf)
paul@12	1003
paul@12	1004	return TermDictionaryReader(info_reader, index_reader, positions_reader)
paul@12	1005
paul@12	1006	def get_field_reader(self, partition):
paul@12	1007	ff = open(join(self.pathname, "fields-%d" % partition), "rb")
paul@12	1008	field_reader = FieldReader(ff)
paul@12	1009
paul@12	1010	fif = open(join(self.pathname, "fields_index-%d" % partition), "rb")
paul@12	1011	field_index_reader = FieldIndexReader(fif)
paul@12	1012
paul@12	1013	return FieldDictionaryReader(field_reader, field_index_reader)
paul@10	1014
paul@10	1015	def find_positions(self, term):
paul@10	1016	return self.dict_reader.find_positions(term)
paul@10	1017
paul@11	1018	def get_frequency(self, term):
paul@11	1019	return self.dict_reader.get_frequency(term)
paul@11	1020
paul@10	1021	def get_fields(self, docnum):
paul@10	1022	return self.field_dict_reader.read_fields(docnum)
paul@10	1023
paul@10	1024	def close(self):
paul@10	1025	self.dict_reader.close()
paul@10	1026	self.field_dict_reader.close()
paul@10	1027
paul@7	1028	class Index:
paul@7	1029
paul@7	1030	"An inverted index solution encapsulating the various components."
paul@7	1031
paul@7	1032	def __init__(self, pathname):
paul@7	1033	self.pathname = pathname
paul@7	1034	self.reader = None
paul@7	1035	self.writer = None
paul@7	1036
paul@12	1037	def get_writer(self, interval=INTERVAL, flush_interval=FLUSH_INTERVAL):
paul@7	1038
paul@12	1039	"""
paul@12	1040	Return a writer, optionally using the given indexing 'interval' and
paul@12	1041	'flush_interval'.
paul@12	1042	"""
paul@7	1043
paul@7	1044	if not exists(self.pathname):
paul@7	1045	mkdir(self.pathname)
paul@7	1046
paul@12	1047	self.writer = IndexWriter(self.pathname, interval, flush_interval)
paul@7	1048	return self.writer
paul@7	1049
paul@12	1050	def get_reader(self, partition=0):
paul@7	1051
paul@7	1052	"Return a reader for the index."
paul@7	1053
paul@7	1054	if not exists(self.pathname):
paul@7	1055	raise OSError, "Index path %r does not exist." % self.pathname
paul@7	1056
paul@12	1057	self.reader = IndexReader(self.pathname, partition)
paul@12	1058	return self.reader
paul@7	1059
paul@12	1060	def merge_terms(self):
paul@7	1061
paul@12	1062	"Merge term dictionaries."
paul@10	1063
paul@12	1064	readers = []
paul@10	1065
paul@12	1066	for filename in os.listdir(self.pathname):
paul@12	1067	if filename.startswith("terms-"): # 6 character prefix
paul@12	1068	partition = int(filename[6:])
paul@12	1069	readers.append(self.get_reader(partition))
paul@10	1070
paul@12	1071	# NOTE: Make a distinct new writer/index.
paul@7	1072
paul@7	1073	def close(self):
paul@7	1074	if self.reader is not None:
paul@7	1075	self.reader.close()
paul@7	1076	self.reader = None
paul@7	1077	if self.writer is not None:
paul@7	1078	self.writer.close()
paul@7	1079	self.writer = None
paul@6	1080
paul@0	1081	# vim: tabstop=4 expandtab shiftwidth=4