iixr (annotate iixr.py in 51cf2f142879)

iixr

Annotated iixr.py

14:51cf2f142879

2009-08-30

Paul Boddie

Fixed term discovery in the term dictionary index within the _find_term method. Fixed various typing errors in the merging functionality. Introduced merging into the process of obtaining a reader from top-level Index instances: only merged dictionaries should be accessed. Introduced the renaming of single partitions for reading and the deletion of partitions after merging. Renamed "index-N" to "terms_index-N" for term dictionary index files. Moved dictionary reader acquisition to utility functions, and changed merging methods to use such readers directly. Introduced merging into the test program.

paul@0	1	#!/usr/bin/env python
paul@0	2
paul@0	3	"""
paul@0	4	A simple (and sane) text indexing library.
paul@1	5
paul@1	6	Copyright (C) 2009 Paul Boddie <paul@boddie.org.uk>
paul@1	7
paul@1	8	This program is free software; you can redistribute it and/or modify it under
paul@1	9	the terms of the GNU General Public License as published by the Free Software
paul@1	10	Foundation; either version 3 of the License, or (at your option) any later
paul@1	11	version.
paul@1	12
paul@1	13	This program is distributed in the hope that it will be useful, but WITHOUT ANY
paul@1	14	WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
paul@1	15	PARTICULAR PURPOSE. See the GNU General Public License for more details.
paul@1	16
paul@1	17	You should have received a copy of the GNU General Public License along
paul@1	18	with this program. If not, see <http://www.gnu.org/licenses/>.
paul@0	19	"""
paul@0	20
paul@12	21	from os import listdir, mkdir # index and partition discovery
paul@14	22	from os import remove, rename # partition manipulation
paul@7	23	from os.path import exists, join
paul@2	24	from os.path import commonprefix # to find common string prefixes
paul@3	25	from bisect import bisect_right # to find terms in the dictionary index
paul@12	26	from bisect import insort_right # to maintain a sorted list of data for merging
paul@10	27	import bz2, zlib # for field compression
paul@2	28
paul@7	29	# Constants.
paul@7	30
paul@7	31	INTERVAL = 100
paul@12	32	FLUSH_INTERVAL = 1000000
paul@7	33
paul@10	34	compressors = [("b", bz2.compress), ("z", zlib.compress)]
paul@10	35	decompressors = {"b" : bz2.decompress, "z" : zlib.decompress}
paul@10	36
paul@0	37	# Foundation classes.
paul@0	38
paul@0	39	class File:
paul@0	40
paul@0	41	"A basic file abstraction."
paul@0	42
paul@0	43	def __init__(self, f):
paul@0	44	self.f = f
paul@0	45	self.reset()
paul@0	46
paul@0	47	def reset(self):
paul@12	48
paul@12	49	"To be used to reset the state of the reader or writer between records."
paul@12	50
paul@0	51	pass
paul@0	52
paul@12	53	def rewind(self):
paul@12	54	self.f.seek(0)
paul@13	55	self.reset()
paul@12	56
paul@0	57	def close(self):
paul@7	58	if self.f is not None:
paul@7	59	self.f.close()
paul@7	60	self.f = None
paul@0	61
paul@0	62	class FileWriter(File):
paul@0	63
paul@0	64	"Writing basic data types to files."
paul@0	65
paul@0	66	def write_number(self, number):
paul@0	67
paul@0	68	"Write 'number' to the file using a variable length encoding."
paul@0	69
paul@0	70	# Negative numbers are not supported.
paul@0	71
paul@0	72	if number < 0:
paul@0	73	raise ValueError, "Number %r is negative." % number
paul@0	74
paul@0	75	# Special case: one byte containing zero.
paul@0	76
paul@0	77	elif number == 0:
paul@4	78	self.f.write(chr(0))
paul@0	79	return
paul@0	80
paul@0	81	# Write the number from least to most significant digits.
paul@0	82
paul@0	83	bytes = []
paul@0	84
paul@0	85	while number != 0:
paul@4	86	lsd = number & 127
paul@4	87	number = number >> 7
paul@4	88	if number != 0:
paul@4	89	lsd \|= 128
paul@0	90	bytes.append(chr(lsd))
paul@0	91
paul@0	92	record = "".join(bytes)
paul@0	93	self.f.write(record)
paul@0	94
paul@8	95	def write_string(self, s, compress=0):
paul@2	96
paul@8	97	"""
paul@8	98	Write 's' to the file, recording its length and compressing the string
paul@8	99	if 'compress' is set to a true value.
paul@8	100	"""
paul@2	101
paul@7	102	# Convert Unicode objects to strings.
paul@7	103
paul@7	104	if isinstance(s, unicode):
paul@7	105	s = s.encode("utf-8")
paul@7	106
paul@8	107	# Compress the string if requested.
paul@2	108
paul@8	109	if compress:
paul@10	110	for flag, fn in compressors:
paul@10	111	cs = fn(s)
paul@10	112
paul@10	113	# Take the first string shorter than the original.
paul@10	114
paul@10	115	if len(cs) < len(s):
paul@10	116	s = cs
paul@10	117	break
paul@10	118	else:
paul@10	119	flag = "-"
paul@10	120
paul@10	121	# Record whether compression was used.
paul@10	122
paul@10	123	self.f.write(flag)
paul@2	124
paul@8	125	# Write the length of the data before the data itself.
paul@8	126
paul@8	127	length = len(s)
paul@4	128	self.write_number(length)
paul@2	129	self.f.write(s)
paul@2	130
paul@0	131	class FileReader(File):
paul@0	132
paul@0	133	"Reading basic data types from files."
paul@0	134
paul@0	135	def read_number(self):
paul@0	136
paul@0	137	"Read a number from the file."
paul@0	138
paul@0	139	# Read each byte, adding it to the number.
paul@0	140
paul@0	141	shift = 0
paul@0	142	number = 0
paul@4	143	more = 1
paul@0	144
paul@4	145	while more:
paul@4	146	byte = self.f.read(1)
paul@4	147	if not byte:
paul@4	148	raise EOFError
paul@4	149
paul@4	150	csd = ord(byte)
paul@4	151	more = csd & 128 != 0
paul@4	152	if more:
paul@4	153	csd &= 127
paul@0	154	number += (csd << shift)
paul@4	155	shift += 7
paul@0	156
paul@0	157	return number
paul@0	158
paul@8	159	def read_string(self, decompress=0):
paul@2	160
paul@8	161	"""
paul@8	162	Read a string from the file, decompressing the stored data if
paul@8	163	'decompress' is set to a true value.
paul@8	164	"""
paul@2	165
paul@10	166	# Decompress the data if requested.
paul@10	167
paul@10	168	if decompress:
paul@10	169	flag = self.f.read(1)
paul@10	170	else:
paul@10	171	flag = "-"
paul@10	172
paul@4	173	length = self.read_number()
paul@8	174	s = self.f.read(length)
paul@8	175
paul@10	176	# Perform decompression if applicable.
paul@8	177
paul@10	178	if flag != "-":
paul@10	179	fn = decompressors[flag]
paul@10	180	s = fn(s)
paul@7	181
paul@7	182	# Convert strings to Unicode objects.
paul@7	183
paul@8	184	return unicode(s, "utf-8")
paul@2	185
paul@9	186	# Specific classes for storing term and position information.
paul@0	187
paul@0	188	class PositionWriter(FileWriter):
paul@0	189
paul@0	190	"Writing position information to files."
paul@0	191
paul@0	192	def reset(self):
paul@0	193	self.last_docnum = 0
paul@0	194
paul@0	195	def write_positions(self, docnum, positions):
paul@0	196
paul@0	197	"Write for the document 'docnum' the given 'positions'."
paul@0	198
paul@0	199	if docnum < self.last_docnum:
paul@0	200	raise ValueError, "Document number %r is less than previous number %r." % (docnum, self.last_docnum)
paul@0	201
paul@0	202	# Write the document number delta.
paul@0	203
paul@0	204	self.write_number(docnum - self.last_docnum)
paul@0	205
paul@0	206	# Write the number of positions.
paul@0	207
paul@0	208	self.write_number(len(positions))
paul@0	209
paul@7	210	# Make sure that the positions are sorted.
paul@7	211
paul@7	212	positions.sort()
paul@7	213
paul@0	214	# Write the position deltas.
paul@0	215
paul@0	216	last = 0
paul@0	217	for position in positions:
paul@0	218	pos = position - last
paul@0	219	self.write_number(pos)
paul@0	220	last = position
paul@0	221
paul@0	222	self.last_docnum = docnum
paul@0	223
paul@12	224	def write_term_positions(self, doc_positions):
paul@0	225
paul@0	226	"""
paul@0	227	Write all 'doc_positions' - a collection of tuples of the form (document
paul@11	228	number, position list) - to the file, returning a tuple containing the
paul@11	229	offset at which they were stored together with the frequency (number of
paul@11	230	positions) for the term involved.
paul@0	231	"""
paul@0	232
paul@0	233	# Reset the writer and record the current file offset.
paul@0	234
paul@0	235	self.reset()
paul@0	236	offset = self.f.tell()
paul@0	237
paul@0	238	# Write the number of documents.
paul@0	239
paul@0	240	self.write_number(len(doc_positions))
paul@0	241
paul@0	242	# Write the positions.
paul@0	243
paul@11	244	frequency = 0
paul@11	245
paul@0	246	for docnum, positions in doc_positions:
paul@0	247	self.write_positions(docnum, positions)
paul@11	248	frequency += len(positions)
paul@0	249
paul@11	250	return offset, frequency
paul@0	251
paul@0	252	class PositionReader(FileReader):
paul@0	253
paul@0	254	"Reading position information from files."
paul@0	255
paul@0	256	def reset(self):
paul@0	257	self.last_docnum = 0
paul@0	258
paul@0	259	def read_positions(self):
paul@0	260
paul@0	261	"Read positions, returning a document number and a list of positions."
paul@0	262
paul@0	263	# Read the document number delta and add it to the last number.
paul@0	264
paul@0	265	self.last_docnum += self.read_number()
paul@0	266
paul@0	267	# Read the number of positions.
paul@0	268
paul@0	269	npositions = self.read_number()
paul@0	270
paul@0	271	# Read the position deltas, adding each previous position to get the
paul@0	272	# appropriate collection of absolute positions.
paul@0	273
paul@0	274	i = 0
paul@0	275	last = 0
paul@0	276	positions = []
paul@0	277
paul@0	278	while i < npositions:
paul@0	279	last += self.read_number()
paul@0	280	positions.append(last)
paul@0	281	i += 1
paul@0	282
paul@0	283	return self.last_docnum, positions
paul@0	284
paul@12	285	def read_term_positions(self, offset):
paul@0	286
paul@0	287	"""
paul@0	288	Read all positions from 'offset', seeking to that position in the file
paul@0	289	before reading.
paul@0	290	"""
paul@0	291
paul@0	292	self.reset()
paul@0	293	self.f.seek(offset)
paul@0	294
paul@0	295	# Read the number of documents.
paul@0	296
paul@0	297	ndocuments = self.read_number()
paul@0	298
paul@0	299	# Read all records.
paul@0	300
paul@0	301	i = 0
paul@0	302	doc_positions = []
paul@0	303
paul@0	304	while i < ndocuments:
paul@0	305	doc_positions.append(self.read_positions())
paul@0	306	i += 1
paul@0	307
paul@0	308	return doc_positions
paul@0	309
paul@2	310	class TermWriter(FileWriter):
paul@2	311
paul@2	312	"Writing term information to files."
paul@2	313
paul@2	314	def reset(self):
paul@2	315	self.last_term = ""
paul@2	316	self.last_offset = 0
paul@2	317
paul@11	318	def write_term(self, term, offset, frequency):
paul@2	319
paul@2	320	"""
paul@11	321	Write the given 'term', its position file 'offset', and its 'frequency'
paul@11	322	to the term information file. Return the offset after the term
paul@11	323	information was written to the file.
paul@2	324	"""
paul@2	325
paul@2	326	# Write the prefix length and term suffix.
paul@2	327
paul@2	328	common = len(commonprefix([self.last_term, term]))
paul@2	329	suffix = term[common:]
paul@2	330
paul@4	331	self.write_number(common)
paul@2	332	self.write_string(suffix)
paul@2	333
paul@2	334	# Write the offset delta.
paul@2	335
paul@2	336	self.write_number(offset - self.last_offset)
paul@2	337
paul@11	338	# Write the frequency.
paul@11	339
paul@11	340	self.write_number(frequency)
paul@11	341
paul@2	342	self.last_term = term
paul@2	343	self.last_offset = offset
paul@2	344
paul@3	345	return self.f.tell()
paul@3	346
paul@2	347	class TermReader(FileReader):
paul@2	348
paul@2	349	"Reading term information from files."
paul@2	350
paul@2	351	def reset(self):
paul@2	352	self.last_term = ""
paul@2	353	self.last_offset = 0
paul@2	354
paul@2	355	def read_term(self):
paul@2	356
paul@2	357	"""
paul@11	358	Read a term, its position file offset, and its frequency from the term
paul@11	359	information file.
paul@2	360	"""
paul@2	361
paul@2	362	# Read the prefix length and term suffix.
paul@2	363
paul@4	364	common = self.read_number()
paul@2	365	suffix = self.read_string()
paul@2	366
paul@2	367	self.last_term = self.last_term[:common] + suffix
paul@2	368
paul@2	369	# Read the offset delta.
paul@2	370
paul@2	371	self.last_offset += self.read_number()
paul@2	372
paul@11	373	# Read the frequency.
paul@11	374
paul@11	375	frequency = self.read_number()
paul@11	376
paul@11	377	return self.last_term, self.last_offset, frequency
paul@2	378
paul@3	379	def go_to_term(self, term, offset, info_offset):
paul@3	380
paul@9	381	"""
paul@9	382	Seek past the entry for 'term' having 'offset' to 'info_offset'. This
paul@9	383	permits the scanning for later terms from the specified term.
paul@9	384	"""
paul@3	385
paul@3	386	self.f.seek(info_offset)
paul@3	387	self.last_term = term
paul@3	388	self.last_offset = offset
paul@3	389
paul@3	390	class TermIndexWriter(TermWriter):
paul@3	391
paul@3	392	"Writing term dictionary index details to files."
paul@3	393
paul@3	394	def reset(self):
paul@3	395	TermWriter.reset(self)
paul@3	396	self.last_info_offset = 0
paul@3	397
paul@11	398	def write_term(self, term, offset, frequency, info_offset):
paul@3	399
paul@3	400	"""
paul@11	401	Write the given 'term', its position file 'offset', and its 'frequency'
paul@11	402	to the term dictionary index file, along with the 'info_offset' in the
paul@11	403	term information file.
paul@3	404	"""
paul@3	405
paul@11	406	TermWriter.write_term(self, term, offset, frequency)
paul@3	407
paul@3	408	# Write the information file offset delta.
paul@3	409
paul@3	410	self.write_number(info_offset - self.last_info_offset)
paul@3	411	self.last_info_offset = info_offset
paul@3	412
paul@3	413	class TermIndexReader(TermReader):
paul@3	414
paul@3	415	"Reading term dictionary index details from files."
paul@3	416
paul@3	417	def reset(self):
paul@3	418	TermReader.reset(self)
paul@3	419	self.last_info_offset = 0
paul@3	420
paul@3	421	def read_term(self):
paul@3	422
paul@3	423	"""
paul@11	424	Read a term, its position file offset, its frequency, and its term
paul@11	425	information file offset from the term dictionary index file.
paul@3	426	"""
paul@3	427
paul@11	428	term, offset, frequency = TermReader.read_term(self)
paul@3	429
paul@3	430	# Read the offset delta.
paul@3	431
paul@3	432	self.last_info_offset += self.read_number()
paul@3	433
paul@11	434	return term, offset, frequency, self.last_info_offset
paul@3	435
paul@3	436	class TermDictionaryWriter:
paul@3	437
paul@3	438	"Writing term dictionaries."
paul@3	439
paul@5	440	def __init__(self, info_writer, index_writer, position_writer, interval):
paul@3	441	self.info_writer = info_writer
paul@3	442	self.index_writer = index_writer
paul@5	443	self.position_writer = position_writer
paul@3	444	self.interval = interval
paul@3	445	self.entry = 0
paul@3	446
paul@11	447	def _write_term(self, term, offset, frequency):
paul@3	448
paul@3	449	"""
paul@11	450	Write the given 'term', its position file 'offset', and its 'frequency'
paul@11	451	to the term information file and optionally to the index, making a
paul@11	452	dictionary entry.
paul@3	453	"""
paul@3	454
paul@11	455	info_offset = self.info_writer.write_term(term, offset, frequency)
paul@3	456
paul@3	457	if self.entry % self.interval == 0:
paul@11	458	self.index_writer.write_term(term, offset, frequency, info_offset)
paul@3	459
paul@3	460	self.entry += 1
paul@3	461
paul@5	462	def write_term_positions(self, term, doc_positions):
paul@5	463
paul@5	464	"""
paul@5	465	Write the given 'term' and the 'doc_positions' recording the documents
paul@5	466	and positions at which the term is found.
paul@5	467	"""
paul@5	468
paul@12	469	offset, frequency = self.position_writer.write_term_positions(doc_positions)
paul@11	470	self._write_term(term, offset, frequency)
paul@5	471
paul@3	472	def close(self):
paul@3	473	self.info_writer.close()
paul@3	474	self.index_writer.close()
paul@5	475	self.position_writer.close()
paul@3	476
paul@3	477	class TermDictionaryReader:
paul@3	478
paul@3	479	"Reading term dictionaries."
paul@3	480
paul@5	481	def __init__(self, info_reader, index_reader, position_reader):
paul@3	482	self.info_reader = info_reader
paul@3	483	self.index_reader = index_reader
paul@5	484	self.position_reader = position_reader
paul@3	485
paul@3	486	self.terms = []
paul@3	487	try:
paul@3	488	while 1:
paul@3	489	self.terms.append(self.index_reader.read_term())
paul@3	490	except EOFError:
paul@3	491	pass
paul@3	492
paul@3	493	# Large numbers for ordering purposes.
paul@3	494
paul@14	495	self.max_offset = self.terms[-1][1] + 1
paul@3	496
paul@9	497	def _find_term(self, term):
paul@3	498
paul@11	499	"""
paul@11	500	Find the position file offset and frequency of 'term' from the term
paul@11	501	dictionary.
paul@11	502	"""
paul@3	503
paul@14	504	i = bisect_right(self.terms, (term, self.max_offset, 0, 0)) - 1
paul@3	505
paul@3	506	# Get the entry position providing the term or one preceding it.
paul@3	507
paul@3	508	if i == -1:
paul@3	509	return None
paul@3	510
paul@11	511	found_term, offset, frequency, info_offset = self.terms[i]
paul@3	512
paul@3	513	# Where the term is found immediately, return the offset.
paul@3	514
paul@3	515	if term == found_term:
paul@11	516	return offset, frequency
paul@3	517
paul@3	518	# Otherwise, seek past the index term's entry in the information file
paul@3	519	# and scan for the desired term.
paul@3	520
paul@3	521	else:
paul@3	522	self.info_reader.go_to_term(found_term, offset, info_offset)
paul@3	523	try:
paul@3	524	while term > found_term:
paul@11	525	found_term, offset, frequency = self.info_reader.read_term()
paul@3	526	except EOFError:
paul@3	527	pass
paul@3	528
paul@11	529	# If the term is found, return the offset and frequency.
paul@3	530
paul@3	531	if term == found_term:
paul@11	532	return offset, frequency
paul@3	533	else:
paul@3	534	return None
paul@3	535
paul@12	536	def rewind(self):
paul@12	537	self.info_reader.rewind()
paul@12	538
paul@12	539	def _get_positions(self, offset):
paul@12	540	return self.position_reader.read_term_positions(offset)
paul@12	541
paul@12	542	def read_term(self):
paul@12	543
paul@12	544	"""
paul@12	545	Return the next term, its frequency and the documents and positions at
paul@12	546	which the term is found.
paul@12	547	"""
paul@12	548
paul@12	549	term, offset, frequency = self.info_reader.read_term()
paul@12	550	positions = self._get_positions(offset)
paul@12	551	return term, frequency, positions
paul@12	552
paul@5	553	def find_positions(self, term):
paul@5	554
paul@5	555	"Return the documents and positions at which the given 'term' is found."
paul@5	556
paul@11	557	t = self._find_term(term)
paul@11	558	if t is None:
paul@5	559	return None
paul@5	560	else:
paul@11	561	offset, frequency = t
paul@12	562	return self._get_positions(offset)
paul@5	563
paul@11	564	def get_frequency(self, term):
paul@11	565
paul@11	566	"Return the frequency of the given 'term'."
paul@11	567
paul@11	568	t = self._find_term(term)
paul@11	569	if t is None:
paul@11	570	return None
paul@11	571	else:
paul@11	572	offset, frequency = t
paul@11	573	return frequency
paul@11	574
paul@3	575	def close(self):
paul@3	576	self.info_reader.close()
paul@3	577	self.index_reader.close()
paul@5	578	self.position_reader.close()
paul@3	579
paul@9	580	# Specific classes for storing document information.
paul@9	581
paul@8	582	class FieldWriter(FileWriter):
paul@8	583
paul@8	584	"Writing field data to files."
paul@8	585
paul@9	586	def reset(self):
paul@9	587	self.last_docnum = 0
paul@9	588
paul@9	589	def write_fields(self, docnum, fields):
paul@8	590
paul@8	591	"""
paul@13	592	Write for the given 'docnum', a list of 'fields' (integer, string pairs
paul@13	593	representing field identifiers and values respectively).
paul@13	594	Return the offset at which the fields are stored.
paul@8	595	"""
paul@8	596
paul@8	597	offset = self.f.tell()
paul@8	598
paul@9	599	# Write the document number delta.
paul@9	600
paul@9	601	self.write_number(docnum - self.last_docnum)
paul@9	602
paul@8	603	# Write the number of fields.
paul@8	604
paul@8	605	self.write_number(len(fields))
paul@8	606
paul@8	607	# Write the fields themselves.
paul@8	608
paul@13	609	for i, field in fields:
paul@13	610	self.write_number(i)
paul@10	611	self.write_string(field, 1) # compress
paul@8	612
paul@9	613	self.last_docnum = docnum
paul@8	614	return offset
paul@8	615
paul@8	616	class FieldReader(FileReader):
paul@8	617
paul@8	618	"Reading field data from files."
paul@8	619
paul@9	620	def reset(self):
paul@9	621	self.last_docnum = 0
paul@9	622
paul@8	623	def read_fields(self):
paul@8	624
paul@9	625	"""
paul@9	626	Read fields from the file, returning a tuple containing the document
paul@13	627	number and a list of field (identifier, value) pairs.
paul@9	628	"""
paul@9	629
paul@9	630	# Read the document number.
paul@9	631
paul@9	632	self.last_docnum += self.read_number()
paul@8	633
paul@8	634	# Read the number of fields.
paul@8	635
paul@8	636	nfields = self.read_number()
paul@8	637
paul@8	638	# Collect the fields.
paul@8	639
paul@8	640	fields = []
paul@8	641	i = 0
paul@8	642
paul@8	643	while i < nfields:
paul@13	644	identifier = self.read_number()
paul@13	645	value = self.read_string(1) # decompress
paul@13	646	fields.append((identifier, value))
paul@8	647	i += 1
paul@8	648
paul@9	649	return self.last_docnum, fields
paul@9	650
paul@9	651	def read_document_fields(self, docnum, offset):
paul@8	652
paul@9	653	"""
paul@9	654	Read fields for 'docnum' at the given 'offset'. This permits the
paul@9	655	retrieval of details for the specified document, as well as scanning for
paul@9	656	later documents.
paul@9	657	"""
paul@8	658
paul@8	659	self.f.seek(offset)
paul@9	660	bad_docnum, fields = self.read_fields()
paul@9	661	self.last_docnum = docnum
paul@9	662	return docnum, fields
paul@12	663
paul@9	664	class FieldIndexWriter(FileWriter):
paul@9	665
paul@9	666	"Writing field index details to files."
paul@9	667
paul@9	668	def reset(self):
paul@9	669	self.last_docnum = 0
paul@10	670	self.last_offset = 0
paul@9	671
paul@9	672	def write_document(self, docnum, offset):
paul@9	673
paul@9	674	"""
paul@9	675	Write for the given 'docnum', the 'offset' at which the fields for the
paul@9	676	document are stored in the fields file.
paul@9	677	"""
paul@9	678
paul@10	679	# Write the document number and offset deltas.
paul@9	680
paul@9	681	self.write_number(docnum - self.last_docnum)
paul@10	682	self.write_number(offset - self.last_offset)
paul@9	683
paul@9	684	self.last_docnum = docnum
paul@10	685	self.last_offset = offset
paul@9	686
paul@9	687	class FieldIndexReader(FileReader):
paul@9	688
paul@9	689	"Reading field index details from files."
paul@9	690
paul@9	691	def reset(self):
paul@9	692	self.last_docnum = 0
paul@10	693	self.last_offset = 0
paul@9	694
paul@9	695	def read_document(self):
paul@9	696
paul@9	697	"Read a document number and field file offset."
paul@9	698
paul@9	699	# Read the document number delta and offset.
paul@9	700
paul@9	701	self.last_docnum += self.read_number()
paul@10	702	self.last_offset += self.read_number()
paul@9	703
paul@10	704	return self.last_docnum, self.last_offset
paul@9	705
paul@9	706	class FieldDictionaryWriter:
paul@9	707
paul@9	708	"Writing field dictionary details."
paul@9	709
paul@9	710	def __init__(self, field_writer, field_index_writer, interval):
paul@9	711	self.field_writer = field_writer
paul@9	712	self.field_index_writer = field_index_writer
paul@9	713	self.interval = interval
paul@9	714	self.entry = 0
paul@9	715
paul@9	716	def write_fields(self, docnum, fields):
paul@9	717
paul@9	718	"Write details of the document with the given 'docnum' and 'fields'."
paul@9	719
paul@9	720	offset = self.field_writer.write_fields(docnum, fields)
paul@9	721
paul@9	722	if self.entry % self.interval == 0:
paul@9	723	self.field_index_writer.write_document(docnum, offset)
paul@9	724
paul@9	725	self.entry += 1
paul@9	726
paul@9	727	def close(self):
paul@9	728	self.field_writer.close()
paul@9	729	self.field_index_writer.close()
paul@9	730
paul@9	731	class FieldDictionaryReader:
paul@9	732
paul@9	733	"Reading field dictionary details."
paul@9	734
paul@9	735	def __init__(self, field_reader, field_index_reader):
paul@9	736	self.field_reader = field_reader
paul@9	737	self.field_index_reader = field_index_reader
paul@9	738
paul@9	739	self.docs = []
paul@9	740	try:
paul@9	741	while 1:
paul@9	742	self.docs.append(self.field_index_reader.read_document())
paul@9	743	except EOFError:
paul@9	744	pass
paul@9	745
paul@9	746	# Large numbers for ordering purposes.
paul@9	747
paul@9	748	self.max_offset = self.docs[-1][1]
paul@9	749
paul@13	750	def rewind(self):
paul@13	751	self.field_reader.rewind()
paul@13	752
paul@13	753	def read_fields(self):
paul@13	754
paul@13	755	"Return the next document number and fields."
paul@13	756
paul@13	757	return self.field_reader.read_fields()
paul@13	758
paul@13	759	def get_fields(self, docnum):
paul@9	760
paul@9	761	"Read the fields of the document with the given 'docnum'."
paul@9	762
paul@9	763	i = bisect_right(self.docs, (docnum, self.max_offset)) - 1
paul@9	764
paul@9	765	# Get the entry position providing the term or one preceding it.
paul@9	766
paul@9	767	if i == -1:
paul@9	768	return None
paul@9	769
paul@9	770	found_docnum, offset = self.docs[i]
paul@9	771
paul@9	772	# Read from the fields file.
paul@9	773
paul@9	774	found_docnum, fields = self.field_reader.read_document_fields(found_docnum, offset)
paul@9	775
paul@9	776	# Scan for the document, if necessary.
paul@9	777
paul@9	778	try:
paul@9	779	while docnum > found_docnum:
paul@9	780	found_docnum, fields = self.field_reader.read_fields()
paul@9	781	except EOFError:
paul@9	782	pass
paul@9	783
paul@9	784	# If the document is found, return the fields.
paul@9	785
paul@9	786	if docnum == found_docnum:
paul@9	787	return fields
paul@9	788	else:
paul@9	789	return None
paul@9	790
paul@9	791	def close(self):
paul@9	792	self.field_reader.close()
paul@9	793	self.field_index_reader.close()
paul@8	794
paul@12	795	# Dictionary merging classes.
paul@12	796
paul@13	797	class Merger:
paul@12	798
paul@13	799	"Merge files."
paul@12	800
paul@12	801	def __init__(self, writer, readers):
paul@12	802	self.writer = writer
paul@12	803	self.readers = readers
paul@12	804
paul@13	805	def close(self):
paul@13	806	for reader in self.readers:
paul@13	807	reader.close()
paul@13	808	self.writer.close()
paul@13	809
paul@13	810	class TermDictionaryMerger(Merger):
paul@13	811
paul@13	812	"Merge term and position files."
paul@13	813
paul@12	814	def merge(self):
paul@13	815
paul@13	816	"""
paul@13	817	Merge terms and positions from the readers, sending them to the writer.
paul@13	818	"""
paul@13	819
paul@12	820	entries = []
paul@12	821
paul@12	822	# Get the first entries from the readers.
paul@12	823
paul@12	824	for partition, reader in enumerate(self.readers):
paul@12	825	reader.rewind()
paul@12	826
paul@12	827	try:
paul@12	828	term, frequency, positions = reader.read_term()
paul@12	829	insort_right(entries, (term, positions, partition))
paul@12	830	except EOFError:
paul@12	831	pass
paul@12	832
paul@12	833	# While entries are available, write them out in order, merging where
paul@12	834	# appropriate.
paul@12	835
paul@12	836	while entries:
paul@12	837	term, doc_positions, partition = entries[0]
paul@12	838	to_update = [partition]
paul@12	839
paul@12	840	nentries = len(entries)
paul@12	841	i = 1
paul@12	842
paul@12	843	# Find other entries for the term.
paul@12	844
paul@12	845	while i < nentries:
paul@12	846	other_term, other_doc_positions, other_partition = entries[i]
paul@12	847
paul@12	848	# For such entries, merge the positions.
paul@12	849
paul@12	850	if other_term == term:
paul@14	851	doc_positions = self.merge_positions(doc_positions, other_doc_positions)
paul@12	852	to_update.append(other_partition)
paul@12	853	i += 1
paul@12	854	else:
paul@12	855	break
paul@12	856
paul@12	857	# Write the combined term details.
paul@12	858
paul@12	859	self.writer.write_term_positions(term, doc_positions)
paul@12	860
paul@12	861	# Update the entries from the affected readers.
paul@12	862
paul@12	863	del entries[:i]
paul@12	864
paul@12	865	for partition in to_update:
paul@12	866	try:
paul@14	867	term, frequency, positions = self.readers[partition].read_term()
paul@12	868	insort_right(entries, (term, positions, partition))
paul@12	869	except EOFError:
paul@12	870	pass
paul@12	871
paul@13	872	def merge_positions(self, doc_positions, other_doc_positions):
paul@13	873
paul@13	874	"""
paul@13	875	Merge 'doc_positions' with 'other_doc_positions' so that common document
paul@13	876	records contain positions from both collections.
paul@13	877	"""
paul@13	878
paul@13	879	doc_position_dict = dict(doc_positions)
paul@13	880
paul@13	881	for docnum, positions in other_doc_positions:
paul@13	882	if doc_position_dict.has_key(docnum):
paul@13	883	doc_position_dict[docnum] += positions
paul@13	884	else:
paul@13	885	doc_position_dict[docnum] = positions
paul@13	886
paul@14	887	return doc_position_dict.items()
paul@13	888
paul@13	889	class FieldDictionaryMerger(Merger):
paul@13	890
paul@13	891	"Merge field files."
paul@13	892
paul@13	893	def merge(self):
paul@13	894
paul@13	895	"""
paul@13	896	Merge fields from the readers, sending them to the writer.
paul@13	897	"""
paul@13	898
paul@13	899	entries = []
paul@13	900
paul@13	901	# Get the first entries from the readers.
paul@13	902
paul@13	903	for partition, reader in enumerate(self.readers):
paul@13	904	reader.rewind()
paul@13	905
paul@13	906	try:
paul@13	907	docnum, fields = reader.read_fields()
paul@13	908	insort_right(entries, (docnum, fields, partition))
paul@13	909	except EOFError:
paul@13	910	pass
paul@13	911
paul@13	912	# While entries are available, write them out in order, merging where
paul@13	913	# appropriate.
paul@13	914
paul@13	915	while entries:
paul@13	916	docnum, fields, partition = entries[0]
paul@13	917	to_update = [partition]
paul@13	918
paul@13	919	nentries = len(entries)
paul@13	920	i = 1
paul@13	921
paul@13	922	# Find other entries for the term.
paul@13	923
paul@13	924	while i < nentries:
paul@13	925	other_docnum, other_fields, other_partition = entries[i]
paul@13	926
paul@13	927	# For such entries, merge the positions.
paul@13	928
paul@13	929	if other_term == term:
paul@13	930	fields += other_fields
paul@13	931	to_update.append(other_partition)
paul@13	932	i += 1
paul@13	933	else:
paul@13	934	break
paul@13	935
paul@13	936	# Write the combined term details.
paul@13	937
paul@13	938	self.writer.write_fields(docnum, fields)
paul@13	939
paul@13	940	# Update the entries from the affected readers.
paul@13	941
paul@13	942	del entries[:i]
paul@13	943
paul@13	944	for partition in to_update:
paul@13	945	try:
paul@14	946	docnum, fields = self.readers[partition].read_fields()
paul@13	947	insort_right(entries, (docnum, fields, partition))
paul@13	948	except EOFError:
paul@13	949	pass
paul@13	950
paul@13	951	# Utility functions.
paul@13	952
paul@13	953	def get_term_writer(pathname, partition, interval):
paul@13	954
paul@13	955	"""
paul@13	956	Return a term dictionary writer using files under the given 'pathname'
paul@13	957	labelled according to the given 'partition', using the given indexing
paul@13	958	'interval'.
paul@13	959	"""
paul@13	960
paul@13	961	tdf = open(join(pathname, "terms-%s" % partition), "wb")
paul@13	962	info_writer = TermWriter(tdf)
paul@13	963
paul@14	964	tdif = open(join(pathname, "terms_index-%s" % partition), "wb")
paul@13	965	index_writer = TermIndexWriter(tdif)
paul@13	966
paul@13	967	tpf = open(join(pathname, "positions-%s" % partition), "wb")
paul@13	968	positions_writer = PositionWriter(tpf)
paul@13	969
paul@13	970	return TermDictionaryWriter(info_writer, index_writer, positions_writer, interval)
paul@13	971
paul@13	972	def get_field_writer(pathname, partition, interval):
paul@13	973
paul@13	974	"""
paul@13	975	Return a field dictionary writer using files under the given 'pathname'
paul@13	976	labelled according to the given 'partition', using the given indexing
paul@13	977	'interval'.
paul@13	978	"""
paul@13	979
paul@13	980	ff = open(join(pathname, "fields-%s" % partition), "wb")
paul@13	981	field_writer = FieldWriter(ff)
paul@13	982
paul@13	983	fif = open(join(pathname, "fields_index-%s" % partition), "wb")
paul@13	984	field_index_writer = FieldIndexWriter(fif)
paul@13	985
paul@13	986	return FieldDictionaryWriter(field_writer, field_index_writer, interval)
paul@13	987
paul@14	988	def get_term_reader(pathname, partition):
paul@14	989
paul@14	990	"""
paul@14	991	Return a term dictionary reader using files under the given 'pathname'
paul@14	992	labelled according to the given 'partition'.
paul@14	993	"""
paul@14	994
paul@14	995	tdf = open(join(pathname, "terms-%s" % partition), "rb")
paul@14	996	info_reader = TermReader(tdf)
paul@14	997
paul@14	998	tdif = open(join(pathname, "terms_index-%s" % partition), "rb")
paul@14	999	index_reader = TermIndexReader(tdif)
paul@14	1000
paul@14	1001	tpf = open(join(pathname, "positions-%s" % partition), "rb")
paul@14	1002	positions_reader = PositionReader(tpf)
paul@14	1003
paul@14	1004	return TermDictionaryReader(info_reader, index_reader, positions_reader)
paul@14	1005
paul@14	1006	def get_field_reader(pathname, partition):
paul@14	1007
paul@14	1008	"""
paul@14	1009	Return a field dictionary reader using files under the given 'pathname'
paul@14	1010	labelled according to the given 'partition'.
paul@14	1011	"""
paul@14	1012
paul@14	1013	ff = open(join(pathname, "fields-%s" % partition), "rb")
paul@14	1014	field_reader = FieldReader(ff)
paul@14	1015
paul@14	1016	fif = open(join(pathname, "fields_index-%s" % partition), "rb")
paul@14	1017	field_index_reader = FieldIndexReader(fif)
paul@14	1018
paul@14	1019	return FieldDictionaryReader(field_reader, field_index_reader)
paul@14	1020
paul@14	1021	def rename_files(pathname, names, from_partition, to_partition):
paul@14	1022	for name in names:
paul@14	1023	rename(join(pathname, "%s-%s" % (name, from_partition)), join(pathname, "%s-%s" % (name, to_partition)))
paul@14	1024
paul@14	1025	def rename_term_files(pathname, from_partition, to_partition):
paul@14	1026	rename_files(pathname, ("terms", "terms_index", "positions"), from_partition, to_partition)
paul@14	1027
paul@14	1028	def rename_field_files(pathname, from_partition, to_partition):
paul@14	1029	rename_files(pathname, ("fields", "fields_index"), from_partition, to_partition)
paul@14	1030
paul@14	1031	def remove_files(pathname, names, partition):
paul@14	1032	for name in names:
paul@14	1033	remove(join(pathname, "%s-%s" % (name, partition)))
paul@14	1034
paul@14	1035	def remove_term_files(pathname, partition):
paul@14	1036	remove_files(pathname, ("terms", "terms_index", "positions"), partition)
paul@14	1037
paul@14	1038	def remove_field_files(pathname, partition):
paul@14	1039	remove_files(pathname, ("fields", "fields_index"), partition)
paul@14	1040
paul@8	1041	# High-level classes.
paul@8	1042
paul@6	1043	class IndexWriter:
paul@6	1044
paul@10	1045	"""
paul@10	1046	Building term information and writing it to the term and field dictionaries.
paul@10	1047	"""
paul@6	1048
paul@12	1049	def __init__(self, pathname, interval, flush_interval):
paul@12	1050	self.pathname = pathname
paul@12	1051	self.interval = interval
paul@12	1052	self.flush_interval = flush_interval
paul@12	1053
paul@12	1054	self.dict_partition = 0
paul@12	1055	self.field_dict_partition = 0
paul@12	1056
paul@6	1057	self.terms = {}
paul@10	1058	self.docs = {}
paul@6	1059
paul@12	1060	self.position_counter = 0
paul@12	1061	self.field_counter = 0
paul@12	1062
paul@6	1063	def add_position(self, term, docnum, position):
paul@6	1064
paul@6	1065	"""
paul@6	1066	Add a position entry for the given 'term' in the document with the given
paul@6	1067	'docnum', indicating the given 'position'.
paul@6	1068	"""
paul@6	1069
paul@6	1070	if not self.terms.has_key(term):
paul@6	1071	doc_positions = self.terms[term] = {}
paul@6	1072	else:
paul@6	1073	doc_positions = self.terms[term]
paul@6	1074
paul@6	1075	if not doc_positions.has_key(docnum):
paul@6	1076	doc = doc_positions[docnum] = []
paul@6	1077	else:
paul@6	1078	doc = doc_positions[docnum]
paul@6	1079
paul@6	1080	doc.append(position)
paul@6	1081
paul@12	1082	self.position_counter += 1
paul@13	1083	if self.flush_interval and self.position_counter >= self.flush_interval:
paul@12	1084	self.flush_terms()
paul@12	1085
paul@13	1086	def add_field(self, docnum, identifier, value):
paul@10	1087
paul@13	1088	"""
paul@13	1089	Add for the document with the given 'docnum' a field having the given
paul@13	1090	'identifier' and 'value'.
paul@13	1091	"""
paul@10	1092
paul@10	1093	if not self.docs.has_key(docnum):
paul@13	1094	doc_fields = self.docs[docnum] = []
paul@10	1095	else:
paul@13	1096	doc_fields = self.docs[docnum]
paul@10	1097
paul@13	1098	doc_fields.append((identifier, value))
paul@13	1099
paul@13	1100	self.field_counter += 1
paul@13	1101	if self.flush_interval and self.field_counter >= self.flush_interval:
paul@12	1102	self.flush_fields()
paul@12	1103
paul@12	1104	def get_term_writer(self):
paul@12	1105
paul@12	1106	"Return a term dictionary writer for the current partition."
paul@12	1107
paul@13	1108	return get_term_writer(self.pathname, self.dict_partition, self.interval)
paul@12	1109
paul@12	1110	def get_field_writer(self):
paul@12	1111
paul@12	1112	"Return a field dictionary writer for the current partition."
paul@12	1113
paul@13	1114	return get_field_writer(self.pathname, self.field_dict_partition, self.interval)
paul@12	1115
paul@12	1116	def flush_terms(self):
paul@12	1117
paul@12	1118	"Flush terms into the current term dictionary partition."
paul@6	1119
paul@6	1120	# Get the terms in order.
paul@6	1121
paul@6	1122	terms = self.terms.items()
paul@6	1123	terms.sort()
paul@6	1124
paul@12	1125	dict_writer = self.get_term_writer()
paul@12	1126
paul@6	1127	for term, doc_positions in terms:
paul@6	1128	doc_positions = doc_positions.items()
paul@6	1129	doc_positions.sort()
paul@12	1130	dict_writer.write_term_positions(term, doc_positions)
paul@12	1131
paul@12	1132	dict_writer.close()
paul@6	1133
paul@12	1134	self.terms = {}
paul@12	1135	self.dict_partition += 1
paul@12	1136
paul@12	1137	def flush_fields(self):
paul@12	1138
paul@12	1139	"Flush fields into the current term dictionary partition."
paul@7	1140
paul@10	1141	# Get the documents in order.
paul@10	1142
paul@10	1143	docs = self.docs.items()
paul@10	1144	docs.sort()
paul@10	1145
paul@12	1146	field_dict_writer = self.get_field_writer()
paul@12	1147
paul@10	1148	for docnum, fields in docs:
paul@12	1149	field_dict_writer.write_fields(docnum, fields)
paul@12	1150
paul@12	1151	field_dict_writer.close()
paul@10	1152
paul@12	1153	self.docs = {}
paul@12	1154	self.field_dict_partition += 1
paul@12	1155
paul@12	1156	def close(self):
paul@12	1157	if self.terms:
paul@12	1158	self.flush_terms()
paul@12	1159	if self.docs:
paul@12	1160	self.flush_fields()
paul@10	1161
paul@10	1162	class IndexReader:
paul@10	1163
paul@10	1164	"Accessing the term and field dictionaries."
paul@10	1165
paul@14	1166	def __init__(self, pathname):
paul@14	1167	self.dict_reader = get_term_reader(pathname, "merged")
paul@14	1168	self.field_dict_reader = get_field_reader(pathname, "merged")
paul@10	1169
paul@10	1170	def find_positions(self, term):
paul@10	1171	return self.dict_reader.find_positions(term)
paul@10	1172
paul@11	1173	def get_frequency(self, term):
paul@11	1174	return self.dict_reader.get_frequency(term)
paul@11	1175
paul@10	1176	def get_fields(self, docnum):
paul@13	1177	return self.field_dict_reader.get_fields(docnum)
paul@10	1178
paul@10	1179	def close(self):
paul@10	1180	self.dict_reader.close()
paul@10	1181	self.field_dict_reader.close()
paul@10	1182
paul@7	1183	class Index:
paul@7	1184
paul@7	1185	"An inverted index solution encapsulating the various components."
paul@7	1186
paul@7	1187	def __init__(self, pathname):
paul@7	1188	self.pathname = pathname
paul@7	1189	self.reader = None
paul@7	1190	self.writer = None
paul@7	1191
paul@12	1192	def get_writer(self, interval=INTERVAL, flush_interval=FLUSH_INTERVAL):
paul@7	1193
paul@12	1194	"""
paul@12	1195	Return a writer, optionally using the given indexing 'interval' and
paul@12	1196	'flush_interval'.
paul@12	1197	"""
paul@7	1198
paul@7	1199	if not exists(self.pathname):
paul@7	1200	mkdir(self.pathname)
paul@7	1201
paul@12	1202	self.writer = IndexWriter(self.pathname, interval, flush_interval)
paul@7	1203	return self.writer
paul@7	1204
paul@12	1205	def get_reader(self, partition=0):
paul@7	1206
paul@7	1207	"Return a reader for the index."
paul@7	1208
paul@14	1209	# Ensure that only one partition exists.
paul@14	1210
paul@14	1211	self.merge_terms()
paul@14	1212	self.merge_fields()
paul@14	1213
paul@14	1214	return self._get_reader(partition)
paul@14	1215
paul@14	1216	def _get_reader(self, partition):
paul@14	1217
paul@14	1218	"Return a reader for the index."
paul@14	1219
paul@7	1220	if not exists(self.pathname):
paul@7	1221	raise OSError, "Index path %r does not exist." % self.pathname
paul@7	1222
paul@14	1223	self.reader = IndexReader(self.pathname)
paul@12	1224	return self.reader
paul@7	1225
paul@13	1226	def merge_terms(self, interval=INTERVAL):
paul@7	1227
paul@13	1228	"Merge term dictionaries using the given indexing 'interval'."
paul@10	1229
paul@12	1230	readers = []
paul@14	1231	partitions = []
paul@10	1232
paul@14	1233	for filename in listdir(self.pathname):
paul@12	1234	if filename.startswith("terms-"): # 6 character prefix
paul@14	1235	partition = filename[6:]
paul@14	1236	readers.append(get_term_reader(self.pathname, partition))
paul@14	1237	partitions.append(partition)
paul@14	1238
paul@14	1239	# Write directly to a dictionary.
paul@14	1240
paul@14	1241	if len(readers) > 1:
paul@14	1242	writer = get_term_writer(self.pathname, "merged", interval)
paul@14	1243	merger = TermDictionaryMerger(writer, readers)
paul@14	1244	merger.merge()
paul@14	1245	merger.close()
paul@14	1246
paul@14	1247	# Remove old files.
paul@14	1248
paul@14	1249	for partition in partitions:
paul@14	1250	remove_term_files(self.pathname, partition)
paul@14	1251
paul@14	1252	elif len(readers) == 1 and partitions[0] != "merged":
paul@14	1253	rename_term_files(self.pathname, partitions[0], "merged")
paul@14	1254
paul@14	1255	def merge_fields(self, interval=INTERVAL):
paul@10	1256
paul@14	1257	"Merge field dictionaries using the given indexing 'interval'."
paul@14	1258
paul@14	1259	readers = []
paul@14	1260	partitions = []
paul@14	1261
paul@14	1262	for filename in listdir(self.pathname):
paul@14	1263	if filename.startswith("fields-"): # 7 character prefix
paul@14	1264	partition = filename[7:]
paul@14	1265	readers.append(get_field_reader(self.pathname, partition))
paul@14	1266	partitions.append(partition)
paul@14	1267
paul@14	1268	# Write directly to a dictionary.
paul@13	1269
paul@14	1270	if len(readers) > 1:
paul@14	1271	writer = get_field_writer(self.pathname, "merged", interval)
paul@14	1272	merger = FieldDictionaryMerger(writer, readers)
paul@14	1273	merger.merge()
paul@14	1274	merger.close()
paul@14	1275
paul@14	1276	# Remove old files.
paul@14	1277
paul@14	1278	for partition in partitions:
paul@14	1279	remove_field_files(self.pathname, partition)
paul@14	1280
paul@14	1281	elif len(readers) == 1 and partitions[0] != "merged":
paul@14	1282	rename_field_files(self.pathname, partitions[0], "merged")
paul@7	1283
paul@7	1284	def close(self):
paul@7	1285	if self.reader is not None:
paul@7	1286	self.reader.close()
paul@7	1287	self.reader = None
paul@7	1288	if self.writer is not None:
paul@7	1289	self.writer.close()
paul@7	1290	self.writer = None
paul@6	1291
paul@0	1292	# vim: tabstop=4 expandtab shiftwidth=4