iixr (annotate iixr.py in 1293ffb9e43b)

iixr

Annotated iixr.py

38:1293ffb9e43b

2009-09-11

Paul Boddie

Fixed Pyrex implementation for numbers from 0 to 127 inclusive.

paul@0	1	#!/usr/bin/env python
paul@0	2
paul@0	3	"""
paul@0	4	A simple (and sane) text indexing library.
paul@1	5
paul@1	6	Copyright (C) 2009 Paul Boddie <paul@boddie.org.uk>
paul@1	7
paul@1	8	This program is free software; you can redistribute it and/or modify it under
paul@1	9	the terms of the GNU General Public License as published by the Free Software
paul@1	10	Foundation; either version 3 of the License, or (at your option) any later
paul@1	11	version.
paul@1	12
paul@1	13	This program is distributed in the hope that it will be useful, but WITHOUT ANY
paul@1	14	WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
paul@1	15	PARTICULAR PURPOSE. See the GNU General Public License for more details.
paul@1	16
paul@1	17	You should have received a copy of the GNU General Public License along
paul@1	18	with this program. If not, see <http://www.gnu.org/licenses/>.
paul@0	19	"""
paul@0	20
paul@19	21	from os import dup, fdopen # independent iterator access to files
paul@12	22	from os import listdir, mkdir # index and partition discovery
paul@14	23	from os import remove, rename # partition manipulation
paul@7	24	from os.path import exists, join
paul@2	25	from os.path import commonprefix # to find common string prefixes
paul@3	26	from bisect import bisect_right # to find terms in the dictionary index
paul@10	27	import bz2, zlib # for field compression
paul@33	28	from itermerge import itermerge
paul@2	29
paul@21	30	try:
paul@21	31	set
paul@21	32	except NameError:
paul@21	33	from sets import Set as set
paul@21	34
paul@7	35	# Constants.
paul@7	36
paul@20	37	TERM_INTERVAL = 100
paul@20	38	DOCUMENT_INTERVAL = 100
paul@20	39	FIELD_INTERVAL = 100
paul@28	40	FLUSH_INTERVAL = 10000
paul@20	41
paul@20	42	TERM_FILENAMES = "terms", "terms_index", "positions", "positions_index"
paul@20	43	FIELD_FILENAMES = "fields", "fields_index"
paul@7	44
paul@10	45	compressors = [("b", bz2.compress), ("z", zlib.compress)]
paul@10	46	decompressors = {"b" : bz2.decompress, "z" : zlib.decompress}
paul@10	47
paul@35	48	# Utility functions.
paul@35	49
paul@37	50	try:
paul@38	51	from vint import vint as _vint
paul@38	52
paul@38	53	def vint(number):
paul@38	54
paul@38	55	"Write 'number' as a variable-length integer."
paul@38	56
paul@38	57	if number >= 0:
paul@38	58	return _vint(number)
paul@38	59	else:
paul@38	60	raise ValueError, "Number %r is negative." % number
paul@38	61
paul@37	62	except ImportError:
paul@37	63
paul@37	64	def vint(number):
paul@37	65
paul@37	66	"Write 'number' as a variable-length integer."
paul@37	67
paul@37	68	if number >= 0:
paul@37	69
paul@37	70	# Special case: one byte containing a 7-bit number.
paul@37	71
paul@37	72	if number < 128:
paul@37	73	return chr(number)
paul@37	74
paul@37	75	# Write the number from least to most significant digits.
paul@37	76
paul@37	77	bytes = []
paul@37	78
paul@37	79	while number != 0:
paul@37	80	lsd = number & 127
paul@37	81	number = number >> 7
paul@37	82	if number != 0:
paul@37	83	lsd \|= 128
paul@37	84	bytes.append(chr(lsd))
paul@37	85
paul@37	86	return "".join(bytes)
paul@37	87
paul@37	88	# Negative numbers are not supported.
paul@37	89
paul@37	90	else:
paul@37	91	raise ValueError, "Number %r is negative." % number
paul@35	92
paul@0	93	# Foundation classes.
paul@0	94
paul@0	95	class File:
paul@0	96
paul@0	97	"A basic file abstraction."
paul@0	98
paul@0	99	def __init__(self, f):
paul@0	100	self.f = f
paul@0	101	self.reset()
paul@0	102
paul@0	103	def reset(self):
paul@12	104
paul@12	105	"To be used to reset the state of the reader or writer between records."
paul@12	106
paul@0	107	pass
paul@0	108
paul@12	109	def rewind(self):
paul@12	110	self.f.seek(0)
paul@13	111	self.reset()
paul@12	112
paul@0	113	def close(self):
paul@7	114	if self.f is not None:
paul@7	115	self.f.close()
paul@7	116	self.f = None
paul@0	117
paul@0	118	class FileWriter(File):
paul@0	119
paul@0	120	"Writing basic data types to files."
paul@0	121
paul@0	122	def write_number(self, number):
paul@0	123
paul@0	124	"Write 'number' to the file using a variable length encoding."
paul@0	125
paul@35	126	self.f.write(vint(number))
paul@0	127
paul@8	128	def write_string(self, s, compress=0):
paul@2	129
paul@8	130	"""
paul@8	131	Write 's' to the file, recording its length and compressing the string
paul@8	132	if 'compress' is set to a true value.
paul@8	133	"""
paul@2	134
paul@7	135	# Convert Unicode objects to strings.
paul@7	136
paul@7	137	if isinstance(s, unicode):
paul@7	138	s = s.encode("utf-8")
paul@7	139
paul@8	140	# Compress the string if requested.
paul@2	141
paul@8	142	if compress:
paul@10	143	for flag, fn in compressors:
paul@10	144	cs = fn(s)
paul@10	145
paul@10	146	# Take the first string shorter than the original.
paul@10	147
paul@10	148	if len(cs) < len(s):
paul@10	149	s = cs
paul@10	150	break
paul@10	151	else:
paul@10	152	flag = "-"
paul@10	153
paul@35	154	else:
paul@35	155	flag = ""
paul@2	156
paul@8	157	# Write the length of the data before the data itself.
paul@8	158
paul@8	159	length = len(s)
paul@35	160	self.f.write(flag + vint(length) + s)
paul@2	161
paul@0	162	class FileReader(File):
paul@0	163
paul@0	164	"Reading basic data types from files."
paul@0	165
paul@0	166	def read_number(self):
paul@0	167
paul@0	168	"Read a number from the file."
paul@0	169
paul@0	170	# Read each byte, adding it to the number.
paul@0	171
paul@0	172	shift = 0
paul@0	173	number = 0
paul@34	174	read = self.f.read
paul@34	175
paul@34	176	try:
paul@34	177	csd = ord(read(1))
paul@34	178	while csd & 128:
paul@34	179	number += ((csd & 127) << shift)
paul@34	180	shift += 7
paul@34	181	csd = ord(read(1))
paul@34	182	else:
paul@34	183	number += (csd << shift)
paul@34	184	except TypeError:
paul@34	185	raise EOFError
paul@0	186
paul@0	187	return number
paul@0	188
paul@8	189	def read_string(self, decompress=0):
paul@2	190
paul@8	191	"""
paul@8	192	Read a string from the file, decompressing the stored data if
paul@8	193	'decompress' is set to a true value.
paul@8	194	"""
paul@2	195
paul@10	196	# Decompress the data if requested.
paul@10	197
paul@10	198	if decompress:
paul@10	199	flag = self.f.read(1)
paul@10	200	else:
paul@10	201	flag = "-"
paul@10	202
paul@4	203	length = self.read_number()
paul@8	204	s = self.f.read(length)
paul@8	205
paul@10	206	# Perform decompression if applicable.
paul@8	207
paul@10	208	if flag != "-":
paul@10	209	fn = decompressors[flag]
paul@10	210	s = fn(s)
paul@7	211
paul@7	212	# Convert strings to Unicode objects.
paul@7	213
paul@8	214	return unicode(s, "utf-8")
paul@2	215
paul@34	216	class FileOpener:
paul@34	217
paul@34	218	"Opening files using their filenames."
paul@34	219
paul@34	220	def __init__(self, filename):
paul@34	221	self.filename = filename
paul@34	222
paul@34	223	def open(self, mode):
paul@34	224	return open(self.filename, mode)
paul@34	225
paul@34	226	def close(self):
paul@34	227	pass
paul@34	228
paul@9	229	# Specific classes for storing term and position information.
paul@0	230
paul@0	231	class PositionWriter(FileWriter):
paul@0	232
paul@0	233	"Writing position information to files."
paul@0	234
paul@0	235	def reset(self):
paul@0	236	self.last_docnum = 0
paul@0	237
paul@0	238	def write_positions(self, docnum, positions):
paul@0	239
paul@19	240	"""
paul@19	241	Write for the document 'docnum' the given 'positions'.
paul@19	242	Return the offset of the written record.
paul@19	243	"""
paul@0	244
paul@0	245	if docnum < self.last_docnum:
paul@0	246	raise ValueError, "Document number %r is less than previous number %r." % (docnum, self.last_docnum)
paul@0	247
paul@19	248	# Record the offset of this record.
paul@19	249
paul@19	250	offset = self.f.tell()
paul@0	251
paul@7	252	# Make sure that the positions are sorted.
paul@7	253
paul@7	254	positions.sort()
paul@7	255
paul@0	256	# Write the position deltas.
paul@0	257
paul@36	258	output = []
paul@0	259	last = 0
paul@36	260
paul@0	261	for position in positions:
paul@36	262	output.append(vint(position - last))
paul@0	263	last = position
paul@0	264
paul@36	265	# Write the document number delta.
paul@36	266	# Write the number of positions.
paul@36	267	# Then write the positions.
paul@36	268
paul@36	269	self.f.write(vint(docnum - self.last_docnum) + vint(len(positions)) + "".join(output))
paul@35	270
paul@0	271	self.last_docnum = docnum
paul@19	272	return offset
paul@0	273
paul@34	274	class PositionOpener(FileOpener):
paul@0	275
paul@0	276	"Reading position information from files."
paul@0	277
paul@19	278	def read_term_positions(self, offset, count):
paul@0	279
paul@0	280	"""
paul@0	281	Read all positions from 'offset', seeking to that position in the file
paul@19	282	before reading. The number of documents available for reading is limited
paul@19	283	to 'count'.
paul@0	284	"""
paul@0	285
paul@19	286	# Duplicate the file handle.
paul@19	287
paul@34	288	f = self.open("rb")
paul@19	289	f.seek(offset)
paul@19	290	return PositionIterator(f, count)
paul@19	291
paul@19	292	class PositionIndexWriter(FileWriter):
paul@19	293
paul@19	294	"Writing position index information to files."
paul@19	295
paul@19	296	def reset(self):
paul@19	297	self.last_docnum = 0
paul@19	298	self.last_pos_offset = 0
paul@19	299
paul@19	300	def write_positions(self, docnum, pos_offset, count):
paul@19	301
paul@19	302	"""
paul@19	303	Write the given 'docnum, 'pos_offset' and document 'count' to the
paul@19	304	position index file.
paul@19	305	"""
paul@19	306
paul@19	307	# Record the offset of this record.
paul@19	308
paul@19	309	offset = self.f.tell()
paul@35	310	output = []
paul@19	311
paul@19	312	# Write the document number delta.
paul@19	313
paul@35	314	output.append(vint(docnum - self.last_docnum))
paul@19	315	self.last_docnum = docnum
paul@19	316
paul@19	317	# Write the position file offset delta.
paul@19	318
paul@35	319	output.append(vint(pos_offset - self.last_pos_offset))
paul@19	320	self.last_pos_offset = pos_offset
paul@19	321
paul@19	322	# Write the document count.
paul@19	323
paul@35	324	output.append(vint(count))
paul@35	325
paul@35	326	# Actually write the data.
paul@35	327
paul@35	328	self.f.write("".join(output))
paul@19	329
paul@19	330	return offset
paul@19	331
paul@34	332	class PositionIndexOpener(FileOpener):
paul@19	333
paul@19	334	"Reading position index information from files."
paul@18	335
paul@34	336	def read_term_positions(self, offset, doc_frequency):
paul@34	337
paul@34	338	"""
paul@34	339	Read all positions from 'offset', seeking to that position in the file
paul@34	340	before reading. The number of documents available for reading is limited
paul@34	341	to 'doc_frequency'.
paul@34	342	"""
paul@34	343
paul@34	344	# Duplicate the file handle.
paul@34	345
paul@34	346	f = self.open("rb")
paul@34	347	f.seek(offset)
paul@34	348	return PositionIndexIterator(f, doc_frequency)
paul@34	349
paul@34	350	# Iterators for position-related files.
paul@34	351
paul@34	352	class IteratorBase:
paul@34	353
paul@34	354	def __init__(self, count):
paul@34	355	self.replenish(count)
paul@34	356
paul@34	357	def replenish(self, count):
paul@34	358	self.count = count
paul@34	359	self.read_documents = 0
paul@34	360
paul@34	361	def __len__(self):
paul@34	362	return self.count
paul@34	363
paul@34	364	def sort(self):
paul@34	365	pass # Stored document positions are already sorted.
paul@34	366
paul@34	367	def __iter__(self):
paul@34	368	return self
paul@34	369
paul@34	370	class PositionIterator(FileReader, IteratorBase):
paul@34	371
paul@34	372	"Iterating over document positions."
paul@34	373
paul@34	374	def __init__(self, f, count):
paul@34	375	FileReader.__init__(self, f)
paul@34	376	IteratorBase.__init__(self, count)
paul@34	377
paul@34	378	def reset(self):
paul@34	379	self.last_docnum = 0
paul@34	380
paul@34	381	def read_positions(self):
paul@34	382
paul@34	383	"Read positions, returning a document number and a list of positions."
paul@34	384
paul@34	385	# Read the document number delta and add it to the last number.
paul@34	386
paul@34	387	self.last_docnum += self.read_number()
paul@34	388
paul@34	389	# Read the number of positions.
paul@34	390
paul@34	391	npositions = self.read_number()
paul@34	392
paul@34	393	# Read the position deltas, adding each previous position to get the
paul@34	394	# appropriate collection of absolute positions.
paul@34	395
paul@34	396	i = 0
paul@34	397	last = 0
paul@34	398	positions = []
paul@34	399
paul@34	400	while i < npositions:
paul@34	401	last += self.read_number()
paul@34	402	positions.append(last)
paul@34	403	i += 1
paul@34	404
paul@34	405	return self.last_docnum, positions
paul@34	406
paul@34	407	def next(self):
paul@34	408
paul@34	409	"Read positions for a single document."
paul@34	410
paul@34	411	if self.read_documents < self.count:
paul@34	412	self.read_documents += 1
paul@34	413	return self.read_positions()
paul@34	414	else:
paul@34	415	raise StopIteration
paul@34	416
paul@34	417	class PositionIndexIterator(FileReader, IteratorBase):
paul@34	418
paul@34	419	"Iterating over document positions."
paul@34	420
paul@34	421	def __init__(self, f, count):
paul@34	422	FileReader.__init__(self, f)
paul@34	423	IteratorBase.__init__(self, count)
paul@34	424	self.section_count = 0
paul@34	425
paul@19	426	def reset(self):
paul@19	427	self.last_docnum = 0
paul@19	428	self.last_pos_offset = 0
paul@19	429
paul@19	430	def read_positions(self):
paul@19	431
paul@19	432	"""
paul@19	433	Read a document number, a position file offset for the position index
paul@19	434	file, and the number of documents in a section of that file.
paul@19	435	"""
paul@19	436
paul@19	437	# Read the document number delta.
paul@19	438
paul@19	439	self.last_docnum += self.read_number()
paul@19	440
paul@19	441	# Read the offset delta.
paul@19	442
paul@19	443	self.last_pos_offset += self.read_number()
paul@19	444
paul@19	445	# Read the document count.
paul@19	446
paul@19	447	count = self.read_number()
paul@19	448
paul@19	449	return self.last_docnum, self.last_pos_offset, count
paul@19	450
paul@19	451	def next(self):
paul@19	452
paul@19	453	"Read positions for a single document."
paul@18	454
paul@19	455	self.read_documents += self.section_count
paul@19	456	if self.read_documents < self.count:
paul@19	457	docnum, pos_offset, self.section_count = t = self.read_positions()
paul@19	458	return t
paul@19	459	else:
paul@19	460	raise StopIteration
paul@19	461
paul@19	462	class PositionDictionaryWriter:
paul@19	463
paul@19	464	"Writing position dictionaries."
paul@19	465
paul@19	466	def __init__(self, position_writer, position_index_writer, interval):
paul@19	467	self.position_writer = position_writer
paul@19	468	self.position_index_writer = position_index_writer
paul@19	469	self.interval = interval
paul@19	470
paul@19	471	def write_term_positions(self, doc_positions):
paul@19	472
paul@19	473	"""
paul@19	474	Write all 'doc_positions' - a collection of tuples of the form (document
paul@19	475	number, position list) - to the file.
paul@19	476
paul@19	477	Add some records to the index, making dictionary entries.
paul@19	478
paul@19	479	Return a tuple containing the offset of the written data, the frequency
paul@19	480	(number of positions), and document frequency (number of documents) for
paul@19	481	the term involved.
paul@19	482	"""
paul@19	483
paul@20	484	# Reset the writers.
paul@19	485
paul@19	486	self.position_writer.reset()
paul@20	487	self.position_index_writer.reset()
paul@20	488
paul@19	489	index_offset = None
paul@19	490
paul@19	491	# Write the positions.
paul@19	492
paul@19	493	frequency = 0
paul@20	494	first_docnum = None
paul@19	495	first_offset = None
paul@19	496	count = 0
paul@19	497
paul@19	498	doc_positions.sort()
paul@19	499
paul@19	500	for docnum, positions in doc_positions:
paul@19	501	pos_offset = self.position_writer.write_positions(docnum, positions)
paul@19	502
paul@19	503	# Retain the first record offset for a subsequent index entry.
paul@19	504
paul@19	505	if first_offset is None:
paul@19	506	first_offset = pos_offset
paul@20	507	first_docnum = docnum
paul@19	508
paul@19	509	frequency += len(positions)
paul@20	510	count += 1
paul@19	511
paul@19	512	# Every {interval} entries, write an index entry.
paul@19	513
paul@34	514	if count % self.interval == 0:
paul@20	515	io = self.position_index_writer.write_positions(first_docnum, first_offset, self.interval)
paul@0	516
paul@19	517	# Remember the first index entry offset.
paul@19	518
paul@19	519	if index_offset is None:
paul@19	520	index_offset = io
paul@19	521
paul@19	522	first_offset = None
paul@20	523	first_docnum = None
paul@19	524
paul@22	525	# Reset the position writer so that position readers accessing
paul@22	526	# a section start with the correct document number.
paul@22	527
paul@22	528	self.position_writer.reset()
paul@22	529
paul@19	530	# Finish writing an index entry for the remaining documents.
paul@19	531
paul@19	532	else:
paul@19	533	if first_offset is not None:
paul@34	534	io = self.position_index_writer.write_positions(first_docnum, first_offset, count % self.interval)
paul@19	535
paul@19	536	# Remember the first index entry offset.
paul@19	537
paul@19	538	if index_offset is None:
paul@19	539	index_offset = io
paul@19	540
paul@34	541	return index_offset, frequency, count
paul@19	542
paul@19	543	def close(self):
paul@19	544	self.position_writer.close()
paul@19	545	self.position_index_writer.close()
paul@19	546
paul@19	547	class PositionDictionaryReader:
paul@18	548
paul@19	549	"Reading position dictionaries."
paul@19	550
paul@34	551	def __init__(self, position_opener, position_index_opener):
paul@34	552	self.position_opener = position_opener
paul@34	553	self.position_index_opener = position_index_opener
paul@19	554
paul@19	555	def read_term_positions(self, offset, doc_frequency):
paul@19	556
paul@19	557	"""
paul@19	558	Return an iterator for dictionary entries starting at 'offset' with the
paul@19	559	given 'doc_frequency'.
paul@19	560	"""
paul@18	561
paul@34	562	return PositionDictionaryIterator(self.position_opener,
paul@34	563	self.position_index_opener, offset, doc_frequency)
paul@19	564
paul@19	565	def close(self):
paul@34	566	pass
paul@19	567
paul@19	568	class PositionDictionaryIterator:
paul@19	569
paul@19	570	"Iteration over position dictionary entries."
paul@19	571
paul@34	572	def __init__(self, position_opener, position_index_opener, offset, doc_frequency):
paul@34	573	self.position_opener = position_opener
paul@20	574	self.doc_frequency = doc_frequency
paul@34	575	self.index_iterator = position_index_opener.read_term_positions(offset, doc_frequency)
paul@34	576	self.iterator = None
paul@19	577
paul@22	578	# Remember the last values.
paul@22	579
paul@22	580	self.found_docnum, self.found_positions = None, None
paul@22	581
paul@21	582	# Maintain state for the next index entry, if read.
paul@21	583
paul@21	584	self.next_docnum, self.next_pos_offset, self.next_section_count = None, None, None
paul@21	585
paul@21	586	# Initialise the current index entry and current position file iterator.
paul@21	587
paul@21	588	self._next_section()
paul@21	589	self._init_section()
paul@0	590
paul@34	591	# Sequence methods.
paul@34	592
paul@20	593	def __len__(self):
paul@20	594	return self.doc_frequency
paul@20	595
paul@20	596	def sort(self):
paul@20	597	pass
paul@20	598
paul@34	599	# Iterator methods.
paul@34	600
paul@18	601	def __iter__(self):
paul@18	602	return self
paul@18	603
paul@18	604	def next(self):
paul@0	605
paul@21	606	"""
paul@21	607	Attempt to get the next document record from the section in the
paul@21	608	positions file.
paul@21	609	"""
paul@19	610
paul@22	611	# Return any visited but unrequested record.
paul@22	612
paul@22	613	if self.found_docnum is not None:
paul@22	614	t = self.found_docnum, self.found_positions
paul@22	615	self.found_docnum, self.found_positions = None, None
paul@22	616	return t
paul@22	617
paul@22	618	# Or search for the next record.
paul@22	619
paul@19	620	while 1:
paul@19	621
paul@19	622	# Either return the next record.
paul@19	623
paul@19	624	try:
paul@19	625	return self.iterator.next()
paul@0	626
paul@19	627	# Or, where a section is finished, get the next section and try again.
paul@19	628
paul@19	629	except StopIteration:
paul@20	630
paul@20	631	# Where a section follows, update the index iterator, but keep
paul@20	632	# reading using the same file iterator (since the data should
paul@20	633	# just follow on from the last section).
paul@20	634
paul@21	635	self._next_section()
paul@19	636	self.iterator.replenish(self.section_count)
paul@19	637
paul@22	638	# Reset the state of the iterator to make sure that document
paul@22	639	# numbers are correct.
paul@22	640
paul@22	641	self.iterator.reset()
paul@22	642
paul@22	643	def from_document(self, docnum):
paul@21	644
paul@21	645	"""
paul@21	646	Attempt to navigate to a positions entry for the given 'docnum',
paul@22	647	returning the positions for 'docnum', or None otherwise.
paul@21	648	"""
paul@21	649
paul@22	650	# Return any unrequested document positions.
paul@22	651
paul@22	652	if docnum == self.found_docnum:
paul@22	653	return self.found_positions
paul@22	654
paul@21	655	# Read ahead in the index until the next entry refers to a document
paul@21	656	# later than the desired document.
paul@21	657
paul@21	658	try:
paul@21	659	if self.next_docnum is None:
paul@21	660	self.next_docnum, self.next_pos_offset, self.next_section_count = self.index_iterator.next()
paul@21	661
paul@22	662	# Read until the next entry is after the desired document number,
paul@22	663	# or until the end of the results.
paul@22	664
paul@22	665	while self.next_docnum <= docnum:
paul@21	666	self._next_read_section()
paul@22	667	if self.docnum < docnum:
paul@22	668	self.next_docnum, self.next_pos_offset, self.next_section_count = self.index_iterator.next()
paul@22	669	else:
paul@22	670	break
paul@21	671
paul@21	672	except StopIteration:
paul@21	673	pass
paul@21	674
paul@21	675	# Navigate in the position file to the document.
paul@21	676
paul@21	677	self._init_section()
paul@19	678
paul@21	679	try:
paul@21	680	while 1:
paul@22	681	found_docnum, found_positions = self.iterator.next()
paul@22	682
paul@24	683	# Return the desired document positions or None (retaining the
paul@24	684	# positions for the document immediately after).
paul@22	685
paul@21	686	if docnum == found_docnum:
paul@22	687	return found_positions
paul@23	688	elif docnum < found_docnum:
paul@22	689	self.found_docnum, self.found_positions = found_docnum, found_positions
paul@21	690	return None
paul@22	691
paul@21	692	except StopIteration:
paul@21	693	return None
paul@21	694
paul@21	695	# Internal methods.
paul@21	696
paul@21	697	def _next_section(self):
paul@21	698
paul@21	699	"Attempt to get the next section in the index."
paul@21	700
paul@21	701	if self.next_docnum is None:
paul@21	702	self.docnum, self.pos_offset, self.section_count = self.index_iterator.next()
paul@21	703	else:
paul@21	704	self._next_read_section()
paul@21	705
paul@21	706	def _next_read_section(self):
paul@21	707
paul@21	708	"""
paul@21	709	Make the next index entry the current one without reading from the
paul@21	710	index.
paul@21	711	"""
paul@21	712
paul@21	713	self.docnum, self.pos_offset, self.section_count = self.next_docnum, self.next_pos_offset, self.next_section_count
paul@22	714	self.next_docnum, self.next_pos_offset, self.next_section_count = None, None, None
paul@21	715
paul@21	716	def _init_section(self):
paul@21	717
paul@21	718	"Initialise the iterator for the section in the position file."
paul@21	719
paul@34	720	if self.iterator is not None:
paul@34	721	self.iterator.close()
paul@34	722	self.iterator = self.position_opener.read_term_positions(self.pos_offset, self.section_count)
paul@34	723
paul@34	724	def close(self):
paul@34	725	if self.iterator is not None:
paul@34	726	self.iterator.close()
paul@34	727	self.iterator = None
paul@34	728	if self.index_iterator is not None:
paul@34	729	self.index_iterator.close()
paul@34	730	self.index_iterator = None
paul@0	731
paul@2	732	class TermWriter(FileWriter):
paul@2	733
paul@2	734	"Writing term information to files."
paul@2	735
paul@2	736	def reset(self):
paul@2	737	self.last_term = ""
paul@2	738	self.last_offset = 0
paul@2	739
paul@19	740	def write_term(self, term, offset, frequency, doc_frequency):
paul@2	741
paul@2	742	"""
paul@19	743	Write the given 'term', its position file 'offset', its 'frequency' and
paul@19	744	its 'doc_frequency' (number of documents in which it appears) to the
paul@19	745	term information file. Return the offset after the term information was
paul@19	746	written to the file.
paul@2	747	"""
paul@2	748
paul@2	749	# Write the prefix length and term suffix.
paul@2	750
paul@2	751	common = len(commonprefix([self.last_term, term]))
paul@2	752	suffix = term[common:]
paul@2	753
paul@4	754	self.write_number(common)
paul@2	755	self.write_string(suffix)
paul@2	756
paul@2	757	# Write the offset delta.
paul@2	758
paul@2	759	self.write_number(offset - self.last_offset)
paul@2	760
paul@11	761	# Write the frequency.
paul@11	762
paul@11	763	self.write_number(frequency)
paul@11	764
paul@19	765	# Write the document frequency.
paul@19	766
paul@19	767	self.write_number(doc_frequency)
paul@19	768
paul@2	769	self.last_term = term
paul@2	770	self.last_offset = offset
paul@2	771
paul@3	772	return self.f.tell()
paul@3	773
paul@2	774	class TermReader(FileReader):
paul@2	775
paul@2	776	"Reading term information from files."
paul@2	777
paul@2	778	def reset(self):
paul@2	779	self.last_term = ""
paul@2	780	self.last_offset = 0
paul@2	781
paul@2	782	def read_term(self):
paul@2	783
paul@2	784	"""
paul@19	785	Read a term, its position file offset, its frequency and its document
paul@25	786	frequency from the term information file.
paul@2	787	"""
paul@2	788
paul@2	789	# Read the prefix length and term suffix.
paul@2	790
paul@4	791	common = self.read_number()
paul@2	792	suffix = self.read_string()
paul@2	793
paul@2	794	self.last_term = self.last_term[:common] + suffix
paul@2	795
paul@2	796	# Read the offset delta.
paul@2	797
paul@2	798	self.last_offset += self.read_number()
paul@2	799
paul@11	800	# Read the frequency.
paul@11	801
paul@11	802	frequency = self.read_number()
paul@11	803
paul@19	804	# Read the document frequency.
paul@19	805
paul@19	806	doc_frequency = self.read_number()
paul@19	807
paul@19	808	return self.last_term, self.last_offset, frequency, doc_frequency
paul@2	809
paul@3	810	def go_to_term(self, term, offset, info_offset):
paul@3	811
paul@9	812	"""
paul@9	813	Seek past the entry for 'term' having 'offset' to 'info_offset'. This
paul@9	814	permits the scanning for later terms from the specified term.
paul@9	815	"""
paul@3	816
paul@3	817	self.f.seek(info_offset)
paul@3	818	self.last_term = term
paul@3	819	self.last_offset = offset
paul@3	820
paul@3	821	class TermIndexWriter(TermWriter):
paul@3	822
paul@3	823	"Writing term dictionary index details to files."
paul@3	824
paul@3	825	def reset(self):
paul@3	826	TermWriter.reset(self)
paul@3	827	self.last_info_offset = 0
paul@3	828
paul@19	829	def write_term(self, term, offset, frequency, doc_frequency, info_offset):
paul@3	830
paul@3	831	"""
paul@19	832	Write the given 'term', its position file 'offset', its 'frequency' and
paul@19	833	its 'doc_frequency' to the term dictionary index file, along with the
paul@19	834	'info_offset' in the term information file.
paul@3	835	"""
paul@3	836
paul@19	837	TermWriter.write_term(self, term, offset, frequency, doc_frequency)
paul@3	838
paul@3	839	# Write the information file offset delta.
paul@3	840
paul@3	841	self.write_number(info_offset - self.last_info_offset)
paul@3	842	self.last_info_offset = info_offset
paul@3	843
paul@3	844	class TermIndexReader(TermReader):
paul@3	845
paul@3	846	"Reading term dictionary index details from files."
paul@3	847
paul@3	848	def reset(self):
paul@3	849	TermReader.reset(self)
paul@3	850	self.last_info_offset = 0
paul@3	851
paul@3	852	def read_term(self):
paul@3	853
paul@3	854	"""
paul@19	855	Read a term, its position file offset, its frequency, its document
paul@19	856	frequency and a term information file offset from the term dictionary
paul@19	857	index file.
paul@3	858	"""
paul@3	859
paul@19	860	term, offset, frequency, doc_frequency = TermReader.read_term(self)
paul@3	861
paul@3	862	# Read the offset delta.
paul@3	863
paul@3	864	self.last_info_offset += self.read_number()
paul@3	865
paul@19	866	return term, offset, frequency, doc_frequency, self.last_info_offset
paul@3	867
paul@3	868	class TermDictionaryWriter:
paul@3	869
paul@3	870	"Writing term dictionaries."
paul@3	871
paul@19	872	def __init__(self, info_writer, index_writer, position_dict_writer, interval):
paul@3	873	self.info_writer = info_writer
paul@3	874	self.index_writer = index_writer
paul@19	875	self.position_dict_writer = position_dict_writer
paul@3	876	self.interval = interval
paul@3	877	self.entry = 0
paul@3	878
paul@19	879	def _write_term(self, term, offset, frequency, doc_frequency):
paul@3	880
paul@3	881	"""
paul@19	882	Write the given 'term', its position file 'offset', its 'frequency' and
paul@19	883	its 'doc_frequency' (number of documents in which it appears) to the
paul@19	884	term information file. Return the offset after the term information was
paul@19	885	written to the file.
paul@3	886	"""
paul@3	887
paul@19	888	info_offset = self.info_writer.write_term(term, offset, frequency, doc_frequency)
paul@3	889
paul@3	890	if self.entry % self.interval == 0:
paul@19	891	self.index_writer.write_term(term, offset, frequency, doc_frequency, info_offset)
paul@3	892
paul@3	893	self.entry += 1
paul@3	894
paul@5	895	def write_term_positions(self, term, doc_positions):
paul@5	896
paul@5	897	"""
paul@5	898	Write the given 'term' and the 'doc_positions' recording the documents
paul@5	899	and positions at which the term is found.
paul@5	900	"""
paul@5	901
paul@19	902	offset, frequency, doc_frequency = self.position_dict_writer.write_term_positions(doc_positions)
paul@19	903	self._write_term(term, offset, frequency, doc_frequency)
paul@5	904
paul@3	905	def close(self):
paul@3	906	self.info_writer.close()
paul@3	907	self.index_writer.close()
paul@19	908	self.position_dict_writer.close()
paul@3	909
paul@3	910	class TermDictionaryReader:
paul@3	911
paul@3	912	"Reading term dictionaries."
paul@3	913
paul@22	914	def __init__(self, info_reader, index_reader, position_dict_reader):
paul@3	915	self.info_reader = info_reader
paul@3	916	self.index_reader = index_reader
paul@22	917	self.position_dict_reader = position_dict_reader
paul@3	918
paul@3	919	self.terms = []
paul@3	920	try:
paul@3	921	while 1:
paul@3	922	self.terms.append(self.index_reader.read_term())
paul@3	923	except EOFError:
paul@3	924	pass
paul@3	925
paul@3	926	# Large numbers for ordering purposes.
paul@3	927
paul@28	928	if self.terms:
paul@28	929	self.max_offset = self.terms[-1][1] + 1
paul@28	930	else:
paul@28	931	self.max_offset = None
paul@3	932
paul@25	933	def _find_closest_entry(self, term):
paul@3	934
paul@11	935	"""
paul@25	936	Find the offsets and frequencies of 'term' from the term dictionary or
paul@25	937	the closest term starting with the value of 'term'.
paul@25	938
paul@25	939	Return the closest index entry consisting of a term, the position file
paul@25	940	offset, the term frequency, the document frequency, and the term details
paul@25	941	file offset.
paul@11	942	"""
paul@3	943
paul@14	944	i = bisect_right(self.terms, (term, self.max_offset, 0, 0)) - 1
paul@3	945
paul@3	946	# Get the entry position providing the term or one preceding it.
paul@25	947	# If no entry precedes the requested term, return the very first entry
paul@25	948	# as the closest.
paul@3	949
paul@3	950	if i == -1:
paul@25	951	return self.terms[0]
paul@25	952	else:
paul@25	953	return self.terms[i]
paul@25	954
paul@25	955	def _find_closest_term(self, term):
paul@25	956
paul@25	957	"""
paul@25	958	Find the offsets and frequencies of 'term' from the term dictionary or
paul@25	959	the closest term starting with the value of 'term'.
paul@25	960
paul@25	961	Return the closest term (or the term itself), the position file offset,
paul@25	962	the term frequency, the document frequency, and the term details file
paul@25	963	offset (or None if the reader is already positioned).
paul@25	964	"""
paul@25	965
paul@25	966	found_term, offset, frequency, doc_frequency, info_offset = self._find_closest_entry(term)
paul@3	967
paul@19	968	# Where the term is found immediately, return the offset and
paul@25	969	# frequencies. If the term does not appear, return the details of the
paul@25	970	# closest entry.
paul@25	971
paul@25	972	if term <= found_term:
paul@25	973	return found_term, offset, frequency, doc_frequency, info_offset
paul@3	974
paul@3	975	# Otherwise, seek past the index term's entry in the information file
paul@3	976	# and scan for the desired term.
paul@3	977
paul@3	978	else:
paul@3	979	self.info_reader.go_to_term(found_term, offset, info_offset)
paul@3	980	try:
paul@3	981	while term > found_term:
paul@19	982	found_term, offset, frequency, doc_frequency = self.info_reader.read_term()
paul@3	983	except EOFError:
paul@3	984	pass
paul@3	985
paul@25	986	return found_term, offset, frequency, doc_frequency, None
paul@25	987
paul@25	988	def _find_term(self, term):
paul@25	989
paul@25	990	"""
paul@25	991	Find the position file offset and frequency of 'term' from the term
paul@25	992	dictionary.
paul@25	993	"""
paul@25	994
paul@25	995	found_term, offset, frequency, doc_frequency, info_offset = self._find_closest_term(term)
paul@25	996
paul@25	997	# If the term is found, return the offset and frequencies.
paul@25	998
paul@25	999	if term == found_term:
paul@25	1000	return offset, frequency, doc_frequency
paul@25	1001	else:
paul@25	1002	return None
paul@25	1003
paul@25	1004	def _get_positions(self, offset, doc_frequency):
paul@25	1005	return self.position_dict_reader.read_term_positions(offset, doc_frequency)
paul@25	1006
paul@34	1007	# Iterator convenience methods.
paul@34	1008
paul@34	1009	def __iter__(self):
paul@34	1010	self.rewind()
paul@34	1011	return self
paul@34	1012
paul@34	1013	def next(self):
paul@34	1014	try:
paul@34	1015	return self.read_term()
paul@34	1016	except EOFError:
paul@34	1017	raise StopIteration
paul@34	1018
paul@25	1019	# Sequential access methods.
paul@3	1020
paul@12	1021	def rewind(self):
paul@12	1022	self.info_reader.rewind()
paul@12	1023
paul@12	1024	def read_term(self):
paul@12	1025
paul@12	1026	"""
paul@19	1027	Return the next term, its frequency, its document frequency, and the
paul@19	1028	documents and positions at which the term is found.
paul@12	1029	"""
paul@12	1030
paul@19	1031	term, offset, frequency, doc_frequency = self.info_reader.read_term()
paul@19	1032	positions = self._get_positions(offset, doc_frequency)
paul@19	1033	return term, frequency, doc_frequency, positions
paul@12	1034
paul@25	1035	# Query methods.
paul@25	1036
paul@25	1037	def find_terms(self, term):
paul@25	1038
paul@25	1039	"Return all terms whose values start with the value of 'term'."
paul@25	1040
paul@25	1041	terms = []
paul@25	1042
paul@25	1043	found_term, offset, frequency, doc_frequency, info_offset = self._find_closest_term(term)
paul@25	1044
paul@25	1045	# Position the reader, if necessary.
paul@25	1046
paul@25	1047	if info_offset is not None:
paul@25	1048	self.info_reader.go_to_term(found_term, offset, info_offset)
paul@25	1049
paul@25	1050	# Read and record terms.
paul@25	1051
paul@25	1052	try:
paul@25	1053	# Add the found term if it starts with the specified term.
paul@25	1054
paul@25	1055	while found_term.startswith(term):
paul@25	1056	terms.append(found_term)
paul@25	1057	found_term, offset, frequency, doc_frequency = self.info_reader.read_term()
paul@25	1058
paul@25	1059	except EOFError:
paul@25	1060	pass
paul@25	1061
paul@25	1062	return terms
paul@25	1063
paul@5	1064	def find_positions(self, term):
paul@5	1065
paul@5	1066	"Return the documents and positions at which the given 'term' is found."
paul@5	1067
paul@11	1068	t = self._find_term(term)
paul@11	1069	if t is None:
paul@5	1070	return None
paul@5	1071	else:
paul@19	1072	offset, frequency, doc_frequency = t
paul@19	1073	return self._get_positions(offset, doc_frequency)
paul@5	1074
paul@11	1075	def get_frequency(self, term):
paul@11	1076
paul@11	1077	"Return the frequency of the given 'term'."
paul@11	1078
paul@11	1079	t = self._find_term(term)
paul@11	1080	if t is None:
paul@11	1081	return None
paul@11	1082	else:
paul@19	1083	offset, frequency, doc_frequency = t
paul@11	1084	return frequency
paul@11	1085
paul@19	1086	def get_document_frequency(self, term):
paul@19	1087
paul@19	1088	"Return the document frequency of the given 'term'."
paul@19	1089
paul@19	1090	t = self._find_term(term)
paul@19	1091	if t is None:
paul@19	1092	return None
paul@19	1093	else:
paul@19	1094	offset, frequency, doc_frequency = t
paul@19	1095	return doc_frequency
paul@19	1096
paul@3	1097	def close(self):
paul@3	1098	self.info_reader.close()
paul@3	1099	self.index_reader.close()
paul@22	1100	self.position_dict_reader.close()
paul@3	1101
paul@9	1102	# Specific classes for storing document information.
paul@9	1103
paul@8	1104	class FieldWriter(FileWriter):
paul@8	1105
paul@8	1106	"Writing field data to files."
paul@8	1107
paul@9	1108	def reset(self):
paul@9	1109	self.last_docnum = 0
paul@9	1110
paul@9	1111	def write_fields(self, docnum, fields):
paul@8	1112
paul@8	1113	"""
paul@13	1114	Write for the given 'docnum', a list of 'fields' (integer, string pairs
paul@13	1115	representing field identifiers and values respectively).
paul@13	1116	Return the offset at which the fields are stored.
paul@8	1117	"""
paul@8	1118
paul@8	1119	offset = self.f.tell()
paul@8	1120
paul@9	1121	# Write the document number delta.
paul@9	1122
paul@9	1123	self.write_number(docnum - self.last_docnum)
paul@9	1124
paul@8	1125	# Write the number of fields.
paul@8	1126
paul@8	1127	self.write_number(len(fields))
paul@8	1128
paul@8	1129	# Write the fields themselves.
paul@8	1130
paul@13	1131	for i, field in fields:
paul@13	1132	self.write_number(i)
paul@10	1133	self.write_string(field, 1) # compress
paul@8	1134
paul@9	1135	self.last_docnum = docnum
paul@8	1136	return offset
paul@8	1137
paul@8	1138	class FieldReader(FileReader):
paul@8	1139
paul@8	1140	"Reading field data from files."
paul@8	1141
paul@9	1142	def reset(self):
paul@9	1143	self.last_docnum = 0
paul@9	1144
paul@8	1145	def read_fields(self):
paul@8	1146
paul@9	1147	"""
paul@9	1148	Read fields from the file, returning a tuple containing the document
paul@13	1149	number and a list of field (identifier, value) pairs.
paul@9	1150	"""
paul@9	1151
paul@9	1152	# Read the document number.
paul@9	1153
paul@9	1154	self.last_docnum += self.read_number()
paul@8	1155
paul@8	1156	# Read the number of fields.
paul@8	1157
paul@8	1158	nfields = self.read_number()
paul@8	1159
paul@8	1160	# Collect the fields.
paul@8	1161
paul@8	1162	fields = []
paul@8	1163	i = 0
paul@8	1164
paul@8	1165	while i < nfields:
paul@13	1166	identifier = self.read_number()
paul@13	1167	value = self.read_string(1) # decompress
paul@13	1168	fields.append((identifier, value))
paul@8	1169	i += 1
paul@8	1170
paul@9	1171	return self.last_docnum, fields
paul@9	1172
paul@9	1173	def read_document_fields(self, docnum, offset):
paul@8	1174
paul@9	1175	"""
paul@9	1176	Read fields for 'docnum' at the given 'offset'. This permits the
paul@9	1177	retrieval of details for the specified document, as well as scanning for
paul@9	1178	later documents.
paul@9	1179	"""
paul@8	1180
paul@8	1181	self.f.seek(offset)
paul@9	1182	bad_docnum, fields = self.read_fields()
paul@9	1183	self.last_docnum = docnum
paul@9	1184	return docnum, fields
paul@12	1185
paul@9	1186	class FieldIndexWriter(FileWriter):
paul@9	1187
paul@9	1188	"Writing field index details to files."
paul@9	1189
paul@9	1190	def reset(self):
paul@9	1191	self.last_docnum = 0
paul@10	1192	self.last_offset = 0
paul@9	1193
paul@9	1194	def write_document(self, docnum, offset):
paul@9	1195
paul@9	1196	"""
paul@9	1197	Write for the given 'docnum', the 'offset' at which the fields for the
paul@9	1198	document are stored in the fields file.
paul@9	1199	"""
paul@9	1200
paul@10	1201	# Write the document number and offset deltas.
paul@9	1202
paul@9	1203	self.write_number(docnum - self.last_docnum)
paul@10	1204	self.write_number(offset - self.last_offset)
paul@9	1205
paul@9	1206	self.last_docnum = docnum
paul@10	1207	self.last_offset = offset
paul@9	1208
paul@9	1209	class FieldIndexReader(FileReader):
paul@9	1210
paul@9	1211	"Reading field index details from files."
paul@9	1212
paul@9	1213	def reset(self):
paul@9	1214	self.last_docnum = 0
paul@10	1215	self.last_offset = 0
paul@9	1216
paul@9	1217	def read_document(self):
paul@9	1218
paul@9	1219	"Read a document number and field file offset."
paul@9	1220
paul@9	1221	# Read the document number delta and offset.
paul@9	1222
paul@9	1223	self.last_docnum += self.read_number()
paul@10	1224	self.last_offset += self.read_number()
paul@9	1225
paul@10	1226	return self.last_docnum, self.last_offset
paul@9	1227
paul@9	1228	class FieldDictionaryWriter:
paul@9	1229
paul@9	1230	"Writing field dictionary details."
paul@9	1231
paul@9	1232	def __init__(self, field_writer, field_index_writer, interval):
paul@9	1233	self.field_writer = field_writer
paul@9	1234	self.field_index_writer = field_index_writer
paul@9	1235	self.interval = interval
paul@9	1236	self.entry = 0
paul@9	1237
paul@9	1238	def write_fields(self, docnum, fields):
paul@9	1239
paul@9	1240	"Write details of the document with the given 'docnum' and 'fields'."
paul@9	1241
paul@9	1242	offset = self.field_writer.write_fields(docnum, fields)
paul@9	1243
paul@9	1244	if self.entry % self.interval == 0:
paul@9	1245	self.field_index_writer.write_document(docnum, offset)
paul@9	1246
paul@9	1247	self.entry += 1
paul@9	1248
paul@9	1249	def close(self):
paul@9	1250	self.field_writer.close()
paul@9	1251	self.field_index_writer.close()
paul@9	1252
paul@9	1253	class FieldDictionaryReader:
paul@9	1254
paul@9	1255	"Reading field dictionary details."
paul@9	1256
paul@9	1257	def __init__(self, field_reader, field_index_reader):
paul@9	1258	self.field_reader = field_reader
paul@9	1259	self.field_index_reader = field_index_reader
paul@9	1260
paul@9	1261	self.docs = []
paul@9	1262	try:
paul@9	1263	while 1:
paul@9	1264	self.docs.append(self.field_index_reader.read_document())
paul@9	1265	except EOFError:
paul@9	1266	pass
paul@9	1267
paul@9	1268	# Large numbers for ordering purposes.
paul@9	1269
paul@28	1270	if self.docs:
paul@28	1271	self.max_offset = self.docs[-1][1]
paul@28	1272	else:
paul@28	1273	self.max_offset = None
paul@9	1274
paul@34	1275	# Iterator convenience methods.
paul@34	1276
paul@34	1277	def __iter__(self):
paul@34	1278	self.rewind()
paul@34	1279	return self
paul@34	1280
paul@34	1281	def next(self):
paul@34	1282	try:
paul@34	1283	return self.read_fields()
paul@34	1284	except EOFError:
paul@34	1285	raise StopIteration
paul@34	1286
paul@34	1287	# Sequential access methods.
paul@34	1288
paul@13	1289	def rewind(self):
paul@13	1290	self.field_reader.rewind()
paul@13	1291
paul@13	1292	def read_fields(self):
paul@13	1293
paul@13	1294	"Return the next document number and fields."
paul@13	1295
paul@13	1296	return self.field_reader.read_fields()
paul@13	1297
paul@34	1298	# Random access methods.
paul@34	1299
paul@13	1300	def get_fields(self, docnum):
paul@9	1301
paul@9	1302	"Read the fields of the document with the given 'docnum'."
paul@9	1303
paul@9	1304	i = bisect_right(self.docs, (docnum, self.max_offset)) - 1
paul@9	1305
paul@9	1306	# Get the entry position providing the term or one preceding it.
paul@9	1307
paul@9	1308	if i == -1:
paul@9	1309	return None
paul@9	1310
paul@9	1311	found_docnum, offset = self.docs[i]
paul@9	1312
paul@9	1313	# Read from the fields file.
paul@9	1314
paul@9	1315	found_docnum, fields = self.field_reader.read_document_fields(found_docnum, offset)
paul@9	1316
paul@9	1317	# Scan for the document, if necessary.
paul@9	1318
paul@9	1319	try:
paul@9	1320	while docnum > found_docnum:
paul@9	1321	found_docnum, fields = self.field_reader.read_fields()
paul@9	1322	except EOFError:
paul@9	1323	pass
paul@9	1324
paul@9	1325	# If the document is found, return the fields.
paul@9	1326
paul@9	1327	if docnum == found_docnum:
paul@9	1328	return fields
paul@9	1329	else:
paul@9	1330	return None
paul@9	1331
paul@9	1332	def close(self):
paul@9	1333	self.field_reader.close()
paul@9	1334	self.field_index_reader.close()
paul@8	1335
paul@12	1336	# Dictionary merging classes.
paul@12	1337
paul@13	1338	class Merger:
paul@12	1339
paul@13	1340	"Merge files."
paul@12	1341
paul@12	1342	def __init__(self, writer, readers):
paul@12	1343	self.writer = writer
paul@12	1344	self.readers = readers
paul@12	1345
paul@13	1346	def close(self):
paul@13	1347	for reader in self.readers:
paul@13	1348	reader.close()
paul@13	1349	self.writer.close()
paul@13	1350
paul@13	1351	class TermDictionaryMerger(Merger):
paul@13	1352
paul@13	1353	"Merge term and position files."
paul@13	1354
paul@12	1355	def merge(self):
paul@13	1356
paul@13	1357	"""
paul@13	1358	Merge terms and positions from the readers, sending them to the writer.
paul@13	1359	"""
paul@13	1360
paul@34	1361	last_term = None
paul@34	1362	current_readers = []
paul@34	1363
paul@34	1364	for term, frequency, doc_frequency, positions in itermerge(self.readers):
paul@34	1365	if term == last_term:
paul@34	1366	current_readers.append(positions)
paul@34	1367	else:
paul@34	1368	if current_readers:
paul@34	1369	self.writer.write_term_positions(last_term, itermerge(current_readers))
paul@34	1370	last_term = term
paul@34	1371	current_readers = [positions]
paul@34	1372	else:
paul@34	1373	if current_readers:
paul@34	1374	self.writer.write_term_positions(last_term, itermerge(current_readers))
paul@12	1375
paul@13	1376	class FieldDictionaryMerger(Merger):
paul@13	1377
paul@13	1378	"Merge field files."
paul@13	1379
paul@13	1380	def merge(self):
paul@13	1381
paul@13	1382	"""
paul@13	1383	Merge fields from the readers, sending them to the writer.
paul@13	1384	"""
paul@13	1385
paul@34	1386	for docnum, fields in itermerge(self.readers):
paul@13	1387	self.writer.write_fields(docnum, fields)
paul@13	1388
paul@13	1389	# Utility functions.
paul@13	1390
paul@19	1391	def get_term_writer(pathname, partition, interval, doc_interval):
paul@13	1392
paul@13	1393	"""
paul@13	1394	Return a term dictionary writer using files under the given 'pathname'
paul@13	1395	labelled according to the given 'partition', using the given indexing
paul@19	1396	'interval' for terms and 'doc_interval' for document position records.
paul@13	1397	"""
paul@13	1398
paul@13	1399	tdf = open(join(pathname, "terms-%s" % partition), "wb")
paul@13	1400	info_writer = TermWriter(tdf)
paul@13	1401
paul@14	1402	tdif = open(join(pathname, "terms_index-%s" % partition), "wb")
paul@13	1403	index_writer = TermIndexWriter(tdif)
paul@13	1404
paul@13	1405	tpf = open(join(pathname, "positions-%s" % partition), "wb")
paul@13	1406	positions_writer = PositionWriter(tpf)
paul@13	1407
paul@19	1408	tpif = open(join(pathname, "positions_index-%s" % partition), "wb")
paul@19	1409	positions_index_writer = PositionIndexWriter(tpif)
paul@19	1410
paul@19	1411	positions_dict_writer = PositionDictionaryWriter(positions_writer, positions_index_writer, doc_interval)
paul@19	1412
paul@19	1413	return TermDictionaryWriter(info_writer, index_writer, positions_dict_writer, interval)
paul@13	1414
paul@13	1415	def get_field_writer(pathname, partition, interval):
paul@13	1416
paul@13	1417	"""
paul@13	1418	Return a field dictionary writer using files under the given 'pathname'
paul@13	1419	labelled according to the given 'partition', using the given indexing
paul@13	1420	'interval'.
paul@13	1421	"""
paul@13	1422
paul@13	1423	ff = open(join(pathname, "fields-%s" % partition), "wb")
paul@13	1424	field_writer = FieldWriter(ff)
paul@13	1425
paul@13	1426	fif = open(join(pathname, "fields_index-%s" % partition), "wb")
paul@13	1427	field_index_writer = FieldIndexWriter(fif)
paul@13	1428
paul@13	1429	return FieldDictionaryWriter(field_writer, field_index_writer, interval)
paul@13	1430
paul@14	1431	def get_term_reader(pathname, partition):
paul@14	1432
paul@14	1433	"""
paul@14	1434	Return a term dictionary reader using files under the given 'pathname'
paul@14	1435	labelled according to the given 'partition'.
paul@14	1436	"""
paul@14	1437
paul@14	1438	tdf = open(join(pathname, "terms-%s" % partition), "rb")
paul@14	1439	info_reader = TermReader(tdf)
paul@14	1440
paul@14	1441	tdif = open(join(pathname, "terms_index-%s" % partition), "rb")
paul@14	1442	index_reader = TermIndexReader(tdif)
paul@14	1443
paul@34	1444	positions_opener = PositionOpener(join(pathname, "positions-%s" % partition))
paul@34	1445	positions_index_opener = PositionIndexOpener(join(pathname, "positions_index-%s" % partition))
paul@34	1446
paul@34	1447	positions_dict_reader = PositionDictionaryReader(positions_opener, positions_index_opener)
paul@19	1448
paul@19	1449	return TermDictionaryReader(info_reader, index_reader, positions_dict_reader)
paul@14	1450
paul@14	1451	def get_field_reader(pathname, partition):
paul@14	1452
paul@14	1453	"""
paul@14	1454	Return a field dictionary reader using files under the given 'pathname'
paul@14	1455	labelled according to the given 'partition'.
paul@14	1456	"""
paul@14	1457
paul@14	1458	ff = open(join(pathname, "fields-%s" % partition), "rb")
paul@14	1459	field_reader = FieldReader(ff)
paul@14	1460
paul@14	1461	fif = open(join(pathname, "fields_index-%s" % partition), "rb")
paul@14	1462	field_index_reader = FieldIndexReader(fif)
paul@14	1463
paul@14	1464	return FieldDictionaryReader(field_reader, field_index_reader)
paul@14	1465
paul@14	1466	def rename_files(pathname, names, from_partition, to_partition):
paul@14	1467	for name in names:
paul@14	1468	rename(join(pathname, "%s-%s" % (name, from_partition)), join(pathname, "%s-%s" % (name, to_partition)))
paul@14	1469
paul@14	1470	def rename_term_files(pathname, from_partition, to_partition):
paul@20	1471	rename_files(pathname, TERM_FILENAMES, from_partition, to_partition)
paul@14	1472
paul@14	1473	def rename_field_files(pathname, from_partition, to_partition):
paul@20	1474	rename_files(pathname, FIELD_FILENAMES, from_partition, to_partition)
paul@14	1475
paul@14	1476	def remove_files(pathname, names, partition):
paul@14	1477	for name in names:
paul@14	1478	remove(join(pathname, "%s-%s" % (name, partition)))
paul@14	1479
paul@14	1480	def remove_term_files(pathname, partition):
paul@20	1481	remove_files(pathname, TERM_FILENAMES, partition)
paul@14	1482
paul@14	1483	def remove_field_files(pathname, partition):
paul@20	1484	remove_files(pathname, FIELD_FILENAMES, partition)
paul@14	1485
paul@8	1486	# High-level classes.
paul@8	1487
paul@28	1488	class Document:
paul@28	1489
paul@28	1490	"A container of document information."
paul@28	1491
paul@28	1492	def __init__(self, docnum):
paul@28	1493	self.docnum = docnum
paul@28	1494	self.fields = []
paul@28	1495	self.terms = {}
paul@28	1496
paul@28	1497	def add_position(self, term, position):
paul@28	1498
paul@28	1499	"""
paul@28	1500	Add a position entry for the given 'term', indicating the given
paul@28	1501	'position'.
paul@28	1502	"""
paul@28	1503
paul@28	1504	self.terms.setdefault(term, []).append(position)
paul@28	1505
paul@28	1506	def add_field(self, identifier, value):
paul@28	1507
paul@28	1508	"Add a field having the given 'identifier' and 'value'."
paul@28	1509
paul@28	1510	self.fields.append((identifier, unicode(value))) # convert to string
paul@28	1511
paul@31	1512	def set_fields(self, fields):
paul@28	1513
paul@28	1514	"""
paul@31	1515	Set the document's 'fields': a list of tuples each containing an integer
paul@31	1516	identifier and a string value.
paul@28	1517	"""
paul@28	1518
paul@28	1519	self.fields = fields
paul@28	1520
paul@6	1521	class IndexWriter:
paul@6	1522
paul@10	1523	"""
paul@10	1524	Building term information and writing it to the term and field dictionaries.
paul@10	1525	"""
paul@6	1526
paul@20	1527	def __init__(self, pathname, interval, doc_interval, flush_interval):
paul@12	1528	self.pathname = pathname
paul@12	1529	self.interval = interval
paul@20	1530	self.doc_interval = doc_interval
paul@12	1531	self.flush_interval = flush_interval
paul@12	1532
paul@12	1533	self.dict_partition = 0
paul@12	1534	self.field_dict_partition = 0
paul@12	1535
paul@6	1536	self.terms = {}
paul@10	1537	self.docs = {}
paul@6	1538
paul@27	1539	self.doc_counter = 0
paul@12	1540
paul@28	1541	def add_document(self, doc):
paul@10	1542
paul@13	1543	"""
paul@28	1544	Add the given document 'doc', updating the document counter and flushing
paul@28	1545	terms and fields if appropriate.
paul@13	1546	"""
paul@10	1547
paul@28	1548	for term, positions in doc.terms.items():
paul@28	1549	self.terms.setdefault(term, {})[doc.docnum] = positions
paul@28	1550
paul@28	1551	self.docs[doc.docnum] = doc.fields
paul@27	1552
paul@27	1553	self.doc_counter += 1
paul@27	1554	if self.flush_interval and self.doc_counter >= self.flush_interval:
paul@27	1555	self.flush_terms()
paul@12	1556	self.flush_fields()
paul@27	1557	self.doc_counter = 0
paul@26	1558
paul@12	1559	def get_term_writer(self):
paul@12	1560
paul@12	1561	"Return a term dictionary writer for the current partition."
paul@12	1562
paul@20	1563	return get_term_writer(self.pathname, self.dict_partition, self.interval, self.doc_interval)
paul@12	1564
paul@12	1565	def get_field_writer(self):
paul@12	1566
paul@12	1567	"Return a field dictionary writer for the current partition."
paul@12	1568
paul@13	1569	return get_field_writer(self.pathname, self.field_dict_partition, self.interval)
paul@12	1570
paul@12	1571	def flush_terms(self):
paul@12	1572
paul@12	1573	"Flush terms into the current term dictionary partition."
paul@6	1574
paul@6	1575	# Get the terms in order.
paul@6	1576
paul@32	1577	all_terms = self.terms
paul@32	1578	terms = all_terms.keys()
paul@6	1579	terms.sort()
paul@6	1580
paul@12	1581	dict_writer = self.get_term_writer()
paul@12	1582
paul@32	1583	for term in terms:
paul@32	1584	doc_positions = all_terms[term].items()
paul@12	1585	dict_writer.write_term_positions(term, doc_positions)
paul@12	1586
paul@12	1587	dict_writer.close()
paul@6	1588
paul@12	1589	self.terms = {}
paul@12	1590	self.dict_partition += 1
paul@12	1591
paul@12	1592	def flush_fields(self):
paul@12	1593
paul@12	1594	"Flush fields into the current term dictionary partition."
paul@7	1595
paul@10	1596	# Get the documents in order.
paul@10	1597
paul@10	1598	docs = self.docs.items()
paul@10	1599	docs.sort()
paul@10	1600
paul@12	1601	field_dict_writer = self.get_field_writer()
paul@12	1602
paul@10	1603	for docnum, fields in docs:
paul@12	1604	field_dict_writer.write_fields(docnum, fields)
paul@12	1605
paul@12	1606	field_dict_writer.close()
paul@10	1607
paul@12	1608	self.docs = {}
paul@12	1609	self.field_dict_partition += 1
paul@12	1610
paul@12	1611	def close(self):
paul@12	1612	if self.terms:
paul@12	1613	self.flush_terms()
paul@12	1614	if self.docs:
paul@12	1615	self.flush_fields()
paul@10	1616
paul@10	1617	class IndexReader:
paul@10	1618
paul@10	1619	"Accessing the term and field dictionaries."
paul@10	1620
paul@14	1621	def __init__(self, pathname):
paul@14	1622	self.dict_reader = get_term_reader(pathname, "merged")
paul@14	1623	self.field_dict_reader = get_field_reader(pathname, "merged")
paul@10	1624
paul@26	1625	def find_terms(self, term):
paul@26	1626	return self.dict_reader.find_terms(term)
paul@26	1627
paul@10	1628	def find_positions(self, term):
paul@10	1629	return self.dict_reader.find_positions(term)
paul@10	1630
paul@11	1631	def get_frequency(self, term):
paul@11	1632	return self.dict_reader.get_frequency(term)
paul@11	1633
paul@22	1634	def get_document_frequency(self, term):
paul@22	1635	return self.dict_reader.get_document_frequency(term)
paul@22	1636
paul@10	1637	def get_fields(self, docnum):
paul@13	1638	return self.field_dict_reader.get_fields(docnum)
paul@10	1639
paul@10	1640	def close(self):
paul@10	1641	self.dict_reader.close()
paul@10	1642	self.field_dict_reader.close()
paul@10	1643
paul@7	1644	class Index:
paul@7	1645
paul@7	1646	"An inverted index solution encapsulating the various components."
paul@7	1647
paul@7	1648	def __init__(self, pathname):
paul@7	1649	self.pathname = pathname
paul@7	1650	self.reader = None
paul@7	1651	self.writer = None
paul@7	1652
paul@20	1653	def get_writer(self, interval=TERM_INTERVAL, doc_interval=DOCUMENT_INTERVAL, flush_interval=FLUSH_INTERVAL):
paul@7	1654
paul@12	1655	"""
paul@20	1656	Return a writer, optionally using the given indexing 'interval',
paul@20	1657	'doc_interval' and 'flush_interval'.
paul@12	1658	"""
paul@7	1659
paul@7	1660	if not exists(self.pathname):
paul@7	1661	mkdir(self.pathname)
paul@7	1662
paul@20	1663	self.writer = IndexWriter(self.pathname, interval, doc_interval, flush_interval)
paul@7	1664	return self.writer
paul@7	1665
paul@12	1666	def get_reader(self, partition=0):
paul@7	1667
paul@7	1668	"Return a reader for the index."
paul@7	1669
paul@14	1670	# Ensure that only one partition exists.
paul@14	1671
paul@24	1672	self.merge()
paul@14	1673	return self._get_reader(partition)
paul@14	1674
paul@14	1675	def _get_reader(self, partition):
paul@14	1676
paul@14	1677	"Return a reader for the index."
paul@14	1678
paul@7	1679	if not exists(self.pathname):
paul@7	1680	raise OSError, "Index path %r does not exist." % self.pathname
paul@7	1681
paul@14	1682	self.reader = IndexReader(self.pathname)
paul@12	1683	return self.reader
paul@7	1684
paul@24	1685	def merge(self):
paul@24	1686
paul@24	1687	"Merge/optimise index partitions."
paul@24	1688
paul@24	1689	self.merge_terms()
paul@24	1690	self.merge_fields()
paul@24	1691
paul@20	1692	def merge_terms(self, interval=TERM_INTERVAL, doc_interval=DOCUMENT_INTERVAL):
paul@7	1693
paul@20	1694	"""
paul@20	1695	Merge term dictionaries using the given indexing 'interval' and
paul@20	1696	'doc_interval'.
paul@20	1697	"""
paul@10	1698
paul@12	1699	readers = []
paul@21	1700	partitions = set()
paul@10	1701
paul@14	1702	for filename in listdir(self.pathname):
paul@12	1703	if filename.startswith("terms-"): # 6 character prefix
paul@14	1704	partition = filename[6:]
paul@14	1705	readers.append(get_term_reader(self.pathname, partition))
paul@21	1706	partitions.add(partition)
paul@14	1707
paul@14	1708	# Write directly to a dictionary.
paul@14	1709
paul@14	1710	if len(readers) > 1:
paul@21	1711	if "merged" in partitions:
paul@21	1712	rename_term_files(self.pathname, "merged", "old-merged")
paul@21	1713	partitions.remove("merged")
paul@21	1714	partitions.add("old-merged")
paul@21	1715
paul@20	1716	writer = get_term_writer(self.pathname, "merged", interval, doc_interval)
paul@14	1717	merger = TermDictionaryMerger(writer, readers)
paul@14	1718	merger.merge()
paul@14	1719	merger.close()
paul@14	1720
paul@14	1721	# Remove old files.
paul@14	1722
paul@14	1723	for partition in partitions:
paul@14	1724	remove_term_files(self.pathname, partition)
paul@14	1725
paul@21	1726	elif len(readers) == 1:
paul@21	1727	partition = list(partitions)[0]
paul@21	1728	if partition != "merged":
paul@21	1729	rename_term_files(self.pathname, partition, "merged")
paul@14	1730
paul@20	1731	def merge_fields(self, interval=FIELD_INTERVAL):
paul@10	1732
paul@14	1733	"Merge field dictionaries using the given indexing 'interval'."
paul@14	1734
paul@14	1735	readers = []
paul@21	1736	partitions = set()
paul@14	1737
paul@14	1738	for filename in listdir(self.pathname):
paul@14	1739	if filename.startswith("fields-"): # 7 character prefix
paul@14	1740	partition = filename[7:]
paul@14	1741	readers.append(get_field_reader(self.pathname, partition))
paul@21	1742	partitions.add(partition)
paul@14	1743
paul@14	1744	# Write directly to a dictionary.
paul@13	1745
paul@14	1746	if len(readers) > 1:
paul@21	1747	if "merged" in partitions:
paul@21	1748	rename_field_files(self.pathname, "merged", "old-merged")
paul@21	1749	partitions.remove("merged")
paul@21	1750	partitions.add("old-merged")
paul@21	1751
paul@14	1752	writer = get_field_writer(self.pathname, "merged", interval)
paul@14	1753	merger = FieldDictionaryMerger(writer, readers)
paul@14	1754	merger.merge()
paul@14	1755	merger.close()
paul@14	1756
paul@14	1757	# Remove old files.
paul@14	1758
paul@14	1759	for partition in partitions:
paul@14	1760	remove_field_files(self.pathname, partition)
paul@14	1761
paul@21	1762	elif len(readers) == 1:
paul@21	1763	partition = list(partitions)[0]
paul@21	1764	if partition != "merged":
paul@21	1765	rename_field_files(self.pathname, partition, "merged")
paul@7	1766
paul@7	1767	def close(self):
paul@7	1768	if self.reader is not None:
paul@7	1769	self.reader.close()
paul@7	1770	self.reader = None
paul@7	1771	if self.writer is not None:
paul@7	1772	self.writer.close()
paul@7	1773	self.writer = None
paul@6	1774
paul@0	1775	# vim: tabstop=4 expandtab shiftwidth=4