iixr (annotate iixr.py in 0cd36671c34e)

iixr

Annotated iixr.py

41:0cd36671c34e

2009-09-12

Paul Boddie

Made the seek method slightly more efficient at reusing cached data.

paul@0	1	#!/usr/bin/env python
paul@0	2
paul@0	3	"""
paul@0	4	A simple (and sane) text indexing library.
paul@1	5
paul@1	6	Copyright (C) 2009 Paul Boddie <paul@boddie.org.uk>
paul@1	7
paul@1	8	This program is free software; you can redistribute it and/or modify it under
paul@1	9	the terms of the GNU General Public License as published by the Free Software
paul@1	10	Foundation; either version 3 of the License, or (at your option) any later
paul@1	11	version.
paul@1	12
paul@1	13	This program is distributed in the hope that it will be useful, but WITHOUT ANY
paul@1	14	WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
paul@1	15	PARTICULAR PURPOSE. See the GNU General Public License for more details.
paul@1	16
paul@1	17	You should have received a copy of the GNU General Public License along
paul@1	18	with this program. If not, see <http://www.gnu.org/licenses/>.
paul@0	19	"""
paul@0	20
paul@19	21	from os import dup, fdopen # independent iterator access to files
paul@12	22	from os import listdir, mkdir # index and partition discovery
paul@14	23	from os import remove, rename # partition manipulation
paul@7	24	from os.path import exists, join
paul@2	25	from os.path import commonprefix # to find common string prefixes
paul@3	26	from bisect import bisect_right # to find terms in the dictionary index
paul@10	27	import bz2, zlib # for field compression
paul@33	28	from itermerge import itermerge
paul@2	29
paul@21	30	try:
paul@21	31	set
paul@21	32	except NameError:
paul@21	33	from sets import Set as set
paul@21	34
paul@7	35	# Constants.
paul@7	36
paul@20	37	TERM_INTERVAL = 100
paul@20	38	DOCUMENT_INTERVAL = 100
paul@20	39	FIELD_INTERVAL = 100
paul@28	40	FLUSH_INTERVAL = 10000
paul@20	41
paul@20	42	TERM_FILENAMES = "terms", "terms_index", "positions", "positions_index"
paul@20	43	FIELD_FILENAMES = "fields", "fields_index"
paul@7	44
paul@10	45	compressors = [("b", bz2.compress), ("z", zlib.compress)]
paul@10	46	decompressors = {"b" : bz2.decompress, "z" : zlib.decompress}
paul@10	47
paul@35	48	# Utility functions.
paul@35	49
paul@37	50	try:
paul@38	51	from vint import vint as _vint
paul@38	52
paul@38	53	def vint(number):
paul@38	54
paul@38	55	"Write 'number' as a variable-length integer."
paul@38	56
paul@38	57	if number >= 0:
paul@38	58	return _vint(number)
paul@38	59	else:
paul@38	60	raise ValueError, "Number %r is negative." % number
paul@38	61
paul@37	62	except ImportError:
paul@37	63
paul@37	64	def vint(number):
paul@37	65
paul@37	66	"Write 'number' as a variable-length integer."
paul@37	67
paul@37	68	if number >= 0:
paul@37	69
paul@37	70	# Special case: one byte containing a 7-bit number.
paul@37	71
paul@37	72	if number < 128:
paul@37	73	return chr(number)
paul@37	74
paul@37	75	# Write the number from least to most significant digits.
paul@37	76
paul@37	77	bytes = []
paul@37	78
paul@37	79	while number != 0:
paul@37	80	lsd = number & 127
paul@37	81	number = number >> 7
paul@37	82	if number != 0:
paul@37	83	lsd \|= 128
paul@37	84	bytes.append(chr(lsd))
paul@37	85
paul@37	86	return "".join(bytes)
paul@37	87
paul@37	88	# Negative numbers are not supported.
paul@37	89
paul@37	90	else:
paul@37	91	raise ValueError, "Number %r is negative." % number
paul@35	92
paul@0	93	# Foundation classes.
paul@0	94
paul@0	95	class File:
paul@0	96
paul@0	97	"A basic file abstraction."
paul@0	98
paul@0	99	def __init__(self, f):
paul@0	100	self.f = f
paul@0	101	self.reset()
paul@0	102
paul@0	103	def reset(self):
paul@12	104
paul@12	105	"To be used to reset the state of the reader or writer between records."
paul@12	106
paul@0	107	pass
paul@0	108
paul@12	109	def rewind(self):
paul@40	110	self.seek(0)
paul@13	111	self.reset()
paul@12	112
paul@40	113	def seek(self, offset):
paul@40	114
paul@40	115	"To be defined by readers."
paul@40	116
paul@40	117	pass
paul@39	118
paul@39	119	def flush(self):
paul@40	120
paul@40	121	"To be defined by writers."
paul@40	122
paul@40	123	pass
paul@39	124
paul@0	125	def close(self):
paul@7	126	if self.f is not None:
paul@39	127	self.flush()
paul@7	128	self.f.close()
paul@7	129	self.f = None
paul@0	130
paul@0	131	class FileWriter(File):
paul@0	132
paul@0	133	"Writing basic data types to files."
paul@0	134
paul@40	135	def __init__(self, f):
paul@40	136	File.__init__(self, f)
paul@40	137	self.cache = []
paul@40	138	self.cache_length = 0
paul@40	139
paul@0	140	def write_number(self, number):
paul@0	141
paul@0	142	"Write 'number' to the file using a variable length encoding."
paul@0	143
paul@39	144	self.write(vint(number))
paul@0	145
paul@8	146	def write_string(self, s, compress=0):
paul@2	147
paul@8	148	"""
paul@8	149	Write 's' to the file, recording its length and compressing the string
paul@8	150	if 'compress' is set to a true value.
paul@8	151	"""
paul@2	152
paul@7	153	# Convert Unicode objects to strings.
paul@7	154
paul@7	155	if isinstance(s, unicode):
paul@7	156	s = s.encode("utf-8")
paul@7	157
paul@8	158	# Compress the string if requested.
paul@2	159
paul@8	160	if compress:
paul@10	161	for flag, fn in compressors:
paul@10	162	cs = fn(s)
paul@10	163
paul@10	164	# Take the first string shorter than the original.
paul@10	165
paul@10	166	if len(cs) < len(s):
paul@10	167	s = cs
paul@10	168	break
paul@10	169	else:
paul@10	170	flag = "-"
paul@10	171
paul@35	172	else:
paul@35	173	flag = ""
paul@2	174
paul@8	175	# Write the length of the data before the data itself.
paul@8	176
paul@8	177	length = len(s)
paul@39	178	self.write(flag + vint(length) + s)
paul@2	179
paul@40	180	# Cache-affected methods.
paul@40	181
paul@40	182	def write(self, s):
paul@40	183	self.cache.append(s)
paul@40	184	self.cache_length += len(s)
paul@40	185	if self.cache_length >= 1000:
paul@40	186	self.flush()
paul@40	187
paul@40	188	def tell(self):
paul@40	189	return self.f.tell() + self.cache_length
paul@40	190
paul@40	191	def flush(self):
paul@40	192	self.f.write("".join(self.cache))
paul@40	193	self.cache = []
paul@40	194	self.cache_length = 0
paul@40	195
paul@0	196	class FileReader(File):
paul@0	197
paul@0	198	"Reading basic data types from files."
paul@0	199
paul@40	200	def __init__(self, f):
paul@40	201	File.__init__(self, f)
paul@40	202	self.cache = ""
paul@40	203	self.cache_length = 0
paul@40	204
paul@0	205	def read_number(self):
paul@0	206
paul@0	207	"Read a number from the file."
paul@0	208
paul@0	209	# Read each byte, adding it to the number.
paul@0	210
paul@0	211	shift = 0
paul@0	212	number = 0
paul@40	213	read = self.read
paul@34	214
paul@34	215	try:
paul@34	216	csd = ord(read(1))
paul@34	217	while csd & 128:
paul@34	218	number += ((csd & 127) << shift)
paul@34	219	shift += 7
paul@34	220	csd = ord(read(1))
paul@34	221	else:
paul@34	222	number += (csd << shift)
paul@34	223	except TypeError:
paul@34	224	raise EOFError
paul@0	225
paul@0	226	return number
paul@0	227
paul@8	228	def read_string(self, decompress=0):
paul@2	229
paul@8	230	"""
paul@8	231	Read a string from the file, decompressing the stored data if
paul@8	232	'decompress' is set to a true value.
paul@8	233	"""
paul@2	234
paul@10	235	# Decompress the data if requested.
paul@10	236
paul@10	237	if decompress:
paul@40	238	flag = self.read(1)
paul@10	239	else:
paul@10	240	flag = "-"
paul@10	241
paul@4	242	length = self.read_number()
paul@40	243	s = self.read(length)
paul@8	244
paul@10	245	# Perform decompression if applicable.
paul@8	246
paul@10	247	if flag != "-":
paul@10	248	fn = decompressors[flag]
paul@10	249	s = fn(s)
paul@7	250
paul@7	251	# Convert strings to Unicode objects.
paul@7	252
paul@8	253	return unicode(s, "utf-8")
paul@2	254
paul@40	255	# Cache-affected methods.
paul@40	256
paul@40	257	def read(self, n):
paul@40	258	needed = n - self.cache_length
paul@40	259	if needed > 0:
paul@40	260	s = self.f.read(max(needed, 1000))
paul@40	261	self.cache += s
paul@40	262	self.cache_length += len(s)
paul@40	263
paul@40	264	s = self.cache[:n]
paul@40	265	self.cache = self.cache[n:]
paul@41	266	self.cache_length = len(self.cache)
paul@40	267	return s
paul@40	268
paul@40	269	def tell(self):
paul@40	270	return self.f.tell() - self.cache_length
paul@40	271
paul@40	272	def seek(self, offset):
paul@41	273	current = self.tell()
paul@40	274	self.f.seek(offset)
paul@41	275	if offset >= current:
paul@41	276	discarded = offset - current
paul@41	277	self.cache = self.cache[discarded:]
paul@41	278	self.cache_length = len(self.cache)
paul@41	279	else:
paul@41	280	self.cache = ""
paul@41	281	self.cache_length = 0
paul@40	282
paul@34	283	class FileOpener:
paul@34	284
paul@34	285	"Opening files using their filenames."
paul@34	286
paul@34	287	def __init__(self, filename):
paul@34	288	self.filename = filename
paul@34	289
paul@34	290	def open(self, mode):
paul@34	291	return open(self.filename, mode)
paul@34	292
paul@34	293	def close(self):
paul@34	294	pass
paul@34	295
paul@9	296	# Specific classes for storing term and position information.
paul@0	297
paul@0	298	class PositionWriter(FileWriter):
paul@0	299
paul@0	300	"Writing position information to files."
paul@0	301
paul@0	302	def reset(self):
paul@0	303	self.last_docnum = 0
paul@0	304
paul@0	305	def write_positions(self, docnum, positions):
paul@0	306
paul@19	307	"""
paul@19	308	Write for the document 'docnum' the given 'positions'.
paul@19	309	Return the offset of the written record.
paul@19	310	"""
paul@0	311
paul@0	312	if docnum < self.last_docnum:
paul@0	313	raise ValueError, "Document number %r is less than previous number %r." % (docnum, self.last_docnum)
paul@0	314
paul@19	315	# Record the offset of this record.
paul@19	316
paul@39	317	offset = self.tell()
paul@0	318
paul@7	319	# Make sure that the positions are sorted.
paul@7	320
paul@7	321	positions.sort()
paul@7	322
paul@0	323	# Write the position deltas.
paul@0	324
paul@36	325	output = []
paul@0	326	last = 0
paul@36	327
paul@0	328	for position in positions:
paul@36	329	output.append(vint(position - last))
paul@0	330	last = position
paul@0	331
paul@36	332	# Write the document number delta.
paul@36	333	# Write the number of positions.
paul@36	334	# Then write the positions.
paul@36	335
paul@39	336	self.write(vint(docnum - self.last_docnum) + vint(len(positions)) + "".join(output))
paul@35	337
paul@0	338	self.last_docnum = docnum
paul@19	339	return offset
paul@0	340
paul@34	341	class PositionOpener(FileOpener):
paul@0	342
paul@0	343	"Reading position information from files."
paul@0	344
paul@19	345	def read_term_positions(self, offset, count):
paul@0	346
paul@0	347	"""
paul@0	348	Read all positions from 'offset', seeking to that position in the file
paul@19	349	before reading. The number of documents available for reading is limited
paul@19	350	to 'count'.
paul@0	351	"""
paul@0	352
paul@19	353	# Duplicate the file handle.
paul@19	354
paul@34	355	f = self.open("rb")
paul@40	356	return PositionIterator(f, offset, count)
paul@19	357
paul@19	358	class PositionIndexWriter(FileWriter):
paul@19	359
paul@19	360	"Writing position index information to files."
paul@19	361
paul@19	362	def reset(self):
paul@19	363	self.last_docnum = 0
paul@19	364	self.last_pos_offset = 0
paul@19	365
paul@19	366	def write_positions(self, docnum, pos_offset, count):
paul@19	367
paul@19	368	"""
paul@19	369	Write the given 'docnum, 'pos_offset' and document 'count' to the
paul@19	370	position index file.
paul@19	371	"""
paul@19	372
paul@19	373	# Record the offset of this record.
paul@19	374
paul@39	375	offset = self.tell()
paul@35	376	output = []
paul@19	377
paul@19	378	# Write the document number delta.
paul@19	379
paul@35	380	output.append(vint(docnum - self.last_docnum))
paul@19	381	self.last_docnum = docnum
paul@19	382
paul@19	383	# Write the position file offset delta.
paul@19	384
paul@35	385	output.append(vint(pos_offset - self.last_pos_offset))
paul@19	386	self.last_pos_offset = pos_offset
paul@19	387
paul@19	388	# Write the document count.
paul@19	389
paul@35	390	output.append(vint(count))
paul@35	391
paul@35	392	# Actually write the data.
paul@35	393
paul@39	394	self.write("".join(output))
paul@19	395
paul@19	396	return offset
paul@19	397
paul@34	398	class PositionIndexOpener(FileOpener):
paul@19	399
paul@19	400	"Reading position index information from files."
paul@18	401
paul@34	402	def read_term_positions(self, offset, doc_frequency):
paul@34	403
paul@34	404	"""
paul@34	405	Read all positions from 'offset', seeking to that position in the file
paul@34	406	before reading. The number of documents available for reading is limited
paul@34	407	to 'doc_frequency'.
paul@34	408	"""
paul@34	409
paul@34	410	# Duplicate the file handle.
paul@34	411
paul@34	412	f = self.open("rb")
paul@40	413	return PositionIndexIterator(f, offset, doc_frequency)
paul@34	414
paul@34	415	# Iterators for position-related files.
paul@34	416
paul@34	417	class IteratorBase:
paul@34	418
paul@34	419	def __init__(self, count):
paul@34	420	self.replenish(count)
paul@34	421
paul@34	422	def replenish(self, count):
paul@34	423	self.count = count
paul@34	424	self.read_documents = 0
paul@34	425
paul@34	426	def __len__(self):
paul@34	427	return self.count
paul@34	428
paul@34	429	def sort(self):
paul@34	430	pass # Stored document positions are already sorted.
paul@34	431
paul@34	432	def __iter__(self):
paul@34	433	return self
paul@34	434
paul@34	435	class PositionIterator(FileReader, IteratorBase):
paul@34	436
paul@34	437	"Iterating over document positions."
paul@34	438
paul@40	439	def __init__(self, f, offset, count):
paul@34	440	FileReader.__init__(self, f)
paul@34	441	IteratorBase.__init__(self, count)
paul@40	442	self.seek(offset)
paul@34	443
paul@34	444	def reset(self):
paul@34	445	self.last_docnum = 0
paul@34	446
paul@34	447	def read_positions(self):
paul@34	448
paul@34	449	"Read positions, returning a document number and a list of positions."
paul@34	450
paul@34	451	# Read the document number delta and add it to the last number.
paul@34	452
paul@34	453	self.last_docnum += self.read_number()
paul@34	454
paul@34	455	# Read the number of positions.
paul@34	456
paul@34	457	npositions = self.read_number()
paul@34	458
paul@34	459	# Read the position deltas, adding each previous position to get the
paul@34	460	# appropriate collection of absolute positions.
paul@34	461
paul@34	462	i = 0
paul@34	463	last = 0
paul@34	464	positions = []
paul@34	465
paul@34	466	while i < npositions:
paul@34	467	last += self.read_number()
paul@34	468	positions.append(last)
paul@34	469	i += 1
paul@34	470
paul@34	471	return self.last_docnum, positions
paul@34	472
paul@34	473	def next(self):
paul@34	474
paul@34	475	"Read positions for a single document."
paul@34	476
paul@34	477	if self.read_documents < self.count:
paul@34	478	self.read_documents += 1
paul@34	479	return self.read_positions()
paul@34	480	else:
paul@34	481	raise StopIteration
paul@34	482
paul@34	483	class PositionIndexIterator(FileReader, IteratorBase):
paul@34	484
paul@34	485	"Iterating over document positions."
paul@34	486
paul@40	487	def __init__(self, f, offset, count):
paul@34	488	FileReader.__init__(self, f)
paul@34	489	IteratorBase.__init__(self, count)
paul@40	490	self.seek(offset)
paul@34	491	self.section_count = 0
paul@34	492
paul@19	493	def reset(self):
paul@19	494	self.last_docnum = 0
paul@19	495	self.last_pos_offset = 0
paul@19	496
paul@19	497	def read_positions(self):
paul@19	498
paul@19	499	"""
paul@19	500	Read a document number, a position file offset for the position index
paul@19	501	file, and the number of documents in a section of that file.
paul@19	502	"""
paul@19	503
paul@19	504	# Read the document number delta.
paul@19	505
paul@19	506	self.last_docnum += self.read_number()
paul@19	507
paul@19	508	# Read the offset delta.
paul@19	509
paul@19	510	self.last_pos_offset += self.read_number()
paul@19	511
paul@19	512	# Read the document count.
paul@19	513
paul@19	514	count = self.read_number()
paul@19	515
paul@19	516	return self.last_docnum, self.last_pos_offset, count
paul@19	517
paul@19	518	def next(self):
paul@19	519
paul@19	520	"Read positions for a single document."
paul@18	521
paul@19	522	self.read_documents += self.section_count
paul@19	523	if self.read_documents < self.count:
paul@19	524	docnum, pos_offset, self.section_count = t = self.read_positions()
paul@19	525	return t
paul@19	526	else:
paul@19	527	raise StopIteration
paul@19	528
paul@19	529	class PositionDictionaryWriter:
paul@19	530
paul@19	531	"Writing position dictionaries."
paul@19	532
paul@19	533	def __init__(self, position_writer, position_index_writer, interval):
paul@19	534	self.position_writer = position_writer
paul@19	535	self.position_index_writer = position_index_writer
paul@19	536	self.interval = interval
paul@19	537
paul@19	538	def write_term_positions(self, doc_positions):
paul@19	539
paul@19	540	"""
paul@19	541	Write all 'doc_positions' - a collection of tuples of the form (document
paul@19	542	number, position list) - to the file.
paul@19	543
paul@19	544	Add some records to the index, making dictionary entries.
paul@19	545
paul@19	546	Return a tuple containing the offset of the written data, the frequency
paul@19	547	(number of positions), and document frequency (number of documents) for
paul@19	548	the term involved.
paul@19	549	"""
paul@19	550
paul@20	551	# Reset the writers.
paul@19	552
paul@19	553	self.position_writer.reset()
paul@20	554	self.position_index_writer.reset()
paul@20	555
paul@19	556	index_offset = None
paul@19	557
paul@19	558	# Write the positions.
paul@19	559
paul@19	560	frequency = 0
paul@20	561	first_docnum = None
paul@19	562	first_offset = None
paul@19	563	count = 0
paul@19	564
paul@19	565	doc_positions.sort()
paul@19	566
paul@19	567	for docnum, positions in doc_positions:
paul@19	568	pos_offset = self.position_writer.write_positions(docnum, positions)
paul@19	569
paul@19	570	# Retain the first record offset for a subsequent index entry.
paul@19	571
paul@19	572	if first_offset is None:
paul@19	573	first_offset = pos_offset
paul@20	574	first_docnum = docnum
paul@19	575
paul@19	576	frequency += len(positions)
paul@20	577	count += 1
paul@19	578
paul@19	579	# Every {interval} entries, write an index entry.
paul@19	580
paul@34	581	if count % self.interval == 0:
paul@20	582	io = self.position_index_writer.write_positions(first_docnum, first_offset, self.interval)
paul@0	583
paul@19	584	# Remember the first index entry offset.
paul@19	585
paul@19	586	if index_offset is None:
paul@19	587	index_offset = io
paul@19	588
paul@19	589	first_offset = None
paul@20	590	first_docnum = None
paul@19	591
paul@22	592	# Reset the position writer so that position readers accessing
paul@22	593	# a section start with the correct document number.
paul@22	594
paul@22	595	self.position_writer.reset()
paul@22	596
paul@19	597	# Finish writing an index entry for the remaining documents.
paul@19	598
paul@19	599	else:
paul@19	600	if first_offset is not None:
paul@34	601	io = self.position_index_writer.write_positions(first_docnum, first_offset, count % self.interval)
paul@19	602
paul@19	603	# Remember the first index entry offset.
paul@19	604
paul@19	605	if index_offset is None:
paul@19	606	index_offset = io
paul@19	607
paul@34	608	return index_offset, frequency, count
paul@19	609
paul@19	610	def close(self):
paul@19	611	self.position_writer.close()
paul@19	612	self.position_index_writer.close()
paul@19	613
paul@19	614	class PositionDictionaryReader:
paul@18	615
paul@19	616	"Reading position dictionaries."
paul@19	617
paul@34	618	def __init__(self, position_opener, position_index_opener):
paul@34	619	self.position_opener = position_opener
paul@34	620	self.position_index_opener = position_index_opener
paul@19	621
paul@19	622	def read_term_positions(self, offset, doc_frequency):
paul@19	623
paul@19	624	"""
paul@19	625	Return an iterator for dictionary entries starting at 'offset' with the
paul@19	626	given 'doc_frequency'.
paul@19	627	"""
paul@18	628
paul@34	629	return PositionDictionaryIterator(self.position_opener,
paul@34	630	self.position_index_opener, offset, doc_frequency)
paul@19	631
paul@19	632	def close(self):
paul@34	633	pass
paul@19	634
paul@19	635	class PositionDictionaryIterator:
paul@19	636
paul@19	637	"Iteration over position dictionary entries."
paul@19	638
paul@34	639	def __init__(self, position_opener, position_index_opener, offset, doc_frequency):
paul@34	640	self.position_opener = position_opener
paul@20	641	self.doc_frequency = doc_frequency
paul@34	642	self.index_iterator = position_index_opener.read_term_positions(offset, doc_frequency)
paul@34	643	self.iterator = None
paul@19	644
paul@22	645	# Remember the last values.
paul@22	646
paul@22	647	self.found_docnum, self.found_positions = None, None
paul@22	648
paul@21	649	# Maintain state for the next index entry, if read.
paul@21	650
paul@21	651	self.next_docnum, self.next_pos_offset, self.next_section_count = None, None, None
paul@21	652
paul@21	653	# Initialise the current index entry and current position file iterator.
paul@21	654
paul@21	655	self._next_section()
paul@21	656	self._init_section()
paul@0	657
paul@34	658	# Sequence methods.
paul@34	659
paul@20	660	def __len__(self):
paul@20	661	return self.doc_frequency
paul@20	662
paul@20	663	def sort(self):
paul@20	664	pass
paul@20	665
paul@34	666	# Iterator methods.
paul@34	667
paul@18	668	def __iter__(self):
paul@18	669	return self
paul@18	670
paul@18	671	def next(self):
paul@0	672
paul@21	673	"""
paul@21	674	Attempt to get the next document record from the section in the
paul@21	675	positions file.
paul@21	676	"""
paul@19	677
paul@22	678	# Return any visited but unrequested record.
paul@22	679
paul@22	680	if self.found_docnum is not None:
paul@22	681	t = self.found_docnum, self.found_positions
paul@22	682	self.found_docnum, self.found_positions = None, None
paul@22	683	return t
paul@22	684
paul@22	685	# Or search for the next record.
paul@22	686
paul@19	687	while 1:
paul@19	688
paul@19	689	# Either return the next record.
paul@19	690
paul@19	691	try:
paul@19	692	return self.iterator.next()
paul@0	693
paul@19	694	# Or, where a section is finished, get the next section and try again.
paul@19	695
paul@19	696	except StopIteration:
paul@20	697
paul@20	698	# Where a section follows, update the index iterator, but keep
paul@20	699	# reading using the same file iterator (since the data should
paul@20	700	# just follow on from the last section).
paul@20	701
paul@21	702	self._next_section()
paul@19	703	self.iterator.replenish(self.section_count)
paul@19	704
paul@22	705	# Reset the state of the iterator to make sure that document
paul@22	706	# numbers are correct.
paul@22	707
paul@22	708	self.iterator.reset()
paul@22	709
paul@22	710	def from_document(self, docnum):
paul@21	711
paul@21	712	"""
paul@21	713	Attempt to navigate to a positions entry for the given 'docnum',
paul@22	714	returning the positions for 'docnum', or None otherwise.
paul@21	715	"""
paul@21	716
paul@22	717	# Return any unrequested document positions.
paul@22	718
paul@22	719	if docnum == self.found_docnum:
paul@22	720	return self.found_positions
paul@22	721
paul@21	722	# Read ahead in the index until the next entry refers to a document
paul@21	723	# later than the desired document.
paul@21	724
paul@21	725	try:
paul@21	726	if self.next_docnum is None:
paul@21	727	self.next_docnum, self.next_pos_offset, self.next_section_count = self.index_iterator.next()
paul@21	728
paul@22	729	# Read until the next entry is after the desired document number,
paul@22	730	# or until the end of the results.
paul@22	731
paul@22	732	while self.next_docnum <= docnum:
paul@21	733	self._next_read_section()
paul@22	734	if self.docnum < docnum:
paul@22	735	self.next_docnum, self.next_pos_offset, self.next_section_count = self.index_iterator.next()
paul@22	736	else:
paul@22	737	break
paul@21	738
paul@21	739	except StopIteration:
paul@21	740	pass
paul@21	741
paul@21	742	# Navigate in the position file to the document.
paul@21	743
paul@21	744	self._init_section()
paul@19	745
paul@21	746	try:
paul@21	747	while 1:
paul@22	748	found_docnum, found_positions = self.iterator.next()
paul@22	749
paul@24	750	# Return the desired document positions or None (retaining the
paul@24	751	# positions for the document immediately after).
paul@22	752
paul@21	753	if docnum == found_docnum:
paul@22	754	return found_positions
paul@23	755	elif docnum < found_docnum:
paul@22	756	self.found_docnum, self.found_positions = found_docnum, found_positions
paul@21	757	return None
paul@22	758
paul@21	759	except StopIteration:
paul@21	760	return None
paul@21	761
paul@21	762	# Internal methods.
paul@21	763
paul@21	764	def _next_section(self):
paul@21	765
paul@21	766	"Attempt to get the next section in the index."
paul@21	767
paul@21	768	if self.next_docnum is None:
paul@21	769	self.docnum, self.pos_offset, self.section_count = self.index_iterator.next()
paul@21	770	else:
paul@21	771	self._next_read_section()
paul@21	772
paul@21	773	def _next_read_section(self):
paul@21	774
paul@21	775	"""
paul@21	776	Make the next index entry the current one without reading from the
paul@21	777	index.
paul@21	778	"""
paul@21	779
paul@21	780	self.docnum, self.pos_offset, self.section_count = self.next_docnum, self.next_pos_offset, self.next_section_count
paul@22	781	self.next_docnum, self.next_pos_offset, self.next_section_count = None, None, None
paul@21	782
paul@21	783	def _init_section(self):
paul@21	784
paul@21	785	"Initialise the iterator for the section in the position file."
paul@21	786
paul@34	787	if self.iterator is not None:
paul@34	788	self.iterator.close()
paul@34	789	self.iterator = self.position_opener.read_term_positions(self.pos_offset, self.section_count)
paul@34	790
paul@34	791	def close(self):
paul@34	792	if self.iterator is not None:
paul@34	793	self.iterator.close()
paul@34	794	self.iterator = None
paul@34	795	if self.index_iterator is not None:
paul@34	796	self.index_iterator.close()
paul@34	797	self.index_iterator = None
paul@0	798
paul@2	799	class TermWriter(FileWriter):
paul@2	800
paul@2	801	"Writing term information to files."
paul@2	802
paul@2	803	def reset(self):
paul@2	804	self.last_term = ""
paul@2	805	self.last_offset = 0
paul@2	806
paul@19	807	def write_term(self, term, offset, frequency, doc_frequency):
paul@2	808
paul@2	809	"""
paul@19	810	Write the given 'term', its position file 'offset', its 'frequency' and
paul@19	811	its 'doc_frequency' (number of documents in which it appears) to the
paul@19	812	term information file. Return the offset after the term information was
paul@19	813	written to the file.
paul@2	814	"""
paul@2	815
paul@2	816	# Write the prefix length and term suffix.
paul@2	817
paul@2	818	common = len(commonprefix([self.last_term, term]))
paul@2	819	suffix = term[common:]
paul@2	820
paul@4	821	self.write_number(common)
paul@2	822	self.write_string(suffix)
paul@2	823
paul@2	824	# Write the offset delta.
paul@2	825
paul@2	826	self.write_number(offset - self.last_offset)
paul@2	827
paul@11	828	# Write the frequency.
paul@11	829
paul@11	830	self.write_number(frequency)
paul@11	831
paul@19	832	# Write the document frequency.
paul@19	833
paul@19	834	self.write_number(doc_frequency)
paul@19	835
paul@2	836	self.last_term = term
paul@2	837	self.last_offset = offset
paul@2	838
paul@39	839	return self.tell()
paul@3	840
paul@2	841	class TermReader(FileReader):
paul@2	842
paul@2	843	"Reading term information from files."
paul@2	844
paul@2	845	def reset(self):
paul@2	846	self.last_term = ""
paul@2	847	self.last_offset = 0
paul@2	848
paul@2	849	def read_term(self):
paul@2	850
paul@2	851	"""
paul@19	852	Read a term, its position file offset, its frequency and its document
paul@25	853	frequency from the term information file.
paul@2	854	"""
paul@2	855
paul@2	856	# Read the prefix length and term suffix.
paul@2	857
paul@4	858	common = self.read_number()
paul@2	859	suffix = self.read_string()
paul@2	860
paul@2	861	self.last_term = self.last_term[:common] + suffix
paul@2	862
paul@2	863	# Read the offset delta.
paul@2	864
paul@2	865	self.last_offset += self.read_number()
paul@2	866
paul@11	867	# Read the frequency.
paul@11	868
paul@11	869	frequency = self.read_number()
paul@11	870
paul@19	871	# Read the document frequency.
paul@19	872
paul@19	873	doc_frequency = self.read_number()
paul@19	874
paul@19	875	return self.last_term, self.last_offset, frequency, doc_frequency
paul@2	876
paul@3	877	def go_to_term(self, term, offset, info_offset):
paul@3	878
paul@9	879	"""
paul@9	880	Seek past the entry for 'term' having 'offset' to 'info_offset'. This
paul@9	881	permits the scanning for later terms from the specified term.
paul@9	882	"""
paul@3	883
paul@40	884	self.seek(info_offset)
paul@3	885	self.last_term = term
paul@3	886	self.last_offset = offset
paul@3	887
paul@3	888	class TermIndexWriter(TermWriter):
paul@3	889
paul@3	890	"Writing term dictionary index details to files."
paul@3	891
paul@3	892	def reset(self):
paul@3	893	TermWriter.reset(self)
paul@3	894	self.last_info_offset = 0
paul@3	895
paul@19	896	def write_term(self, term, offset, frequency, doc_frequency, info_offset):
paul@3	897
paul@3	898	"""
paul@19	899	Write the given 'term', its position file 'offset', its 'frequency' and
paul@19	900	its 'doc_frequency' to the term dictionary index file, along with the
paul@19	901	'info_offset' in the term information file.
paul@3	902	"""
paul@3	903
paul@19	904	TermWriter.write_term(self, term, offset, frequency, doc_frequency)
paul@3	905
paul@3	906	# Write the information file offset delta.
paul@3	907
paul@3	908	self.write_number(info_offset - self.last_info_offset)
paul@3	909	self.last_info_offset = info_offset
paul@3	910
paul@3	911	class TermIndexReader(TermReader):
paul@3	912
paul@3	913	"Reading term dictionary index details from files."
paul@3	914
paul@3	915	def reset(self):
paul@3	916	TermReader.reset(self)
paul@3	917	self.last_info_offset = 0
paul@3	918
paul@3	919	def read_term(self):
paul@3	920
paul@3	921	"""
paul@19	922	Read a term, its position file offset, its frequency, its document
paul@19	923	frequency and a term information file offset from the term dictionary
paul@19	924	index file.
paul@3	925	"""
paul@3	926
paul@19	927	term, offset, frequency, doc_frequency = TermReader.read_term(self)
paul@3	928
paul@3	929	# Read the offset delta.
paul@3	930
paul@3	931	self.last_info_offset += self.read_number()
paul@3	932
paul@19	933	return term, offset, frequency, doc_frequency, self.last_info_offset
paul@3	934
paul@3	935	class TermDictionaryWriter:
paul@3	936
paul@3	937	"Writing term dictionaries."
paul@3	938
paul@19	939	def __init__(self, info_writer, index_writer, position_dict_writer, interval):
paul@3	940	self.info_writer = info_writer
paul@3	941	self.index_writer = index_writer
paul@19	942	self.position_dict_writer = position_dict_writer
paul@3	943	self.interval = interval
paul@3	944	self.entry = 0
paul@3	945
paul@19	946	def _write_term(self, term, offset, frequency, doc_frequency):
paul@3	947
paul@3	948	"""
paul@19	949	Write the given 'term', its position file 'offset', its 'frequency' and
paul@19	950	its 'doc_frequency' (number of documents in which it appears) to the
paul@19	951	term information file. Return the offset after the term information was
paul@19	952	written to the file.
paul@3	953	"""
paul@3	954
paul@19	955	info_offset = self.info_writer.write_term(term, offset, frequency, doc_frequency)
paul@3	956
paul@3	957	if self.entry % self.interval == 0:
paul@19	958	self.index_writer.write_term(term, offset, frequency, doc_frequency, info_offset)
paul@3	959
paul@3	960	self.entry += 1
paul@3	961
paul@5	962	def write_term_positions(self, term, doc_positions):
paul@5	963
paul@5	964	"""
paul@5	965	Write the given 'term' and the 'doc_positions' recording the documents
paul@5	966	and positions at which the term is found.
paul@5	967	"""
paul@5	968
paul@19	969	offset, frequency, doc_frequency = self.position_dict_writer.write_term_positions(doc_positions)
paul@19	970	self._write_term(term, offset, frequency, doc_frequency)
paul@5	971
paul@3	972	def close(self):
paul@3	973	self.info_writer.close()
paul@3	974	self.index_writer.close()
paul@19	975	self.position_dict_writer.close()
paul@3	976
paul@3	977	class TermDictionaryReader:
paul@3	978
paul@3	979	"Reading term dictionaries."
paul@3	980
paul@22	981	def __init__(self, info_reader, index_reader, position_dict_reader):
paul@3	982	self.info_reader = info_reader
paul@3	983	self.index_reader = index_reader
paul@22	984	self.position_dict_reader = position_dict_reader
paul@3	985
paul@3	986	self.terms = []
paul@3	987	try:
paul@3	988	while 1:
paul@3	989	self.terms.append(self.index_reader.read_term())
paul@3	990	except EOFError:
paul@3	991	pass
paul@3	992
paul@3	993	# Large numbers for ordering purposes.
paul@3	994
paul@28	995	if self.terms:
paul@28	996	self.max_offset = self.terms[-1][1] + 1
paul@28	997	else:
paul@28	998	self.max_offset = None
paul@3	999
paul@25	1000	def _find_closest_entry(self, term):
paul@3	1001
paul@11	1002	"""
paul@25	1003	Find the offsets and frequencies of 'term' from the term dictionary or
paul@25	1004	the closest term starting with the value of 'term'.
paul@25	1005
paul@25	1006	Return the closest index entry consisting of a term, the position file
paul@25	1007	offset, the term frequency, the document frequency, and the term details
paul@25	1008	file offset.
paul@11	1009	"""
paul@3	1010
paul@14	1011	i = bisect_right(self.terms, (term, self.max_offset, 0, 0)) - 1
paul@3	1012
paul@3	1013	# Get the entry position providing the term or one preceding it.
paul@25	1014	# If no entry precedes the requested term, return the very first entry
paul@25	1015	# as the closest.
paul@3	1016
paul@3	1017	if i == -1:
paul@25	1018	return self.terms[0]
paul@25	1019	else:
paul@25	1020	return self.terms[i]
paul@25	1021
paul@25	1022	def _find_closest_term(self, term):
paul@25	1023
paul@25	1024	"""
paul@25	1025	Find the offsets and frequencies of 'term' from the term dictionary or
paul@25	1026	the closest term starting with the value of 'term'.
paul@25	1027
paul@25	1028	Return the closest term (or the term itself), the position file offset,
paul@25	1029	the term frequency, the document frequency, and the term details file
paul@25	1030	offset (or None if the reader is already positioned).
paul@25	1031	"""
paul@25	1032
paul@25	1033	found_term, offset, frequency, doc_frequency, info_offset = self._find_closest_entry(term)
paul@3	1034
paul@19	1035	# Where the term is found immediately, return the offset and
paul@25	1036	# frequencies. If the term does not appear, return the details of the
paul@25	1037	# closest entry.
paul@25	1038
paul@25	1039	if term <= found_term:
paul@25	1040	return found_term, offset, frequency, doc_frequency, info_offset
paul@3	1041
paul@3	1042	# Otherwise, seek past the index term's entry in the information file
paul@3	1043	# and scan for the desired term.
paul@3	1044
paul@3	1045	else:
paul@3	1046	self.info_reader.go_to_term(found_term, offset, info_offset)
paul@3	1047	try:
paul@3	1048	while term > found_term:
paul@19	1049	found_term, offset, frequency, doc_frequency = self.info_reader.read_term()
paul@3	1050	except EOFError:
paul@3	1051	pass
paul@3	1052
paul@25	1053	return found_term, offset, frequency, doc_frequency, None
paul@25	1054
paul@25	1055	def _find_term(self, term):
paul@25	1056
paul@25	1057	"""
paul@25	1058	Find the position file offset and frequency of 'term' from the term
paul@25	1059	dictionary.
paul@25	1060	"""
paul@25	1061
paul@25	1062	found_term, offset, frequency, doc_frequency, info_offset = self._find_closest_term(term)
paul@25	1063
paul@25	1064	# If the term is found, return the offset and frequencies.
paul@25	1065
paul@25	1066	if term == found_term:
paul@25	1067	return offset, frequency, doc_frequency
paul@25	1068	else:
paul@25	1069	return None
paul@25	1070
paul@25	1071	def _get_positions(self, offset, doc_frequency):
paul@25	1072	return self.position_dict_reader.read_term_positions(offset, doc_frequency)
paul@25	1073
paul@34	1074	# Iterator convenience methods.
paul@34	1075
paul@34	1076	def __iter__(self):
paul@34	1077	self.rewind()
paul@34	1078	return self
paul@34	1079
paul@34	1080	def next(self):
paul@34	1081	try:
paul@34	1082	return self.read_term()
paul@34	1083	except EOFError:
paul@34	1084	raise StopIteration
paul@34	1085
paul@25	1086	# Sequential access methods.
paul@3	1087
paul@12	1088	def rewind(self):
paul@12	1089	self.info_reader.rewind()
paul@12	1090
paul@12	1091	def read_term(self):
paul@12	1092
paul@12	1093	"""
paul@19	1094	Return the next term, its frequency, its document frequency, and the
paul@19	1095	documents and positions at which the term is found.
paul@12	1096	"""
paul@12	1097
paul@19	1098	term, offset, frequency, doc_frequency = self.info_reader.read_term()
paul@19	1099	positions = self._get_positions(offset, doc_frequency)
paul@19	1100	return term, frequency, doc_frequency, positions
paul@12	1101
paul@25	1102	# Query methods.
paul@25	1103
paul@25	1104	def find_terms(self, term):
paul@25	1105
paul@25	1106	"Return all terms whose values start with the value of 'term'."
paul@25	1107
paul@25	1108	terms = []
paul@25	1109
paul@25	1110	found_term, offset, frequency, doc_frequency, info_offset = self._find_closest_term(term)
paul@25	1111
paul@25	1112	# Position the reader, if necessary.
paul@25	1113
paul@25	1114	if info_offset is not None:
paul@25	1115	self.info_reader.go_to_term(found_term, offset, info_offset)
paul@25	1116
paul@25	1117	# Read and record terms.
paul@25	1118
paul@25	1119	try:
paul@25	1120	# Add the found term if it starts with the specified term.
paul@25	1121
paul@25	1122	while found_term.startswith(term):
paul@25	1123	terms.append(found_term)
paul@25	1124	found_term, offset, frequency, doc_frequency = self.info_reader.read_term()
paul@25	1125
paul@25	1126	except EOFError:
paul@25	1127	pass
paul@25	1128
paul@25	1129	return terms
paul@25	1130
paul@5	1131	def find_positions(self, term):
paul@5	1132
paul@5	1133	"Return the documents and positions at which the given 'term' is found."
paul@5	1134
paul@11	1135	t = self._find_term(term)
paul@11	1136	if t is None:
paul@5	1137	return None
paul@5	1138	else:
paul@19	1139	offset, frequency, doc_frequency = t
paul@19	1140	return self._get_positions(offset, doc_frequency)
paul@5	1141
paul@11	1142	def get_frequency(self, term):
paul@11	1143
paul@11	1144	"Return the frequency of the given 'term'."
paul@11	1145
paul@11	1146	t = self._find_term(term)
paul@11	1147	if t is None:
paul@11	1148	return None
paul@11	1149	else:
paul@19	1150	offset, frequency, doc_frequency = t
paul@11	1151	return frequency
paul@11	1152
paul@19	1153	def get_document_frequency(self, term):
paul@19	1154
paul@19	1155	"Return the document frequency of the given 'term'."
paul@19	1156
paul@19	1157	t = self._find_term(term)
paul@19	1158	if t is None:
paul@19	1159	return None
paul@19	1160	else:
paul@19	1161	offset, frequency, doc_frequency = t
paul@19	1162	return doc_frequency
paul@19	1163
paul@3	1164	def close(self):
paul@3	1165	self.info_reader.close()
paul@3	1166	self.index_reader.close()
paul@22	1167	self.position_dict_reader.close()
paul@3	1168
paul@9	1169	# Specific classes for storing document information.
paul@9	1170
paul@8	1171	class FieldWriter(FileWriter):
paul@8	1172
paul@8	1173	"Writing field data to files."
paul@8	1174
paul@9	1175	def reset(self):
paul@9	1176	self.last_docnum = 0
paul@9	1177
paul@9	1178	def write_fields(self, docnum, fields):
paul@8	1179
paul@8	1180	"""
paul@13	1181	Write for the given 'docnum', a list of 'fields' (integer, string pairs
paul@13	1182	representing field identifiers and values respectively).
paul@13	1183	Return the offset at which the fields are stored.
paul@8	1184	"""
paul@8	1185
paul@39	1186	offset = self.tell()
paul@8	1187
paul@9	1188	# Write the document number delta.
paul@9	1189
paul@9	1190	self.write_number(docnum - self.last_docnum)
paul@9	1191
paul@8	1192	# Write the number of fields.
paul@8	1193
paul@8	1194	self.write_number(len(fields))
paul@8	1195
paul@8	1196	# Write the fields themselves.
paul@8	1197
paul@13	1198	for i, field in fields:
paul@13	1199	self.write_number(i)
paul@10	1200	self.write_string(field, 1) # compress
paul@8	1201
paul@9	1202	self.last_docnum = docnum
paul@8	1203	return offset
paul@8	1204
paul@8	1205	class FieldReader(FileReader):
paul@8	1206
paul@8	1207	"Reading field data from files."
paul@8	1208
paul@9	1209	def reset(self):
paul@9	1210	self.last_docnum = 0
paul@9	1211
paul@8	1212	def read_fields(self):
paul@8	1213
paul@9	1214	"""
paul@9	1215	Read fields from the file, returning a tuple containing the document
paul@13	1216	number and a list of field (identifier, value) pairs.
paul@9	1217	"""
paul@9	1218
paul@9	1219	# Read the document number.
paul@9	1220
paul@9	1221	self.last_docnum += self.read_number()
paul@8	1222
paul@8	1223	# Read the number of fields.
paul@8	1224
paul@8	1225	nfields = self.read_number()
paul@8	1226
paul@8	1227	# Collect the fields.
paul@8	1228
paul@8	1229	fields = []
paul@8	1230	i = 0
paul@8	1231
paul@8	1232	while i < nfields:
paul@13	1233	identifier = self.read_number()
paul@13	1234	value = self.read_string(1) # decompress
paul@13	1235	fields.append((identifier, value))
paul@8	1236	i += 1
paul@8	1237
paul@9	1238	return self.last_docnum, fields
paul@9	1239
paul@9	1240	def read_document_fields(self, docnum, offset):
paul@8	1241
paul@9	1242	"""
paul@9	1243	Read fields for 'docnum' at the given 'offset'. This permits the
paul@9	1244	retrieval of details for the specified document, as well as scanning for
paul@9	1245	later documents.
paul@9	1246	"""
paul@8	1247
paul@40	1248	self.seek(offset)
paul@9	1249	bad_docnum, fields = self.read_fields()
paul@9	1250	self.last_docnum = docnum
paul@9	1251	return docnum, fields
paul@12	1252
paul@9	1253	class FieldIndexWriter(FileWriter):
paul@9	1254
paul@9	1255	"Writing field index details to files."
paul@9	1256
paul@9	1257	def reset(self):
paul@9	1258	self.last_docnum = 0
paul@10	1259	self.last_offset = 0
paul@9	1260
paul@9	1261	def write_document(self, docnum, offset):
paul@9	1262
paul@9	1263	"""
paul@9	1264	Write for the given 'docnum', the 'offset' at which the fields for the
paul@9	1265	document are stored in the fields file.
paul@9	1266	"""
paul@9	1267
paul@10	1268	# Write the document number and offset deltas.
paul@9	1269
paul@9	1270	self.write_number(docnum - self.last_docnum)
paul@10	1271	self.write_number(offset - self.last_offset)
paul@9	1272
paul@9	1273	self.last_docnum = docnum
paul@10	1274	self.last_offset = offset
paul@9	1275
paul@9	1276	class FieldIndexReader(FileReader):
paul@9	1277
paul@9	1278	"Reading field index details from files."
paul@9	1279
paul@9	1280	def reset(self):
paul@9	1281	self.last_docnum = 0
paul@10	1282	self.last_offset = 0
paul@9	1283
paul@9	1284	def read_document(self):
paul@9	1285
paul@9	1286	"Read a document number and field file offset."
paul@9	1287
paul@9	1288	# Read the document number delta and offset.
paul@9	1289
paul@9	1290	self.last_docnum += self.read_number()
paul@10	1291	self.last_offset += self.read_number()
paul@9	1292
paul@10	1293	return self.last_docnum, self.last_offset
paul@9	1294
paul@9	1295	class FieldDictionaryWriter:
paul@9	1296
paul@9	1297	"Writing field dictionary details."
paul@9	1298
paul@9	1299	def __init__(self, field_writer, field_index_writer, interval):
paul@9	1300	self.field_writer = field_writer
paul@9	1301	self.field_index_writer = field_index_writer
paul@9	1302	self.interval = interval
paul@9	1303	self.entry = 0
paul@9	1304
paul@9	1305	def write_fields(self, docnum, fields):
paul@9	1306
paul@9	1307	"Write details of the document with the given 'docnum' and 'fields'."
paul@9	1308
paul@9	1309	offset = self.field_writer.write_fields(docnum, fields)
paul@9	1310
paul@9	1311	if self.entry % self.interval == 0:
paul@9	1312	self.field_index_writer.write_document(docnum, offset)
paul@9	1313
paul@9	1314	self.entry += 1
paul@9	1315
paul@9	1316	def close(self):
paul@9	1317	self.field_writer.close()
paul@9	1318	self.field_index_writer.close()
paul@9	1319
paul@9	1320	class FieldDictionaryReader:
paul@9	1321
paul@9	1322	"Reading field dictionary details."
paul@9	1323
paul@9	1324	def __init__(self, field_reader, field_index_reader):
paul@9	1325	self.field_reader = field_reader
paul@9	1326	self.field_index_reader = field_index_reader
paul@9	1327
paul@9	1328	self.docs = []
paul@9	1329	try:
paul@9	1330	while 1:
paul@9	1331	self.docs.append(self.field_index_reader.read_document())
paul@9	1332	except EOFError:
paul@9	1333	pass
paul@9	1334
paul@9	1335	# Large numbers for ordering purposes.
paul@9	1336
paul@28	1337	if self.docs:
paul@28	1338	self.max_offset = self.docs[-1][1]
paul@28	1339	else:
paul@28	1340	self.max_offset = None
paul@9	1341
paul@34	1342	# Iterator convenience methods.
paul@34	1343
paul@34	1344	def __iter__(self):
paul@34	1345	self.rewind()
paul@34	1346	return self
paul@34	1347
paul@34	1348	def next(self):
paul@34	1349	try:
paul@34	1350	return self.read_fields()
paul@34	1351	except EOFError:
paul@34	1352	raise StopIteration
paul@34	1353
paul@34	1354	# Sequential access methods.
paul@34	1355
paul@13	1356	def rewind(self):
paul@13	1357	self.field_reader.rewind()
paul@13	1358
paul@13	1359	def read_fields(self):
paul@13	1360
paul@13	1361	"Return the next document number and fields."
paul@13	1362
paul@13	1363	return self.field_reader.read_fields()
paul@13	1364
paul@34	1365	# Random access methods.
paul@34	1366
paul@13	1367	def get_fields(self, docnum):
paul@9	1368
paul@9	1369	"Read the fields of the document with the given 'docnum'."
paul@9	1370
paul@9	1371	i = bisect_right(self.docs, (docnum, self.max_offset)) - 1
paul@9	1372
paul@9	1373	# Get the entry position providing the term or one preceding it.
paul@9	1374
paul@9	1375	if i == -1:
paul@9	1376	return None
paul@9	1377
paul@9	1378	found_docnum, offset = self.docs[i]
paul@9	1379
paul@9	1380	# Read from the fields file.
paul@9	1381
paul@9	1382	found_docnum, fields = self.field_reader.read_document_fields(found_docnum, offset)
paul@9	1383
paul@9	1384	# Scan for the document, if necessary.
paul@9	1385
paul@9	1386	try:
paul@9	1387	while docnum > found_docnum:
paul@9	1388	found_docnum, fields = self.field_reader.read_fields()
paul@9	1389	except EOFError:
paul@9	1390	pass
paul@9	1391
paul@9	1392	# If the document is found, return the fields.
paul@9	1393
paul@9	1394	if docnum == found_docnum:
paul@9	1395	return fields
paul@9	1396	else:
paul@9	1397	return None
paul@9	1398
paul@9	1399	def close(self):
paul@9	1400	self.field_reader.close()
paul@9	1401	self.field_index_reader.close()
paul@8	1402
paul@12	1403	# Dictionary merging classes.
paul@12	1404
paul@13	1405	class Merger:
paul@12	1406
paul@13	1407	"Merge files."
paul@12	1408
paul@12	1409	def __init__(self, writer, readers):
paul@12	1410	self.writer = writer
paul@12	1411	self.readers = readers
paul@12	1412
paul@13	1413	def close(self):
paul@13	1414	for reader in self.readers:
paul@13	1415	reader.close()
paul@13	1416	self.writer.close()
paul@13	1417
paul@13	1418	class TermDictionaryMerger(Merger):
paul@13	1419
paul@13	1420	"Merge term and position files."
paul@13	1421
paul@12	1422	def merge(self):
paul@13	1423
paul@13	1424	"""
paul@13	1425	Merge terms and positions from the readers, sending them to the writer.
paul@13	1426	"""
paul@13	1427
paul@34	1428	last_term = None
paul@34	1429	current_readers = []
paul@34	1430
paul@34	1431	for term, frequency, doc_frequency, positions in itermerge(self.readers):
paul@34	1432	if term == last_term:
paul@34	1433	current_readers.append(positions)
paul@34	1434	else:
paul@34	1435	if current_readers:
paul@34	1436	self.writer.write_term_positions(last_term, itermerge(current_readers))
paul@34	1437	last_term = term
paul@34	1438	current_readers = [positions]
paul@34	1439	else:
paul@34	1440	if current_readers:
paul@34	1441	self.writer.write_term_positions(last_term, itermerge(current_readers))
paul@12	1442
paul@13	1443	class FieldDictionaryMerger(Merger):
paul@13	1444
paul@13	1445	"Merge field files."
paul@13	1446
paul@13	1447	def merge(self):
paul@13	1448
paul@13	1449	"""
paul@13	1450	Merge fields from the readers, sending them to the writer.
paul@13	1451	"""
paul@13	1452
paul@34	1453	for docnum, fields in itermerge(self.readers):
paul@13	1454	self.writer.write_fields(docnum, fields)
paul@13	1455
paul@13	1456	# Utility functions.
paul@13	1457
paul@19	1458	def get_term_writer(pathname, partition, interval, doc_interval):
paul@13	1459
paul@13	1460	"""
paul@13	1461	Return a term dictionary writer using files under the given 'pathname'
paul@13	1462	labelled according to the given 'partition', using the given indexing
paul@19	1463	'interval' for terms and 'doc_interval' for document position records.
paul@13	1464	"""
paul@13	1465
paul@13	1466	tdf = open(join(pathname, "terms-%s" % partition), "wb")
paul@13	1467	info_writer = TermWriter(tdf)
paul@13	1468
paul@14	1469	tdif = open(join(pathname, "terms_index-%s" % partition), "wb")
paul@13	1470	index_writer = TermIndexWriter(tdif)
paul@13	1471
paul@13	1472	tpf = open(join(pathname, "positions-%s" % partition), "wb")
paul@13	1473	positions_writer = PositionWriter(tpf)
paul@13	1474
paul@19	1475	tpif = open(join(pathname, "positions_index-%s" % partition), "wb")
paul@19	1476	positions_index_writer = PositionIndexWriter(tpif)
paul@19	1477
paul@19	1478	positions_dict_writer = PositionDictionaryWriter(positions_writer, positions_index_writer, doc_interval)
paul@19	1479
paul@19	1480	return TermDictionaryWriter(info_writer, index_writer, positions_dict_writer, interval)
paul@13	1481
paul@13	1482	def get_field_writer(pathname, partition, interval):
paul@13	1483
paul@13	1484	"""
paul@13	1485	Return a field dictionary writer using files under the given 'pathname'
paul@13	1486	labelled according to the given 'partition', using the given indexing
paul@13	1487	'interval'.
paul@13	1488	"""
paul@13	1489
paul@13	1490	ff = open(join(pathname, "fields-%s" % partition), "wb")
paul@13	1491	field_writer = FieldWriter(ff)
paul@13	1492
paul@13	1493	fif = open(join(pathname, "fields_index-%s" % partition), "wb")
paul@13	1494	field_index_writer = FieldIndexWriter(fif)
paul@13	1495
paul@13	1496	return FieldDictionaryWriter(field_writer, field_index_writer, interval)
paul@13	1497
paul@14	1498	def get_term_reader(pathname, partition):
paul@14	1499
paul@14	1500	"""
paul@14	1501	Return a term dictionary reader using files under the given 'pathname'
paul@14	1502	labelled according to the given 'partition'.
paul@14	1503	"""
paul@14	1504
paul@14	1505	tdf = open(join(pathname, "terms-%s" % partition), "rb")
paul@14	1506	info_reader = TermReader(tdf)
paul@14	1507
paul@14	1508	tdif = open(join(pathname, "terms_index-%s" % partition), "rb")
paul@14	1509	index_reader = TermIndexReader(tdif)
paul@14	1510
paul@34	1511	positions_opener = PositionOpener(join(pathname, "positions-%s" % partition))
paul@34	1512	positions_index_opener = PositionIndexOpener(join(pathname, "positions_index-%s" % partition))
paul@34	1513
paul@34	1514	positions_dict_reader = PositionDictionaryReader(positions_opener, positions_index_opener)
paul@19	1515
paul@19	1516	return TermDictionaryReader(info_reader, index_reader, positions_dict_reader)
paul@14	1517
paul@14	1518	def get_field_reader(pathname, partition):
paul@14	1519
paul@14	1520	"""
paul@14	1521	Return a field dictionary reader using files under the given 'pathname'
paul@14	1522	labelled according to the given 'partition'.
paul@14	1523	"""
paul@14	1524
paul@14	1525	ff = open(join(pathname, "fields-%s" % partition), "rb")
paul@14	1526	field_reader = FieldReader(ff)
paul@14	1527
paul@14	1528	fif = open(join(pathname, "fields_index-%s" % partition), "rb")
paul@14	1529	field_index_reader = FieldIndexReader(fif)
paul@14	1530
paul@14	1531	return FieldDictionaryReader(field_reader, field_index_reader)
paul@14	1532
paul@14	1533	def rename_files(pathname, names, from_partition, to_partition):
paul@14	1534	for name in names:
paul@14	1535	rename(join(pathname, "%s-%s" % (name, from_partition)), join(pathname, "%s-%s" % (name, to_partition)))
paul@14	1536
paul@14	1537	def rename_term_files(pathname, from_partition, to_partition):
paul@20	1538	rename_files(pathname, TERM_FILENAMES, from_partition, to_partition)
paul@14	1539
paul@14	1540	def rename_field_files(pathname, from_partition, to_partition):
paul@20	1541	rename_files(pathname, FIELD_FILENAMES, from_partition, to_partition)
paul@14	1542
paul@14	1543	def remove_files(pathname, names, partition):
paul@14	1544	for name in names:
paul@14	1545	remove(join(pathname, "%s-%s" % (name, partition)))
paul@14	1546
paul@14	1547	def remove_term_files(pathname, partition):
paul@20	1548	remove_files(pathname, TERM_FILENAMES, partition)
paul@14	1549
paul@14	1550	def remove_field_files(pathname, partition):
paul@20	1551	remove_files(pathname, FIELD_FILENAMES, partition)
paul@14	1552
paul@8	1553	# High-level classes.
paul@8	1554
paul@28	1555	class Document:
paul@28	1556
paul@28	1557	"A container of document information."
paul@28	1558
paul@28	1559	def __init__(self, docnum):
paul@28	1560	self.docnum = docnum
paul@28	1561	self.fields = []
paul@28	1562	self.terms = {}
paul@28	1563
paul@28	1564	def add_position(self, term, position):
paul@28	1565
paul@28	1566	"""
paul@28	1567	Add a position entry for the given 'term', indicating the given
paul@28	1568	'position'.
paul@28	1569	"""
paul@28	1570
paul@28	1571	self.terms.setdefault(term, []).append(position)
paul@28	1572
paul@28	1573	def add_field(self, identifier, value):
paul@28	1574
paul@28	1575	"Add a field having the given 'identifier' and 'value'."
paul@28	1576
paul@28	1577	self.fields.append((identifier, unicode(value))) # convert to string
paul@28	1578
paul@31	1579	def set_fields(self, fields):
paul@28	1580
paul@28	1581	"""
paul@31	1582	Set the document's 'fields': a list of tuples each containing an integer
paul@31	1583	identifier and a string value.
paul@28	1584	"""
paul@28	1585
paul@28	1586	self.fields = fields
paul@28	1587
paul@6	1588	class IndexWriter:
paul@6	1589
paul@10	1590	"""
paul@10	1591	Building term information and writing it to the term and field dictionaries.
paul@10	1592	"""
paul@6	1593
paul@20	1594	def __init__(self, pathname, interval, doc_interval, flush_interval):
paul@12	1595	self.pathname = pathname
paul@12	1596	self.interval = interval
paul@20	1597	self.doc_interval = doc_interval
paul@12	1598	self.flush_interval = flush_interval
paul@12	1599
paul@12	1600	self.dict_partition = 0
paul@12	1601	self.field_dict_partition = 0
paul@12	1602
paul@6	1603	self.terms = {}
paul@10	1604	self.docs = {}
paul@6	1605
paul@27	1606	self.doc_counter = 0
paul@12	1607
paul@28	1608	def add_document(self, doc):
paul@10	1609
paul@13	1610	"""
paul@28	1611	Add the given document 'doc', updating the document counter and flushing
paul@28	1612	terms and fields if appropriate.
paul@13	1613	"""
paul@10	1614
paul@28	1615	for term, positions in doc.terms.items():
paul@28	1616	self.terms.setdefault(term, {})[doc.docnum] = positions
paul@28	1617
paul@28	1618	self.docs[doc.docnum] = doc.fields
paul@27	1619
paul@27	1620	self.doc_counter += 1
paul@27	1621	if self.flush_interval and self.doc_counter >= self.flush_interval:
paul@27	1622	self.flush_terms()
paul@12	1623	self.flush_fields()
paul@27	1624	self.doc_counter = 0
paul@26	1625
paul@12	1626	def get_term_writer(self):
paul@12	1627
paul@12	1628	"Return a term dictionary writer for the current partition."
paul@12	1629
paul@20	1630	return get_term_writer(self.pathname, self.dict_partition, self.interval, self.doc_interval)
paul@12	1631
paul@12	1632	def get_field_writer(self):
paul@12	1633
paul@12	1634	"Return a field dictionary writer for the current partition."
paul@12	1635
paul@13	1636	return get_field_writer(self.pathname, self.field_dict_partition, self.interval)
paul@12	1637
paul@12	1638	def flush_terms(self):
paul@12	1639
paul@12	1640	"Flush terms into the current term dictionary partition."
paul@6	1641
paul@6	1642	# Get the terms in order.
paul@6	1643
paul@32	1644	all_terms = self.terms
paul@32	1645	terms = all_terms.keys()
paul@6	1646	terms.sort()
paul@6	1647
paul@12	1648	dict_writer = self.get_term_writer()
paul@12	1649
paul@32	1650	for term in terms:
paul@32	1651	doc_positions = all_terms[term].items()
paul@12	1652	dict_writer.write_term_positions(term, doc_positions)
paul@12	1653
paul@12	1654	dict_writer.close()
paul@6	1655
paul@12	1656	self.terms = {}
paul@12	1657	self.dict_partition += 1
paul@12	1658
paul@12	1659	def flush_fields(self):
paul@12	1660
paul@12	1661	"Flush fields into the current term dictionary partition."
paul@7	1662
paul@10	1663	# Get the documents in order.
paul@10	1664
paul@10	1665	docs = self.docs.items()
paul@10	1666	docs.sort()
paul@10	1667
paul@12	1668	field_dict_writer = self.get_field_writer()
paul@12	1669
paul@10	1670	for docnum, fields in docs:
paul@12	1671	field_dict_writer.write_fields(docnum, fields)
paul@12	1672
paul@12	1673	field_dict_writer.close()
paul@10	1674
paul@12	1675	self.docs = {}
paul@12	1676	self.field_dict_partition += 1
paul@12	1677
paul@12	1678	def close(self):
paul@12	1679	if self.terms:
paul@12	1680	self.flush_terms()
paul@12	1681	if self.docs:
paul@12	1682	self.flush_fields()
paul@10	1683
paul@10	1684	class IndexReader:
paul@10	1685
paul@10	1686	"Accessing the term and field dictionaries."
paul@10	1687
paul@14	1688	def __init__(self, pathname):
paul@14	1689	self.dict_reader = get_term_reader(pathname, "merged")
paul@14	1690	self.field_dict_reader = get_field_reader(pathname, "merged")
paul@10	1691
paul@26	1692	def find_terms(self, term):
paul@26	1693	return self.dict_reader.find_terms(term)
paul@26	1694
paul@10	1695	def find_positions(self, term):
paul@10	1696	return self.dict_reader.find_positions(term)
paul@10	1697
paul@11	1698	def get_frequency(self, term):
paul@11	1699	return self.dict_reader.get_frequency(term)
paul@11	1700
paul@22	1701	def get_document_frequency(self, term):
paul@22	1702	return self.dict_reader.get_document_frequency(term)
paul@22	1703
paul@10	1704	def get_fields(self, docnum):
paul@13	1705	return self.field_dict_reader.get_fields(docnum)
paul@10	1706
paul@10	1707	def close(self):
paul@10	1708	self.dict_reader.close()
paul@10	1709	self.field_dict_reader.close()
paul@10	1710
paul@7	1711	class Index:
paul@7	1712
paul@7	1713	"An inverted index solution encapsulating the various components."
paul@7	1714
paul@7	1715	def __init__(self, pathname):
paul@7	1716	self.pathname = pathname
paul@7	1717	self.reader = None
paul@7	1718	self.writer = None
paul@7	1719
paul@20	1720	def get_writer(self, interval=TERM_INTERVAL, doc_interval=DOCUMENT_INTERVAL, flush_interval=FLUSH_INTERVAL):
paul@7	1721
paul@12	1722	"""
paul@20	1723	Return a writer, optionally using the given indexing 'interval',
paul@20	1724	'doc_interval' and 'flush_interval'.
paul@12	1725	"""
paul@7	1726
paul@7	1727	if not exists(self.pathname):
paul@7	1728	mkdir(self.pathname)
paul@7	1729
paul@20	1730	self.writer = IndexWriter(self.pathname, interval, doc_interval, flush_interval)
paul@7	1731	return self.writer
paul@7	1732
paul@12	1733	def get_reader(self, partition=0):
paul@7	1734
paul@7	1735	"Return a reader for the index."
paul@7	1736
paul@14	1737	# Ensure that only one partition exists.
paul@14	1738
paul@24	1739	self.merge()
paul@14	1740	return self._get_reader(partition)
paul@14	1741
paul@14	1742	def _get_reader(self, partition):
paul@14	1743
paul@14	1744	"Return a reader for the index."
paul@14	1745
paul@7	1746	if not exists(self.pathname):
paul@7	1747	raise OSError, "Index path %r does not exist." % self.pathname
paul@7	1748
paul@14	1749	self.reader = IndexReader(self.pathname)
paul@12	1750	return self.reader
paul@7	1751
paul@24	1752	def merge(self):
paul@24	1753
paul@24	1754	"Merge/optimise index partitions."
paul@24	1755
paul@24	1756	self.merge_terms()
paul@24	1757	self.merge_fields()
paul@24	1758
paul@20	1759	def merge_terms(self, interval=TERM_INTERVAL, doc_interval=DOCUMENT_INTERVAL):
paul@7	1760
paul@20	1761	"""
paul@20	1762	Merge term dictionaries using the given indexing 'interval' and
paul@20	1763	'doc_interval'.
paul@20	1764	"""
paul@10	1765
paul@12	1766	readers = []
paul@21	1767	partitions = set()
paul@10	1768
paul@14	1769	for filename in listdir(self.pathname):
paul@12	1770	if filename.startswith("terms-"): # 6 character prefix
paul@14	1771	partition = filename[6:]
paul@14	1772	readers.append(get_term_reader(self.pathname, partition))
paul@21	1773	partitions.add(partition)
paul@14	1774
paul@14	1775	# Write directly to a dictionary.
paul@14	1776
paul@14	1777	if len(readers) > 1:
paul@21	1778	if "merged" in partitions:
paul@21	1779	rename_term_files(self.pathname, "merged", "old-merged")
paul@21	1780	partitions.remove("merged")
paul@21	1781	partitions.add("old-merged")
paul@21	1782
paul@20	1783	writer = get_term_writer(self.pathname, "merged", interval, doc_interval)
paul@14	1784	merger = TermDictionaryMerger(writer, readers)
paul@14	1785	merger.merge()
paul@14	1786	merger.close()
paul@14	1787
paul@14	1788	# Remove old files.
paul@14	1789
paul@14	1790	for partition in partitions:
paul@14	1791	remove_term_files(self.pathname, partition)
paul@14	1792
paul@21	1793	elif len(readers) == 1:
paul@21	1794	partition = list(partitions)[0]
paul@21	1795	if partition != "merged":
paul@21	1796	rename_term_files(self.pathname, partition, "merged")
paul@14	1797
paul@20	1798	def merge_fields(self, interval=FIELD_INTERVAL):
paul@10	1799
paul@14	1800	"Merge field dictionaries using the given indexing 'interval'."
paul@14	1801
paul@14	1802	readers = []
paul@21	1803	partitions = set()
paul@14	1804
paul@14	1805	for filename in listdir(self.pathname):
paul@14	1806	if filename.startswith("fields-"): # 7 character prefix
paul@14	1807	partition = filename[7:]
paul@14	1808	readers.append(get_field_reader(self.pathname, partition))
paul@21	1809	partitions.add(partition)
paul@14	1810
paul@14	1811	# Write directly to a dictionary.
paul@13	1812
paul@14	1813	if len(readers) > 1:
paul@21	1814	if "merged" in partitions:
paul@21	1815	rename_field_files(self.pathname, "merged", "old-merged")
paul@21	1816	partitions.remove("merged")
paul@21	1817	partitions.add("old-merged")
paul@21	1818
paul@14	1819	writer = get_field_writer(self.pathname, "merged", interval)
paul@14	1820	merger = FieldDictionaryMerger(writer, readers)
paul@14	1821	merger.merge()
paul@14	1822	merger.close()
paul@14	1823
paul@14	1824	# Remove old files.
paul@14	1825
paul@14	1826	for partition in partitions:
paul@14	1827	remove_field_files(self.pathname, partition)
paul@14	1828
paul@21	1829	elif len(readers) == 1:
paul@21	1830	partition = list(partitions)[0]
paul@21	1831	if partition != "merged":
paul@21	1832	rename_field_files(self.pathname, partition, "merged")
paul@7	1833
paul@7	1834	def close(self):
paul@7	1835	if self.reader is not None:
paul@7	1836	self.reader.close()
paul@7	1837	self.reader = None
paul@7	1838	if self.writer is not None:
paul@7	1839	self.writer.close()
paul@7	1840	self.writer = None
paul@6	1841
paul@0	1842	# vim: tabstop=4 expandtab shiftwidth=4