iixr (annotate iixr.py in a0f37b0ef350)

iixr

Annotated iixr.py

43:a0f37b0ef350

2009-09-14

Paul Boddie

Added constants for various measures. Prevented unnecessary read cache resets where the cache offset is zero.

paul@0	1	#!/usr/bin/env python
paul@0	2
paul@0	3	"""
paul@0	4	A simple (and sane) text indexing library.
paul@1	5
paul@1	6	Copyright (C) 2009 Paul Boddie <paul@boddie.org.uk>
paul@1	7
paul@1	8	This program is free software; you can redistribute it and/or modify it under
paul@1	9	the terms of the GNU General Public License as published by the Free Software
paul@1	10	Foundation; either version 3 of the License, or (at your option) any later
paul@1	11	version.
paul@1	12
paul@1	13	This program is distributed in the hope that it will be useful, but WITHOUT ANY
paul@1	14	WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
paul@1	15	PARTICULAR PURPOSE. See the GNU General Public License for more details.
paul@1	16
paul@1	17	You should have received a copy of the GNU General Public License along
paul@1	18	with this program. If not, see <http://www.gnu.org/licenses/>.
paul@0	19	"""
paul@0	20
paul@19	21	from os import dup, fdopen # independent iterator access to files
paul@12	22	from os import listdir, mkdir # index and partition discovery
paul@14	23	from os import remove, rename # partition manipulation
paul@7	24	from os.path import exists, join
paul@2	25	from os.path import commonprefix # to find common string prefixes
paul@3	26	from bisect import bisect_right # to find terms in the dictionary index
paul@10	27	import bz2, zlib # for field compression
paul@33	28	from itermerge import itermerge
paul@2	29
paul@21	30	try:
paul@21	31	set
paul@21	32	except NameError:
paul@21	33	from sets import Set as set
paul@21	34
paul@7	35	# Constants.
paul@7	36
paul@20	37	TERM_INTERVAL = 100
paul@20	38	DOCUMENT_INTERVAL = 100
paul@20	39	FIELD_INTERVAL = 100
paul@28	40	FLUSH_INTERVAL = 10000
paul@20	41
paul@43	42	WRITE_CACHE_SIZE = 100000
paul@43	43	READ_CACHE_SIZE = 10000
paul@43	44	READ_CACHE_RESIZE = 5000
paul@43	45
paul@20	46	TERM_FILENAMES = "terms", "terms_index", "positions", "positions_index"
paul@20	47	FIELD_FILENAMES = "fields", "fields_index"
paul@7	48
paul@10	49	compressors = [("b", bz2.compress), ("z", zlib.compress)]
paul@10	50	decompressors = {"b" : bz2.decompress, "z" : zlib.decompress}
paul@10	51
paul@35	52	# Utility functions.
paul@35	53
paul@37	54	try:
paul@38	55	from vint import vint as _vint
paul@38	56
paul@38	57	def vint(number):
paul@38	58
paul@38	59	"Write 'number' as a variable-length integer."
paul@38	60
paul@38	61	if number >= 0:
paul@38	62	return _vint(number)
paul@38	63	else:
paul@38	64	raise ValueError, "Number %r is negative." % number
paul@38	65
paul@37	66	except ImportError:
paul@37	67
paul@37	68	def vint(number):
paul@37	69
paul@37	70	"Write 'number' as a variable-length integer."
paul@37	71
paul@37	72	if number >= 0:
paul@37	73
paul@37	74	# Special case: one byte containing a 7-bit number.
paul@37	75
paul@37	76	if number < 128:
paul@37	77	return chr(number)
paul@37	78
paul@37	79	# Write the number from least to most significant digits.
paul@37	80
paul@37	81	bytes = []
paul@37	82
paul@37	83	while number != 0:
paul@37	84	lsd = number & 127
paul@37	85	number = number >> 7
paul@37	86	if number != 0:
paul@37	87	lsd \|= 128
paul@37	88	bytes.append(chr(lsd))
paul@37	89
paul@37	90	return "".join(bytes)
paul@37	91
paul@37	92	# Negative numbers are not supported.
paul@37	93
paul@37	94	else:
paul@37	95	raise ValueError, "Number %r is negative." % number
paul@35	96
paul@0	97	# Foundation classes.
paul@0	98
paul@0	99	class File:
paul@0	100
paul@0	101	"A basic file abstraction."
paul@0	102
paul@0	103	def __init__(self, f):
paul@0	104	self.f = f
paul@0	105	self.reset()
paul@0	106
paul@0	107	def reset(self):
paul@12	108
paul@12	109	"To be used to reset the state of the reader or writer between records."
paul@12	110
paul@0	111	pass
paul@0	112
paul@12	113	def rewind(self):
paul@40	114	self.seek(0)
paul@13	115	self.reset()
paul@12	116
paul@40	117	def seek(self, offset):
paul@40	118
paul@40	119	"To be defined by readers."
paul@40	120
paul@40	121	pass
paul@39	122
paul@39	123	def flush(self):
paul@40	124
paul@40	125	"To be defined by writers."
paul@40	126
paul@40	127	pass
paul@39	128
paul@0	129	def close(self):
paul@7	130	if self.f is not None:
paul@39	131	self.flush()
paul@7	132	self.f.close()
paul@7	133	self.f = None
paul@0	134
paul@0	135	class FileWriter(File):
paul@0	136
paul@0	137	"Writing basic data types to files."
paul@0	138
paul@40	139	def __init__(self, f):
paul@40	140	File.__init__(self, f)
paul@40	141	self.cache = []
paul@40	142	self.cache_length = 0
paul@40	143
paul@0	144	def write_number(self, number):
paul@0	145
paul@0	146	"Write 'number' to the file using a variable length encoding."
paul@0	147
paul@39	148	self.write(vint(number))
paul@0	149
paul@8	150	def write_string(self, s, compress=0):
paul@2	151
paul@8	152	"""
paul@8	153	Write 's' to the file, recording its length and compressing the string
paul@8	154	if 'compress' is set to a true value.
paul@8	155	"""
paul@2	156
paul@7	157	# Convert Unicode objects to strings.
paul@7	158
paul@7	159	if isinstance(s, unicode):
paul@7	160	s = s.encode("utf-8")
paul@7	161
paul@8	162	# Compress the string if requested.
paul@2	163
paul@8	164	if compress:
paul@10	165	for flag, fn in compressors:
paul@10	166	cs = fn(s)
paul@10	167
paul@10	168	# Take the first string shorter than the original.
paul@10	169
paul@10	170	if len(cs) < len(s):
paul@10	171	s = cs
paul@10	172	break
paul@10	173	else:
paul@10	174	flag = "-"
paul@10	175
paul@35	176	else:
paul@35	177	flag = ""
paul@2	178
paul@8	179	# Write the length of the data before the data itself.
paul@8	180
paul@8	181	length = len(s)
paul@39	182	self.write(flag + vint(length) + s)
paul@2	183
paul@40	184	# Cache-affected methods.
paul@40	185
paul@40	186	def write(self, s):
paul@40	187	self.cache.append(s)
paul@40	188	self.cache_length += len(s)
paul@43	189	if self.cache_length >= WRITE_CACHE_SIZE:
paul@40	190	self.flush()
paul@40	191
paul@40	192	def tell(self):
paul@40	193	return self.f.tell() + self.cache_length
paul@40	194
paul@40	195	def flush(self):
paul@40	196	self.f.write("".join(self.cache))
paul@40	197	self.cache = []
paul@40	198	self.cache_length = 0
paul@40	199
paul@0	200	class FileReader(File):
paul@0	201
paul@0	202	"Reading basic data types from files."
paul@0	203
paul@40	204	def __init__(self, f):
paul@40	205	File.__init__(self, f)
paul@42	206	self.reset_cache()
paul@42	207
paul@42	208	def reset_cache(self):
paul@40	209	self.cache = ""
paul@40	210	self.cache_length = 0
paul@42	211	self.cache_start = 0
paul@40	212
paul@0	213	def read_number(self):
paul@0	214
paul@0	215	"Read a number from the file."
paul@0	216
paul@0	217	# Read each byte, adding it to the number.
paul@0	218
paul@0	219	shift = 0
paul@0	220	number = 0
paul@40	221	read = self.read
paul@34	222
paul@34	223	try:
paul@34	224	csd = ord(read(1))
paul@34	225	while csd & 128:
paul@34	226	number += ((csd & 127) << shift)
paul@34	227	shift += 7
paul@34	228	csd = ord(read(1))
paul@34	229	else:
paul@34	230	number += (csd << shift)
paul@34	231	except TypeError:
paul@34	232	raise EOFError
paul@0	233
paul@0	234	return number
paul@0	235
paul@8	236	def read_string(self, decompress=0):
paul@2	237
paul@8	238	"""
paul@8	239	Read a string from the file, decompressing the stored data if
paul@8	240	'decompress' is set to a true value.
paul@8	241	"""
paul@2	242
paul@10	243	# Decompress the data if requested.
paul@10	244
paul@10	245	if decompress:
paul@40	246	flag = self.read(1)
paul@10	247	else:
paul@10	248	flag = "-"
paul@10	249
paul@4	250	length = self.read_number()
paul@40	251	s = self.read(length)
paul@8	252
paul@10	253	# Perform decompression if applicable.
paul@8	254
paul@10	255	if flag != "-":
paul@10	256	fn = decompressors[flag]
paul@10	257	s = fn(s)
paul@7	258
paul@7	259	# Convert strings to Unicode objects.
paul@7	260
paul@8	261	return unicode(s, "utf-8")
paul@2	262
paul@40	263	# Cache-affected methods.
paul@40	264
paul@40	265	def read(self, n):
paul@42	266	needed = n - (self.cache_length - self.cache_start)
paul@42	267
paul@42	268	# Read the needed number of characters, if possible.
paul@42	269
paul@40	270	if needed > 0:
paul@43	271	s = self.f.read(max(needed, READ_CACHE_SIZE))
paul@40	272	self.cache += s
paul@40	273	self.cache_length += len(s)
paul@40	274
paul@42	275	# Get the end of the requested block.
paul@42	276
paul@42	277	next_start = self.cache_start + n
paul@42	278	s = self.cache[self.cache_start:next_start]
paul@42	279
paul@42	280	# Reposition the pointer to the cache.
paul@42	281
paul@42	282	self._seek_cache(len(s))
paul@40	283	return s
paul@40	284
paul@40	285	def tell(self):
paul@42	286	return self.f.tell() - self.cache_length + self.cache_start
paul@40	287
paul@40	288	def seek(self, offset):
paul@41	289	current = self.tell()
paul@40	290	self.f.seek(offset)
paul@42	291
paul@42	292	# If seeking forward, attempt to navigate the cache.
paul@42	293
paul@41	294	if offset >= current:
paul@42	295	self._seek_cache(offset - current)
paul@42	296	else:
paul@42	297	self.reset_cache()
paul@42	298
paul@42	299	def _seek_cache(self, delta):
paul@42	300	next_start = self.cache_start + delta
paul@42	301
paul@43	302	if next_start > 0 and next_start >= len(self.cache):
paul@42	303	self.reset_cache()
paul@42	304
paul@42	305	# If the cache is too big, resize it.
paul@42	306
paul@43	307	elif next_start > READ_CACHE_RESIZE:
paul@42	308	self.cache = self.cache[next_start:]
paul@41	309	self.cache_length = len(self.cache)
paul@42	310	self.cache_start = 0
paul@42	311
paul@42	312	# Otherwise, just reference the next part of the cache.
paul@42	313
paul@41	314	else:
paul@42	315	self.cache_start = next_start
paul@40	316
paul@34	317	class FileOpener:
paul@34	318
paul@34	319	"Opening files using their filenames."
paul@34	320
paul@34	321	def __init__(self, filename):
paul@34	322	self.filename = filename
paul@34	323
paul@34	324	def open(self, mode):
paul@34	325	return open(self.filename, mode)
paul@34	326
paul@34	327	def close(self):
paul@34	328	pass
paul@34	329
paul@9	330	# Specific classes for storing term and position information.
paul@0	331
paul@0	332	class PositionWriter(FileWriter):
paul@0	333
paul@0	334	"Writing position information to files."
paul@0	335
paul@0	336	def reset(self):
paul@0	337	self.last_docnum = 0
paul@0	338
paul@0	339	def write_positions(self, docnum, positions):
paul@0	340
paul@19	341	"""
paul@19	342	Write for the document 'docnum' the given 'positions'.
paul@19	343	Return the offset of the written record.
paul@19	344	"""
paul@0	345
paul@0	346	if docnum < self.last_docnum:
paul@0	347	raise ValueError, "Document number %r is less than previous number %r." % (docnum, self.last_docnum)
paul@0	348
paul@19	349	# Record the offset of this record.
paul@19	350
paul@39	351	offset = self.tell()
paul@0	352
paul@7	353	# Make sure that the positions are sorted.
paul@7	354
paul@7	355	positions.sort()
paul@7	356
paul@0	357	# Write the position deltas.
paul@0	358
paul@36	359	output = []
paul@0	360	last = 0
paul@36	361
paul@0	362	for position in positions:
paul@36	363	output.append(vint(position - last))
paul@0	364	last = position
paul@0	365
paul@36	366	# Write the document number delta.
paul@36	367	# Write the number of positions.
paul@36	368	# Then write the positions.
paul@36	369
paul@39	370	self.write(vint(docnum - self.last_docnum) + vint(len(positions)) + "".join(output))
paul@35	371
paul@0	372	self.last_docnum = docnum
paul@19	373	return offset
paul@0	374
paul@34	375	class PositionOpener(FileOpener):
paul@0	376
paul@0	377	"Reading position information from files."
paul@0	378
paul@19	379	def read_term_positions(self, offset, count):
paul@0	380
paul@0	381	"""
paul@0	382	Read all positions from 'offset', seeking to that position in the file
paul@19	383	before reading. The number of documents available for reading is limited
paul@19	384	to 'count'.
paul@0	385	"""
paul@0	386
paul@19	387	# Duplicate the file handle.
paul@19	388
paul@34	389	f = self.open("rb")
paul@40	390	return PositionIterator(f, offset, count)
paul@19	391
paul@19	392	class PositionIndexWriter(FileWriter):
paul@19	393
paul@19	394	"Writing position index information to files."
paul@19	395
paul@19	396	def reset(self):
paul@19	397	self.last_docnum = 0
paul@19	398	self.last_pos_offset = 0
paul@19	399
paul@19	400	def write_positions(self, docnum, pos_offset, count):
paul@19	401
paul@19	402	"""
paul@19	403	Write the given 'docnum, 'pos_offset' and document 'count' to the
paul@19	404	position index file.
paul@19	405	"""
paul@19	406
paul@19	407	# Record the offset of this record.
paul@19	408
paul@39	409	offset = self.tell()
paul@35	410	output = []
paul@19	411
paul@19	412	# Write the document number delta.
paul@19	413
paul@35	414	output.append(vint(docnum - self.last_docnum))
paul@19	415	self.last_docnum = docnum
paul@19	416
paul@19	417	# Write the position file offset delta.
paul@19	418
paul@35	419	output.append(vint(pos_offset - self.last_pos_offset))
paul@19	420	self.last_pos_offset = pos_offset
paul@19	421
paul@19	422	# Write the document count.
paul@19	423
paul@35	424	output.append(vint(count))
paul@35	425
paul@35	426	# Actually write the data.
paul@35	427
paul@39	428	self.write("".join(output))
paul@19	429
paul@19	430	return offset
paul@19	431
paul@34	432	class PositionIndexOpener(FileOpener):
paul@19	433
paul@19	434	"Reading position index information from files."
paul@18	435
paul@34	436	def read_term_positions(self, offset, doc_frequency):
paul@34	437
paul@34	438	"""
paul@34	439	Read all positions from 'offset', seeking to that position in the file
paul@34	440	before reading. The number of documents available for reading is limited
paul@34	441	to 'doc_frequency'.
paul@34	442	"""
paul@34	443
paul@34	444	# Duplicate the file handle.
paul@34	445
paul@34	446	f = self.open("rb")
paul@40	447	return PositionIndexIterator(f, offset, doc_frequency)
paul@34	448
paul@34	449	# Iterators for position-related files.
paul@34	450
paul@34	451	class IteratorBase:
paul@34	452
paul@34	453	def __init__(self, count):
paul@34	454	self.replenish(count)
paul@34	455
paul@34	456	def replenish(self, count):
paul@34	457	self.count = count
paul@34	458	self.read_documents = 0
paul@34	459
paul@34	460	def __len__(self):
paul@34	461	return self.count
paul@34	462
paul@34	463	def sort(self):
paul@34	464	pass # Stored document positions are already sorted.
paul@34	465
paul@34	466	def __iter__(self):
paul@34	467	return self
paul@34	468
paul@34	469	class PositionIterator(FileReader, IteratorBase):
paul@34	470
paul@34	471	"Iterating over document positions."
paul@34	472
paul@40	473	def __init__(self, f, offset, count):
paul@34	474	FileReader.__init__(self, f)
paul@34	475	IteratorBase.__init__(self, count)
paul@40	476	self.seek(offset)
paul@34	477
paul@34	478	def reset(self):
paul@34	479	self.last_docnum = 0
paul@34	480
paul@34	481	def read_positions(self):
paul@34	482
paul@34	483	"Read positions, returning a document number and a list of positions."
paul@34	484
paul@34	485	# Read the document number delta and add it to the last number.
paul@34	486
paul@34	487	self.last_docnum += self.read_number()
paul@34	488
paul@34	489	# Read the number of positions.
paul@34	490
paul@34	491	npositions = self.read_number()
paul@34	492
paul@34	493	# Read the position deltas, adding each previous position to get the
paul@34	494	# appropriate collection of absolute positions.
paul@34	495
paul@34	496	i = 0
paul@34	497	last = 0
paul@34	498	positions = []
paul@34	499
paul@34	500	while i < npositions:
paul@34	501	last += self.read_number()
paul@34	502	positions.append(last)
paul@34	503	i += 1
paul@34	504
paul@34	505	return self.last_docnum, positions
paul@34	506
paul@34	507	def next(self):
paul@34	508
paul@34	509	"Read positions for a single document."
paul@34	510
paul@34	511	if self.read_documents < self.count:
paul@34	512	self.read_documents += 1
paul@34	513	return self.read_positions()
paul@34	514	else:
paul@34	515	raise StopIteration
paul@34	516
paul@34	517	class PositionIndexIterator(FileReader, IteratorBase):
paul@34	518
paul@34	519	"Iterating over document positions."
paul@34	520
paul@40	521	def __init__(self, f, offset, count):
paul@34	522	FileReader.__init__(self, f)
paul@34	523	IteratorBase.__init__(self, count)
paul@40	524	self.seek(offset)
paul@34	525	self.section_count = 0
paul@34	526
paul@19	527	def reset(self):
paul@19	528	self.last_docnum = 0
paul@19	529	self.last_pos_offset = 0
paul@19	530
paul@19	531	def read_positions(self):
paul@19	532
paul@19	533	"""
paul@19	534	Read a document number, a position file offset for the position index
paul@19	535	file, and the number of documents in a section of that file.
paul@19	536	"""
paul@19	537
paul@19	538	# Read the document number delta.
paul@19	539
paul@19	540	self.last_docnum += self.read_number()
paul@19	541
paul@19	542	# Read the offset delta.
paul@19	543
paul@19	544	self.last_pos_offset += self.read_number()
paul@19	545
paul@19	546	# Read the document count.
paul@19	547
paul@19	548	count = self.read_number()
paul@19	549
paul@19	550	return self.last_docnum, self.last_pos_offset, count
paul@19	551
paul@19	552	def next(self):
paul@19	553
paul@19	554	"Read positions for a single document."
paul@18	555
paul@19	556	self.read_documents += self.section_count
paul@19	557	if self.read_documents < self.count:
paul@19	558	docnum, pos_offset, self.section_count = t = self.read_positions()
paul@19	559	return t
paul@19	560	else:
paul@19	561	raise StopIteration
paul@19	562
paul@19	563	class PositionDictionaryWriter:
paul@19	564
paul@19	565	"Writing position dictionaries."
paul@19	566
paul@19	567	def __init__(self, position_writer, position_index_writer, interval):
paul@19	568	self.position_writer = position_writer
paul@19	569	self.position_index_writer = position_index_writer
paul@19	570	self.interval = interval
paul@19	571
paul@19	572	def write_term_positions(self, doc_positions):
paul@19	573
paul@19	574	"""
paul@19	575	Write all 'doc_positions' - a collection of tuples of the form (document
paul@19	576	number, position list) - to the file.
paul@19	577
paul@19	578	Add some records to the index, making dictionary entries.
paul@19	579
paul@19	580	Return a tuple containing the offset of the written data, the frequency
paul@19	581	(number of positions), and document frequency (number of documents) for
paul@19	582	the term involved.
paul@19	583	"""
paul@19	584
paul@20	585	# Reset the writers.
paul@19	586
paul@19	587	self.position_writer.reset()
paul@20	588	self.position_index_writer.reset()
paul@20	589
paul@19	590	index_offset = None
paul@19	591
paul@19	592	# Write the positions.
paul@19	593
paul@19	594	frequency = 0
paul@20	595	first_docnum = None
paul@19	596	first_offset = None
paul@19	597	count = 0
paul@19	598
paul@19	599	doc_positions.sort()
paul@19	600
paul@19	601	for docnum, positions in doc_positions:
paul@19	602	pos_offset = self.position_writer.write_positions(docnum, positions)
paul@19	603
paul@19	604	# Retain the first record offset for a subsequent index entry.
paul@19	605
paul@19	606	if first_offset is None:
paul@19	607	first_offset = pos_offset
paul@20	608	first_docnum = docnum
paul@19	609
paul@19	610	frequency += len(positions)
paul@20	611	count += 1
paul@19	612
paul@19	613	# Every {interval} entries, write an index entry.
paul@19	614
paul@34	615	if count % self.interval == 0:
paul@20	616	io = self.position_index_writer.write_positions(first_docnum, first_offset, self.interval)
paul@0	617
paul@19	618	# Remember the first index entry offset.
paul@19	619
paul@19	620	if index_offset is None:
paul@19	621	index_offset = io
paul@19	622
paul@19	623	first_offset = None
paul@20	624	first_docnum = None
paul@19	625
paul@22	626	# Reset the position writer so that position readers accessing
paul@22	627	# a section start with the correct document number.
paul@22	628
paul@22	629	self.position_writer.reset()
paul@22	630
paul@19	631	# Finish writing an index entry for the remaining documents.
paul@19	632
paul@19	633	else:
paul@19	634	if first_offset is not None:
paul@34	635	io = self.position_index_writer.write_positions(first_docnum, first_offset, count % self.interval)
paul@19	636
paul@19	637	# Remember the first index entry offset.
paul@19	638
paul@19	639	if index_offset is None:
paul@19	640	index_offset = io
paul@19	641
paul@34	642	return index_offset, frequency, count
paul@19	643
paul@19	644	def close(self):
paul@19	645	self.position_writer.close()
paul@19	646	self.position_index_writer.close()
paul@19	647
paul@19	648	class PositionDictionaryReader:
paul@18	649
paul@19	650	"Reading position dictionaries."
paul@19	651
paul@34	652	def __init__(self, position_opener, position_index_opener):
paul@34	653	self.position_opener = position_opener
paul@34	654	self.position_index_opener = position_index_opener
paul@19	655
paul@19	656	def read_term_positions(self, offset, doc_frequency):
paul@19	657
paul@19	658	"""
paul@19	659	Return an iterator for dictionary entries starting at 'offset' with the
paul@19	660	given 'doc_frequency'.
paul@19	661	"""
paul@18	662
paul@34	663	return PositionDictionaryIterator(self.position_opener,
paul@34	664	self.position_index_opener, offset, doc_frequency)
paul@19	665
paul@19	666	def close(self):
paul@34	667	pass
paul@19	668
paul@19	669	class PositionDictionaryIterator:
paul@19	670
paul@19	671	"Iteration over position dictionary entries."
paul@19	672
paul@34	673	def __init__(self, position_opener, position_index_opener, offset, doc_frequency):
paul@34	674	self.position_opener = position_opener
paul@20	675	self.doc_frequency = doc_frequency
paul@34	676	self.index_iterator = position_index_opener.read_term_positions(offset, doc_frequency)
paul@34	677	self.iterator = None
paul@19	678
paul@22	679	# Remember the last values.
paul@22	680
paul@22	681	self.found_docnum, self.found_positions = None, None
paul@22	682
paul@21	683	# Maintain state for the next index entry, if read.
paul@21	684
paul@21	685	self.next_docnum, self.next_pos_offset, self.next_section_count = None, None, None
paul@21	686
paul@21	687	# Initialise the current index entry and current position file iterator.
paul@21	688
paul@21	689	self._next_section()
paul@21	690	self._init_section()
paul@0	691
paul@34	692	# Sequence methods.
paul@34	693
paul@20	694	def __len__(self):
paul@20	695	return self.doc_frequency
paul@20	696
paul@20	697	def sort(self):
paul@20	698	pass
paul@20	699
paul@34	700	# Iterator methods.
paul@34	701
paul@18	702	def __iter__(self):
paul@18	703	return self
paul@18	704
paul@18	705	def next(self):
paul@0	706
paul@21	707	"""
paul@21	708	Attempt to get the next document record from the section in the
paul@21	709	positions file.
paul@21	710	"""
paul@19	711
paul@22	712	# Return any visited but unrequested record.
paul@22	713
paul@22	714	if self.found_docnum is not None:
paul@22	715	t = self.found_docnum, self.found_positions
paul@22	716	self.found_docnum, self.found_positions = None, None
paul@22	717	return t
paul@22	718
paul@22	719	# Or search for the next record.
paul@22	720
paul@19	721	while 1:
paul@19	722
paul@19	723	# Either return the next record.
paul@19	724
paul@19	725	try:
paul@19	726	return self.iterator.next()
paul@0	727
paul@19	728	# Or, where a section is finished, get the next section and try again.
paul@19	729
paul@19	730	except StopIteration:
paul@20	731
paul@20	732	# Where a section follows, update the index iterator, but keep
paul@20	733	# reading using the same file iterator (since the data should
paul@20	734	# just follow on from the last section).
paul@20	735
paul@21	736	self._next_section()
paul@19	737	self.iterator.replenish(self.section_count)
paul@19	738
paul@22	739	# Reset the state of the iterator to make sure that document
paul@22	740	# numbers are correct.
paul@22	741
paul@22	742	self.iterator.reset()
paul@22	743
paul@22	744	def from_document(self, docnum):
paul@21	745
paul@21	746	"""
paul@21	747	Attempt to navigate to a positions entry for the given 'docnum',
paul@22	748	returning the positions for 'docnum', or None otherwise.
paul@21	749	"""
paul@21	750
paul@22	751	# Return any unrequested document positions.
paul@22	752
paul@22	753	if docnum == self.found_docnum:
paul@22	754	return self.found_positions
paul@22	755
paul@21	756	# Read ahead in the index until the next entry refers to a document
paul@21	757	# later than the desired document.
paul@21	758
paul@21	759	try:
paul@21	760	if self.next_docnum is None:
paul@21	761	self.next_docnum, self.next_pos_offset, self.next_section_count = self.index_iterator.next()
paul@21	762
paul@22	763	# Read until the next entry is after the desired document number,
paul@22	764	# or until the end of the results.
paul@22	765
paul@22	766	while self.next_docnum <= docnum:
paul@21	767	self._next_read_section()
paul@22	768	if self.docnum < docnum:
paul@22	769	self.next_docnum, self.next_pos_offset, self.next_section_count = self.index_iterator.next()
paul@22	770	else:
paul@22	771	break
paul@21	772
paul@21	773	except StopIteration:
paul@21	774	pass
paul@21	775
paul@21	776	# Navigate in the position file to the document.
paul@21	777
paul@21	778	self._init_section()
paul@19	779
paul@21	780	try:
paul@21	781	while 1:
paul@22	782	found_docnum, found_positions = self.iterator.next()
paul@22	783
paul@24	784	# Return the desired document positions or None (retaining the
paul@24	785	# positions for the document immediately after).
paul@22	786
paul@21	787	if docnum == found_docnum:
paul@22	788	return found_positions
paul@23	789	elif docnum < found_docnum:
paul@22	790	self.found_docnum, self.found_positions = found_docnum, found_positions
paul@21	791	return None
paul@22	792
paul@21	793	except StopIteration:
paul@21	794	return None
paul@21	795
paul@21	796	# Internal methods.
paul@21	797
paul@21	798	def _next_section(self):
paul@21	799
paul@21	800	"Attempt to get the next section in the index."
paul@21	801
paul@21	802	if self.next_docnum is None:
paul@21	803	self.docnum, self.pos_offset, self.section_count = self.index_iterator.next()
paul@21	804	else:
paul@21	805	self._next_read_section()
paul@21	806
paul@21	807	def _next_read_section(self):
paul@21	808
paul@21	809	"""
paul@21	810	Make the next index entry the current one without reading from the
paul@21	811	index.
paul@21	812	"""
paul@21	813
paul@21	814	self.docnum, self.pos_offset, self.section_count = self.next_docnum, self.next_pos_offset, self.next_section_count
paul@22	815	self.next_docnum, self.next_pos_offset, self.next_section_count = None, None, None
paul@21	816
paul@21	817	def _init_section(self):
paul@21	818
paul@21	819	"Initialise the iterator for the section in the position file."
paul@21	820
paul@34	821	if self.iterator is not None:
paul@34	822	self.iterator.close()
paul@34	823	self.iterator = self.position_opener.read_term_positions(self.pos_offset, self.section_count)
paul@34	824
paul@34	825	def close(self):
paul@34	826	if self.iterator is not None:
paul@34	827	self.iterator.close()
paul@34	828	self.iterator = None
paul@34	829	if self.index_iterator is not None:
paul@34	830	self.index_iterator.close()
paul@34	831	self.index_iterator = None
paul@0	832
paul@2	833	class TermWriter(FileWriter):
paul@2	834
paul@2	835	"Writing term information to files."
paul@2	836
paul@2	837	def reset(self):
paul@2	838	self.last_term = ""
paul@2	839	self.last_offset = 0
paul@2	840
paul@19	841	def write_term(self, term, offset, frequency, doc_frequency):
paul@2	842
paul@2	843	"""
paul@19	844	Write the given 'term', its position file 'offset', its 'frequency' and
paul@19	845	its 'doc_frequency' (number of documents in which it appears) to the
paul@19	846	term information file. Return the offset after the term information was
paul@19	847	written to the file.
paul@2	848	"""
paul@2	849
paul@2	850	# Write the prefix length and term suffix.
paul@2	851
paul@2	852	common = len(commonprefix([self.last_term, term]))
paul@2	853	suffix = term[common:]
paul@2	854
paul@4	855	self.write_number(common)
paul@2	856	self.write_string(suffix)
paul@2	857
paul@2	858	# Write the offset delta.
paul@2	859
paul@2	860	self.write_number(offset - self.last_offset)
paul@2	861
paul@11	862	# Write the frequency.
paul@11	863
paul@11	864	self.write_number(frequency)
paul@11	865
paul@19	866	# Write the document frequency.
paul@19	867
paul@19	868	self.write_number(doc_frequency)
paul@19	869
paul@2	870	self.last_term = term
paul@2	871	self.last_offset = offset
paul@2	872
paul@39	873	return self.tell()
paul@3	874
paul@2	875	class TermReader(FileReader):
paul@2	876
paul@2	877	"Reading term information from files."
paul@2	878
paul@2	879	def reset(self):
paul@2	880	self.last_term = ""
paul@2	881	self.last_offset = 0
paul@2	882
paul@2	883	def read_term(self):
paul@2	884
paul@2	885	"""
paul@19	886	Read a term, its position file offset, its frequency and its document
paul@25	887	frequency from the term information file.
paul@2	888	"""
paul@2	889
paul@2	890	# Read the prefix length and term suffix.
paul@2	891
paul@4	892	common = self.read_number()
paul@2	893	suffix = self.read_string()
paul@2	894
paul@2	895	self.last_term = self.last_term[:common] + suffix
paul@2	896
paul@2	897	# Read the offset delta.
paul@2	898
paul@2	899	self.last_offset += self.read_number()
paul@2	900
paul@11	901	# Read the frequency.
paul@11	902
paul@11	903	frequency = self.read_number()
paul@11	904
paul@19	905	# Read the document frequency.
paul@19	906
paul@19	907	doc_frequency = self.read_number()
paul@19	908
paul@19	909	return self.last_term, self.last_offset, frequency, doc_frequency
paul@2	910
paul@3	911	def go_to_term(self, term, offset, info_offset):
paul@3	912
paul@9	913	"""
paul@9	914	Seek past the entry for 'term' having 'offset' to 'info_offset'. This
paul@9	915	permits the scanning for later terms from the specified term.
paul@9	916	"""
paul@3	917
paul@40	918	self.seek(info_offset)
paul@3	919	self.last_term = term
paul@3	920	self.last_offset = offset
paul@3	921
paul@3	922	class TermIndexWriter(TermWriter):
paul@3	923
paul@3	924	"Writing term dictionary index details to files."
paul@3	925
paul@3	926	def reset(self):
paul@3	927	TermWriter.reset(self)
paul@3	928	self.last_info_offset = 0
paul@3	929
paul@19	930	def write_term(self, term, offset, frequency, doc_frequency, info_offset):
paul@3	931
paul@3	932	"""
paul@19	933	Write the given 'term', its position file 'offset', its 'frequency' and
paul@19	934	its 'doc_frequency' to the term dictionary index file, along with the
paul@19	935	'info_offset' in the term information file.
paul@3	936	"""
paul@3	937
paul@19	938	TermWriter.write_term(self, term, offset, frequency, doc_frequency)
paul@3	939
paul@3	940	# Write the information file offset delta.
paul@3	941
paul@3	942	self.write_number(info_offset - self.last_info_offset)
paul@3	943	self.last_info_offset = info_offset
paul@3	944
paul@3	945	class TermIndexReader(TermReader):
paul@3	946
paul@3	947	"Reading term dictionary index details from files."
paul@3	948
paul@3	949	def reset(self):
paul@3	950	TermReader.reset(self)
paul@3	951	self.last_info_offset = 0
paul@3	952
paul@3	953	def read_term(self):
paul@3	954
paul@3	955	"""
paul@19	956	Read a term, its position file offset, its frequency, its document
paul@19	957	frequency and a term information file offset from the term dictionary
paul@19	958	index file.
paul@3	959	"""
paul@3	960
paul@19	961	term, offset, frequency, doc_frequency = TermReader.read_term(self)
paul@3	962
paul@3	963	# Read the offset delta.
paul@3	964
paul@3	965	self.last_info_offset += self.read_number()
paul@3	966
paul@19	967	return term, offset, frequency, doc_frequency, self.last_info_offset
paul@3	968
paul@3	969	class TermDictionaryWriter:
paul@3	970
paul@3	971	"Writing term dictionaries."
paul@3	972
paul@19	973	def __init__(self, info_writer, index_writer, position_dict_writer, interval):
paul@3	974	self.info_writer = info_writer
paul@3	975	self.index_writer = index_writer
paul@19	976	self.position_dict_writer = position_dict_writer
paul@3	977	self.interval = interval
paul@3	978	self.entry = 0
paul@3	979
paul@19	980	def _write_term(self, term, offset, frequency, doc_frequency):
paul@3	981
paul@3	982	"""
paul@19	983	Write the given 'term', its position file 'offset', its 'frequency' and
paul@19	984	its 'doc_frequency' (number of documents in which it appears) to the
paul@19	985	term information file. Return the offset after the term information was
paul@19	986	written to the file.
paul@3	987	"""
paul@3	988
paul@19	989	info_offset = self.info_writer.write_term(term, offset, frequency, doc_frequency)
paul@3	990
paul@3	991	if self.entry % self.interval == 0:
paul@19	992	self.index_writer.write_term(term, offset, frequency, doc_frequency, info_offset)
paul@3	993
paul@3	994	self.entry += 1
paul@3	995
paul@5	996	def write_term_positions(self, term, doc_positions):
paul@5	997
paul@5	998	"""
paul@5	999	Write the given 'term' and the 'doc_positions' recording the documents
paul@5	1000	and positions at which the term is found.
paul@5	1001	"""
paul@5	1002
paul@19	1003	offset, frequency, doc_frequency = self.position_dict_writer.write_term_positions(doc_positions)
paul@19	1004	self._write_term(term, offset, frequency, doc_frequency)
paul@5	1005
paul@3	1006	def close(self):
paul@3	1007	self.info_writer.close()
paul@3	1008	self.index_writer.close()
paul@19	1009	self.position_dict_writer.close()
paul@3	1010
paul@3	1011	class TermDictionaryReader:
paul@3	1012
paul@3	1013	"Reading term dictionaries."
paul@3	1014
paul@22	1015	def __init__(self, info_reader, index_reader, position_dict_reader):
paul@3	1016	self.info_reader = info_reader
paul@3	1017	self.index_reader = index_reader
paul@22	1018	self.position_dict_reader = position_dict_reader
paul@3	1019
paul@3	1020	self.terms = []
paul@3	1021	try:
paul@3	1022	while 1:
paul@3	1023	self.terms.append(self.index_reader.read_term())
paul@3	1024	except EOFError:
paul@3	1025	pass
paul@3	1026
paul@3	1027	# Large numbers for ordering purposes.
paul@3	1028
paul@28	1029	if self.terms:
paul@28	1030	self.max_offset = self.terms[-1][1] + 1
paul@28	1031	else:
paul@28	1032	self.max_offset = None
paul@3	1033
paul@25	1034	def _find_closest_entry(self, term):
paul@3	1035
paul@11	1036	"""
paul@25	1037	Find the offsets and frequencies of 'term' from the term dictionary or
paul@25	1038	the closest term starting with the value of 'term'.
paul@25	1039
paul@25	1040	Return the closest index entry consisting of a term, the position file
paul@25	1041	offset, the term frequency, the document frequency, and the term details
paul@25	1042	file offset.
paul@11	1043	"""
paul@3	1044
paul@14	1045	i = bisect_right(self.terms, (term, self.max_offset, 0, 0)) - 1
paul@3	1046
paul@3	1047	# Get the entry position providing the term or one preceding it.
paul@25	1048	# If no entry precedes the requested term, return the very first entry
paul@25	1049	# as the closest.
paul@3	1050
paul@3	1051	if i == -1:
paul@25	1052	return self.terms[0]
paul@25	1053	else:
paul@25	1054	return self.terms[i]
paul@25	1055
paul@25	1056	def _find_closest_term(self, term):
paul@25	1057
paul@25	1058	"""
paul@25	1059	Find the offsets and frequencies of 'term' from the term dictionary or
paul@25	1060	the closest term starting with the value of 'term'.
paul@25	1061
paul@25	1062	Return the closest term (or the term itself), the position file offset,
paul@25	1063	the term frequency, the document frequency, and the term details file
paul@25	1064	offset (or None if the reader is already positioned).
paul@25	1065	"""
paul@25	1066
paul@25	1067	found_term, offset, frequency, doc_frequency, info_offset = self._find_closest_entry(term)
paul@3	1068
paul@19	1069	# Where the term is found immediately, return the offset and
paul@25	1070	# frequencies. If the term does not appear, return the details of the
paul@25	1071	# closest entry.
paul@25	1072
paul@25	1073	if term <= found_term:
paul@25	1074	return found_term, offset, frequency, doc_frequency, info_offset
paul@3	1075
paul@3	1076	# Otherwise, seek past the index term's entry in the information file
paul@3	1077	# and scan for the desired term.
paul@3	1078
paul@3	1079	else:
paul@3	1080	self.info_reader.go_to_term(found_term, offset, info_offset)
paul@3	1081	try:
paul@3	1082	while term > found_term:
paul@19	1083	found_term, offset, frequency, doc_frequency = self.info_reader.read_term()
paul@3	1084	except EOFError:
paul@3	1085	pass
paul@3	1086
paul@25	1087	return found_term, offset, frequency, doc_frequency, None
paul@25	1088
paul@25	1089	def _find_term(self, term):
paul@25	1090
paul@25	1091	"""
paul@25	1092	Find the position file offset and frequency of 'term' from the term
paul@25	1093	dictionary.
paul@25	1094	"""
paul@25	1095
paul@25	1096	found_term, offset, frequency, doc_frequency, info_offset = self._find_closest_term(term)
paul@25	1097
paul@25	1098	# If the term is found, return the offset and frequencies.
paul@25	1099
paul@25	1100	if term == found_term:
paul@25	1101	return offset, frequency, doc_frequency
paul@25	1102	else:
paul@25	1103	return None
paul@25	1104
paul@25	1105	def _get_positions(self, offset, doc_frequency):
paul@25	1106	return self.position_dict_reader.read_term_positions(offset, doc_frequency)
paul@25	1107
paul@34	1108	# Iterator convenience methods.
paul@34	1109
paul@34	1110	def __iter__(self):
paul@34	1111	self.rewind()
paul@34	1112	return self
paul@34	1113
paul@34	1114	def next(self):
paul@34	1115	try:
paul@34	1116	return self.read_term()
paul@34	1117	except EOFError:
paul@34	1118	raise StopIteration
paul@34	1119
paul@25	1120	# Sequential access methods.
paul@3	1121
paul@12	1122	def rewind(self):
paul@12	1123	self.info_reader.rewind()
paul@12	1124
paul@12	1125	def read_term(self):
paul@12	1126
paul@12	1127	"""
paul@19	1128	Return the next term, its frequency, its document frequency, and the
paul@19	1129	documents and positions at which the term is found.
paul@12	1130	"""
paul@12	1131
paul@19	1132	term, offset, frequency, doc_frequency = self.info_reader.read_term()
paul@19	1133	positions = self._get_positions(offset, doc_frequency)
paul@19	1134	return term, frequency, doc_frequency, positions
paul@12	1135
paul@25	1136	# Query methods.
paul@25	1137
paul@25	1138	def find_terms(self, term):
paul@25	1139
paul@25	1140	"Return all terms whose values start with the value of 'term'."
paul@25	1141
paul@25	1142	terms = []
paul@25	1143
paul@25	1144	found_term, offset, frequency, doc_frequency, info_offset = self._find_closest_term(term)
paul@25	1145
paul@25	1146	# Position the reader, if necessary.
paul@25	1147
paul@25	1148	if info_offset is not None:
paul@25	1149	self.info_reader.go_to_term(found_term, offset, info_offset)
paul@25	1150
paul@25	1151	# Read and record terms.
paul@25	1152
paul@25	1153	try:
paul@25	1154	# Add the found term if it starts with the specified term.
paul@25	1155
paul@25	1156	while found_term.startswith(term):
paul@25	1157	terms.append(found_term)
paul@25	1158	found_term, offset, frequency, doc_frequency = self.info_reader.read_term()
paul@25	1159
paul@25	1160	except EOFError:
paul@25	1161	pass
paul@25	1162
paul@25	1163	return terms
paul@25	1164
paul@5	1165	def find_positions(self, term):
paul@5	1166
paul@5	1167	"Return the documents and positions at which the given 'term' is found."
paul@5	1168
paul@11	1169	t = self._find_term(term)
paul@11	1170	if t is None:
paul@5	1171	return None
paul@5	1172	else:
paul@19	1173	offset, frequency, doc_frequency = t
paul@19	1174	return self._get_positions(offset, doc_frequency)
paul@5	1175
paul@11	1176	def get_frequency(self, term):
paul@11	1177
paul@11	1178	"Return the frequency of the given 'term'."
paul@11	1179
paul@11	1180	t = self._find_term(term)
paul@11	1181	if t is None:
paul@11	1182	return None
paul@11	1183	else:
paul@19	1184	offset, frequency, doc_frequency = t
paul@11	1185	return frequency
paul@11	1186
paul@19	1187	def get_document_frequency(self, term):
paul@19	1188
paul@19	1189	"Return the document frequency of the given 'term'."
paul@19	1190
paul@19	1191	t = self._find_term(term)
paul@19	1192	if t is None:
paul@19	1193	return None
paul@19	1194	else:
paul@19	1195	offset, frequency, doc_frequency = t
paul@19	1196	return doc_frequency
paul@19	1197
paul@3	1198	def close(self):
paul@3	1199	self.info_reader.close()
paul@3	1200	self.index_reader.close()
paul@22	1201	self.position_dict_reader.close()
paul@3	1202
paul@9	1203	# Specific classes for storing document information.
paul@9	1204
paul@8	1205	class FieldWriter(FileWriter):
paul@8	1206
paul@8	1207	"Writing field data to files."
paul@8	1208
paul@9	1209	def reset(self):
paul@9	1210	self.last_docnum = 0
paul@9	1211
paul@9	1212	def write_fields(self, docnum, fields):
paul@8	1213
paul@8	1214	"""
paul@13	1215	Write for the given 'docnum', a list of 'fields' (integer, string pairs
paul@13	1216	representing field identifiers and values respectively).
paul@13	1217	Return the offset at which the fields are stored.
paul@8	1218	"""
paul@8	1219
paul@39	1220	offset = self.tell()
paul@8	1221
paul@9	1222	# Write the document number delta.
paul@9	1223
paul@9	1224	self.write_number(docnum - self.last_docnum)
paul@9	1225
paul@8	1226	# Write the number of fields.
paul@8	1227
paul@8	1228	self.write_number(len(fields))
paul@8	1229
paul@8	1230	# Write the fields themselves.
paul@8	1231
paul@13	1232	for i, field in fields:
paul@13	1233	self.write_number(i)
paul@10	1234	self.write_string(field, 1) # compress
paul@8	1235
paul@9	1236	self.last_docnum = docnum
paul@8	1237	return offset
paul@8	1238
paul@8	1239	class FieldReader(FileReader):
paul@8	1240
paul@8	1241	"Reading field data from files."
paul@8	1242
paul@9	1243	def reset(self):
paul@9	1244	self.last_docnum = 0
paul@9	1245
paul@8	1246	def read_fields(self):
paul@8	1247
paul@9	1248	"""
paul@9	1249	Read fields from the file, returning a tuple containing the document
paul@13	1250	number and a list of field (identifier, value) pairs.
paul@9	1251	"""
paul@9	1252
paul@9	1253	# Read the document number.
paul@9	1254
paul@9	1255	self.last_docnum += self.read_number()
paul@8	1256
paul@8	1257	# Read the number of fields.
paul@8	1258
paul@8	1259	nfields = self.read_number()
paul@8	1260
paul@8	1261	# Collect the fields.
paul@8	1262
paul@8	1263	fields = []
paul@8	1264	i = 0
paul@8	1265
paul@8	1266	while i < nfields:
paul@13	1267	identifier = self.read_number()
paul@13	1268	value = self.read_string(1) # decompress
paul@13	1269	fields.append((identifier, value))
paul@8	1270	i += 1
paul@8	1271
paul@9	1272	return self.last_docnum, fields
paul@9	1273
paul@9	1274	def read_document_fields(self, docnum, offset):
paul@8	1275
paul@9	1276	"""
paul@9	1277	Read fields for 'docnum' at the given 'offset'. This permits the
paul@9	1278	retrieval of details for the specified document, as well as scanning for
paul@9	1279	later documents.
paul@9	1280	"""
paul@8	1281
paul@40	1282	self.seek(offset)
paul@9	1283	bad_docnum, fields = self.read_fields()
paul@9	1284	self.last_docnum = docnum
paul@9	1285	return docnum, fields
paul@12	1286
paul@9	1287	class FieldIndexWriter(FileWriter):
paul@9	1288
paul@9	1289	"Writing field index details to files."
paul@9	1290
paul@9	1291	def reset(self):
paul@9	1292	self.last_docnum = 0
paul@10	1293	self.last_offset = 0
paul@9	1294
paul@9	1295	def write_document(self, docnum, offset):
paul@9	1296
paul@9	1297	"""
paul@9	1298	Write for the given 'docnum', the 'offset' at which the fields for the
paul@9	1299	document are stored in the fields file.
paul@9	1300	"""
paul@9	1301
paul@10	1302	# Write the document number and offset deltas.
paul@9	1303
paul@9	1304	self.write_number(docnum - self.last_docnum)
paul@10	1305	self.write_number(offset - self.last_offset)
paul@9	1306
paul@9	1307	self.last_docnum = docnum
paul@10	1308	self.last_offset = offset
paul@9	1309
paul@9	1310	class FieldIndexReader(FileReader):
paul@9	1311
paul@9	1312	"Reading field index details from files."
paul@9	1313
paul@9	1314	def reset(self):
paul@9	1315	self.last_docnum = 0
paul@10	1316	self.last_offset = 0
paul@9	1317
paul@9	1318	def read_document(self):
paul@9	1319
paul@9	1320	"Read a document number and field file offset."
paul@9	1321
paul@9	1322	# Read the document number delta and offset.
paul@9	1323
paul@9	1324	self.last_docnum += self.read_number()
paul@10	1325	self.last_offset += self.read_number()
paul@9	1326
paul@10	1327	return self.last_docnum, self.last_offset
paul@9	1328
paul@9	1329	class FieldDictionaryWriter:
paul@9	1330
paul@9	1331	"Writing field dictionary details."
paul@9	1332
paul@9	1333	def __init__(self, field_writer, field_index_writer, interval):
paul@9	1334	self.field_writer = field_writer
paul@9	1335	self.field_index_writer = field_index_writer
paul@9	1336	self.interval = interval
paul@9	1337	self.entry = 0
paul@9	1338
paul@9	1339	def write_fields(self, docnum, fields):
paul@9	1340
paul@9	1341	"Write details of the document with the given 'docnum' and 'fields'."
paul@9	1342
paul@9	1343	offset = self.field_writer.write_fields(docnum, fields)
paul@9	1344
paul@9	1345	if self.entry % self.interval == 0:
paul@9	1346	self.field_index_writer.write_document(docnum, offset)
paul@9	1347
paul@9	1348	self.entry += 1
paul@9	1349
paul@9	1350	def close(self):
paul@9	1351	self.field_writer.close()
paul@9	1352	self.field_index_writer.close()
paul@9	1353
paul@9	1354	class FieldDictionaryReader:
paul@9	1355
paul@9	1356	"Reading field dictionary details."
paul@9	1357
paul@9	1358	def __init__(self, field_reader, field_index_reader):
paul@9	1359	self.field_reader = field_reader
paul@9	1360	self.field_index_reader = field_index_reader
paul@9	1361
paul@9	1362	self.docs = []
paul@9	1363	try:
paul@9	1364	while 1:
paul@9	1365	self.docs.append(self.field_index_reader.read_document())
paul@9	1366	except EOFError:
paul@9	1367	pass
paul@9	1368
paul@9	1369	# Large numbers for ordering purposes.
paul@9	1370
paul@28	1371	if self.docs:
paul@28	1372	self.max_offset = self.docs[-1][1]
paul@28	1373	else:
paul@28	1374	self.max_offset = None
paul@9	1375
paul@34	1376	# Iterator convenience methods.
paul@34	1377
paul@34	1378	def __iter__(self):
paul@34	1379	self.rewind()
paul@34	1380	return self
paul@34	1381
paul@34	1382	def next(self):
paul@34	1383	try:
paul@34	1384	return self.read_fields()
paul@34	1385	except EOFError:
paul@34	1386	raise StopIteration
paul@34	1387
paul@34	1388	# Sequential access methods.
paul@34	1389
paul@13	1390	def rewind(self):
paul@13	1391	self.field_reader.rewind()
paul@13	1392
paul@13	1393	def read_fields(self):
paul@13	1394
paul@13	1395	"Return the next document number and fields."
paul@13	1396
paul@13	1397	return self.field_reader.read_fields()
paul@13	1398
paul@34	1399	# Random access methods.
paul@34	1400
paul@13	1401	def get_fields(self, docnum):
paul@9	1402
paul@9	1403	"Read the fields of the document with the given 'docnum'."
paul@9	1404
paul@9	1405	i = bisect_right(self.docs, (docnum, self.max_offset)) - 1
paul@9	1406
paul@9	1407	# Get the entry position providing the term or one preceding it.
paul@9	1408
paul@9	1409	if i == -1:
paul@9	1410	return None
paul@9	1411
paul@9	1412	found_docnum, offset = self.docs[i]
paul@9	1413
paul@9	1414	# Read from the fields file.
paul@9	1415
paul@9	1416	found_docnum, fields = self.field_reader.read_document_fields(found_docnum, offset)
paul@9	1417
paul@9	1418	# Scan for the document, if necessary.
paul@9	1419
paul@9	1420	try:
paul@9	1421	while docnum > found_docnum:
paul@9	1422	found_docnum, fields = self.field_reader.read_fields()
paul@9	1423	except EOFError:
paul@9	1424	pass
paul@9	1425
paul@9	1426	# If the document is found, return the fields.
paul@9	1427
paul@9	1428	if docnum == found_docnum:
paul@9	1429	return fields
paul@9	1430	else:
paul@9	1431	return None
paul@9	1432
paul@9	1433	def close(self):
paul@9	1434	self.field_reader.close()
paul@9	1435	self.field_index_reader.close()
paul@8	1436
paul@12	1437	# Dictionary merging classes.
paul@12	1438
paul@13	1439	class Merger:
paul@12	1440
paul@13	1441	"Merge files."
paul@12	1442
paul@12	1443	def __init__(self, writer, readers):
paul@12	1444	self.writer = writer
paul@12	1445	self.readers = readers
paul@12	1446
paul@13	1447	def close(self):
paul@13	1448	for reader in self.readers:
paul@13	1449	reader.close()
paul@13	1450	self.writer.close()
paul@13	1451
paul@13	1452	class TermDictionaryMerger(Merger):
paul@13	1453
paul@13	1454	"Merge term and position files."
paul@13	1455
paul@12	1456	def merge(self):
paul@13	1457
paul@13	1458	"""
paul@13	1459	Merge terms and positions from the readers, sending them to the writer.
paul@13	1460	"""
paul@13	1461
paul@34	1462	last_term = None
paul@34	1463	current_readers = []
paul@34	1464
paul@34	1465	for term, frequency, doc_frequency, positions in itermerge(self.readers):
paul@34	1466	if term == last_term:
paul@34	1467	current_readers.append(positions)
paul@34	1468	else:
paul@34	1469	if current_readers:
paul@34	1470	self.writer.write_term_positions(last_term, itermerge(current_readers))
paul@34	1471	last_term = term
paul@34	1472	current_readers = [positions]
paul@34	1473	else:
paul@34	1474	if current_readers:
paul@34	1475	self.writer.write_term_positions(last_term, itermerge(current_readers))
paul@12	1476
paul@13	1477	class FieldDictionaryMerger(Merger):
paul@13	1478
paul@13	1479	"Merge field files."
paul@13	1480
paul@13	1481	def merge(self):
paul@13	1482
paul@13	1483	"""
paul@13	1484	Merge fields from the readers, sending them to the writer.
paul@13	1485	"""
paul@13	1486
paul@34	1487	for docnum, fields in itermerge(self.readers):
paul@13	1488	self.writer.write_fields(docnum, fields)
paul@13	1489
paul@13	1490	# Utility functions.
paul@13	1491
paul@19	1492	def get_term_writer(pathname, partition, interval, doc_interval):
paul@13	1493
paul@13	1494	"""
paul@13	1495	Return a term dictionary writer using files under the given 'pathname'
paul@13	1496	labelled according to the given 'partition', using the given indexing
paul@19	1497	'interval' for terms and 'doc_interval' for document position records.
paul@13	1498	"""
paul@13	1499
paul@13	1500	tdf = open(join(pathname, "terms-%s" % partition), "wb")
paul@13	1501	info_writer = TermWriter(tdf)
paul@13	1502
paul@14	1503	tdif = open(join(pathname, "terms_index-%s" % partition), "wb")
paul@13	1504	index_writer = TermIndexWriter(tdif)
paul@13	1505
paul@13	1506	tpf = open(join(pathname, "positions-%s" % partition), "wb")
paul@13	1507	positions_writer = PositionWriter(tpf)
paul@13	1508
paul@19	1509	tpif = open(join(pathname, "positions_index-%s" % partition), "wb")
paul@19	1510	positions_index_writer = PositionIndexWriter(tpif)
paul@19	1511
paul@19	1512	positions_dict_writer = PositionDictionaryWriter(positions_writer, positions_index_writer, doc_interval)
paul@19	1513
paul@19	1514	return TermDictionaryWriter(info_writer, index_writer, positions_dict_writer, interval)
paul@13	1515
paul@13	1516	def get_field_writer(pathname, partition, interval):
paul@13	1517
paul@13	1518	"""
paul@13	1519	Return a field dictionary writer using files under the given 'pathname'
paul@13	1520	labelled according to the given 'partition', using the given indexing
paul@13	1521	'interval'.
paul@13	1522	"""
paul@13	1523
paul@13	1524	ff = open(join(pathname, "fields-%s" % partition), "wb")
paul@13	1525	field_writer = FieldWriter(ff)
paul@13	1526
paul@13	1527	fif = open(join(pathname, "fields_index-%s" % partition), "wb")
paul@13	1528	field_index_writer = FieldIndexWriter(fif)
paul@13	1529
paul@13	1530	return FieldDictionaryWriter(field_writer, field_index_writer, interval)
paul@13	1531
paul@14	1532	def get_term_reader(pathname, partition):
paul@14	1533
paul@14	1534	"""
paul@14	1535	Return a term dictionary reader using files under the given 'pathname'
paul@14	1536	labelled according to the given 'partition'.
paul@14	1537	"""
paul@14	1538
paul@14	1539	tdf = open(join(pathname, "terms-%s" % partition), "rb")
paul@14	1540	info_reader = TermReader(tdf)
paul@14	1541
paul@14	1542	tdif = open(join(pathname, "terms_index-%s" % partition), "rb")
paul@14	1543	index_reader = TermIndexReader(tdif)
paul@14	1544
paul@34	1545	positions_opener = PositionOpener(join(pathname, "positions-%s" % partition))
paul@34	1546	positions_index_opener = PositionIndexOpener(join(pathname, "positions_index-%s" % partition))
paul@34	1547
paul@34	1548	positions_dict_reader = PositionDictionaryReader(positions_opener, positions_index_opener)
paul@19	1549
paul@19	1550	return TermDictionaryReader(info_reader, index_reader, positions_dict_reader)
paul@14	1551
paul@14	1552	def get_field_reader(pathname, partition):
paul@14	1553
paul@14	1554	"""
paul@14	1555	Return a field dictionary reader using files under the given 'pathname'
paul@14	1556	labelled according to the given 'partition'.
paul@14	1557	"""
paul@14	1558
paul@14	1559	ff = open(join(pathname, "fields-%s" % partition), "rb")
paul@14	1560	field_reader = FieldReader(ff)
paul@14	1561
paul@14	1562	fif = open(join(pathname, "fields_index-%s" % partition), "rb")
paul@14	1563	field_index_reader = FieldIndexReader(fif)
paul@14	1564
paul@14	1565	return FieldDictionaryReader(field_reader, field_index_reader)
paul@14	1566
paul@14	1567	def rename_files(pathname, names, from_partition, to_partition):
paul@14	1568	for name in names:
paul@14	1569	rename(join(pathname, "%s-%s" % (name, from_partition)), join(pathname, "%s-%s" % (name, to_partition)))
paul@14	1570
paul@14	1571	def rename_term_files(pathname, from_partition, to_partition):
paul@20	1572	rename_files(pathname, TERM_FILENAMES, from_partition, to_partition)
paul@14	1573
paul@14	1574	def rename_field_files(pathname, from_partition, to_partition):
paul@20	1575	rename_files(pathname, FIELD_FILENAMES, from_partition, to_partition)
paul@14	1576
paul@14	1577	def remove_files(pathname, names, partition):
paul@14	1578	for name in names:
paul@14	1579	remove(join(pathname, "%s-%s" % (name, partition)))
paul@14	1580
paul@14	1581	def remove_term_files(pathname, partition):
paul@20	1582	remove_files(pathname, TERM_FILENAMES, partition)
paul@14	1583
paul@14	1584	def remove_field_files(pathname, partition):
paul@20	1585	remove_files(pathname, FIELD_FILENAMES, partition)
paul@14	1586
paul@8	1587	# High-level classes.
paul@8	1588
paul@28	1589	class Document:
paul@28	1590
paul@28	1591	"A container of document information."
paul@28	1592
paul@28	1593	def __init__(self, docnum):
paul@28	1594	self.docnum = docnum
paul@28	1595	self.fields = []
paul@28	1596	self.terms = {}
paul@28	1597
paul@28	1598	def add_position(self, term, position):
paul@28	1599
paul@28	1600	"""
paul@28	1601	Add a position entry for the given 'term', indicating the given
paul@28	1602	'position'.
paul@28	1603	"""
paul@28	1604
paul@28	1605	self.terms.setdefault(term, []).append(position)
paul@28	1606
paul@28	1607	def add_field(self, identifier, value):
paul@28	1608
paul@28	1609	"Add a field having the given 'identifier' and 'value'."
paul@28	1610
paul@28	1611	self.fields.append((identifier, unicode(value))) # convert to string
paul@28	1612
paul@31	1613	def set_fields(self, fields):
paul@28	1614
paul@28	1615	"""
paul@31	1616	Set the document's 'fields': a list of tuples each containing an integer
paul@31	1617	identifier and a string value.
paul@28	1618	"""
paul@28	1619
paul@28	1620	self.fields = fields
paul@28	1621
paul@6	1622	class IndexWriter:
paul@6	1623
paul@10	1624	"""
paul@10	1625	Building term information and writing it to the term and field dictionaries.
paul@10	1626	"""
paul@6	1627
paul@20	1628	def __init__(self, pathname, interval, doc_interval, flush_interval):
paul@12	1629	self.pathname = pathname
paul@12	1630	self.interval = interval
paul@20	1631	self.doc_interval = doc_interval
paul@12	1632	self.flush_interval = flush_interval
paul@12	1633
paul@12	1634	self.dict_partition = 0
paul@12	1635	self.field_dict_partition = 0
paul@12	1636
paul@6	1637	self.terms = {}
paul@10	1638	self.docs = {}
paul@6	1639
paul@27	1640	self.doc_counter = 0
paul@12	1641
paul@28	1642	def add_document(self, doc):
paul@10	1643
paul@13	1644	"""
paul@28	1645	Add the given document 'doc', updating the document counter and flushing
paul@28	1646	terms and fields if appropriate.
paul@13	1647	"""
paul@10	1648
paul@28	1649	for term, positions in doc.terms.items():
paul@28	1650	self.terms.setdefault(term, {})[doc.docnum] = positions
paul@28	1651
paul@28	1652	self.docs[doc.docnum] = doc.fields
paul@27	1653
paul@27	1654	self.doc_counter += 1
paul@27	1655	if self.flush_interval and self.doc_counter >= self.flush_interval:
paul@27	1656	self.flush_terms()
paul@12	1657	self.flush_fields()
paul@27	1658	self.doc_counter = 0
paul@26	1659
paul@12	1660	def get_term_writer(self):
paul@12	1661
paul@12	1662	"Return a term dictionary writer for the current partition."
paul@12	1663
paul@20	1664	return get_term_writer(self.pathname, self.dict_partition, self.interval, self.doc_interval)
paul@12	1665
paul@12	1666	def get_field_writer(self):
paul@12	1667
paul@12	1668	"Return a field dictionary writer for the current partition."
paul@12	1669
paul@13	1670	return get_field_writer(self.pathname, self.field_dict_partition, self.interval)
paul@12	1671
paul@12	1672	def flush_terms(self):
paul@12	1673
paul@12	1674	"Flush terms into the current term dictionary partition."
paul@6	1675
paul@6	1676	# Get the terms in order.
paul@6	1677
paul@32	1678	all_terms = self.terms
paul@32	1679	terms = all_terms.keys()
paul@6	1680	terms.sort()
paul@6	1681
paul@12	1682	dict_writer = self.get_term_writer()
paul@12	1683
paul@32	1684	for term in terms:
paul@32	1685	doc_positions = all_terms[term].items()
paul@12	1686	dict_writer.write_term_positions(term, doc_positions)
paul@12	1687
paul@12	1688	dict_writer.close()
paul@6	1689
paul@12	1690	self.terms = {}
paul@12	1691	self.dict_partition += 1
paul@12	1692
paul@12	1693	def flush_fields(self):
paul@12	1694
paul@12	1695	"Flush fields into the current term dictionary partition."
paul@7	1696
paul@10	1697	# Get the documents in order.
paul@10	1698
paul@10	1699	docs = self.docs.items()
paul@10	1700	docs.sort()
paul@10	1701
paul@12	1702	field_dict_writer = self.get_field_writer()
paul@12	1703
paul@10	1704	for docnum, fields in docs:
paul@12	1705	field_dict_writer.write_fields(docnum, fields)
paul@12	1706
paul@12	1707	field_dict_writer.close()
paul@10	1708
paul@12	1709	self.docs = {}
paul@12	1710	self.field_dict_partition += 1
paul@12	1711
paul@12	1712	def close(self):
paul@12	1713	if self.terms:
paul@12	1714	self.flush_terms()
paul@12	1715	if self.docs:
paul@12	1716	self.flush_fields()
paul@10	1717
paul@10	1718	class IndexReader:
paul@10	1719
paul@10	1720	"Accessing the term and field dictionaries."
paul@10	1721
paul@14	1722	def __init__(self, pathname):
paul@14	1723	self.dict_reader = get_term_reader(pathname, "merged")
paul@14	1724	self.field_dict_reader = get_field_reader(pathname, "merged")
paul@10	1725
paul@26	1726	def find_terms(self, term):
paul@26	1727	return self.dict_reader.find_terms(term)
paul@26	1728
paul@10	1729	def find_positions(self, term):
paul@10	1730	return self.dict_reader.find_positions(term)
paul@10	1731
paul@11	1732	def get_frequency(self, term):
paul@11	1733	return self.dict_reader.get_frequency(term)
paul@11	1734
paul@22	1735	def get_document_frequency(self, term):
paul@22	1736	return self.dict_reader.get_document_frequency(term)
paul@22	1737
paul@10	1738	def get_fields(self, docnum):
paul@13	1739	return self.field_dict_reader.get_fields(docnum)
paul@10	1740
paul@10	1741	def close(self):
paul@10	1742	self.dict_reader.close()
paul@10	1743	self.field_dict_reader.close()
paul@10	1744
paul@7	1745	class Index:
paul@7	1746
paul@7	1747	"An inverted index solution encapsulating the various components."
paul@7	1748
paul@7	1749	def __init__(self, pathname):
paul@7	1750	self.pathname = pathname
paul@7	1751	self.reader = None
paul@7	1752	self.writer = None
paul@7	1753
paul@20	1754	def get_writer(self, interval=TERM_INTERVAL, doc_interval=DOCUMENT_INTERVAL, flush_interval=FLUSH_INTERVAL):
paul@7	1755
paul@12	1756	"""
paul@20	1757	Return a writer, optionally using the given indexing 'interval',
paul@20	1758	'doc_interval' and 'flush_interval'.
paul@12	1759	"""
paul@7	1760
paul@7	1761	if not exists(self.pathname):
paul@7	1762	mkdir(self.pathname)
paul@7	1763
paul@20	1764	self.writer = IndexWriter(self.pathname, interval, doc_interval, flush_interval)
paul@7	1765	return self.writer
paul@7	1766
paul@12	1767	def get_reader(self, partition=0):
paul@7	1768
paul@7	1769	"Return a reader for the index."
paul@7	1770
paul@14	1771	# Ensure that only one partition exists.
paul@14	1772
paul@24	1773	self.merge()
paul@14	1774	return self._get_reader(partition)
paul@14	1775
paul@14	1776	def _get_reader(self, partition):
paul@14	1777
paul@14	1778	"Return a reader for the index."
paul@14	1779
paul@7	1780	if not exists(self.pathname):
paul@7	1781	raise OSError, "Index path %r does not exist." % self.pathname
paul@7	1782
paul@14	1783	self.reader = IndexReader(self.pathname)
paul@12	1784	return self.reader
paul@7	1785
paul@24	1786	def merge(self):
paul@24	1787
paul@24	1788	"Merge/optimise index partitions."
paul@24	1789
paul@24	1790	self.merge_terms()
paul@24	1791	self.merge_fields()
paul@24	1792
paul@20	1793	def merge_terms(self, interval=TERM_INTERVAL, doc_interval=DOCUMENT_INTERVAL):
paul@7	1794
paul@20	1795	"""
paul@20	1796	Merge term dictionaries using the given indexing 'interval' and
paul@20	1797	'doc_interval'.
paul@20	1798	"""
paul@10	1799
paul@12	1800	readers = []
paul@21	1801	partitions = set()
paul@10	1802
paul@14	1803	for filename in listdir(self.pathname):
paul@12	1804	if filename.startswith("terms-"): # 6 character prefix
paul@14	1805	partition = filename[6:]
paul@14	1806	readers.append(get_term_reader(self.pathname, partition))
paul@21	1807	partitions.add(partition)
paul@14	1808
paul@14	1809	# Write directly to a dictionary.
paul@14	1810
paul@14	1811	if len(readers) > 1:
paul@21	1812	if "merged" in partitions:
paul@21	1813	rename_term_files(self.pathname, "merged", "old-merged")
paul@21	1814	partitions.remove("merged")
paul@21	1815	partitions.add("old-merged")
paul@21	1816
paul@20	1817	writer = get_term_writer(self.pathname, "merged", interval, doc_interval)
paul@14	1818	merger = TermDictionaryMerger(writer, readers)
paul@14	1819	merger.merge()
paul@14	1820	merger.close()
paul@14	1821
paul@14	1822	# Remove old files.
paul@14	1823
paul@14	1824	for partition in partitions:
paul@14	1825	remove_term_files(self.pathname, partition)
paul@14	1826
paul@21	1827	elif len(readers) == 1:
paul@21	1828	partition = list(partitions)[0]
paul@21	1829	if partition != "merged":
paul@21	1830	rename_term_files(self.pathname, partition, "merged")
paul@14	1831
paul@20	1832	def merge_fields(self, interval=FIELD_INTERVAL):
paul@10	1833
paul@14	1834	"Merge field dictionaries using the given indexing 'interval'."
paul@14	1835
paul@14	1836	readers = []
paul@21	1837	partitions = set()
paul@14	1838
paul@14	1839	for filename in listdir(self.pathname):
paul@14	1840	if filename.startswith("fields-"): # 7 character prefix
paul@14	1841	partition = filename[7:]
paul@14	1842	readers.append(get_field_reader(self.pathname, partition))
paul@21	1843	partitions.add(partition)
paul@14	1844
paul@14	1845	# Write directly to a dictionary.
paul@13	1846
paul@14	1847	if len(readers) > 1:
paul@21	1848	if "merged" in partitions:
paul@21	1849	rename_field_files(self.pathname, "merged", "old-merged")
paul@21	1850	partitions.remove("merged")
paul@21	1851	partitions.add("old-merged")
paul@21	1852
paul@14	1853	writer = get_field_writer(self.pathname, "merged", interval)
paul@14	1854	merger = FieldDictionaryMerger(writer, readers)
paul@14	1855	merger.merge()
paul@14	1856	merger.close()
paul@14	1857
paul@14	1858	# Remove old files.
paul@14	1859
paul@14	1860	for partition in partitions:
paul@14	1861	remove_field_files(self.pathname, partition)
paul@14	1862
paul@21	1863	elif len(readers) == 1:
paul@21	1864	partition = list(partitions)[0]
paul@21	1865	if partition != "merged":
paul@21	1866	rename_field_files(self.pathname, partition, "merged")
paul@7	1867
paul@7	1868	def close(self):
paul@7	1869	if self.reader is not None:
paul@7	1870	self.reader.close()
paul@7	1871	self.reader = None
paul@7	1872	if self.writer is not None:
paul@7	1873	self.writer.close()
paul@7	1874	self.writer = None
paul@6	1875
paul@0	1876	# vim: tabstop=4 expandtab shiftwidth=4