iixr (annotate iixr.py in 3d86f5cb01c1)

iixr

Annotated iixr.py

11:3d86f5cb01c1

2009-08-29

Paul Boddie

Added term frequency information to the term dictionary.

paul@0	1	#!/usr/bin/env python
paul@0	2
paul@0	3	"""
paul@0	4	A simple (and sane) text indexing library.
paul@1	5
paul@1	6	Copyright (C) 2009 Paul Boddie <paul@boddie.org.uk>
paul@1	7
paul@1	8	This program is free software; you can redistribute it and/or modify it under
paul@1	9	the terms of the GNU General Public License as published by the Free Software
paul@1	10	Foundation; either version 3 of the License, or (at your option) any later
paul@1	11	version.
paul@1	12
paul@1	13	This program is distributed in the hope that it will be useful, but WITHOUT ANY
paul@1	14	WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
paul@1	15	PARTICULAR PURPOSE. See the GNU General Public License for more details.
paul@1	16
paul@1	17	You should have received a copy of the GNU General Public License along
paul@1	18	with this program. If not, see <http://www.gnu.org/licenses/>.
paul@0	19	"""
paul@0	20
paul@7	21	from os import mkdir # to determine whether to create indexes
paul@7	22	from os.path import exists, join
paul@2	23	from os.path import commonprefix # to find common string prefixes
paul@3	24	from bisect import bisect_right # to find terms in the dictionary index
paul@10	25	import bz2, zlib # for field compression
paul@2	26
paul@7	27	# Constants.
paul@7	28
paul@7	29	INTERVAL = 100
paul@7	30
paul@10	31	compressors = [("b", bz2.compress), ("z", zlib.compress)]
paul@10	32	decompressors = {"b" : bz2.decompress, "z" : zlib.decompress}
paul@10	33
paul@0	34	# Foundation classes.
paul@0	35
paul@0	36	class File:
paul@0	37
paul@0	38	"A basic file abstraction."
paul@0	39
paul@0	40	def __init__(self, f):
paul@0	41	self.f = f
paul@0	42	self.reset()
paul@0	43
paul@0	44	def reset(self):
paul@0	45	pass
paul@0	46
paul@0	47	def close(self):
paul@7	48	if self.f is not None:
paul@7	49	self.f.close()
paul@7	50	self.f = None
paul@0	51
paul@0	52	class FileWriter(File):
paul@0	53
paul@0	54	"Writing basic data types to files."
paul@0	55
paul@0	56	def write_number(self, number):
paul@0	57
paul@0	58	"Write 'number' to the file using a variable length encoding."
paul@0	59
paul@0	60	# Negative numbers are not supported.
paul@0	61
paul@0	62	if number < 0:
paul@0	63	raise ValueError, "Number %r is negative." % number
paul@0	64
paul@0	65	# Special case: one byte containing zero.
paul@0	66
paul@0	67	elif number == 0:
paul@4	68	self.f.write(chr(0))
paul@0	69	return
paul@0	70
paul@0	71	# Write the number from least to most significant digits.
paul@0	72
paul@0	73	bytes = []
paul@0	74
paul@0	75	while number != 0:
paul@4	76	lsd = number & 127
paul@4	77	number = number >> 7
paul@4	78	if number != 0:
paul@4	79	lsd \|= 128
paul@0	80	bytes.append(chr(lsd))
paul@0	81
paul@0	82	record = "".join(bytes)
paul@0	83	self.f.write(record)
paul@0	84
paul@8	85	def write_string(self, s, compress=0):
paul@2	86
paul@8	87	"""
paul@8	88	Write 's' to the file, recording its length and compressing the string
paul@8	89	if 'compress' is set to a true value.
paul@8	90	"""
paul@2	91
paul@7	92	# Convert Unicode objects to strings.
paul@7	93
paul@7	94	if isinstance(s, unicode):
paul@7	95	s = s.encode("utf-8")
paul@7	96
paul@8	97	# Compress the string if requested.
paul@2	98
paul@8	99	if compress:
paul@10	100	for flag, fn in compressors:
paul@10	101	cs = fn(s)
paul@10	102
paul@10	103	# Take the first string shorter than the original.
paul@10	104
paul@10	105	if len(cs) < len(s):
paul@10	106	s = cs
paul@10	107	break
paul@10	108	else:
paul@10	109	flag = "-"
paul@10	110
paul@10	111	# Record whether compression was used.
paul@10	112
paul@10	113	self.f.write(flag)
paul@2	114
paul@8	115	# Write the length of the data before the data itself.
paul@8	116
paul@8	117	length = len(s)
paul@4	118	self.write_number(length)
paul@2	119	self.f.write(s)
paul@2	120
paul@0	121	class FileReader(File):
paul@0	122
paul@0	123	"Reading basic data types from files."
paul@0	124
paul@0	125	def read_number(self):
paul@0	126
paul@0	127	"Read a number from the file."
paul@0	128
paul@0	129	# Read each byte, adding it to the number.
paul@0	130
paul@0	131	shift = 0
paul@0	132	number = 0
paul@4	133	more = 1
paul@0	134
paul@4	135	while more:
paul@4	136	byte = self.f.read(1)
paul@4	137	if not byte:
paul@4	138	raise EOFError
paul@4	139
paul@4	140	csd = ord(byte)
paul@4	141	more = csd & 128 != 0
paul@4	142	if more:
paul@4	143	csd &= 127
paul@0	144	number += (csd << shift)
paul@4	145	shift += 7
paul@0	146
paul@0	147	return number
paul@0	148
paul@8	149	def read_string(self, decompress=0):
paul@2	150
paul@8	151	"""
paul@8	152	Read a string from the file, decompressing the stored data if
paul@8	153	'decompress' is set to a true value.
paul@8	154	"""
paul@2	155
paul@10	156	# Decompress the data if requested.
paul@10	157
paul@10	158	if decompress:
paul@10	159	flag = self.f.read(1)
paul@10	160	else:
paul@10	161	flag = "-"
paul@10	162
paul@4	163	length = self.read_number()
paul@8	164	s = self.f.read(length)
paul@8	165
paul@10	166	# Perform decompression if applicable.
paul@8	167
paul@10	168	if flag != "-":
paul@10	169	fn = decompressors[flag]
paul@10	170	s = fn(s)
paul@7	171
paul@7	172	# Convert strings to Unicode objects.
paul@7	173
paul@8	174	return unicode(s, "utf-8")
paul@2	175
paul@9	176	# Specific classes for storing term and position information.
paul@0	177
paul@0	178	class PositionWriter(FileWriter):
paul@0	179
paul@0	180	"Writing position information to files."
paul@0	181
paul@0	182	def reset(self):
paul@0	183	self.last_docnum = 0
paul@0	184
paul@0	185	def write_positions(self, docnum, positions):
paul@0	186
paul@0	187	"Write for the document 'docnum' the given 'positions'."
paul@0	188
paul@0	189	if docnum < self.last_docnum:
paul@0	190	raise ValueError, "Document number %r is less than previous number %r." % (docnum, self.last_docnum)
paul@0	191
paul@0	192	# Write the document number delta.
paul@0	193
paul@0	194	self.write_number(docnum - self.last_docnum)
paul@0	195
paul@0	196	# Write the number of positions.
paul@0	197
paul@0	198	self.write_number(len(positions))
paul@0	199
paul@7	200	# Make sure that the positions are sorted.
paul@7	201
paul@7	202	positions.sort()
paul@7	203
paul@0	204	# Write the position deltas.
paul@0	205
paul@0	206	last = 0
paul@0	207	for position in positions:
paul@0	208	pos = position - last
paul@0	209	self.write_number(pos)
paul@0	210	last = position
paul@0	211
paul@0	212	self.last_docnum = docnum
paul@0	213
paul@0	214	def write_all_positions(self, doc_positions):
paul@0	215
paul@0	216	"""
paul@0	217	Write all 'doc_positions' - a collection of tuples of the form (document
paul@11	218	number, position list) - to the file, returning a tuple containing the
paul@11	219	offset at which they were stored together with the frequency (number of
paul@11	220	positions) for the term involved.
paul@0	221	"""
paul@0	222
paul@0	223	# Reset the writer and record the current file offset.
paul@0	224
paul@0	225	self.reset()
paul@0	226	offset = self.f.tell()
paul@0	227
paul@0	228	# Write the number of documents.
paul@0	229
paul@0	230	self.write_number(len(doc_positions))
paul@0	231
paul@0	232	# Write the positions.
paul@0	233
paul@11	234	frequency = 0
paul@11	235
paul@0	236	for docnum, positions in doc_positions:
paul@0	237	self.write_positions(docnum, positions)
paul@11	238	frequency += len(positions)
paul@0	239
paul@11	240	return offset, frequency
paul@0	241
paul@0	242	class PositionReader(FileReader):
paul@0	243
paul@0	244	"Reading position information from files."
paul@0	245
paul@0	246	def reset(self):
paul@0	247	self.last_docnum = 0
paul@0	248
paul@0	249	def read_positions(self):
paul@0	250
paul@0	251	"Read positions, returning a document number and a list of positions."
paul@0	252
paul@0	253	# Read the document number delta and add it to the last number.
paul@0	254
paul@0	255	self.last_docnum += self.read_number()
paul@0	256
paul@0	257	# Read the number of positions.
paul@0	258
paul@0	259	npositions = self.read_number()
paul@0	260
paul@0	261	# Read the position deltas, adding each previous position to get the
paul@0	262	# appropriate collection of absolute positions.
paul@0	263
paul@0	264	i = 0
paul@0	265	last = 0
paul@0	266	positions = []
paul@0	267
paul@0	268	while i < npositions:
paul@0	269	last += self.read_number()
paul@0	270	positions.append(last)
paul@0	271	i += 1
paul@0	272
paul@0	273	return self.last_docnum, positions
paul@0	274
paul@0	275	def read_all_positions(self, offset):
paul@0	276
paul@0	277	"""
paul@0	278	Read all positions from 'offset', seeking to that position in the file
paul@0	279	before reading.
paul@0	280	"""
paul@0	281
paul@0	282	self.reset()
paul@0	283	self.f.seek(offset)
paul@0	284
paul@0	285	# Read the number of documents.
paul@0	286
paul@0	287	ndocuments = self.read_number()
paul@0	288
paul@0	289	# Read all records.
paul@0	290
paul@0	291	i = 0
paul@0	292	doc_positions = []
paul@0	293
paul@0	294	while i < ndocuments:
paul@0	295	doc_positions.append(self.read_positions())
paul@0	296	i += 1
paul@0	297
paul@0	298	return doc_positions
paul@0	299
paul@2	300	class TermWriter(FileWriter):
paul@2	301
paul@2	302	"Writing term information to files."
paul@2	303
paul@2	304	def reset(self):
paul@2	305	self.last_term = ""
paul@2	306	self.last_offset = 0
paul@2	307
paul@11	308	def write_term(self, term, offset, frequency):
paul@2	309
paul@2	310	"""
paul@11	311	Write the given 'term', its position file 'offset', and its 'frequency'
paul@11	312	to the term information file. Return the offset after the term
paul@11	313	information was written to the file.
paul@2	314	"""
paul@2	315
paul@2	316	# Too long terms are not currently supported.
paul@2	317
paul@2	318	if len(term) > 255:
paul@2	319	raise ValueError, "Term %r is too long." % term
paul@2	320
paul@2	321	# Write the prefix length and term suffix.
paul@2	322
paul@2	323	common = len(commonprefix([self.last_term, term]))
paul@2	324	suffix = term[common:]
paul@2	325
paul@4	326	self.write_number(common)
paul@2	327	self.write_string(suffix)
paul@2	328
paul@2	329	# Write the offset delta.
paul@2	330
paul@2	331	self.write_number(offset - self.last_offset)
paul@2	332
paul@11	333	# Write the frequency.
paul@11	334
paul@11	335	self.write_number(frequency)
paul@11	336
paul@2	337	self.last_term = term
paul@2	338	self.last_offset = offset
paul@2	339
paul@3	340	return self.f.tell()
paul@3	341
paul@2	342	class TermReader(FileReader):
paul@2	343
paul@2	344	"Reading term information from files."
paul@2	345
paul@2	346	def reset(self):
paul@2	347	self.last_term = ""
paul@2	348	self.last_offset = 0
paul@2	349
paul@2	350	def read_term(self):
paul@2	351
paul@2	352	"""
paul@11	353	Read a term, its position file offset, and its frequency from the term
paul@11	354	information file.
paul@2	355	"""
paul@2	356
paul@2	357	# Read the prefix length and term suffix.
paul@2	358
paul@4	359	common = self.read_number()
paul@2	360	suffix = self.read_string()
paul@2	361
paul@2	362	self.last_term = self.last_term[:common] + suffix
paul@2	363
paul@2	364	# Read the offset delta.
paul@2	365
paul@2	366	self.last_offset += self.read_number()
paul@2	367
paul@11	368	# Read the frequency.
paul@11	369
paul@11	370	frequency = self.read_number()
paul@11	371
paul@11	372	return self.last_term, self.last_offset, frequency
paul@2	373
paul@3	374	def go_to_term(self, term, offset, info_offset):
paul@3	375
paul@9	376	"""
paul@9	377	Seek past the entry for 'term' having 'offset' to 'info_offset'. This
paul@9	378	permits the scanning for later terms from the specified term.
paul@9	379	"""
paul@3	380
paul@3	381	self.f.seek(info_offset)
paul@3	382	self.last_term = term
paul@3	383	self.last_offset = offset
paul@3	384
paul@3	385	class TermIndexWriter(TermWriter):
paul@3	386
paul@3	387	"Writing term dictionary index details to files."
paul@3	388
paul@3	389	def reset(self):
paul@3	390	TermWriter.reset(self)
paul@3	391	self.last_info_offset = 0
paul@3	392
paul@11	393	def write_term(self, term, offset, frequency, info_offset):
paul@3	394
paul@3	395	"""
paul@11	396	Write the given 'term', its position file 'offset', and its 'frequency'
paul@11	397	to the term dictionary index file, along with the 'info_offset' in the
paul@11	398	term information file.
paul@3	399	"""
paul@3	400
paul@11	401	TermWriter.write_term(self, term, offset, frequency)
paul@3	402
paul@3	403	# Write the information file offset delta.
paul@3	404
paul@3	405	self.write_number(info_offset - self.last_info_offset)
paul@3	406	self.last_info_offset = info_offset
paul@3	407
paul@3	408	class TermIndexReader(TermReader):
paul@3	409
paul@3	410	"Reading term dictionary index details from files."
paul@3	411
paul@3	412	def reset(self):
paul@3	413	TermReader.reset(self)
paul@3	414	self.last_info_offset = 0
paul@3	415
paul@3	416	def read_term(self):
paul@3	417
paul@3	418	"""
paul@11	419	Read a term, its position file offset, its frequency, and its term
paul@11	420	information file offset from the term dictionary index file.
paul@3	421	"""
paul@3	422
paul@11	423	term, offset, frequency = TermReader.read_term(self)
paul@3	424
paul@3	425	# Read the offset delta.
paul@3	426
paul@3	427	self.last_info_offset += self.read_number()
paul@3	428
paul@11	429	return term, offset, frequency, self.last_info_offset
paul@3	430
paul@3	431	class TermDictionaryWriter:
paul@3	432
paul@3	433	"Writing term dictionaries."
paul@3	434
paul@5	435	def __init__(self, info_writer, index_writer, position_writer, interval):
paul@3	436	self.info_writer = info_writer
paul@3	437	self.index_writer = index_writer
paul@5	438	self.position_writer = position_writer
paul@3	439	self.interval = interval
paul@3	440	self.entry = 0
paul@3	441
paul@11	442	def _write_term(self, term, offset, frequency):
paul@3	443
paul@3	444	"""
paul@11	445	Write the given 'term', its position file 'offset', and its 'frequency'
paul@11	446	to the term information file and optionally to the index, making a
paul@11	447	dictionary entry.
paul@3	448	"""
paul@3	449
paul@11	450	info_offset = self.info_writer.write_term(term, offset, frequency)
paul@3	451
paul@3	452	if self.entry % self.interval == 0:
paul@11	453	self.index_writer.write_term(term, offset, frequency, info_offset)
paul@3	454
paul@3	455	self.entry += 1
paul@3	456
paul@5	457	def write_term_positions(self, term, doc_positions):
paul@5	458
paul@5	459	"""
paul@5	460	Write the given 'term' and the 'doc_positions' recording the documents
paul@5	461	and positions at which the term is found.
paul@5	462	"""
paul@5	463
paul@11	464	offset, frequency = self.position_writer.write_all_positions(doc_positions)
paul@11	465	self._write_term(term, offset, frequency)
paul@5	466
paul@3	467	def close(self):
paul@3	468	self.info_writer.close()
paul@3	469	self.index_writer.close()
paul@5	470	self.position_writer.close()
paul@3	471
paul@3	472	class TermDictionaryReader:
paul@3	473
paul@3	474	"Reading term dictionaries."
paul@3	475
paul@5	476	def __init__(self, info_reader, index_reader, position_reader):
paul@3	477	self.info_reader = info_reader
paul@3	478	self.index_reader = index_reader
paul@5	479	self.position_reader = position_reader
paul@3	480
paul@3	481	self.terms = []
paul@3	482	try:
paul@3	483	while 1:
paul@3	484	self.terms.append(self.index_reader.read_term())
paul@3	485	except EOFError:
paul@3	486	pass
paul@3	487
paul@3	488	# Large numbers for ordering purposes.
paul@3	489
paul@3	490	self.max_offset = self.terms[-1][1]
paul@3	491	self.max_info_offset = self.terms[-1][2]
paul@3	492
paul@9	493	def _find_term(self, term):
paul@3	494
paul@11	495	"""
paul@11	496	Find the position file offset and frequency of 'term' from the term
paul@11	497	dictionary.
paul@11	498	"""
paul@3	499
paul@3	500	i = bisect_right(self.terms, (term, self.max_offset, self.max_info_offset)) - 1
paul@3	501
paul@3	502	# Get the entry position providing the term or one preceding it.
paul@3	503
paul@3	504	if i == -1:
paul@3	505	return None
paul@3	506
paul@11	507	found_term, offset, frequency, info_offset = self.terms[i]
paul@3	508
paul@3	509	# Where the term is found immediately, return the offset.
paul@3	510
paul@3	511	if term == found_term:
paul@11	512	return offset, frequency
paul@3	513
paul@3	514	# Otherwise, seek past the index term's entry in the information file
paul@3	515	# and scan for the desired term.
paul@3	516
paul@3	517	else:
paul@3	518	self.info_reader.go_to_term(found_term, offset, info_offset)
paul@3	519	try:
paul@3	520	while term > found_term:
paul@11	521	found_term, offset, frequency = self.info_reader.read_term()
paul@3	522	except EOFError:
paul@3	523	pass
paul@3	524
paul@11	525	# If the term is found, return the offset and frequency.
paul@3	526
paul@3	527	if term == found_term:
paul@11	528	return offset, frequency
paul@3	529	else:
paul@3	530	return None
paul@3	531
paul@5	532	def find_positions(self, term):
paul@5	533
paul@5	534	"Return the documents and positions at which the given 'term' is found."
paul@5	535
paul@11	536	t = self._find_term(term)
paul@11	537	if t is None:
paul@5	538	return None
paul@5	539	else:
paul@11	540	offset, frequency = t
paul@5	541	return self.position_reader.read_all_positions(offset)
paul@5	542
paul@11	543	def get_frequency(self, term):
paul@11	544
paul@11	545	"Return the frequency of the given 'term'."
paul@11	546
paul@11	547	t = self._find_term(term)
paul@11	548	if t is None:
paul@11	549	return None
paul@11	550	else:
paul@11	551	offset, frequency = t
paul@11	552	return frequency
paul@11	553
paul@3	554	def close(self):
paul@3	555	self.info_reader.close()
paul@3	556	self.index_reader.close()
paul@5	557	self.position_reader.close()
paul@3	558
paul@9	559	# Specific classes for storing document information.
paul@9	560
paul@8	561	class FieldWriter(FileWriter):
paul@8	562
paul@8	563	"Writing field data to files."
paul@8	564
paul@9	565	def reset(self):
paul@9	566	self.last_docnum = 0
paul@9	567
paul@9	568	def write_fields(self, docnum, fields):
paul@8	569
paul@8	570	"""
paul@9	571	Write for the given 'docnum', a list of 'fields' (strings representing
paul@9	572	field values). Return the offset at which the fields are stored.
paul@8	573	"""
paul@8	574
paul@8	575	offset = self.f.tell()
paul@8	576
paul@9	577	# Write the document number delta.
paul@9	578
paul@9	579	self.write_number(docnum - self.last_docnum)
paul@9	580
paul@8	581	# Write the number of fields.
paul@8	582
paul@8	583	self.write_number(len(fields))
paul@8	584
paul@8	585	# Write the fields themselves.
paul@8	586
paul@8	587	for field in fields:
paul@10	588	self.write_string(field, 1) # compress
paul@8	589
paul@9	590	self.last_docnum = docnum
paul@8	591	return offset
paul@8	592
paul@8	593	class FieldReader(FileReader):
paul@8	594
paul@8	595	"Reading field data from files."
paul@8	596
paul@9	597	def reset(self):
paul@9	598	self.last_docnum = 0
paul@9	599
paul@8	600	def read_fields(self):
paul@8	601
paul@9	602	"""
paul@9	603	Read fields from the file, returning a tuple containing the document
paul@9	604	number and a list of field values.
paul@9	605	"""
paul@9	606
paul@9	607	# Read the document number.
paul@9	608
paul@9	609	self.last_docnum += self.read_number()
paul@8	610
paul@8	611	# Read the number of fields.
paul@8	612
paul@8	613	nfields = self.read_number()
paul@8	614
paul@8	615	# Collect the fields.
paul@8	616
paul@8	617	fields = []
paul@8	618	i = 0
paul@8	619
paul@8	620	while i < nfields:
paul@10	621	fields.append(self.read_string(1)) # decompress
paul@8	622	i += 1
paul@8	623
paul@9	624	return self.last_docnum, fields
paul@9	625
paul@9	626	def read_document_fields(self, docnum, offset):
paul@8	627
paul@9	628	"""
paul@9	629	Read fields for 'docnum' at the given 'offset'. This permits the
paul@9	630	retrieval of details for the specified document, as well as scanning for
paul@9	631	later documents.
paul@9	632	"""
paul@8	633
paul@8	634	self.f.seek(offset)
paul@9	635	bad_docnum, fields = self.read_fields()
paul@9	636	self.last_docnum = docnum
paul@9	637	return docnum, fields
paul@9	638
paul@9	639	class FieldIndexWriter(FileWriter):
paul@9	640
paul@9	641	"Writing field index details to files."
paul@9	642
paul@9	643	def reset(self):
paul@9	644	self.last_docnum = 0
paul@10	645	self.last_offset = 0
paul@9	646
paul@9	647	def write_document(self, docnum, offset):
paul@9	648
paul@9	649	"""
paul@9	650	Write for the given 'docnum', the 'offset' at which the fields for the
paul@9	651	document are stored in the fields file.
paul@9	652	"""
paul@9	653
paul@10	654	# Write the document number and offset deltas.
paul@9	655
paul@9	656	self.write_number(docnum - self.last_docnum)
paul@10	657	self.write_number(offset - self.last_offset)
paul@9	658
paul@9	659	self.last_docnum = docnum
paul@10	660	self.last_offset = offset
paul@9	661
paul@9	662	class FieldIndexReader(FileReader):
paul@9	663
paul@9	664	"Reading field index details from files."
paul@9	665
paul@9	666	def reset(self):
paul@9	667	self.last_docnum = 0
paul@10	668	self.last_offset = 0
paul@9	669
paul@9	670	def read_document(self):
paul@9	671
paul@9	672	"Read a document number and field file offset."
paul@9	673
paul@9	674	# Read the document number delta and offset.
paul@9	675
paul@9	676	self.last_docnum += self.read_number()
paul@10	677	self.last_offset += self.read_number()
paul@9	678
paul@10	679	return self.last_docnum, self.last_offset
paul@9	680
paul@9	681	class FieldDictionaryWriter:
paul@9	682
paul@9	683	"Writing field dictionary details."
paul@9	684
paul@9	685	def __init__(self, field_writer, field_index_writer, interval):
paul@9	686	self.field_writer = field_writer
paul@9	687	self.field_index_writer = field_index_writer
paul@9	688	self.interval = interval
paul@9	689	self.entry = 0
paul@9	690
paul@9	691	def write_fields(self, docnum, fields):
paul@9	692
paul@9	693	"Write details of the document with the given 'docnum' and 'fields'."
paul@9	694
paul@9	695	offset = self.field_writer.write_fields(docnum, fields)
paul@9	696
paul@9	697	if self.entry % self.interval == 0:
paul@9	698	self.field_index_writer.write_document(docnum, offset)
paul@9	699
paul@9	700	self.entry += 1
paul@9	701
paul@9	702	def close(self):
paul@9	703	self.field_writer.close()
paul@9	704	self.field_index_writer.close()
paul@9	705
paul@9	706	class FieldDictionaryReader:
paul@9	707
paul@9	708	"Reading field dictionary details."
paul@9	709
paul@9	710	def __init__(self, field_reader, field_index_reader):
paul@9	711	self.field_reader = field_reader
paul@9	712	self.field_index_reader = field_index_reader
paul@9	713
paul@9	714	self.docs = []
paul@9	715	try:
paul@9	716	while 1:
paul@9	717	self.docs.append(self.field_index_reader.read_document())
paul@9	718	except EOFError:
paul@9	719	pass
paul@9	720
paul@9	721	# Large numbers for ordering purposes.
paul@9	722
paul@9	723	self.max_offset = self.docs[-1][1]
paul@9	724
paul@9	725	def read_fields(self, docnum):
paul@9	726
paul@9	727	"Read the fields of the document with the given 'docnum'."
paul@9	728
paul@9	729	i = bisect_right(self.docs, (docnum, self.max_offset)) - 1
paul@9	730
paul@9	731	# Get the entry position providing the term or one preceding it.
paul@9	732
paul@9	733	if i == -1:
paul@9	734	return None
paul@9	735
paul@9	736	found_docnum, offset = self.docs[i]
paul@9	737
paul@9	738	# Read from the fields file.
paul@9	739
paul@9	740	found_docnum, fields = self.field_reader.read_document_fields(found_docnum, offset)
paul@9	741
paul@9	742	# Scan for the document, if necessary.
paul@9	743
paul@9	744	try:
paul@9	745	while docnum > found_docnum:
paul@9	746	found_docnum, fields = self.field_reader.read_fields()
paul@9	747	except EOFError:
paul@9	748	pass
paul@9	749
paul@9	750	# If the document is found, return the fields.
paul@9	751
paul@9	752	if docnum == found_docnum:
paul@9	753	return fields
paul@9	754	else:
paul@9	755	return None
paul@9	756
paul@9	757	def close(self):
paul@9	758	self.field_reader.close()
paul@9	759	self.field_index_reader.close()
paul@8	760
paul@8	761	# High-level classes.
paul@8	762
paul@6	763	class IndexWriter:
paul@6	764
paul@10	765	"""
paul@10	766	Building term information and writing it to the term and field dictionaries.
paul@10	767	"""
paul@6	768
paul@10	769	def __init__(self, dict_writer, field_dict_writer):
paul@6	770	self.dict_writer = dict_writer
paul@10	771	self.field_dict_writer = field_dict_writer
paul@6	772	self.terms = {}
paul@10	773	self.docs = {}
paul@6	774
paul@6	775	def add_position(self, term, docnum, position):
paul@6	776
paul@6	777	"""
paul@6	778	Add a position entry for the given 'term' in the document with the given
paul@6	779	'docnum', indicating the given 'position'.
paul@6	780	"""
paul@6	781
paul@6	782	if not self.terms.has_key(term):
paul@6	783	doc_positions = self.terms[term] = {}
paul@6	784	else:
paul@6	785	doc_positions = self.terms[term]
paul@6	786
paul@6	787	if not doc_positions.has_key(docnum):
paul@6	788	doc = doc_positions[docnum] = []
paul@6	789	else:
paul@6	790	doc = doc_positions[docnum]
paul@6	791
paul@6	792	doc.append(position)
paul@6	793
paul@10	794	def add_fields(self, docnum, fields):
paul@10	795
paul@10	796	"Add for the document with the given 'docnum' a list of 'fields'."
paul@10	797
paul@10	798	if not self.docs.has_key(docnum):
paul@10	799	doc_fields = self.docs[docnum] = fields
paul@10	800	else:
paul@10	801	self.docs[docnum] += fields
paul@10	802
paul@6	803	def close(self):
paul@7	804	if self.dict_writer is None:
paul@7	805	return
paul@6	806
paul@6	807	# Get the terms in order.
paul@6	808
paul@6	809	terms = self.terms.items()
paul@6	810	terms.sort()
paul@6	811
paul@6	812	for term, doc_positions in terms:
paul@6	813	doc_positions = doc_positions.items()
paul@6	814	doc_positions.sort()
paul@6	815	self.dict_writer.write_term_positions(term, doc_positions)
paul@6	816
paul@6	817	self.dict_writer.close()
paul@7	818	self.dict_writer = None
paul@7	819
paul@10	820	# Get the documents in order.
paul@10	821
paul@10	822	docs = self.docs.items()
paul@10	823	docs.sort()
paul@10	824
paul@10	825	for docnum, fields in docs:
paul@10	826	self.field_dict_writer.write_fields(docnum, fields)
paul@10	827
paul@10	828	self.field_dict_writer.close()
paul@10	829	self.field_dict_writer = None
paul@10	830
paul@10	831	class IndexReader:
paul@10	832
paul@10	833	"Accessing the term and field dictionaries."
paul@10	834
paul@10	835	def __init__(self, dict_reader, field_dict_reader):
paul@10	836	self.dict_reader = dict_reader
paul@10	837	self.field_dict_reader = field_dict_reader
paul@10	838
paul@10	839	def find_positions(self, term):
paul@10	840	return self.dict_reader.find_positions(term)
paul@10	841
paul@11	842	def get_frequency(self, term):
paul@11	843	return self.dict_reader.get_frequency(term)
paul@11	844
paul@10	845	def get_fields(self, docnum):
paul@10	846	return self.field_dict_reader.read_fields(docnum)
paul@10	847
paul@10	848	def close(self):
paul@10	849	self.dict_reader.close()
paul@10	850	self.field_dict_reader.close()
paul@10	851
paul@7	852	class Index:
paul@7	853
paul@7	854	"An inverted index solution encapsulating the various components."
paul@7	855
paul@7	856	def __init__(self, pathname):
paul@7	857	self.pathname = pathname
paul@7	858	self.reader = None
paul@7	859	self.writer = None
paul@7	860
paul@7	861	def get_writer(self, interval=INTERVAL):
paul@7	862
paul@7	863	"Return a writer, optionally using the given indexing 'interval'."
paul@7	864
paul@7	865	if not exists(self.pathname):
paul@7	866	mkdir(self.pathname)
paul@7	867
paul@7	868	tdf = open(join(self.pathname, "terms"), "wb")
paul@7	869	info_writer = TermWriter(tdf)
paul@7	870
paul@7	871	tdif = open(join(self.pathname, "index"), "wb")
paul@7	872	index_writer = TermIndexWriter(tdif)
paul@7	873
paul@7	874	tpf = open(join(self.pathname, "positions"), "wb")
paul@7	875	positions_writer = PositionWriter(tpf)
paul@7	876
paul@7	877	dict_writer = TermDictionaryWriter(info_writer, index_writer, positions_writer, interval)
paul@7	878
paul@10	879	ff = open(join(self.pathname, "fields"), "wb")
paul@10	880	field_writer = FieldWriter(ff)
paul@10	881
paul@10	882	fif = open(join(self.pathname, "fields_index"), "wb")
paul@10	883	field_index_writer = FieldIndexWriter(fif)
paul@10	884
paul@10	885	field_dict_writer = FieldDictionaryWriter(field_writer, field_index_writer, interval)
paul@10	886
paul@10	887	self.writer = IndexWriter(dict_writer, field_dict_writer)
paul@7	888	return self.writer
paul@7	889
paul@7	890	def get_reader(self):
paul@7	891
paul@7	892	"Return a reader for the index."
paul@7	893
paul@7	894	if not exists(self.pathname):
paul@7	895	raise OSError, "Index path %r does not exist." % self.pathname
paul@7	896
paul@7	897	tdf = open(join(self.pathname, "terms"), "rb")
paul@7	898	info_reader = TermReader(tdf)
paul@7	899
paul@7	900	tdif = open(join(self.pathname, "index"), "rb")
paul@7	901	index_reader = TermIndexReader(tdif)
paul@7	902
paul@7	903	tpf = open(join(self.pathname, "positions"), "rb")
paul@7	904	positions_reader = PositionReader(tpf)
paul@7	905
paul@10	906	dict_reader = TermDictionaryReader(info_reader, index_reader, positions_reader)
paul@10	907
paul@10	908	ff = open(join(self.pathname, "fields"), "rb")
paul@10	909	field_reader = FieldReader(ff)
paul@10	910
paul@10	911	fif = open(join(self.pathname, "fields_index"), "rb")
paul@10	912	field_index_reader = FieldIndexReader(fif)
paul@10	913
paul@10	914	field_dict_reader = FieldDictionaryReader(field_reader, field_index_reader)
paul@10	915
paul@10	916	self.reader = IndexReader(dict_reader, field_dict_reader)
paul@7	917	return self.reader
paul@7	918
paul@7	919	def close(self):
paul@7	920	if self.reader is not None:
paul@7	921	self.reader.close()
paul@7	922	self.reader = None
paul@7	923	if self.writer is not None:
paul@7	924	self.writer.close()
paul@7	925	self.writer = None
paul@6	926
paul@0	927	# vim: tabstop=4 expandtab shiftwidth=4