iixr (annotate iixr.py in fe7ed6b96612)

iixr

Annotated iixr.py

9:fe7ed6b96612

2009-08-29

Paul Boddie

Added field dictionary and field index readers and writers. Renamed various internal methods. Added document number deltas to field collections in order to support scanning for documents.

paul@0	1	#!/usr/bin/env python
paul@0	2
paul@0	3	"""
paul@0	4	A simple (and sane) text indexing library.
paul@1	5
paul@1	6	Copyright (C) 2009 Paul Boddie <paul@boddie.org.uk>
paul@1	7
paul@1	8	This program is free software; you can redistribute it and/or modify it under
paul@1	9	the terms of the GNU General Public License as published by the Free Software
paul@1	10	Foundation; either version 3 of the License, or (at your option) any later
paul@1	11	version.
paul@1	12
paul@1	13	This program is distributed in the hope that it will be useful, but WITHOUT ANY
paul@1	14	WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
paul@1	15	PARTICULAR PURPOSE. See the GNU General Public License for more details.
paul@1	16
paul@1	17	You should have received a copy of the GNU General Public License along
paul@1	18	with this program. If not, see <http://www.gnu.org/licenses/>.
paul@0	19	"""
paul@0	20
paul@7	21	from os import mkdir # to determine whether to create indexes
paul@7	22	from os.path import exists, join
paul@2	23	from os.path import commonprefix # to find common string prefixes
paul@3	24	from bisect import bisect_right # to find terms in the dictionary index
paul@8	25	import bz2 # for field compression
paul@2	26
paul@7	27	# Constants.
paul@7	28
paul@7	29	INTERVAL = 100
paul@7	30
paul@0	31	# Foundation classes.
paul@0	32
paul@0	33	class File:
paul@0	34
paul@0	35	"A basic file abstraction."
paul@0	36
paul@0	37	def __init__(self, f):
paul@0	38	self.f = f
paul@0	39	self.reset()
paul@0	40
paul@0	41	def reset(self):
paul@0	42	pass
paul@0	43
paul@0	44	def close(self):
paul@7	45	if self.f is not None:
paul@7	46	self.f.close()
paul@7	47	self.f = None
paul@0	48
paul@0	49	class FileWriter(File):
paul@0	50
paul@0	51	"Writing basic data types to files."
paul@0	52
paul@0	53	def write_number(self, number):
paul@0	54
paul@0	55	"Write 'number' to the file using a variable length encoding."
paul@0	56
paul@0	57	# Negative numbers are not supported.
paul@0	58
paul@0	59	if number < 0:
paul@0	60	raise ValueError, "Number %r is negative." % number
paul@0	61
paul@0	62	# Special case: one byte containing zero.
paul@0	63
paul@0	64	elif number == 0:
paul@4	65	self.f.write(chr(0))
paul@0	66	return
paul@0	67
paul@0	68	# Write the number from least to most significant digits.
paul@0	69
paul@0	70	bytes = []
paul@0	71
paul@0	72	while number != 0:
paul@4	73	lsd = number & 127
paul@4	74	number = number >> 7
paul@4	75	if number != 0:
paul@4	76	lsd \|= 128
paul@0	77	bytes.append(chr(lsd))
paul@0	78
paul@0	79	record = "".join(bytes)
paul@0	80	self.f.write(record)
paul@0	81
paul@8	82	def write_string(self, s, compress=0):
paul@2	83
paul@8	84	"""
paul@8	85	Write 's' to the file, recording its length and compressing the string
paul@8	86	if 'compress' is set to a true value.
paul@8	87	"""
paul@2	88
paul@7	89	# Convert Unicode objects to strings.
paul@7	90
paul@7	91	if isinstance(s, unicode):
paul@7	92	s = s.encode("utf-8")
paul@7	93
paul@8	94	# Compress the string if requested.
paul@2	95
paul@8	96	if compress:
paul@8	97	s = bz2.compress(s)
paul@2	98
paul@8	99	# Write the length of the data before the data itself.
paul@8	100
paul@8	101	length = len(s)
paul@4	102	self.write_number(length)
paul@2	103	self.f.write(s)
paul@2	104
paul@0	105	class FileReader(File):
paul@0	106
paul@0	107	"Reading basic data types from files."
paul@0	108
paul@0	109	def read_number(self):
paul@0	110
paul@0	111	"Read a number from the file."
paul@0	112
paul@0	113	# Read each byte, adding it to the number.
paul@0	114
paul@0	115	shift = 0
paul@0	116	number = 0
paul@4	117	more = 1
paul@0	118
paul@4	119	while more:
paul@4	120	byte = self.f.read(1)
paul@4	121	if not byte:
paul@4	122	raise EOFError
paul@4	123
paul@4	124	csd = ord(byte)
paul@4	125	more = csd & 128 != 0
paul@4	126	if more:
paul@4	127	csd &= 127
paul@0	128	number += (csd << shift)
paul@4	129	shift += 7
paul@0	130
paul@0	131	return number
paul@0	132
paul@8	133	def read_string(self, decompress=0):
paul@2	134
paul@8	135	"""
paul@8	136	Read a string from the file, decompressing the stored data if
paul@8	137	'decompress' is set to a true value.
paul@8	138	"""
paul@2	139
paul@4	140	length = self.read_number()
paul@8	141	s = self.f.read(length)
paul@8	142
paul@8	143	# Decompress the data if requested.
paul@8	144
paul@8	145	if decompress:
paul@8	146	s = bz2.decompress(s)
paul@7	147
paul@7	148	# Convert strings to Unicode objects.
paul@7	149
paul@8	150	return unicode(s, "utf-8")
paul@2	151
paul@9	152	# Specific classes for storing term and position information.
paul@0	153
paul@0	154	class PositionWriter(FileWriter):
paul@0	155
paul@0	156	"Writing position information to files."
paul@0	157
paul@0	158	def reset(self):
paul@0	159	self.last_docnum = 0
paul@0	160
paul@0	161	def write_positions(self, docnum, positions):
paul@0	162
paul@0	163	"Write for the document 'docnum' the given 'positions'."
paul@0	164
paul@0	165	if docnum < self.last_docnum:
paul@0	166	raise ValueError, "Document number %r is less than previous number %r." % (docnum, self.last_docnum)
paul@0	167
paul@0	168	# Write the document number delta.
paul@0	169
paul@0	170	self.write_number(docnum - self.last_docnum)
paul@0	171
paul@0	172	# Write the number of positions.
paul@0	173
paul@0	174	self.write_number(len(positions))
paul@0	175
paul@7	176	# Make sure that the positions are sorted.
paul@7	177
paul@7	178	positions.sort()
paul@7	179
paul@0	180	# Write the position deltas.
paul@0	181
paul@0	182	last = 0
paul@0	183	for position in positions:
paul@0	184	pos = position - last
paul@0	185	self.write_number(pos)
paul@0	186	last = position
paul@0	187
paul@0	188	self.last_docnum = docnum
paul@0	189
paul@0	190	def write_all_positions(self, doc_positions):
paul@0	191
paul@0	192	"""
paul@0	193	Write all 'doc_positions' - a collection of tuples of the form (document
paul@0	194	number, position list) - to the file, returning the offset at which they
paul@0	195	were stored.
paul@0	196	"""
paul@0	197
paul@0	198	# Reset the writer and record the current file offset.
paul@0	199
paul@0	200	self.reset()
paul@0	201	offset = self.f.tell()
paul@0	202
paul@0	203	# Write the number of documents.
paul@0	204
paul@0	205	self.write_number(len(doc_positions))
paul@0	206
paul@0	207	# Write the positions.
paul@0	208
paul@0	209	for docnum, positions in doc_positions:
paul@0	210	self.write_positions(docnum, positions)
paul@0	211
paul@0	212	return offset
paul@0	213
paul@0	214	class PositionReader(FileReader):
paul@0	215
paul@0	216	"Reading position information from files."
paul@0	217
paul@0	218	def reset(self):
paul@0	219	self.last_docnum = 0
paul@0	220
paul@0	221	def read_positions(self):
paul@0	222
paul@0	223	"Read positions, returning a document number and a list of positions."
paul@0	224
paul@0	225	# Read the document number delta and add it to the last number.
paul@0	226
paul@0	227	self.last_docnum += self.read_number()
paul@0	228
paul@0	229	# Read the number of positions.
paul@0	230
paul@0	231	npositions = self.read_number()
paul@0	232
paul@0	233	# Read the position deltas, adding each previous position to get the
paul@0	234	# appropriate collection of absolute positions.
paul@0	235
paul@0	236	i = 0
paul@0	237	last = 0
paul@0	238	positions = []
paul@0	239
paul@0	240	while i < npositions:
paul@0	241	last += self.read_number()
paul@0	242	positions.append(last)
paul@0	243	i += 1
paul@0	244
paul@0	245	return self.last_docnum, positions
paul@0	246
paul@0	247	def read_all_positions(self, offset):
paul@0	248
paul@0	249	"""
paul@0	250	Read all positions from 'offset', seeking to that position in the file
paul@0	251	before reading.
paul@0	252	"""
paul@0	253
paul@0	254	self.reset()
paul@0	255	self.f.seek(offset)
paul@0	256
paul@0	257	# Read the number of documents.
paul@0	258
paul@0	259	ndocuments = self.read_number()
paul@0	260
paul@0	261	# Read all records.
paul@0	262
paul@0	263	i = 0
paul@0	264	doc_positions = []
paul@0	265
paul@0	266	while i < ndocuments:
paul@0	267	doc_positions.append(self.read_positions())
paul@0	268	i += 1
paul@0	269
paul@0	270	return doc_positions
paul@0	271
paul@2	272	class TermWriter(FileWriter):
paul@2	273
paul@2	274	"Writing term information to files."
paul@2	275
paul@2	276	def reset(self):
paul@2	277	self.last_term = ""
paul@2	278	self.last_offset = 0
paul@2	279
paul@2	280	def write_term(self, term, offset):
paul@2	281
paul@2	282	"""
paul@2	283	Write the given 'term' and its position file 'offset' to the term
paul@3	284	information file. Return the offset after the term information was
paul@3	285	written to the file.
paul@2	286	"""
paul@2	287
paul@2	288	# Too long terms are not currently supported.
paul@2	289
paul@2	290	if len(term) > 255:
paul@2	291	raise ValueError, "Term %r is too long." % term
paul@2	292
paul@2	293	# Write the prefix length and term suffix.
paul@2	294
paul@2	295	common = len(commonprefix([self.last_term, term]))
paul@2	296	suffix = term[common:]
paul@2	297
paul@4	298	self.write_number(common)
paul@2	299	self.write_string(suffix)
paul@2	300
paul@2	301	# Write the offset delta.
paul@2	302
paul@2	303	self.write_number(offset - self.last_offset)
paul@2	304
paul@2	305	self.last_term = term
paul@2	306	self.last_offset = offset
paul@2	307
paul@3	308	return self.f.tell()
paul@3	309
paul@2	310	class TermReader(FileReader):
paul@2	311
paul@2	312	"Reading term information from files."
paul@2	313
paul@2	314	def reset(self):
paul@2	315	self.last_term = ""
paul@2	316	self.last_offset = 0
paul@2	317
paul@2	318	def read_term(self):
paul@2	319
paul@2	320	"""
paul@2	321	Read a term and its position file offset from the term information file.
paul@2	322	"""
paul@2	323
paul@2	324	# Read the prefix length and term suffix.
paul@2	325
paul@4	326	common = self.read_number()
paul@2	327	suffix = self.read_string()
paul@2	328
paul@2	329	self.last_term = self.last_term[:common] + suffix
paul@2	330
paul@2	331	# Read the offset delta.
paul@2	332
paul@2	333	self.last_offset += self.read_number()
paul@2	334
paul@2	335	return self.last_term, self.last_offset
paul@2	336
paul@3	337	def go_to_term(self, term, offset, info_offset):
paul@3	338
paul@9	339	"""
paul@9	340	Seek past the entry for 'term' having 'offset' to 'info_offset'. This
paul@9	341	permits the scanning for later terms from the specified term.
paul@9	342	"""
paul@3	343
paul@3	344	self.f.seek(info_offset)
paul@3	345	self.last_term = term
paul@3	346	self.last_offset = offset
paul@3	347
paul@3	348	class TermIndexWriter(TermWriter):
paul@3	349
paul@3	350	"Writing term dictionary index details to files."
paul@3	351
paul@3	352	def reset(self):
paul@3	353	TermWriter.reset(self)
paul@3	354	self.last_info_offset = 0
paul@3	355
paul@3	356	def write_term(self, term, offset, info_offset):
paul@3	357
paul@3	358	"""
paul@3	359	Write the given 'term' and its position file 'offset' to the term
paul@3	360	dictionary index file, along with the 'info_offset' in the term
paul@3	361	information file.
paul@3	362	"""
paul@3	363
paul@3	364	TermWriter.write_term(self, term, offset)
paul@3	365
paul@3	366	# Write the information file offset delta.
paul@3	367
paul@3	368	self.write_number(info_offset - self.last_info_offset)
paul@3	369	self.last_info_offset = info_offset
paul@3	370
paul@3	371	class TermIndexReader(TermReader):
paul@3	372
paul@3	373	"Reading term dictionary index details from files."
paul@3	374
paul@3	375	def reset(self):
paul@3	376	TermReader.reset(self)
paul@3	377	self.last_info_offset = 0
paul@3	378
paul@3	379	def read_term(self):
paul@3	380
paul@3	381	"""
paul@3	382	Read a term, its position file offset, and its term information file
paul@3	383	offset from the term dictionary index file.
paul@3	384	"""
paul@3	385
paul@3	386	term, offset = TermReader.read_term(self)
paul@3	387
paul@3	388	# Read the offset delta.
paul@3	389
paul@3	390	self.last_info_offset += self.read_number()
paul@3	391
paul@3	392	return term, offset, self.last_info_offset
paul@3	393
paul@3	394	class TermDictionaryWriter:
paul@3	395
paul@3	396	"Writing term dictionaries."
paul@3	397
paul@5	398	def __init__(self, info_writer, index_writer, position_writer, interval):
paul@3	399	self.info_writer = info_writer
paul@3	400	self.index_writer = index_writer
paul@5	401	self.position_writer = position_writer
paul@3	402	self.interval = interval
paul@3	403	self.entry = 0
paul@3	404
paul@9	405	def _write_term(self, term, offset):
paul@3	406
paul@3	407	"""
paul@3	408	Write the given 'term' and its position file 'offset' to the term
paul@3	409	information file and optionally to the index, making a dictionary entry.
paul@3	410	"""
paul@3	411
paul@3	412	info_offset = self.info_writer.write_term(term, offset)
paul@3	413
paul@3	414	if self.entry % self.interval == 0:
paul@3	415	self.index_writer.write_term(term, offset, info_offset)
paul@3	416
paul@3	417	self.entry += 1
paul@3	418
paul@5	419	def write_term_positions(self, term, doc_positions):
paul@5	420
paul@5	421	"""
paul@5	422	Write the given 'term' and the 'doc_positions' recording the documents
paul@5	423	and positions at which the term is found.
paul@5	424	"""
paul@5	425
paul@5	426	offset = self.position_writer.write_all_positions(doc_positions)
paul@9	427	self._write_term(term, offset)
paul@5	428
paul@3	429	def close(self):
paul@3	430	self.info_writer.close()
paul@3	431	self.index_writer.close()
paul@5	432	self.position_writer.close()
paul@3	433
paul@3	434	class TermDictionaryReader:
paul@3	435
paul@3	436	"Reading term dictionaries."
paul@3	437
paul@5	438	def __init__(self, info_reader, index_reader, position_reader):
paul@3	439	self.info_reader = info_reader
paul@3	440	self.index_reader = index_reader
paul@5	441	self.position_reader = position_reader
paul@3	442
paul@3	443	self.terms = []
paul@3	444	try:
paul@3	445	while 1:
paul@3	446	self.terms.append(self.index_reader.read_term())
paul@3	447	except EOFError:
paul@3	448	pass
paul@3	449
paul@3	450	# Large numbers for ordering purposes.
paul@3	451
paul@3	452	self.max_offset = self.terms[-1][1]
paul@3	453	self.max_info_offset = self.terms[-1][2]
paul@3	454
paul@9	455	def _find_term(self, term):
paul@3	456
paul@3	457	"Find the position file offset of 'term' from the term dictionary."
paul@3	458
paul@3	459	i = bisect_right(self.terms, (term, self.max_offset, self.max_info_offset)) - 1
paul@3	460
paul@3	461	# Get the entry position providing the term or one preceding it.
paul@3	462
paul@3	463	if i == -1:
paul@3	464	return None
paul@3	465
paul@3	466	found_term, offset, info_offset = self.terms[i]
paul@3	467
paul@3	468	# Where the term is found immediately, return the offset.
paul@3	469
paul@3	470	if term == found_term:
paul@3	471	return offset
paul@3	472
paul@3	473	# Otherwise, seek past the index term's entry in the information file
paul@3	474	# and scan for the desired term.
paul@3	475
paul@3	476	else:
paul@3	477	self.info_reader.go_to_term(found_term, offset, info_offset)
paul@3	478	try:
paul@3	479	while term > found_term:
paul@3	480	found_term, offset = self.info_reader.read_term()
paul@3	481	except EOFError:
paul@3	482	pass
paul@3	483
paul@3	484	# If the term is found, return the offset.
paul@3	485
paul@3	486	if term == found_term:
paul@3	487	return offset
paul@3	488	else:
paul@3	489	return None
paul@3	490
paul@5	491	def find_positions(self, term):
paul@5	492
paul@5	493	"Return the documents and positions at which the given 'term' is found."
paul@5	494
paul@9	495	offset = self._find_term(term)
paul@5	496	if offset is None:
paul@5	497	return None
paul@5	498	else:
paul@5	499	return self.position_reader.read_all_positions(offset)
paul@5	500
paul@3	501	def close(self):
paul@3	502	self.info_reader.close()
paul@3	503	self.index_reader.close()
paul@5	504	self.position_reader.close()
paul@3	505
paul@9	506	# Specific classes for storing document information.
paul@9	507
paul@8	508	class FieldWriter(FileWriter):
paul@8	509
paul@8	510	"Writing field data to files."
paul@8	511
paul@9	512	def reset(self):
paul@9	513	self.last_docnum = 0
paul@9	514
paul@9	515	def write_fields(self, docnum, fields):
paul@8	516
paul@8	517	"""
paul@9	518	Write for the given 'docnum', a list of 'fields' (strings representing
paul@9	519	field values). Return the offset at which the fields are stored.
paul@8	520	"""
paul@8	521
paul@8	522	offset = self.f.tell()
paul@8	523
paul@9	524	# Write the document number delta.
paul@9	525
paul@9	526	self.write_number(docnum - self.last_docnum)
paul@9	527
paul@8	528	# Write the number of fields.
paul@8	529
paul@8	530	self.write_number(len(fields))
paul@8	531
paul@8	532	# Write the fields themselves.
paul@8	533
paul@8	534	for field in fields:
paul@8	535	self.write_string(field, 0) # compress
paul@8	536
paul@9	537	self.last_docnum = docnum
paul@8	538	return offset
paul@8	539
paul@8	540	class FieldReader(FileReader):
paul@8	541
paul@8	542	"Reading field data from files."
paul@8	543
paul@9	544	def reset(self):
paul@9	545	self.last_docnum = 0
paul@9	546
paul@8	547	def read_fields(self):
paul@8	548
paul@9	549	"""
paul@9	550	Read fields from the file, returning a tuple containing the document
paul@9	551	number and a list of field values.
paul@9	552	"""
paul@9	553
paul@9	554	# Read the document number.
paul@9	555
paul@9	556	self.last_docnum += self.read_number()
paul@8	557
paul@8	558	# Read the number of fields.
paul@8	559
paul@8	560	nfields = self.read_number()
paul@8	561
paul@8	562	# Collect the fields.
paul@8	563
paul@8	564	fields = []
paul@8	565	i = 0
paul@8	566
paul@8	567	while i < nfields:
paul@8	568	fields.append(self.read_string(0)) # decompress
paul@8	569	i += 1
paul@8	570
paul@9	571	return self.last_docnum, fields
paul@9	572
paul@9	573	def read_document_fields(self, docnum, offset):
paul@8	574
paul@9	575	"""
paul@9	576	Read fields for 'docnum' at the given 'offset'. This permits the
paul@9	577	retrieval of details for the specified document, as well as scanning for
paul@9	578	later documents.
paul@9	579	"""
paul@8	580
paul@8	581	self.f.seek(offset)
paul@9	582	bad_docnum, fields = self.read_fields()
paul@9	583	self.last_docnum = docnum
paul@9	584	return docnum, fields
paul@9	585
paul@9	586	class FieldIndexWriter(FileWriter):
paul@9	587
paul@9	588	"Writing field index details to files."
paul@9	589
paul@9	590	def reset(self):
paul@9	591	self.last_docnum = 0
paul@9	592
paul@9	593	def write_document(self, docnum, offset):
paul@9	594
paul@9	595	"""
paul@9	596	Write for the given 'docnum', the 'offset' at which the fields for the
paul@9	597	document are stored in the fields file.
paul@9	598	"""
paul@9	599
paul@9	600	# Write the document number delta and offset.
paul@9	601
paul@9	602	self.write_number(docnum - self.last_docnum)
paul@9	603	self.write_number(offset)
paul@9	604
paul@9	605	self.last_docnum = docnum
paul@9	606
paul@9	607	class FieldIndexReader(FileReader):
paul@9	608
paul@9	609	"Reading field index details from files."
paul@9	610
paul@9	611	def reset(self):
paul@9	612	self.last_docnum = 0
paul@9	613
paul@9	614	def read_document(self):
paul@9	615
paul@9	616	"Read a document number and field file offset."
paul@9	617
paul@9	618	# Read the document number delta and offset.
paul@9	619
paul@9	620	self.last_docnum += self.read_number()
paul@9	621	offset = self.read_number()
paul@9	622
paul@9	623	return self.last_docnum, offset
paul@9	624
paul@9	625	class FieldDictionaryWriter:
paul@9	626
paul@9	627	"Writing field dictionary details."
paul@9	628
paul@9	629	def __init__(self, field_writer, field_index_writer, interval):
paul@9	630	self.field_writer = field_writer
paul@9	631	self.field_index_writer = field_index_writer
paul@9	632	self.interval = interval
paul@9	633	self.entry = 0
paul@9	634
paul@9	635	def write_fields(self, docnum, fields):
paul@9	636
paul@9	637	"Write details of the document with the given 'docnum' and 'fields'."
paul@9	638
paul@9	639	offset = self.field_writer.write_fields(docnum, fields)
paul@9	640
paul@9	641	if self.entry % self.interval == 0:
paul@9	642	self.field_index_writer.write_document(docnum, offset)
paul@9	643
paul@9	644	self.entry += 1
paul@9	645
paul@9	646	def close(self):
paul@9	647	self.field_writer.close()
paul@9	648	self.field_index_writer.close()
paul@9	649
paul@9	650	class FieldDictionaryReader:
paul@9	651
paul@9	652	"Reading field dictionary details."
paul@9	653
paul@9	654	def __init__(self, field_reader, field_index_reader):
paul@9	655	self.field_reader = field_reader
paul@9	656	self.field_index_reader = field_index_reader
paul@9	657
paul@9	658	self.docs = []
paul@9	659	try:
paul@9	660	while 1:
paul@9	661	self.docs.append(self.field_index_reader.read_document())
paul@9	662	except EOFError:
paul@9	663	pass
paul@9	664
paul@9	665	# Large numbers for ordering purposes.
paul@9	666
paul@9	667	self.max_offset = self.docs[-1][1]
paul@9	668
paul@9	669	def read_fields(self, docnum):
paul@9	670
paul@9	671	"Read the fields of the document with the given 'docnum'."
paul@9	672
paul@9	673	i = bisect_right(self.docs, (docnum, self.max_offset)) - 1
paul@9	674
paul@9	675	# Get the entry position providing the term or one preceding it.
paul@9	676
paul@9	677	if i == -1:
paul@9	678	return None
paul@9	679
paul@9	680	found_docnum, offset = self.docs[i]
paul@9	681
paul@9	682	# Read from the fields file.
paul@9	683
paul@9	684	found_docnum, fields = self.field_reader.read_document_fields(found_docnum, offset)
paul@9	685
paul@9	686	# Scan for the document, if necessary.
paul@9	687
paul@9	688	try:
paul@9	689	while docnum > found_docnum:
paul@9	690	found_docnum, fields = self.field_reader.read_fields()
paul@9	691	except EOFError:
paul@9	692	pass
paul@9	693
paul@9	694	# If the document is found, return the fields.
paul@9	695
paul@9	696	if docnum == found_docnum:
paul@9	697	return fields
paul@9	698	else:
paul@9	699	return None
paul@9	700
paul@9	701	def close(self):
paul@9	702	self.field_reader.close()
paul@9	703	self.field_index_reader.close()
paul@8	704
paul@8	705	# High-level classes.
paul@8	706
paul@6	707	class IndexWriter:
paul@6	708
paul@6	709	"Building term information and writing it to the term dictionary."
paul@6	710
paul@6	711	def __init__(self, dict_writer):
paul@6	712	self.dict_writer = dict_writer
paul@6	713	self.terms = {}
paul@6	714
paul@6	715	def add_position(self, term, docnum, position):
paul@6	716
paul@6	717	"""
paul@6	718	Add a position entry for the given 'term' in the document with the given
paul@6	719	'docnum', indicating the given 'position'.
paul@6	720	"""
paul@6	721
paul@6	722	if not self.terms.has_key(term):
paul@6	723	doc_positions = self.terms[term] = {}
paul@6	724	else:
paul@6	725	doc_positions = self.terms[term]
paul@6	726
paul@6	727	if not doc_positions.has_key(docnum):
paul@6	728	doc = doc_positions[docnum] = []
paul@6	729	else:
paul@6	730	doc = doc_positions[docnum]
paul@6	731
paul@6	732	doc.append(position)
paul@6	733
paul@6	734	def close(self):
paul@7	735	if self.dict_writer is None:
paul@7	736	return
paul@6	737
paul@6	738	# Get the terms in order.
paul@6	739
paul@6	740	terms = self.terms.items()
paul@6	741	terms.sort()
paul@6	742
paul@6	743	for term, doc_positions in terms:
paul@6	744	doc_positions = doc_positions.items()
paul@6	745	doc_positions.sort()
paul@6	746	self.dict_writer.write_term_positions(term, doc_positions)
paul@6	747
paul@6	748	self.dict_writer.close()
paul@7	749	self.dict_writer = None
paul@7	750
paul@7	751	class Index:
paul@7	752
paul@7	753	"An inverted index solution encapsulating the various components."
paul@7	754
paul@7	755	def __init__(self, pathname):
paul@7	756	self.pathname = pathname
paul@7	757	self.reader = None
paul@7	758	self.writer = None
paul@7	759
paul@7	760	def get_writer(self, interval=INTERVAL):
paul@7	761
paul@7	762	"Return a writer, optionally using the given indexing 'interval'."
paul@7	763
paul@7	764	if not exists(self.pathname):
paul@7	765	mkdir(self.pathname)
paul@7	766
paul@7	767	tdf = open(join(self.pathname, "terms"), "wb")
paul@7	768	info_writer = TermWriter(tdf)
paul@7	769
paul@7	770	tdif = open(join(self.pathname, "index"), "wb")
paul@7	771	index_writer = TermIndexWriter(tdif)
paul@7	772
paul@7	773	tpf = open(join(self.pathname, "positions"), "wb")
paul@7	774	positions_writer = PositionWriter(tpf)
paul@7	775
paul@7	776	dict_writer = TermDictionaryWriter(info_writer, index_writer, positions_writer, interval)
paul@7	777
paul@7	778	self.writer = IndexWriter(dict_writer)
paul@7	779	return self.writer
paul@7	780
paul@7	781	def get_reader(self):
paul@7	782
paul@7	783	"Return a reader for the index."
paul@7	784
paul@7	785	if not exists(self.pathname):
paul@7	786	raise OSError, "Index path %r does not exist." % self.pathname
paul@7	787
paul@7	788	tdf = open(join(self.pathname, "terms"), "rb")
paul@7	789	info_reader = TermReader(tdf)
paul@7	790
paul@7	791	tdif = open(join(self.pathname, "index"), "rb")
paul@7	792	index_reader = TermIndexReader(tdif)
paul@7	793
paul@7	794	tpf = open(join(self.pathname, "positions"), "rb")
paul@7	795	positions_reader = PositionReader(tpf)
paul@7	796
paul@7	797	self.reader = TermDictionaryReader(info_reader, index_reader, positions_reader)
paul@7	798	return self.reader
paul@7	799
paul@7	800	def close(self):
paul@7	801	if self.reader is not None:
paul@7	802	self.reader.close()
paul@7	803	self.reader = None
paul@7	804	if self.writer is not None:
paul@7	805	self.writer.close()
paul@7	806	self.writer = None
paul@6	807
paul@0	808	# vim: tabstop=4 expandtab shiftwidth=4