iixr (annotate iixr/fields.py in b75bd39cf61f)

iixr

Annotated iixr/fields.py

95:b75bd39cf61f

2011-02-12

Paul Boddie

Changed cache slicing to record pointer updating.

paul@44	1	#!/usr/bin/env python
paul@44	2
paul@44	3	"""
paul@44	4	Specific classes for storing document information.
paul@44	5
paul@89	6	Copyright (C) 2009, 2010, 2011 Paul Boddie <paul@boddie.org.uk>
paul@44	7
paul@44	8	This program is free software; you can redistribute it and/or modify it under
paul@44	9	the terms of the GNU General Public License as published by the Free Software
paul@44	10	Foundation; either version 3 of the License, or (at your option) any later
paul@44	11	version.
paul@44	12
paul@44	13	This program is distributed in the hope that it will be useful, but WITHOUT ANY
paul@44	14	WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
paul@44	15	PARTICULAR PURPOSE. See the GNU General Public License for more details.
paul@44	16
paul@44	17	You should have received a copy of the GNU General Public License along
paul@44	18	with this program. If not, see <http://www.gnu.org/licenses/>.
paul@44	19	"""
paul@44	20
paul@89	21	from iixr.data import *
paul@44	22	from iixr.files import *
paul@44	23	from bisect import bisect_right # to find terms in the dictionary index
paul@44	24
paul@67	25	DOCUMENT_CACHE_LIMIT = 10000
paul@67	26
paul@44	27	class FieldWriter(FileWriter):
paul@44	28
paul@44	29	"Writing field data to files."
paul@44	30
paul@91	31	def begin(self, docnum_size):
paul@91	32	self.write_number(docnum_size)
paul@91	33	self.end_record()
paul@91	34	self.docnum_size = docnum_size
paul@91	35	self.data_start = self.tell()
paul@91	36
paul@44	37	def reset(self):
paul@90	38	self.end_record()
paul@74	39	self.last_docnum = None
paul@89	40	self.subtractor = None
paul@44	41
paul@44	42	def write_fields(self, docnum, fields):
paul@44	43
paul@44	44	"""
paul@44	45	Write for the given 'docnum', a list of 'fields' (integer, string pairs
paul@44	46	representing field identifiers and values respectively).
paul@44	47	"""
paul@44	48
paul@74	49	# Find the size of document number values.
paul@74	50
paul@89	51	if self.last_docnum is not None:
paul@89	52	docnum_seq = self.subtractor(docnum, self.last_docnum)
paul@89	53	else:
paul@89	54	self.subtractor = get_subtractor(docnum)
paul@89	55	docnum_seq = docnum
paul@74	56
paul@89	57	# Write the document number.
paul@89	58
paul@91	59	self.write_sequence_value(docnum_seq, self.docnum_size)
paul@44	60
paul@44	61	# Write the number of fields.
paul@44	62
paul@88	63	self.write_number(len(fields))
paul@44	64
paul@44	65	# Write the fields themselves.
paul@44	66
paul@44	67	for i, field in fields:
paul@44	68	self.write_number(i)
paul@44	69	self.write_string(field, 1) # compress
paul@44	70
paul@89	71	self.last_docnum = docnum
paul@89	72
paul@44	73	class FieldReader(FileReader):
paul@44	74
paul@44	75	"Reading field data from files."
paul@44	76
paul@91	77	def begin(self):
paul@91	78	self.begin_record()
paul@91	79	try:
paul@91	80	self.docnum_size = self.read_number()
paul@91	81	except EOFError:
paul@91	82	self.docnum_size = 0 # NOTE: No fields!
paul@91	83	self.data_start = self.tell()
paul@91	84
paul@44	85	def reset(self):
paul@74	86	self.last_docnum = None
paul@89	87	self.adder = None
paul@90	88	self.begin_record()
paul@44	89
paul@44	90	def read_fields(self):
paul@44	91
paul@44	92	"""
paul@44	93	Read fields from the file, returning a tuple containing the document
paul@44	94	number and a list of field (identifier, value) pairs.
paul@44	95	"""
paul@44	96
paul@89	97	# Read the document number.
paul@89	98
paul@91	99	docnum = self.read_sequence_value(self.docnum_size)
paul@44	100
paul@89	101	if self.last_docnum is not None:
paul@89	102	self.last_docnum = self.adder(docnum, self.last_docnum)
paul@89	103	else:
paul@89	104	self.adder = get_adder(docnum)
paul@89	105	self.last_docnum = docnum
paul@44	106
paul@44	107	# Read the number of fields.
paul@44	108
paul@44	109	nfields = self.read_number()
paul@44	110
paul@44	111	# Collect the fields.
paul@44	112
paul@44	113	fields = []
paul@44	114	i = 0
paul@44	115
paul@44	116	while i < nfields:
paul@44	117	identifier = self.read_number()
paul@44	118	value = self.read_string(1) # decompress
paul@44	119	fields.append((identifier, value))
paul@44	120	i += 1
paul@44	121
paul@44	122	return self.last_docnum, fields
paul@44	123
paul@44	124	def read_document_fields(self, docnum, offset):
paul@44	125
paul@44	126	"""
paul@44	127	Read fields for 'docnum' at the given 'offset'. This permits the
paul@44	128	retrieval of details for the specified document, as well as scanning for
paul@44	129	later documents.
paul@44	130	"""
paul@44	131
paul@69	132	self.seek(offset)
paul@44	133	bad_docnum, fields = self.read_fields()
paul@44	134	self.last_docnum = docnum
paul@44	135	return docnum, fields
paul@44	136
paul@91	137	class FieldIndexWriter(FieldWriter):
paul@44	138
paul@44	139	"Writing field index details to files."
paul@44	140
paul@44	141	def reset(self):
paul@91	142	FieldWriter.reset(self)
paul@44	143	self.last_offset = 0
paul@44	144
paul@44	145	def write_document(self, docnum, offset):
paul@44	146
paul@44	147	"""
paul@44	148	Write for the given 'docnum', the 'offset' at which the fields for the
paul@44	149	document are stored in the fields file.
paul@44	150	"""
paul@44	151
paul@74	152	# Find the size of document number values.
paul@74	153
paul@89	154	if self.last_docnum is not None:
paul@89	155	docnum_seq = self.subtractor(docnum, self.last_docnum)
paul@89	156	else:
paul@89	157	self.subtractor = get_subtractor(docnum)
paul@89	158	docnum_seq = docnum
paul@74	159
paul@89	160	# Write the document number.
paul@89	161
paul@91	162	self.write_sequence_value(docnum_seq, self.docnum_size)
paul@74	163
paul@74	164	# Write the offset delta.
paul@74	165
paul@44	166	self.write_number(offset - self.last_offset)
paul@89	167
paul@89	168	self.last_docnum = docnum
paul@44	169	self.last_offset = offset
paul@44	170
paul@91	171	class FieldIndexReader(FieldReader):
paul@44	172
paul@44	173	"Reading field index details from files."
paul@44	174
paul@44	175	def reset(self):
paul@91	176	FieldReader.reset(self)
paul@44	177	self.last_offset = 0
paul@44	178
paul@44	179	def read_document(self):
paul@44	180
paul@44	181	"Read a document number and field file offset."
paul@44	182
paul@89	183	# Read the document number.
paul@89	184
paul@91	185	docnum = self.read_sequence_value(self.docnum_size)
paul@74	186
paul@89	187	if self.last_docnum is not None:
paul@89	188	self.last_docnum = self.adder(docnum, self.last_docnum)
paul@89	189	else:
paul@89	190	self.adder = get_adder(docnum)
paul@89	191	self.last_docnum = docnum
paul@74	192
paul@74	193	# Read the offset.
paul@74	194
paul@44	195	self.last_offset += self.read_number()
paul@44	196
paul@44	197	return self.last_docnum, self.last_offset
paul@44	198
paul@44	199	class FieldDictionaryWriter:
paul@44	200
paul@44	201	"Writing field dictionary details."
paul@44	202
paul@44	203	def __init__(self, field_writer, field_index_writer, interval):
paul@44	204	self.field_writer = field_writer
paul@44	205	self.field_index_writer = field_index_writer
paul@44	206	self.interval = interval
paul@44	207	self.entry = 0
paul@44	208
paul@44	209	def write_fields(self, docnum, fields):
paul@44	210
paul@91	211	"Write details of the given 'docnum' and 'fields'."
paul@91	212
paul@91	213	if self.entry == 0:
paul@91	214	docnum_size = sizeof(docnum)
paul@91	215	self.field_writer.begin(docnum_size)
paul@91	216	self.field_index_writer.begin(docnum_size)
paul@91	217	self.field_index_writer.reset()
paul@44	218
paul@44	219	if self.entry % self.interval == 0:
paul@90	220	self.field_writer.reset()
paul@88	221	offset = self.field_writer.tell()
paul@55	222	self.field_writer.write_fields(docnum, fields)
paul@44	223	self.field_index_writer.write_document(docnum, offset)
paul@55	224	else:
paul@55	225	self.field_writer.write_fields(docnum, fields)
paul@44	226
paul@44	227	self.entry += 1
paul@44	228
paul@44	229	def close(self):
paul@44	230	self.field_writer.close()
paul@44	231	self.field_index_writer.close()
paul@44	232
paul@44	233	class FieldDictionaryReader:
paul@44	234
paul@44	235	"Reading field dictionary details."
paul@44	236
paul@44	237	def __init__(self, field_reader, field_index_reader):
paul@44	238	self.field_reader = field_reader
paul@44	239	self.field_index_reader = field_index_reader
paul@91	240
paul@91	241	self.field_reader.reset()
paul@91	242	self.field_index_reader.reset()
paul@44	243
paul@67	244	self.cache = {}
paul@91	245
paul@91	246	self.entry = 0
paul@44	247	self.docs = []
paul@44	248	try:
paul@44	249	while 1:
paul@44	250	self.docs.append(self.field_index_reader.read_document())
paul@44	251	except EOFError:
paul@44	252	pass
paul@44	253
paul@44	254	# Large numbers for ordering purposes.
paul@44	255
paul@44	256	if self.docs:
paul@44	257	self.max_offset = self.docs[-1][1]
paul@44	258	else:
paul@44	259	self.max_offset = None
paul@44	260
paul@44	261	# Iterator convenience methods.
paul@44	262
paul@44	263	def __iter__(self):
paul@44	264	self.rewind()
paul@44	265	return self
paul@44	266
paul@44	267	def next(self):
paul@44	268	try:
paul@44	269	return self.read_fields()
paul@44	270	except EOFError:
paul@44	271	raise StopIteration
paul@44	272
paul@44	273	# Sequential access methods.
paul@44	274
paul@44	275	def rewind(self):
paul@44	276	self.field_reader.rewind()
paul@44	277
paul@44	278	def read_fields(self):
paul@44	279
paul@44	280	"Return the next document number and fields."
paul@44	281
paul@90	282	try:
paul@90	283	return self.field_reader.read_fields()
paul@90	284	except EOFError:
paul@90	285	self.entry += 1
paul@90	286	try:
paul@90	287	found_docnum, offset = self.docs[self.entry]
paul@90	288	except IndexError:
paul@90	289	raise EOFError
paul@90	290	else:
paul@90	291	self.field_reader.reset()
paul@90	292	return self.field_reader.read_fields()
paul@44	293
paul@44	294	# Random access methods.
paul@44	295
paul@44	296	def get_fields(self, docnum):
paul@44	297
paul@44	298	"Read the fields of the document with the given 'docnum'."
paul@44	299
paul@67	300	if self.cache.has_key(docnum):
paul@67	301	return self.cache[docnum]
paul@67	302
paul@44	303	i = bisect_right(self.docs, (docnum, self.max_offset)) - 1
paul@44	304
paul@44	305	# Get the entry position providing the term or one preceding it.
paul@44	306
paul@44	307	if i == -1:
paul@44	308	return None
paul@44	309
paul@44	310	found_docnum, offset = self.docs[i]
paul@44	311
paul@44	312	# Read from the fields file.
paul@44	313
paul@44	314	found_docnum, fields = self.field_reader.read_document_fields(found_docnum, offset)
paul@44	315
paul@44	316	# Scan for the document, if necessary.
paul@44	317
paul@44	318	try:
paul@44	319	while docnum > found_docnum:
paul@44	320	found_docnum, fields = self.field_reader.read_fields()
paul@44	321	except EOFError:
paul@44	322	pass
paul@44	323
paul@44	324	# If the document is found, return the fields.
paul@44	325
paul@44	326	if docnum == found_docnum:
paul@67	327
paul@67	328	# Store the fields in the cache, removing entries if the limit has
paul@67	329	# been reached.
paul@67	330
paul@67	331	keys = self.cache.keys()
paul@67	332
paul@67	333	if len(keys) == DOCUMENT_CACHE_LIMIT:
paul@67	334	del self.cache[keys[0]]
paul@67	335
paul@67	336	self.cache[docnum] = fields
paul@44	337	return fields
paul@44	338	else:
paul@44	339	return None
paul@44	340
paul@44	341	def close(self):
paul@44	342	self.field_reader.close()
paul@44	343	self.field_index_reader.close()
paul@44	344
paul@44	345	# vim: tabstop=4 expandtab shiftwidth=4