iixr (annotate iixr/fields.py in fc0e9882717b)

iixr

Annotated iixr/fields.py

90:fc0e9882717b

2011-02-08

Paul Boddie

Moved the record handling into reset methods in order to have records encompass entire "pages" of stored data, rather than individual entries. Changed the term dictionary index to refer to the start of each "page" of term dictionary entries rather than the second entry. This is done so that the entire "page" or record can be loaded when such a "page" is requested, although it effectively prohibits direct traversal of the term dictionary without having to refer to the term dictionary index. Introduced a test for array exhaustion when reading variable-length integers from a particular starting position.

paul@44	1	#!/usr/bin/env python
paul@44	2
paul@44	3	"""
paul@44	4	Specific classes for storing document information.
paul@44	5
paul@89	6	Copyright (C) 2009, 2010, 2011 Paul Boddie <paul@boddie.org.uk>
paul@44	7
paul@44	8	This program is free software; you can redistribute it and/or modify it under
paul@44	9	the terms of the GNU General Public License as published by the Free Software
paul@44	10	Foundation; either version 3 of the License, or (at your option) any later
paul@44	11	version.
paul@44	12
paul@44	13	This program is distributed in the hope that it will be useful, but WITHOUT ANY
paul@44	14	WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
paul@44	15	PARTICULAR PURPOSE. See the GNU General Public License for more details.
paul@44	16
paul@44	17	You should have received a copy of the GNU General Public License along
paul@44	18	with this program. If not, see <http://www.gnu.org/licenses/>.
paul@44	19	"""
paul@44	20
paul@89	21	from iixr.data import *
paul@44	22	from iixr.files import *
paul@44	23	from bisect import bisect_right # to find terms in the dictionary index
paul@44	24
paul@67	25	DOCUMENT_CACHE_LIMIT = 10000
paul@67	26
paul@44	27	class FieldWriter(FileWriter):
paul@44	28
paul@44	29	"Writing field data to files."
paul@44	30
paul@44	31	def reset(self):
paul@90	32	self.end_record()
paul@74	33	self.last_docnum = None
paul@89	34	self.subtractor = None
paul@44	35
paul@44	36	def write_fields(self, docnum, fields):
paul@44	37
paul@44	38	"""
paul@44	39	Write for the given 'docnum', a list of 'fields' (integer, string pairs
paul@44	40	representing field identifiers and values respectively).
paul@44	41	"""
paul@44	42
paul@74	43	# Find the size of document number values.
paul@74	44
paul@89	45	if self.last_docnum is not None:
paul@89	46	docnum_seq = self.subtractor(docnum, self.last_docnum)
paul@89	47	else:
paul@89	48	self.subtractor = get_subtractor(docnum)
paul@89	49	docnum_seq = docnum
paul@74	50
paul@89	51	# Write the document number.
paul@89	52
paul@89	53	self.write_sequence_value(docnum_seq)
paul@44	54
paul@44	55	# Write the number of fields.
paul@44	56
paul@88	57	self.write_number(len(fields))
paul@44	58
paul@44	59	# Write the fields themselves.
paul@44	60
paul@44	61	for i, field in fields:
paul@44	62	self.write_number(i)
paul@44	63	self.write_string(field, 1) # compress
paul@44	64
paul@89	65	self.last_docnum = docnum
paul@89	66
paul@44	67	class FieldReader(FileReader):
paul@44	68
paul@44	69	"Reading field data from files."
paul@44	70
paul@44	71	def reset(self):
paul@74	72	self.last_docnum = None
paul@89	73	self.adder = None
paul@90	74	self.begin_record()
paul@44	75
paul@44	76	def read_fields(self):
paul@44	77
paul@44	78	"""
paul@44	79	Read fields from the file, returning a tuple containing the document
paul@44	80	number and a list of field (identifier, value) pairs.
paul@44	81	"""
paul@44	82
paul@89	83	# Read the document number.
paul@89	84
paul@89	85	docnum = self.read_sequence_value()
paul@44	86
paul@89	87	if self.last_docnum is not None:
paul@89	88	self.last_docnum = self.adder(docnum, self.last_docnum)
paul@89	89	else:
paul@89	90	self.adder = get_adder(docnum)
paul@89	91	self.last_docnum = docnum
paul@44	92
paul@44	93	# Read the number of fields.
paul@44	94
paul@44	95	nfields = self.read_number()
paul@44	96
paul@44	97	# Collect the fields.
paul@44	98
paul@44	99	fields = []
paul@44	100	i = 0
paul@44	101
paul@44	102	while i < nfields:
paul@44	103	identifier = self.read_number()
paul@44	104	value = self.read_string(1) # decompress
paul@44	105	fields.append((identifier, value))
paul@44	106	i += 1
paul@44	107
paul@44	108	return self.last_docnum, fields
paul@44	109
paul@44	110	def read_document_fields(self, docnum, offset):
paul@44	111
paul@44	112	"""
paul@44	113	Read fields for 'docnum' at the given 'offset'. This permits the
paul@44	114	retrieval of details for the specified document, as well as scanning for
paul@44	115	later documents.
paul@44	116	"""
paul@44	117
paul@69	118	self.seek(offset)
paul@44	119	bad_docnum, fields = self.read_fields()
paul@44	120	self.last_docnum = docnum
paul@44	121	return docnum, fields
paul@44	122
paul@44	123	class FieldIndexWriter(FileWriter):
paul@44	124
paul@44	125	"Writing field index details to files."
paul@44	126
paul@44	127	def reset(self):
paul@90	128	self.end_record()
paul@74	129	self.last_docnum = None
paul@89	130	self.subtractor = None
paul@44	131	self.last_offset = 0
paul@44	132
paul@44	133	def write_document(self, docnum, offset):
paul@44	134
paul@44	135	"""
paul@44	136	Write for the given 'docnum', the 'offset' at which the fields for the
paul@44	137	document are stored in the fields file.
paul@44	138	"""
paul@44	139
paul@74	140	# Find the size of document number values.
paul@74	141
paul@89	142	if self.last_docnum is not None:
paul@89	143	docnum_seq = self.subtractor(docnum, self.last_docnum)
paul@89	144	else:
paul@89	145	self.subtractor = get_subtractor(docnum)
paul@89	146	docnum_seq = docnum
paul@74	147
paul@89	148	# Write the document number.
paul@89	149
paul@89	150	self.write_sequence_value(docnum_seq)
paul@74	151
paul@74	152	# Write the offset delta.
paul@74	153
paul@44	154	self.write_number(offset - self.last_offset)
paul@89	155
paul@89	156	self.last_docnum = docnum
paul@44	157	self.last_offset = offset
paul@44	158
paul@44	159	class FieldIndexReader(FileReader):
paul@44	160
paul@44	161	"Reading field index details from files."
paul@44	162
paul@44	163	def reset(self):
paul@74	164	self.last_docnum = None
paul@89	165	self.adder = None
paul@44	166	self.last_offset = 0
paul@90	167	self.begin_record()
paul@44	168
paul@44	169	def read_document(self):
paul@44	170
paul@44	171	"Read a document number and field file offset."
paul@44	172
paul@89	173	# Read the document number.
paul@89	174
paul@89	175	docnum = self.read_sequence_value()
paul@74	176
paul@89	177	if self.last_docnum is not None:
paul@89	178	self.last_docnum = self.adder(docnum, self.last_docnum)
paul@89	179	else:
paul@89	180	self.adder = get_adder(docnum)
paul@89	181	self.last_docnum = docnum
paul@74	182
paul@74	183	# Read the offset.
paul@74	184
paul@44	185	self.last_offset += self.read_number()
paul@44	186
paul@44	187	return self.last_docnum, self.last_offset
paul@44	188
paul@44	189	class FieldDictionaryWriter:
paul@44	190
paul@44	191	"Writing field dictionary details."
paul@44	192
paul@44	193	def __init__(self, field_writer, field_index_writer, interval):
paul@44	194	self.field_writer = field_writer
paul@44	195	self.field_index_writer = field_index_writer
paul@44	196	self.interval = interval
paul@44	197	self.entry = 0
paul@44	198
paul@44	199	def write_fields(self, docnum, fields):
paul@44	200
paul@44	201	"Write details of the document with the given 'docnum' and 'fields'."
paul@44	202
paul@44	203	if self.entry % self.interval == 0:
paul@90	204	self.field_writer.reset()
paul@88	205	offset = self.field_writer.tell()
paul@55	206	self.field_writer.write_fields(docnum, fields)
paul@44	207	self.field_index_writer.write_document(docnum, offset)
paul@55	208	else:
paul@55	209	self.field_writer.write_fields(docnum, fields)
paul@44	210
paul@44	211	self.entry += 1
paul@44	212
paul@44	213	def close(self):
paul@44	214	self.field_writer.close()
paul@44	215	self.field_index_writer.close()
paul@44	216
paul@44	217	class FieldDictionaryReader:
paul@44	218
paul@44	219	"Reading field dictionary details."
paul@44	220
paul@44	221	def __init__(self, field_reader, field_index_reader):
paul@44	222	self.field_reader = field_reader
paul@44	223	self.field_index_reader = field_index_reader
paul@90	224	self.entry = 0
paul@44	225
paul@67	226	self.cache = {}
paul@44	227	self.docs = []
paul@44	228	try:
paul@44	229	while 1:
paul@44	230	self.docs.append(self.field_index_reader.read_document())
paul@44	231	except EOFError:
paul@44	232	pass
paul@44	233
paul@44	234	# Large numbers for ordering purposes.
paul@44	235
paul@44	236	if self.docs:
paul@44	237	self.max_offset = self.docs[-1][1]
paul@44	238	else:
paul@44	239	self.max_offset = None
paul@44	240
paul@44	241	# Iterator convenience methods.
paul@44	242
paul@44	243	def __iter__(self):
paul@44	244	self.rewind()
paul@44	245	return self
paul@44	246
paul@44	247	def next(self):
paul@44	248	try:
paul@44	249	return self.read_fields()
paul@44	250	except EOFError:
paul@44	251	raise StopIteration
paul@44	252
paul@44	253	# Sequential access methods.
paul@44	254
paul@44	255	def rewind(self):
paul@44	256	self.field_reader.rewind()
paul@44	257
paul@44	258	def read_fields(self):
paul@44	259
paul@44	260	"Return the next document number and fields."
paul@44	261
paul@90	262	try:
paul@90	263	return self.field_reader.read_fields()
paul@90	264	except EOFError:
paul@90	265	self.entry += 1
paul@90	266	try:
paul@90	267	found_docnum, offset = self.docs[self.entry]
paul@90	268	except IndexError:
paul@90	269	raise EOFError
paul@90	270	else:
paul@90	271	self.field_reader.reset()
paul@90	272	return self.field_reader.read_fields()
paul@44	273
paul@44	274	# Random access methods.
paul@44	275
paul@44	276	def get_fields(self, docnum):
paul@44	277
paul@44	278	"Read the fields of the document with the given 'docnum'."
paul@44	279
paul@67	280	if self.cache.has_key(docnum):
paul@67	281	return self.cache[docnum]
paul@67	282
paul@44	283	i = bisect_right(self.docs, (docnum, self.max_offset)) - 1
paul@44	284
paul@44	285	# Get the entry position providing the term or one preceding it.
paul@44	286
paul@44	287	if i == -1:
paul@44	288	return None
paul@44	289
paul@44	290	found_docnum, offset = self.docs[i]
paul@44	291
paul@44	292	# Read from the fields file.
paul@44	293
paul@44	294	found_docnum, fields = self.field_reader.read_document_fields(found_docnum, offset)
paul@44	295
paul@44	296	# Scan for the document, if necessary.
paul@44	297
paul@44	298	try:
paul@44	299	while docnum > found_docnum:
paul@44	300	found_docnum, fields = self.field_reader.read_fields()
paul@44	301	except EOFError:
paul@44	302	pass
paul@44	303
paul@44	304	# If the document is found, return the fields.
paul@44	305
paul@44	306	if docnum == found_docnum:
paul@67	307
paul@67	308	# Store the fields in the cache, removing entries if the limit has
paul@67	309	# been reached.
paul@67	310
paul@67	311	keys = self.cache.keys()
paul@67	312
paul@67	313	if len(keys) == DOCUMENT_CACHE_LIMIT:
paul@67	314	del self.cache[keys[0]]
paul@67	315
paul@67	316	self.cache[docnum] = fields
paul@44	317	return fields
paul@44	318	else:
paul@44	319	return None
paul@44	320
paul@44	321	def close(self):
paul@44	322	self.field_reader.close()
paul@44	323	self.field_index_reader.close()
paul@44	324
paul@44	325	# vim: tabstop=4 expandtab shiftwidth=4