iixr (annotate iixr/fields.py in 4c35f0aa339c)

iixr

Annotated iixr/fields.py

88:4c35f0aa339c

2011-02-03

Paul Boddie

Changed the files to have an internal array for reading and writing data.

paul@44	1	#!/usr/bin/env python
paul@44	2
paul@44	3	"""
paul@44	4	Specific classes for storing document information.
paul@44	5
paul@69	6	Copyright (C) 2009, 2010 Paul Boddie <paul@boddie.org.uk>
paul@44	7
paul@44	8	This program is free software; you can redistribute it and/or modify it under
paul@44	9	the terms of the GNU General Public License as published by the Free Software
paul@44	10	Foundation; either version 3 of the License, or (at your option) any later
paul@44	11	version.
paul@44	12
paul@44	13	This program is distributed in the hope that it will be useful, but WITHOUT ANY
paul@44	14	WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
paul@44	15	PARTICULAR PURPOSE. See the GNU General Public License for more details.
paul@44	16
paul@44	17	You should have received a copy of the GNU General Public License along
paul@44	18	with this program. If not, see <http://www.gnu.org/licenses/>.
paul@44	19	"""
paul@44	20
paul@44	21	from iixr.files import *
paul@44	22	from bisect import bisect_right # to find terms in the dictionary index
paul@44	23
paul@67	24	DOCUMENT_CACHE_LIMIT = 10000
paul@67	25
paul@44	26	class FieldWriter(FileWriter):
paul@44	27
paul@44	28	"Writing field data to files."
paul@44	29
paul@44	30	def reset(self):
paul@74	31	self.last_docnum = None
paul@74	32	self.docnum_size = None
paul@44	33
paul@44	34	def write_fields(self, docnum, fields):
paul@44	35
paul@44	36	"""
paul@44	37	Write for the given 'docnum', a list of 'fields' (integer, string pairs
paul@44	38	representing field identifiers and values respectively).
paul@44	39	"""
paul@44	40
paul@74	41	# Find the size of document number values.
paul@74	42
paul@74	43	if self.docnum_size is None:
paul@74	44	self.docnum_size = self.get_value_size(docnum)
paul@74	45	self.last_docnum = self.get_initial_value(self.docnum_size)
paul@74	46
paul@74	47	# Write the number of values per document number.
paul@44	48	# Write the document number delta.
paul@44	49
paul@88	50	self.write_number(self.docnum_size)
paul@88	51	self.last_docnum = self.write_sequence(docnum, self.last_docnum, self.docnum_size, monotonic=0)
paul@44	52
paul@44	53	# Write the number of fields.
paul@44	54
paul@88	55	self.write_number(len(fields))
paul@44	56
paul@44	57	# Write the fields themselves.
paul@44	58
paul@44	59	for i, field in fields:
paul@44	60	self.write_number(i)
paul@44	61	self.write_string(field, 1) # compress
paul@44	62
paul@44	63	class FieldReader(FileReader):
paul@44	64
paul@44	65	"Reading field data from files."
paul@44	66
paul@44	67	def reset(self):
paul@74	68	self.last_docnum = None
paul@44	69
paul@44	70	def read_fields(self):
paul@44	71
paul@44	72	"""
paul@44	73	Read fields from the file, returning a tuple containing the document
paul@44	74	number and a list of field (identifier, value) pairs.
paul@44	75	"""
paul@44	76
paul@74	77	# Read the number of values per document number.
paul@74	78
paul@74	79	docnum_size = self.read_number()
paul@44	80
paul@74	81	if self.last_docnum is None:
paul@74	82	self.last_docnum = self.get_initial_value(docnum_size)
paul@74	83
paul@74	84	# Read the document number delta and add it to the last number.
paul@74	85
paul@74	86	self.last_docnum = self.read_sequence(self.last_docnum, docnum_size, monotonic=0)
paul@44	87
paul@44	88	# Read the number of fields.
paul@44	89
paul@44	90	nfields = self.read_number()
paul@44	91
paul@44	92	# Collect the fields.
paul@44	93
paul@44	94	fields = []
paul@44	95	i = 0
paul@44	96
paul@44	97	while i < nfields:
paul@44	98	identifier = self.read_number()
paul@44	99	value = self.read_string(1) # decompress
paul@44	100	fields.append((identifier, value))
paul@44	101	i += 1
paul@44	102
paul@44	103	return self.last_docnum, fields
paul@44	104
paul@44	105	def read_document_fields(self, docnum, offset):
paul@44	106
paul@44	107	"""
paul@44	108	Read fields for 'docnum' at the given 'offset'. This permits the
paul@44	109	retrieval of details for the specified document, as well as scanning for
paul@44	110	later documents.
paul@44	111	"""
paul@44	112
paul@69	113	self.seek(offset)
paul@44	114	bad_docnum, fields = self.read_fields()
paul@44	115	self.last_docnum = docnum
paul@44	116	return docnum, fields
paul@44	117
paul@44	118	class FieldIndexWriter(FileWriter):
paul@44	119
paul@44	120	"Writing field index details to files."
paul@44	121
paul@44	122	def reset(self):
paul@74	123	self.last_docnum = None
paul@74	124	self.docnum_size = None
paul@44	125	self.last_offset = 0
paul@44	126
paul@44	127	def write_document(self, docnum, offset):
paul@44	128
paul@44	129	"""
paul@44	130	Write for the given 'docnum', the 'offset' at which the fields for the
paul@44	131	document are stored in the fields file.
paul@44	132	"""
paul@44	133
paul@74	134	# Find the size of document number values.
paul@74	135
paul@74	136	if self.docnum_size is None:
paul@74	137	self.docnum_size = self.get_value_size(docnum)
paul@74	138	self.last_docnum = self.get_initial_value(self.docnum_size)
paul@74	139
paul@74	140	# Write the number of values per document number.
paul@74	141	# Write the document number delta.
paul@44	142
paul@88	143	self.write_number(self.docnum_size)
paul@88	144	self.last_docnum = self.write_sequence(docnum, self.last_docnum, self.docnum_size, monotonic=0)
paul@74	145
paul@74	146	# Write the offset delta.
paul@74	147
paul@44	148	self.write_number(offset - self.last_offset)
paul@44	149	self.last_offset = offset
paul@44	150
paul@44	151	class FieldIndexReader(FileReader):
paul@44	152
paul@44	153	"Reading field index details from files."
paul@44	154
paul@44	155	def reset(self):
paul@74	156	self.last_docnum = None
paul@44	157	self.last_offset = 0
paul@44	158
paul@44	159	def read_document(self):
paul@44	160
paul@44	161	"Read a document number and field file offset."
paul@44	162
paul@74	163	# Read the number of values per document number.
paul@74	164
paul@74	165	docnum_size = self.read_number()
paul@74	166
paul@74	167	if self.last_docnum is None:
paul@74	168	self.last_docnum = self.get_initial_value(docnum_size)
paul@44	169
paul@74	170	# Read the document number delta and add it to the last number.
paul@74	171
paul@74	172	self.last_docnum = self.read_sequence(self.last_docnum, docnum_size, monotonic=0)
paul@74	173
paul@74	174	# Read the offset.
paul@74	175
paul@44	176	self.last_offset += self.read_number()
paul@44	177
paul@44	178	return self.last_docnum, self.last_offset
paul@44	179
paul@44	180	class FieldDictionaryWriter:
paul@44	181
paul@44	182	"Writing field dictionary details."
paul@44	183
paul@44	184	def __init__(self, field_writer, field_index_writer, interval):
paul@44	185	self.field_writer = field_writer
paul@44	186	self.field_index_writer = field_index_writer
paul@44	187	self.interval = interval
paul@44	188	self.entry = 0
paul@44	189
paul@44	190	def write_fields(self, docnum, fields):
paul@44	191
paul@44	192	"Write details of the document with the given 'docnum' and 'fields'."
paul@44	193
paul@44	194	if self.entry % self.interval == 0:
paul@88	195	offset = self.field_writer.tell()
paul@55	196	self.field_writer.write_fields(docnum, fields)
paul@44	197	self.field_index_writer.write_document(docnum, offset)
paul@55	198	else:
paul@55	199	self.field_writer.write_fields(docnum, fields)
paul@44	200
paul@44	201	self.entry += 1
paul@44	202
paul@44	203	def close(self):
paul@44	204	self.field_writer.close()
paul@44	205	self.field_index_writer.close()
paul@44	206
paul@44	207	class FieldDictionaryReader:
paul@44	208
paul@44	209	"Reading field dictionary details."
paul@44	210
paul@44	211	def __init__(self, field_reader, field_index_reader):
paul@44	212	self.field_reader = field_reader
paul@44	213	self.field_index_reader = field_index_reader
paul@44	214
paul@67	215	self.cache = {}
paul@44	216	self.docs = []
paul@44	217	try:
paul@44	218	while 1:
paul@44	219	self.docs.append(self.field_index_reader.read_document())
paul@44	220	except EOFError:
paul@44	221	pass
paul@44	222
paul@44	223	# Large numbers for ordering purposes.
paul@44	224
paul@44	225	if self.docs:
paul@44	226	self.max_offset = self.docs[-1][1]
paul@44	227	else:
paul@44	228	self.max_offset = None
paul@44	229
paul@44	230	# Iterator convenience methods.
paul@44	231
paul@44	232	def __iter__(self):
paul@44	233	self.rewind()
paul@44	234	return self
paul@44	235
paul@44	236	def next(self):
paul@44	237	try:
paul@44	238	return self.read_fields()
paul@44	239	except EOFError:
paul@44	240	raise StopIteration
paul@44	241
paul@44	242	# Sequential access methods.
paul@44	243
paul@44	244	def rewind(self):
paul@44	245	self.field_reader.rewind()
paul@44	246
paul@44	247	def read_fields(self):
paul@44	248
paul@44	249	"Return the next document number and fields."
paul@44	250
paul@44	251	return self.field_reader.read_fields()
paul@44	252
paul@44	253	# Random access methods.
paul@44	254
paul@44	255	def get_fields(self, docnum):
paul@44	256
paul@44	257	"Read the fields of the document with the given 'docnum'."
paul@44	258
paul@67	259	if self.cache.has_key(docnum):
paul@67	260	return self.cache[docnum]
paul@67	261
paul@44	262	i = bisect_right(self.docs, (docnum, self.max_offset)) - 1
paul@44	263
paul@44	264	# Get the entry position providing the term or one preceding it.
paul@44	265
paul@44	266	if i == -1:
paul@44	267	return None
paul@44	268
paul@44	269	found_docnum, offset = self.docs[i]
paul@44	270
paul@44	271	# Read from the fields file.
paul@44	272
paul@44	273	found_docnum, fields = self.field_reader.read_document_fields(found_docnum, offset)
paul@44	274
paul@44	275	# Scan for the document, if necessary.
paul@44	276
paul@44	277	try:
paul@44	278	while docnum > found_docnum:
paul@44	279	found_docnum, fields = self.field_reader.read_fields()
paul@44	280	except EOFError:
paul@44	281	pass
paul@44	282
paul@44	283	# If the document is found, return the fields.
paul@44	284
paul@44	285	if docnum == found_docnum:
paul@67	286
paul@67	287	# Store the fields in the cache, removing entries if the limit has
paul@67	288	# been reached.
paul@67	289
paul@67	290	keys = self.cache.keys()
paul@67	291
paul@67	292	if len(keys) == DOCUMENT_CACHE_LIMIT:
paul@67	293	del self.cache[keys[0]]
paul@67	294
paul@67	295	self.cache[docnum] = fields
paul@44	296	return fields
paul@44	297	else:
paul@44	298	return None
paul@44	299
paul@44	300	def close(self):
paul@44	301	self.field_reader.close()
paul@44	302	self.field_index_reader.close()
paul@44	303
paul@44	304	# vim: tabstop=4 expandtab shiftwidth=4