1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
1.2 +++ b/iixr/fields.py Tue Sep 15 00:15:11 2009 +0200
1.3 @@ -0,0 +1,256 @@
1.4 +#!/usr/bin/env python
1.5 +
1.6 +"""
1.7 +Specific classes for storing document information.
1.8 +
1.9 +Copyright (C) 2009 Paul Boddie <paul@boddie.org.uk>
1.10 +
1.11 +This program is free software; you can redistribute it and/or modify it under
1.12 +the terms of the GNU General Public License as published by the Free Software
1.13 +Foundation; either version 3 of the License, or (at your option) any later
1.14 +version.
1.15 +
1.16 +This program is distributed in the hope that it will be useful, but WITHOUT ANY
1.17 +WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
1.18 +PARTICULAR PURPOSE. See the GNU General Public License for more details.
1.19 +
1.20 +You should have received a copy of the GNU General Public License along
1.21 +with this program. If not, see <http://www.gnu.org/licenses/>.
1.22 +"""
1.23 +
1.24 +from iixr.files import *
1.25 +from bisect import bisect_right # to find terms in the dictionary index
1.26 +
1.27 +class FieldWriter(FileWriter):
1.28 +
1.29 + "Writing field data to files."
1.30 +
1.31 + def reset(self):
1.32 + self.last_docnum = 0
1.33 +
1.34 + def write_fields(self, docnum, fields):
1.35 +
1.36 + """
1.37 + Write for the given 'docnum', a list of 'fields' (integer, string pairs
1.38 + representing field identifiers and values respectively).
1.39 + Return the offset at which the fields are stored.
1.40 + """
1.41 +
1.42 + offset = self.tell()
1.43 +
1.44 + # Write the document number delta.
1.45 +
1.46 + self.write_number(docnum - self.last_docnum)
1.47 +
1.48 + # Write the number of fields.
1.49 +
1.50 + self.write_number(len(fields))
1.51 +
1.52 + # Write the fields themselves.
1.53 +
1.54 + for i, field in fields:
1.55 + self.write_number(i)
1.56 + self.write_string(field, 1) # compress
1.57 +
1.58 + self.last_docnum = docnum
1.59 + return offset
1.60 +
1.61 +class FieldReader(FileReader):
1.62 +
1.63 + "Reading field data from files."
1.64 +
1.65 + def reset(self):
1.66 + self.last_docnum = 0
1.67 +
1.68 + def read_fields(self):
1.69 +
1.70 + """
1.71 + Read fields from the file, returning a tuple containing the document
1.72 + number and a list of field (identifier, value) pairs.
1.73 + """
1.74 +
1.75 + # Read the document number.
1.76 +
1.77 + self.last_docnum += self.read_number()
1.78 +
1.79 + # Read the number of fields.
1.80 +
1.81 + nfields = self.read_number()
1.82 +
1.83 + # Collect the fields.
1.84 +
1.85 + fields = []
1.86 + i = 0
1.87 +
1.88 + while i < nfields:
1.89 + identifier = self.read_number()
1.90 + value = self.read_string(1) # decompress
1.91 + fields.append((identifier, value))
1.92 + i += 1
1.93 +
1.94 + return self.last_docnum, fields
1.95 +
1.96 + def read_document_fields(self, docnum, offset):
1.97 +
1.98 + """
1.99 + Read fields for 'docnum' at the given 'offset'. This permits the
1.100 + retrieval of details for the specified document, as well as scanning for
1.101 + later documents.
1.102 + """
1.103 +
1.104 + self.seek(offset)
1.105 + bad_docnum, fields = self.read_fields()
1.106 + self.last_docnum = docnum
1.107 + return docnum, fields
1.108 +
1.109 +class FieldIndexWriter(FileWriter):
1.110 +
1.111 + "Writing field index details to files."
1.112 +
1.113 + def reset(self):
1.114 + self.last_docnum = 0
1.115 + self.last_offset = 0
1.116 +
1.117 + def write_document(self, docnum, offset):
1.118 +
1.119 + """
1.120 + Write for the given 'docnum', the 'offset' at which the fields for the
1.121 + document are stored in the fields file.
1.122 + """
1.123 +
1.124 + # Write the document number and offset deltas.
1.125 +
1.126 + self.write_number(docnum - self.last_docnum)
1.127 + self.write_number(offset - self.last_offset)
1.128 +
1.129 + self.last_docnum = docnum
1.130 + self.last_offset = offset
1.131 +
1.132 +class FieldIndexReader(FileReader):
1.133 +
1.134 + "Reading field index details from files."
1.135 +
1.136 + def reset(self):
1.137 + self.last_docnum = 0
1.138 + self.last_offset = 0
1.139 +
1.140 + def read_document(self):
1.141 +
1.142 + "Read a document number and field file offset."
1.143 +
1.144 + # Read the document number delta and offset.
1.145 +
1.146 + self.last_docnum += self.read_number()
1.147 + self.last_offset += self.read_number()
1.148 +
1.149 + return self.last_docnum, self.last_offset
1.150 +
1.151 +class FieldDictionaryWriter:
1.152 +
1.153 + "Writing field dictionary details."
1.154 +
1.155 + def __init__(self, field_writer, field_index_writer, interval):
1.156 + self.field_writer = field_writer
1.157 + self.field_index_writer = field_index_writer
1.158 + self.interval = interval
1.159 + self.entry = 0
1.160 +
1.161 + def write_fields(self, docnum, fields):
1.162 +
1.163 + "Write details of the document with the given 'docnum' and 'fields'."
1.164 +
1.165 + offset = self.field_writer.write_fields(docnum, fields)
1.166 +
1.167 + if self.entry % self.interval == 0:
1.168 + self.field_index_writer.write_document(docnum, offset)
1.169 +
1.170 + self.entry += 1
1.171 +
1.172 + def close(self):
1.173 + self.field_writer.close()
1.174 + self.field_index_writer.close()
1.175 +
1.176 +class FieldDictionaryReader:
1.177 +
1.178 + "Reading field dictionary details."
1.179 +
1.180 + def __init__(self, field_reader, field_index_reader):
1.181 + self.field_reader = field_reader
1.182 + self.field_index_reader = field_index_reader
1.183 +
1.184 + self.docs = []
1.185 + try:
1.186 + while 1:
1.187 + self.docs.append(self.field_index_reader.read_document())
1.188 + except EOFError:
1.189 + pass
1.190 +
1.191 + # Large numbers for ordering purposes.
1.192 +
1.193 + if self.docs:
1.194 + self.max_offset = self.docs[-1][1]
1.195 + else:
1.196 + self.max_offset = None
1.197 +
1.198 + # Iterator convenience methods.
1.199 +
1.200 + def __iter__(self):
1.201 + self.rewind()
1.202 + return self
1.203 +
1.204 + def next(self):
1.205 + try:
1.206 + return self.read_fields()
1.207 + except EOFError:
1.208 + raise StopIteration
1.209 +
1.210 + # Sequential access methods.
1.211 +
1.212 + def rewind(self):
1.213 + self.field_reader.rewind()
1.214 +
1.215 + def read_fields(self):
1.216 +
1.217 + "Return the next document number and fields."
1.218 +
1.219 + return self.field_reader.read_fields()
1.220 +
1.221 + # Random access methods.
1.222 +
1.223 + def get_fields(self, docnum):
1.224 +
1.225 + "Read the fields of the document with the given 'docnum'."
1.226 +
1.227 + i = bisect_right(self.docs, (docnum, self.max_offset)) - 1
1.228 +
1.229 + # Get the entry position providing the term or one preceding it.
1.230 +
1.231 + if i == -1:
1.232 + return None
1.233 +
1.234 + found_docnum, offset = self.docs[i]
1.235 +
1.236 + # Read from the fields file.
1.237 +
1.238 + found_docnum, fields = self.field_reader.read_document_fields(found_docnum, offset)
1.239 +
1.240 + # Scan for the document, if necessary.
1.241 +
1.242 + try:
1.243 + while docnum > found_docnum:
1.244 + found_docnum, fields = self.field_reader.read_fields()
1.245 + except EOFError:
1.246 + pass
1.247 +
1.248 + # If the document is found, return the fields.
1.249 +
1.250 + if docnum == found_docnum:
1.251 + return fields
1.252 + else:
1.253 + return None
1.254 +
1.255 + def close(self):
1.256 + self.field_reader.close()
1.257 + self.field_index_reader.close()
1.258 +
1.259 +# vim: tabstop=4 expandtab shiftwidth=4