1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
1.2 +++ b/iixr/terms.py Tue Sep 15 00:15:11 2009 +0200
1.3 @@ -0,0 +1,395 @@
1.4 +#!/usr/bin/env python
1.5 +
1.6 +"""
1.7 +Specific classes for storing term information.
1.8 +
1.9 +Copyright (C) 2009 Paul Boddie <paul@boddie.org.uk>
1.10 +
1.11 +This program is free software; you can redistribute it and/or modify it under
1.12 +the terms of the GNU General Public License as published by the Free Software
1.13 +Foundation; either version 3 of the License, or (at your option) any later
1.14 +version.
1.15 +
1.16 +This program is distributed in the hope that it will be useful, but WITHOUT ANY
1.17 +WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
1.18 +PARTICULAR PURPOSE. See the GNU General Public License for more details.
1.19 +
1.20 +You should have received a copy of the GNU General Public License along
1.21 +with this program. If not, see <http://www.gnu.org/licenses/>.
1.22 +"""
1.23 +
1.24 +from iixr.files import *
1.25 +from os.path import commonprefix # to find common string prefixes
1.26 +from bisect import bisect_right # to find terms in the dictionary index
1.27 +
1.28 +class TermWriter(FileWriter):
1.29 +
1.30 + "Writing term information to files."
1.31 +
1.32 + def reset(self):
1.33 + self.last_term = ""
1.34 + self.last_offset = 0
1.35 +
1.36 + def write_term(self, term, offset, frequency, doc_frequency):
1.37 +
1.38 + """
1.39 + Write the given 'term', its position file 'offset', its 'frequency' and
1.40 + its 'doc_frequency' (number of documents in which it appears) to the
1.41 + term information file. Return the offset after the term information was
1.42 + written to the file.
1.43 + """
1.44 +
1.45 + # Write the prefix length and term suffix.
1.46 +
1.47 + common = len(commonprefix([self.last_term, term]))
1.48 + suffix = term[common:]
1.49 +
1.50 + self.write_number(common)
1.51 + self.write_string(suffix)
1.52 +
1.53 + # Write the offset delta.
1.54 +
1.55 + self.write_number(offset - self.last_offset)
1.56 +
1.57 + # Write the frequency.
1.58 +
1.59 + self.write_number(frequency)
1.60 +
1.61 + # Write the document frequency.
1.62 +
1.63 + self.write_number(doc_frequency)
1.64 +
1.65 + self.last_term = term
1.66 + self.last_offset = offset
1.67 +
1.68 + return self.tell()
1.69 +
1.70 +class TermReader(FileReader):
1.71 +
1.72 + "Reading term information from files."
1.73 +
1.74 + def reset(self):
1.75 + self.last_term = ""
1.76 + self.last_offset = 0
1.77 +
1.78 + def read_term(self):
1.79 +
1.80 + """
1.81 + Read a term, its position file offset, its frequency and its document
1.82 + frequency from the term information file.
1.83 + """
1.84 +
1.85 + # Read the prefix length and term suffix.
1.86 +
1.87 + common = self.read_number()
1.88 + suffix = self.read_string()
1.89 +
1.90 + self.last_term = self.last_term[:common] + suffix
1.91 +
1.92 + # Read the offset delta.
1.93 +
1.94 + self.last_offset += self.read_number()
1.95 +
1.96 + # Read the frequency.
1.97 +
1.98 + frequency = self.read_number()
1.99 +
1.100 + # Read the document frequency.
1.101 +
1.102 + doc_frequency = self.read_number()
1.103 +
1.104 + return self.last_term, self.last_offset, frequency, doc_frequency
1.105 +
1.106 + def go_to_term(self, term, offset, info_offset):
1.107 +
1.108 + """
1.109 + Seek past the entry for 'term' having 'offset' to 'info_offset'. This
1.110 + permits the scanning for later terms from the specified term.
1.111 + """
1.112 +
1.113 + self.seek(info_offset)
1.114 + self.last_term = term
1.115 + self.last_offset = offset
1.116 +
1.117 +class TermIndexWriter(TermWriter):
1.118 +
1.119 + "Writing term dictionary index details to files."
1.120 +
1.121 + def reset(self):
1.122 + TermWriter.reset(self)
1.123 + self.last_info_offset = 0
1.124 +
1.125 + def write_term(self, term, offset, frequency, doc_frequency, info_offset):
1.126 +
1.127 + """
1.128 + Write the given 'term', its position file 'offset', its 'frequency' and
1.129 + its 'doc_frequency' to the term dictionary index file, along with the
1.130 + 'info_offset' in the term information file.
1.131 + """
1.132 +
1.133 + TermWriter.write_term(self, term, offset, frequency, doc_frequency)
1.134 +
1.135 + # Write the information file offset delta.
1.136 +
1.137 + self.write_number(info_offset - self.last_info_offset)
1.138 + self.last_info_offset = info_offset
1.139 +
1.140 +class TermIndexReader(TermReader):
1.141 +
1.142 + "Reading term dictionary index details from files."
1.143 +
1.144 + def reset(self):
1.145 + TermReader.reset(self)
1.146 + self.last_info_offset = 0
1.147 +
1.148 + def read_term(self):
1.149 +
1.150 + """
1.151 + Read a term, its position file offset, its frequency, its document
1.152 + frequency and a term information file offset from the term dictionary
1.153 + index file.
1.154 + """
1.155 +
1.156 + term, offset, frequency, doc_frequency = TermReader.read_term(self)
1.157 +
1.158 + # Read the offset delta.
1.159 +
1.160 + self.last_info_offset += self.read_number()
1.161 +
1.162 + return term, offset, frequency, doc_frequency, self.last_info_offset
1.163 +
1.164 +class TermDictionaryWriter:
1.165 +
1.166 + "Writing term dictionaries."
1.167 +
1.168 + def __init__(self, info_writer, index_writer, position_dict_writer, interval):
1.169 + self.info_writer = info_writer
1.170 + self.index_writer = index_writer
1.171 + self.position_dict_writer = position_dict_writer
1.172 + self.interval = interval
1.173 + self.entry = 0
1.174 +
1.175 + def _write_term(self, term, offset, frequency, doc_frequency):
1.176 +
1.177 + """
1.178 + Write the given 'term', its position file 'offset', its 'frequency' and
1.179 + its 'doc_frequency' (number of documents in which it appears) to the
1.180 + term information file. Return the offset after the term information was
1.181 + written to the file.
1.182 + """
1.183 +
1.184 + info_offset = self.info_writer.write_term(term, offset, frequency, doc_frequency)
1.185 +
1.186 + if self.entry % self.interval == 0:
1.187 + self.index_writer.write_term(term, offset, frequency, doc_frequency, info_offset)
1.188 +
1.189 + self.entry += 1
1.190 +
1.191 + def write_term_positions(self, term, doc_positions):
1.192 +
1.193 + """
1.194 + Write the given 'term' and the 'doc_positions' recording the documents
1.195 + and positions at which the term is found.
1.196 + """
1.197 +
1.198 + offset, frequency, doc_frequency = self.position_dict_writer.write_term_positions(doc_positions)
1.199 + self._write_term(term, offset, frequency, doc_frequency)
1.200 +
1.201 + def close(self):
1.202 + self.info_writer.close()
1.203 + self.index_writer.close()
1.204 + self.position_dict_writer.close()
1.205 +
1.206 +class TermDictionaryReader:
1.207 +
1.208 + "Reading term dictionaries."
1.209 +
1.210 + def __init__(self, info_reader, index_reader, position_dict_reader):
1.211 + self.info_reader = info_reader
1.212 + self.index_reader = index_reader
1.213 + self.position_dict_reader = position_dict_reader
1.214 +
1.215 + self.terms = []
1.216 + try:
1.217 + while 1:
1.218 + self.terms.append(self.index_reader.read_term())
1.219 + except EOFError:
1.220 + pass
1.221 +
1.222 + # Large numbers for ordering purposes.
1.223 +
1.224 + if self.terms:
1.225 + self.max_offset = self.terms[-1][1] + 1
1.226 + else:
1.227 + self.max_offset = None
1.228 +
1.229 + def _find_closest_entry(self, term):
1.230 +
1.231 + """
1.232 + Find the offsets and frequencies of 'term' from the term dictionary or
1.233 + the closest term starting with the value of 'term'.
1.234 +
1.235 + Return the closest index entry consisting of a term, the position file
1.236 + offset, the term frequency, the document frequency, and the term details
1.237 + file offset.
1.238 + """
1.239 +
1.240 + i = bisect_right(self.terms, (term, self.max_offset, 0, 0)) - 1
1.241 +
1.242 + # Get the entry position providing the term or one preceding it.
1.243 + # If no entry precedes the requested term, return the very first entry
1.244 + # as the closest.
1.245 +
1.246 + if i == -1:
1.247 + return self.terms[0]
1.248 + else:
1.249 + return self.terms[i]
1.250 +
1.251 + def _find_closest_term(self, term):
1.252 +
1.253 + """
1.254 + Find the offsets and frequencies of 'term' from the term dictionary or
1.255 + the closest term starting with the value of 'term'.
1.256 +
1.257 + Return the closest term (or the term itself), the position file offset,
1.258 + the term frequency, the document frequency, and the term details file
1.259 + offset (or None if the reader is already positioned).
1.260 + """
1.261 +
1.262 + found_term, offset, frequency, doc_frequency, info_offset = self._find_closest_entry(term)
1.263 +
1.264 + # Where the term is found immediately, return the offset and
1.265 + # frequencies. If the term does not appear, return the details of the
1.266 + # closest entry.
1.267 +
1.268 + if term <= found_term:
1.269 + return found_term, offset, frequency, doc_frequency, info_offset
1.270 +
1.271 + # Otherwise, seek past the index term's entry in the information file
1.272 + # and scan for the desired term.
1.273 +
1.274 + else:
1.275 + self.info_reader.go_to_term(found_term, offset, info_offset)
1.276 + try:
1.277 + while term > found_term:
1.278 + found_term, offset, frequency, doc_frequency = self.info_reader.read_term()
1.279 + except EOFError:
1.280 + pass
1.281 +
1.282 + return found_term, offset, frequency, doc_frequency, None
1.283 +
1.284 + def _find_term(self, term):
1.285 +
1.286 + """
1.287 + Find the position file offset and frequency of 'term' from the term
1.288 + dictionary.
1.289 + """
1.290 +
1.291 + found_term, offset, frequency, doc_frequency, info_offset = self._find_closest_term(term)
1.292 +
1.293 + # If the term is found, return the offset and frequencies.
1.294 +
1.295 + if term == found_term:
1.296 + return offset, frequency, doc_frequency
1.297 + else:
1.298 + return None
1.299 +
1.300 + def _get_positions(self, offset, doc_frequency):
1.301 + return self.position_dict_reader.read_term_positions(offset, doc_frequency)
1.302 +
1.303 + # Iterator convenience methods.
1.304 +
1.305 + def __iter__(self):
1.306 + self.rewind()
1.307 + return self
1.308 +
1.309 + def next(self):
1.310 + try:
1.311 + return self.read_term()
1.312 + except EOFError:
1.313 + raise StopIteration
1.314 +
1.315 + # Sequential access methods.
1.316 +
1.317 + def rewind(self):
1.318 + self.info_reader.rewind()
1.319 +
1.320 + def read_term(self):
1.321 +
1.322 + """
1.323 + Return the next term, its frequency, its document frequency, and the
1.324 + documents and positions at which the term is found.
1.325 + """
1.326 +
1.327 + term, offset, frequency, doc_frequency = self.info_reader.read_term()
1.328 + positions = self._get_positions(offset, doc_frequency)
1.329 + return term, frequency, doc_frequency, positions
1.330 +
1.331 + # Query methods.
1.332 +
1.333 + def find_terms(self, term):
1.334 +
1.335 + "Return all terms whose values start with the value of 'term'."
1.336 +
1.337 + terms = []
1.338 +
1.339 + found_term, offset, frequency, doc_frequency, info_offset = self._find_closest_term(term)
1.340 +
1.341 + # Position the reader, if necessary.
1.342 +
1.343 + if info_offset is not None:
1.344 + self.info_reader.go_to_term(found_term, offset, info_offset)
1.345 +
1.346 + # Read and record terms.
1.347 +
1.348 + try:
1.349 + # Add the found term if it starts with the specified term.
1.350 +
1.351 + while found_term.startswith(term):
1.352 + terms.append(found_term)
1.353 + found_term, offset, frequency, doc_frequency = self.info_reader.read_term()
1.354 +
1.355 + except EOFError:
1.356 + pass
1.357 +
1.358 + return terms
1.359 +
1.360 + def find_positions(self, term):
1.361 +
1.362 + "Return the documents and positions at which the given 'term' is found."
1.363 +
1.364 + t = self._find_term(term)
1.365 + if t is None:
1.366 + return None
1.367 + else:
1.368 + offset, frequency, doc_frequency = t
1.369 + return self._get_positions(offset, doc_frequency)
1.370 +
1.371 + def get_frequency(self, term):
1.372 +
1.373 + "Return the frequency of the given 'term'."
1.374 +
1.375 + t = self._find_term(term)
1.376 + if t is None:
1.377 + return None
1.378 + else:
1.379 + offset, frequency, doc_frequency = t
1.380 + return frequency
1.381 +
1.382 + def get_document_frequency(self, term):
1.383 +
1.384 + "Return the document frequency of the given 'term'."
1.385 +
1.386 + t = self._find_term(term)
1.387 + if t is None:
1.388 + return None
1.389 + else:
1.390 + offset, frequency, doc_frequency = t
1.391 + return doc_frequency
1.392 +
1.393 + def close(self):
1.394 + self.info_reader.close()
1.395 + self.index_reader.close()
1.396 + self.position_dict_reader.close()
1.397 +
1.398 +# vim: tabstop=4 expandtab shiftwidth=4