1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/iixr/terms.py	Tue Sep 15 00:15:11 2009 +0200
     1.3 @@ -0,0 +1,395 @@
     1.4 +#!/usr/bin/env python
     1.5 +
     1.6 +"""
     1.7 +Specific classes for storing term information.
     1.8 +
     1.9 +Copyright (C) 2009 Paul Boddie <paul@boddie.org.uk>
    1.10 +
    1.11 +This program is free software; you can redistribute it and/or modify it under
    1.12 +the terms of the GNU General Public License as published by the Free Software
    1.13 +Foundation; either version 3 of the License, or (at your option) any later
    1.14 +version.
    1.15 +
    1.16 +This program is distributed in the hope that it will be useful, but WITHOUT ANY
    1.17 +WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
    1.18 +PARTICULAR PURPOSE.  See the GNU General Public License for more details.
    1.19 +
    1.20 +You should have received a copy of the GNU General Public License along
    1.21 +with this program.  If not, see <http://www.gnu.org/licenses/>.
    1.22 +"""
    1.23 +
    1.24 +from iixr.files import *
    1.25 +from os.path import commonprefix # to find common string prefixes
    1.26 +from bisect import bisect_right  # to find terms in the dictionary index
    1.27 +
    1.28 +class TermWriter(FileWriter):
    1.29 +
    1.30 +    "Writing term information to files."
    1.31 +
    1.32 +    def reset(self):
    1.33 +        self.last_term = ""
    1.34 +        self.last_offset = 0
    1.35 +
    1.36 +    def write_term(self, term, offset, frequency, doc_frequency):
    1.37 +
    1.38 +        """
    1.39 +        Write the given 'term', its position file 'offset', its 'frequency' and
    1.40 +        its 'doc_frequency' (number of documents in which it appears) to the
    1.41 +        term information file. Return the offset after the term information was
    1.42 +        written to the file.
    1.43 +        """
    1.44 +
    1.45 +        # Write the prefix length and term suffix.
    1.46 +
    1.47 +        common = len(commonprefix([self.last_term, term]))
    1.48 +        suffix = term[common:]
    1.49 +
    1.50 +        self.write_number(common)
    1.51 +        self.write_string(suffix)
    1.52 +
    1.53 +        # Write the offset delta.
    1.54 +
    1.55 +        self.write_number(offset - self.last_offset)
    1.56 +
    1.57 +        # Write the frequency.
    1.58 +
    1.59 +        self.write_number(frequency)
    1.60 +
    1.61 +        # Write the document frequency.
    1.62 +
    1.63 +        self.write_number(doc_frequency)
    1.64 +
    1.65 +        self.last_term = term
    1.66 +        self.last_offset = offset
    1.67 +
    1.68 +        return self.tell()
    1.69 +
    1.70 +class TermReader(FileReader):
    1.71 +
    1.72 +    "Reading term information from files."
    1.73 +
    1.74 +    def reset(self):
    1.75 +        self.last_term = ""
    1.76 +        self.last_offset = 0
    1.77 +
    1.78 +    def read_term(self):
    1.79 +
    1.80 +        """
    1.81 +        Read a term, its position file offset, its frequency and its document
    1.82 +        frequency from the term information file.
    1.83 +        """
    1.84 +
    1.85 +        # Read the prefix length and term suffix.
    1.86 +
    1.87 +        common = self.read_number()
    1.88 +        suffix = self.read_string()
    1.89 +
    1.90 +        self.last_term = self.last_term[:common] + suffix
    1.91 +
    1.92 +        # Read the offset delta.
    1.93 +
    1.94 +        self.last_offset += self.read_number()
    1.95 +
    1.96 +        # Read the frequency.
    1.97 +
    1.98 +        frequency = self.read_number()
    1.99 +
   1.100 +        # Read the document frequency.
   1.101 +
   1.102 +        doc_frequency = self.read_number()
   1.103 +
   1.104 +        return self.last_term, self.last_offset, frequency, doc_frequency
   1.105 +
   1.106 +    def go_to_term(self, term, offset, info_offset):
   1.107 +
   1.108 +        """
   1.109 +        Seek past the entry for 'term' having 'offset' to 'info_offset'. This
   1.110 +        permits the scanning for later terms from the specified term.
   1.111 +        """
   1.112 +
   1.113 +        self.seek(info_offset)
   1.114 +        self.last_term = term
   1.115 +        self.last_offset = offset
   1.116 +
   1.117 +class TermIndexWriter(TermWriter):
   1.118 +
   1.119 +    "Writing term dictionary index details to files."
   1.120 +
   1.121 +    def reset(self):
   1.122 +        TermWriter.reset(self)
   1.123 +        self.last_info_offset = 0
   1.124 +
   1.125 +    def write_term(self, term, offset, frequency, doc_frequency, info_offset):
   1.126 +
   1.127 +        """
   1.128 +        Write the given 'term', its position file 'offset', its 'frequency' and
   1.129 +        its 'doc_frequency' to the term dictionary index file, along with the
   1.130 +        'info_offset' in the term information file.
   1.131 +        """
   1.132 +
   1.133 +        TermWriter.write_term(self, term, offset, frequency, doc_frequency)
   1.134 +
   1.135 +        # Write the information file offset delta.
   1.136 +
   1.137 +        self.write_number(info_offset - self.last_info_offset)
   1.138 +        self.last_info_offset = info_offset
   1.139 +
   1.140 +class TermIndexReader(TermReader):
   1.141 +
   1.142 +    "Reading term dictionary index details from files."
   1.143 +
   1.144 +    def reset(self):
   1.145 +        TermReader.reset(self)
   1.146 +        self.last_info_offset = 0
   1.147 +
   1.148 +    def read_term(self):
   1.149 +
   1.150 +        """
   1.151 +        Read a term, its position file offset, its frequency, its document
   1.152 +        frequency and a term information file offset from the term dictionary
   1.153 +        index file.
   1.154 +        """
   1.155 +
   1.156 +        term, offset, frequency, doc_frequency = TermReader.read_term(self)
   1.157 +
   1.158 +        # Read the offset delta.
   1.159 +
   1.160 +        self.last_info_offset += self.read_number()
   1.161 +
   1.162 +        return term, offset, frequency, doc_frequency, self.last_info_offset
   1.163 +
   1.164 +class TermDictionaryWriter:
   1.165 +
   1.166 +    "Writing term dictionaries."
   1.167 +
   1.168 +    def __init__(self, info_writer, index_writer, position_dict_writer, interval):
   1.169 +        self.info_writer = info_writer
   1.170 +        self.index_writer = index_writer
   1.171 +        self.position_dict_writer = position_dict_writer
   1.172 +        self.interval = interval
   1.173 +        self.entry = 0
   1.174 +
   1.175 +    def _write_term(self, term, offset, frequency, doc_frequency):
   1.176 +
   1.177 +        """
   1.178 +        Write the given 'term', its position file 'offset', its 'frequency' and
   1.179 +        its 'doc_frequency' (number of documents in which it appears) to the
   1.180 +        term information file. Return the offset after the term information was
   1.181 +        written to the file.
   1.182 +        """
   1.183 +
   1.184 +        info_offset = self.info_writer.write_term(term, offset, frequency, doc_frequency)
   1.185 +
   1.186 +        if self.entry % self.interval == 0:
   1.187 +            self.index_writer.write_term(term, offset, frequency, doc_frequency, info_offset)
   1.188 +
   1.189 +        self.entry += 1
   1.190 +
   1.191 +    def write_term_positions(self, term, doc_positions):
   1.192 +
   1.193 +        """
   1.194 +        Write the given 'term' and the 'doc_positions' recording the documents
   1.195 +        and positions at which the term is found.
   1.196 +        """
   1.197 +
   1.198 +        offset, frequency, doc_frequency = self.position_dict_writer.write_term_positions(doc_positions)
   1.199 +        self._write_term(term, offset, frequency, doc_frequency)
   1.200 +
   1.201 +    def close(self):
   1.202 +        self.info_writer.close()
   1.203 +        self.index_writer.close()
   1.204 +        self.position_dict_writer.close()
   1.205 +
   1.206 +class TermDictionaryReader:
   1.207 +
   1.208 +    "Reading term dictionaries."
   1.209 +
   1.210 +    def __init__(self, info_reader, index_reader, position_dict_reader):
   1.211 +        self.info_reader = info_reader
   1.212 +        self.index_reader = index_reader
   1.213 +        self.position_dict_reader = position_dict_reader
   1.214 +
   1.215 +        self.terms = []
   1.216 +        try:
   1.217 +            while 1:
   1.218 +                self.terms.append(self.index_reader.read_term())
   1.219 +        except EOFError:
   1.220 +            pass
   1.221 +
   1.222 +        # Large numbers for ordering purposes.
   1.223 +
   1.224 +        if self.terms:
   1.225 +            self.max_offset = self.terms[-1][1] + 1
   1.226 +        else:
   1.227 +            self.max_offset = None
   1.228 +
   1.229 +    def _find_closest_entry(self, term):
   1.230 +
   1.231 +        """
   1.232 +        Find the offsets and frequencies of 'term' from the term dictionary or
   1.233 +        the closest term starting with the value of 'term'.
   1.234 +
   1.235 +        Return the closest index entry consisting of a term, the position file
   1.236 +        offset, the term frequency, the document frequency, and the term details
   1.237 +        file offset.
   1.238 +        """
   1.239 +
   1.240 +        i = bisect_right(self.terms, (term, self.max_offset, 0, 0)) - 1
   1.241 +
   1.242 +        # Get the entry position providing the term or one preceding it.
   1.243 +        # If no entry precedes the requested term, return the very first entry
   1.244 +        # as the closest.
   1.245 +
   1.246 +        if i == -1:
   1.247 +            return self.terms[0]
   1.248 +        else:
   1.249 +            return self.terms[i]
   1.250 +
   1.251 +    def _find_closest_term(self, term):
   1.252 +
   1.253 +        """
   1.254 +        Find the offsets and frequencies of 'term' from the term dictionary or
   1.255 +        the closest term starting with the value of 'term'.
   1.256 +
   1.257 +        Return the closest term (or the term itself), the position file offset,
   1.258 +        the term frequency, the document frequency, and the term details file
   1.259 +        offset (or None if the reader is already positioned).
   1.260 +        """
   1.261 +
   1.262 +        found_term, offset, frequency, doc_frequency, info_offset = self._find_closest_entry(term)
   1.263 +
   1.264 +        # Where the term is found immediately, return the offset and
   1.265 +        # frequencies. If the term does not appear, return the details of the
   1.266 +        # closest entry.
   1.267 +
   1.268 +        if term <= found_term:
   1.269 +            return found_term, offset, frequency, doc_frequency, info_offset
   1.270 +
   1.271 +        # Otherwise, seek past the index term's entry in the information file
   1.272 +        # and scan for the desired term.
   1.273 +
   1.274 +        else:
   1.275 +            self.info_reader.go_to_term(found_term, offset, info_offset)
   1.276 +            try:
   1.277 +                while term > found_term:
   1.278 +                    found_term, offset, frequency, doc_frequency = self.info_reader.read_term()
   1.279 +            except EOFError:
   1.280 +                pass
   1.281 +
   1.282 +            return found_term, offset, frequency, doc_frequency, None
   1.283 +
   1.284 +    def _find_term(self, term):
   1.285 +
   1.286 +        """
   1.287 +        Find the position file offset and frequency of 'term' from the term
   1.288 +        dictionary.
   1.289 +        """
   1.290 +
   1.291 +        found_term, offset, frequency, doc_frequency, info_offset = self._find_closest_term(term)
   1.292 +
   1.293 +        # If the term is found, return the offset and frequencies.
   1.294 +
   1.295 +        if term == found_term:
   1.296 +            return offset, frequency, doc_frequency
   1.297 +        else:
   1.298 +            return None
   1.299 +
   1.300 +    def _get_positions(self, offset, doc_frequency):
   1.301 +        return self.position_dict_reader.read_term_positions(offset, doc_frequency)
   1.302 +
   1.303 +    # Iterator convenience methods.
   1.304 +
   1.305 +    def __iter__(self):
   1.306 +        self.rewind()
   1.307 +        return self
   1.308 +
   1.309 +    def next(self):
   1.310 +        try:
   1.311 +            return self.read_term()
   1.312 +        except EOFError:
   1.313 +            raise StopIteration
   1.314 +
   1.315 +    # Sequential access methods.
   1.316 +
   1.317 +    def rewind(self):
   1.318 +        self.info_reader.rewind()
   1.319 +
   1.320 +    def read_term(self):
   1.321 +
   1.322 +        """
   1.323 +        Return the next term, its frequency, its document frequency, and the
   1.324 +        documents and positions at which the term is found.
   1.325 +        """
   1.326 +
   1.327 +        term, offset, frequency, doc_frequency = self.info_reader.read_term()
   1.328 +        positions = self._get_positions(offset, doc_frequency)
   1.329 +        return term, frequency, doc_frequency, positions
   1.330 +
   1.331 +    # Query methods.
   1.332 +
   1.333 +    def find_terms(self, term):
   1.334 +
   1.335 +        "Return all terms whose values start with the value of 'term'."
   1.336 +
   1.337 +        terms = []
   1.338 +
   1.339 +        found_term, offset, frequency, doc_frequency, info_offset = self._find_closest_term(term)
   1.340 +
   1.341 +        # Position the reader, if necessary.
   1.342 +
   1.343 +        if info_offset is not None:
   1.344 +            self.info_reader.go_to_term(found_term, offset, info_offset)
   1.345 +
   1.346 +        # Read and record terms.
   1.347 +
   1.348 +        try:
   1.349 +            # Add the found term if it starts with the specified term.
   1.350 +
   1.351 +            while found_term.startswith(term):
   1.352 +                terms.append(found_term)
   1.353 +                found_term, offset, frequency, doc_frequency = self.info_reader.read_term()
   1.354 +
   1.355 +        except EOFError:
   1.356 +            pass
   1.357 +
   1.358 +        return terms
   1.359 +
   1.360 +    def find_positions(self, term):
   1.361 +
   1.362 +        "Return the documents and positions at which the given 'term' is found."
   1.363 +
   1.364 +        t = self._find_term(term)
   1.365 +        if t is None:
   1.366 +            return None
   1.367 +        else:
   1.368 +            offset, frequency, doc_frequency = t
   1.369 +            return self._get_positions(offset, doc_frequency)
   1.370 +
   1.371 +    def get_frequency(self, term):
   1.372 +
   1.373 +        "Return the frequency of the given 'term'."
   1.374 +
   1.375 +        t = self._find_term(term)
   1.376 +        if t is None:
   1.377 +            return None
   1.378 +        else:
   1.379 +            offset, frequency, doc_frequency = t
   1.380 +            return frequency
   1.381 +
   1.382 +    def get_document_frequency(self, term):
   1.383 +
   1.384 +        "Return the document frequency of the given 'term'."
   1.385 +
   1.386 +        t = self._find_term(term)
   1.387 +        if t is None:
   1.388 +            return None
   1.389 +        else:
   1.390 +            offset, frequency, doc_frequency = t
   1.391 +            return doc_frequency
   1.392 +
   1.393 +    def close(self):
   1.394 +        self.info_reader.close()
   1.395 +        self.index_reader.close()
   1.396 +        self.position_dict_reader.close()
   1.397 +
   1.398 +# vim: tabstop=4 expandtab shiftwidth=4