paul@44 | 1 | #!/usr/bin/env python |
paul@44 | 2 | |
paul@44 | 3 | """ |
paul@44 | 4 | Specific classes for storing term information. |
paul@44 | 5 | |
paul@89 | 6 | Copyright (C) 2009, 2010, 2011 Paul Boddie <paul@boddie.org.uk> |
paul@44 | 7 | |
paul@44 | 8 | This program is free software; you can redistribute it and/or modify it under |
paul@44 | 9 | the terms of the GNU General Public License as published by the Free Software |
paul@44 | 10 | Foundation; either version 3 of the License, or (at your option) any later |
paul@44 | 11 | version. |
paul@44 | 12 | |
paul@44 | 13 | This program is distributed in the hope that it will be useful, but WITHOUT ANY |
paul@44 | 14 | WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A |
paul@44 | 15 | PARTICULAR PURPOSE. See the GNU General Public License for more details. |
paul@44 | 16 | |
paul@44 | 17 | You should have received a copy of the GNU General Public License along |
paul@44 | 18 | with this program. If not, see <http://www.gnu.org/licenses/>. |
paul@44 | 19 | """ |
paul@44 | 20 | |
paul@44 | 21 | from iixr.files import * |
paul@47 | 22 | from iixr.positions import * |
paul@60 | 23 | from iixr.phrases import PhraseIterator |
paul@44 | 24 | from os.path import commonprefix # to find common string prefixes |
paul@44 | 25 | from bisect import bisect_right # to find terms in the dictionary index |
paul@44 | 26 | |
paul@44 | 27 | class TermWriter(FileWriter): |
paul@44 | 28 | |
paul@44 | 29 | "Writing term information to files." |
paul@44 | 30 | |
paul@44 | 31 | def reset(self): |
paul@90 | 32 | self.end_record() |
paul@44 | 33 | self.last_term = "" |
paul@44 | 34 | self.last_offset = 0 |
paul@44 | 35 | |
paul@44 | 36 | def write_term(self, term, offset, frequency, doc_frequency): |
paul@44 | 37 | |
paul@44 | 38 | """ |
paul@44 | 39 | Write the given 'term', its position file 'offset', its 'frequency' and |
paul@44 | 40 | its 'doc_frequency' (number of documents in which it appears) to the |
paul@55 | 41 | term information file. |
paul@44 | 42 | """ |
paul@44 | 43 | |
paul@75 | 44 | if term <= self.last_term: |
paul@75 | 45 | raise ValueError, "Term %r precedes the previous term %r." % (term, self.last_term) |
paul@75 | 46 | |
paul@44 | 47 | # Write the prefix length and term suffix. |
paul@44 | 48 | |
paul@44 | 49 | common = len(commonprefix([self.last_term, term])) |
paul@44 | 50 | suffix = term[common:] |
paul@44 | 51 | |
paul@44 | 52 | self.write_number(common) |
paul@44 | 53 | self.write_string(suffix) |
paul@44 | 54 | |
paul@44 | 55 | # Write the offset delta. |
paul@44 | 56 | # Write the frequency. |
paul@44 | 57 | # Write the document frequency. |
paul@44 | 58 | |
paul@86 | 59 | self.write_numbers(( |
paul@86 | 60 | offset - self.last_offset, |
paul@86 | 61 | frequency, |
paul@86 | 62 | doc_frequency |
paul@86 | 63 | )) |
paul@44 | 64 | |
paul@44 | 65 | self.last_term = term |
paul@44 | 66 | self.last_offset = offset |
paul@44 | 67 | |
paul@44 | 68 | class TermReader(FileReader): |
paul@44 | 69 | |
paul@44 | 70 | "Reading term information from files." |
paul@44 | 71 | |
paul@44 | 72 | def reset(self): |
paul@44 | 73 | self.last_term = "" |
paul@44 | 74 | self.last_offset = 0 |
paul@90 | 75 | self.begin_record() |
paul@44 | 76 | |
paul@44 | 77 | def read_term(self): |
paul@44 | 78 | |
paul@44 | 79 | """ |
paul@44 | 80 | Read a term, its position file offset, its frequency and its document |
paul@44 | 81 | frequency from the term information file. |
paul@44 | 82 | """ |
paul@44 | 83 | |
paul@44 | 84 | # Read the prefix length and term suffix. |
paul@44 | 85 | |
paul@44 | 86 | common = self.read_number() |
paul@44 | 87 | suffix = self.read_string() |
paul@44 | 88 | |
paul@44 | 89 | self.last_term = self.last_term[:common] + suffix |
paul@44 | 90 | |
paul@44 | 91 | # Read the offset delta. |
paul@44 | 92 | |
paul@44 | 93 | self.last_offset += self.read_number() |
paul@44 | 94 | |
paul@44 | 95 | # Read the frequency. |
paul@44 | 96 | |
paul@44 | 97 | frequency = self.read_number() |
paul@44 | 98 | |
paul@44 | 99 | # Read the document frequency. |
paul@44 | 100 | |
paul@44 | 101 | doc_frequency = self.read_number() |
paul@44 | 102 | |
paul@44 | 103 | return self.last_term, self.last_offset, frequency, doc_frequency |
paul@44 | 104 | |
paul@44 | 105 | def go_to_term(self, term, offset, info_offset): |
paul@44 | 106 | |
paul@44 | 107 | """ |
paul@44 | 108 | Seek past the entry for 'term' having 'offset' to 'info_offset'. This |
paul@44 | 109 | permits the scanning for later terms from the specified term. |
paul@44 | 110 | """ |
paul@44 | 111 | |
paul@69 | 112 | self.seek(info_offset) |
paul@44 | 113 | self.last_term = term |
paul@44 | 114 | self.last_offset = offset |
paul@44 | 115 | |
paul@44 | 116 | class TermIndexWriter(TermWriter): |
paul@44 | 117 | |
paul@44 | 118 | "Writing term dictionary index details to files." |
paul@44 | 119 | |
paul@44 | 120 | def reset(self): |
paul@44 | 121 | TermWriter.reset(self) |
paul@44 | 122 | self.last_info_offset = 0 |
paul@44 | 123 | |
paul@44 | 124 | def write_term(self, term, offset, frequency, doc_frequency, info_offset): |
paul@44 | 125 | |
paul@44 | 126 | """ |
paul@44 | 127 | Write the given 'term', its position file 'offset', its 'frequency' and |
paul@44 | 128 | its 'doc_frequency' to the term dictionary index file, along with the |
paul@44 | 129 | 'info_offset' in the term information file. |
paul@44 | 130 | """ |
paul@44 | 131 | |
paul@90 | 132 | TermWriter.write_term(self, term, offset, frequency, doc_frequency) |
paul@44 | 133 | |
paul@44 | 134 | # Write the information file offset delta. |
paul@44 | 135 | |
paul@44 | 136 | self.write_number(info_offset - self.last_info_offset) |
paul@89 | 137 | |
paul@44 | 138 | self.last_info_offset = info_offset |
paul@44 | 139 | |
paul@44 | 140 | class TermIndexReader(TermReader): |
paul@44 | 141 | |
paul@44 | 142 | "Reading term dictionary index details from files." |
paul@44 | 143 | |
paul@44 | 144 | def reset(self): |
paul@44 | 145 | TermReader.reset(self) |
paul@44 | 146 | self.last_info_offset = 0 |
paul@44 | 147 | |
paul@44 | 148 | def read_term(self): |
paul@44 | 149 | |
paul@44 | 150 | """ |
paul@44 | 151 | Read a term, its position file offset, its frequency, its document |
paul@44 | 152 | frequency and a term information file offset from the term dictionary |
paul@44 | 153 | index file. |
paul@44 | 154 | """ |
paul@44 | 155 | |
paul@90 | 156 | term, offset, frequency, doc_frequency = TermReader.read_term(self) |
paul@44 | 157 | |
paul@44 | 158 | # Read the offset delta. |
paul@44 | 159 | |
paul@44 | 160 | self.last_info_offset += self.read_number() |
paul@44 | 161 | |
paul@44 | 162 | return term, offset, frequency, doc_frequency, self.last_info_offset |
paul@44 | 163 | |
paul@44 | 164 | class TermDictionaryWriter: |
paul@44 | 165 | |
paul@44 | 166 | "Writing term dictionaries." |
paul@44 | 167 | |
paul@44 | 168 | def __init__(self, info_writer, index_writer, position_dict_writer, interval): |
paul@44 | 169 | self.info_writer = info_writer |
paul@44 | 170 | self.index_writer = index_writer |
paul@44 | 171 | self.position_dict_writer = position_dict_writer |
paul@44 | 172 | self.interval = interval |
paul@44 | 173 | self.entry = 0 |
paul@44 | 174 | |
paul@91 | 175 | self.index_writer.reset() |
paul@91 | 176 | |
paul@44 | 177 | def _write_term(self, term, offset, frequency, doc_frequency): |
paul@44 | 178 | |
paul@44 | 179 | """ |
paul@44 | 180 | Write the given 'term', its position file 'offset', its 'frequency' and |
paul@44 | 181 | its 'doc_frequency' (number of documents in which it appears) to the |
paul@90 | 182 | term information file. Return the offset before the term information was |
paul@44 | 183 | written to the file. |
paul@44 | 184 | """ |
paul@44 | 185 | |
paul@44 | 186 | if self.entry % self.interval == 0: |
paul@90 | 187 | self.info_writer.reset() |
paul@88 | 188 | info_offset = self.info_writer.tell() |
paul@44 | 189 | self.index_writer.write_term(term, offset, frequency, doc_frequency, info_offset) |
paul@44 | 190 | |
paul@90 | 191 | self.info_writer.write_term(term, offset, frequency, doc_frequency) |
paul@44 | 192 | self.entry += 1 |
paul@44 | 193 | |
paul@44 | 194 | def write_term_positions(self, term, doc_positions): |
paul@44 | 195 | |
paul@44 | 196 | """ |
paul@44 | 197 | Write the given 'term' and the 'doc_positions' recording the documents |
paul@44 | 198 | and positions at which the term is found. |
paul@44 | 199 | """ |
paul@44 | 200 | |
paul@44 | 201 | offset, frequency, doc_frequency = self.position_dict_writer.write_term_positions(doc_positions) |
paul@75 | 202 | |
paul@75 | 203 | if not frequency or not doc_frequency: |
paul@75 | 204 | raise ValueError, "Term %r has no occurrences recorded: %r" % (term, doc_positions) |
paul@75 | 205 | |
paul@44 | 206 | self._write_term(term, offset, frequency, doc_frequency) |
paul@44 | 207 | |
paul@44 | 208 | def close(self): |
paul@44 | 209 | self.info_writer.close() |
paul@44 | 210 | self.index_writer.close() |
paul@44 | 211 | self.position_dict_writer.close() |
paul@44 | 212 | |
paul@44 | 213 | class TermDictionaryReader: |
paul@44 | 214 | |
paul@44 | 215 | "Reading term dictionaries." |
paul@44 | 216 | |
paul@44 | 217 | def __init__(self, info_reader, index_reader, position_dict_reader): |
paul@44 | 218 | self.info_reader = info_reader |
paul@44 | 219 | self.index_reader = index_reader |
paul@44 | 220 | self.position_dict_reader = position_dict_reader |
paul@91 | 221 | |
paul@91 | 222 | self.info_reader.reset() |
paul@91 | 223 | self.index_reader.reset() |
paul@91 | 224 | |
paul@90 | 225 | self.entry = 0 |
paul@44 | 226 | self.terms = [] |
paul@44 | 227 | try: |
paul@44 | 228 | while 1: |
paul@44 | 229 | self.terms.append(self.index_reader.read_term()) |
paul@44 | 230 | except EOFError: |
paul@44 | 231 | pass |
paul@44 | 232 | |
paul@44 | 233 | # Large numbers for ordering purposes. |
paul@44 | 234 | |
paul@44 | 235 | if self.terms: |
paul@44 | 236 | self.max_offset = self.terms[-1][1] + 1 |
paul@44 | 237 | else: |
paul@44 | 238 | self.max_offset = None |
paul@44 | 239 | |
paul@44 | 240 | def _find_closest_entry(self, term): |
paul@44 | 241 | |
paul@44 | 242 | """ |
paul@44 | 243 | Find the offsets and frequencies of 'term' from the term dictionary or |
paul@44 | 244 | the closest term starting with the value of 'term'. |
paul@44 | 245 | |
paul@44 | 246 | Return the closest index entry consisting of a term, the position file |
paul@44 | 247 | offset, the term frequency, the document frequency, and the term details |
paul@44 | 248 | file offset. |
paul@44 | 249 | """ |
paul@44 | 250 | |
paul@44 | 251 | i = bisect_right(self.terms, (term, self.max_offset, 0, 0)) - 1 |
paul@44 | 252 | |
paul@44 | 253 | # Get the entry position providing the term or one preceding it. |
paul@44 | 254 | # If no entry precedes the requested term, return the very first entry |
paul@44 | 255 | # as the closest. |
paul@44 | 256 | |
paul@44 | 257 | if i == -1: |
paul@90 | 258 | self.entry = 0 |
paul@44 | 259 | return self.terms[0] |
paul@44 | 260 | else: |
paul@90 | 261 | self.entry = i |
paul@44 | 262 | return self.terms[i] |
paul@44 | 263 | |
paul@44 | 264 | def _find_closest_term(self, term): |
paul@44 | 265 | |
paul@44 | 266 | """ |
paul@44 | 267 | Find the offsets and frequencies of 'term' from the term dictionary or |
paul@44 | 268 | the closest term starting with the value of 'term'. |
paul@44 | 269 | |
paul@44 | 270 | Return the closest term (or the term itself), the position file offset, |
paul@44 | 271 | the term frequency, the document frequency, and the term details file |
paul@44 | 272 | offset (or None if the reader is already positioned). |
paul@44 | 273 | """ |
paul@44 | 274 | |
paul@44 | 275 | found_term, offset, frequency, doc_frequency, info_offset = self._find_closest_entry(term) |
paul@44 | 276 | |
paul@44 | 277 | # Where the term is found immediately, return the offset and |
paul@44 | 278 | # frequencies. If the term does not appear, return the details of the |
paul@44 | 279 | # closest entry. |
paul@44 | 280 | |
paul@44 | 281 | if term <= found_term: |
paul@44 | 282 | return found_term, offset, frequency, doc_frequency, info_offset |
paul@44 | 283 | |
paul@44 | 284 | # Otherwise, seek past the index term's entry in the information file |
paul@44 | 285 | # and scan for the desired term. |
paul@44 | 286 | |
paul@44 | 287 | else: |
paul@90 | 288 | # Reset the term and offset for the new page. |
paul@90 | 289 | self.info_reader.go_to_term("", 0, info_offset) |
paul@44 | 290 | try: |
paul@44 | 291 | while term > found_term: |
paul@90 | 292 | found_term, offset, frequency, doc_frequency = self._read_term() |
paul@44 | 293 | except EOFError: |
paul@44 | 294 | pass |
paul@44 | 295 | |
paul@44 | 296 | return found_term, offset, frequency, doc_frequency, None |
paul@44 | 297 | |
paul@44 | 298 | def _find_term(self, term): |
paul@44 | 299 | |
paul@44 | 300 | """ |
paul@44 | 301 | Find the position file offset and frequency of 'term' from the term |
paul@44 | 302 | dictionary. |
paul@44 | 303 | """ |
paul@44 | 304 | |
paul@44 | 305 | found_term, offset, frequency, doc_frequency, info_offset = self._find_closest_term(term) |
paul@44 | 306 | |
paul@44 | 307 | # If the term is found, return the offset and frequencies. |
paul@44 | 308 | |
paul@44 | 309 | if term == found_term: |
paul@44 | 310 | return offset, frequency, doc_frequency |
paul@44 | 311 | else: |
paul@44 | 312 | return None |
paul@44 | 313 | |
paul@81 | 314 | def _get_term_and_positions(self, term, offset, frequency, doc_frequency): |
paul@81 | 315 | |
paul@81 | 316 | """ |
paul@81 | 317 | Return the term plus positions details using the given 'term', 'offset', |
paul@81 | 318 | 'frequency' and 'doc_frequency'. |
paul@81 | 319 | """ |
paul@81 | 320 | |
paul@81 | 321 | return term, frequency, doc_frequency, self._get_positions(offset, doc_frequency) |
paul@81 | 322 | |
paul@44 | 323 | def _get_positions(self, offset, doc_frequency): |
paul@58 | 324 | |
paul@58 | 325 | """ |
paul@58 | 326 | Obtain positions from the position index 'offset' expecting a number of |
paul@58 | 327 | documents equal to the given 'doc_frequency'. |
paul@58 | 328 | """ |
paul@58 | 329 | |
paul@69 | 330 | return self.position_dict_reader.read_term_positions(offset, doc_frequency) |
paul@44 | 331 | |
paul@44 | 332 | # Iterator convenience methods. |
paul@44 | 333 | |
paul@44 | 334 | def __iter__(self): |
paul@44 | 335 | self.rewind() |
paul@44 | 336 | return self |
paul@44 | 337 | |
paul@44 | 338 | def next(self): |
paul@44 | 339 | try: |
paul@44 | 340 | return self.read_term() |
paul@44 | 341 | except EOFError: |
paul@44 | 342 | raise StopIteration |
paul@44 | 343 | |
paul@44 | 344 | # Sequential access methods. |
paul@44 | 345 | |
paul@44 | 346 | def rewind(self): |
paul@90 | 347 | self.entry = 0 |
paul@44 | 348 | self.info_reader.rewind() |
paul@44 | 349 | |
paul@44 | 350 | def read_term(self): |
paul@44 | 351 | |
paul@44 | 352 | """ |
paul@44 | 353 | Return the next term, its frequency, its document frequency, and the |
paul@44 | 354 | documents and positions at which the term is found. |
paul@44 | 355 | """ |
paul@44 | 356 | |
paul@90 | 357 | return self._get_term_and_positions(*self._read_term()) |
paul@90 | 358 | |
paul@90 | 359 | def _read_term(self): |
paul@90 | 360 | |
paul@90 | 361 | try: |
paul@90 | 362 | term, offset, frequency, doc_frequency = self.info_reader.read_term() |
paul@90 | 363 | except EOFError: |
paul@90 | 364 | self.entry += 1 |
paul@90 | 365 | try: |
paul@90 | 366 | term, offset, frequency, doc_frequency, info_offset = self.terms[self.entry] |
paul@90 | 367 | except IndexError: |
paul@90 | 368 | raise EOFError |
paul@90 | 369 | else: |
paul@90 | 370 | # Reset the term and offset for the new page. |
paul@90 | 371 | |
paul@90 | 372 | self.info_reader.go_to_term("", 0, info_offset) |
paul@90 | 373 | |
paul@90 | 374 | # Skip the term in the information file. |
paul@90 | 375 | |
paul@90 | 376 | self.info_reader.read_term() |
paul@90 | 377 | |
paul@90 | 378 | return term, offset, frequency, doc_frequency |
paul@81 | 379 | |
paul@81 | 380 | def go_to_term(self, term): |
paul@81 | 381 | |
paul@81 | 382 | """ |
paul@81 | 383 | Navigate to 'term' in the dictionary, returning the details from its |
paul@81 | 384 | entry. The returned details can be augmented with position information |
paul@81 | 385 | when presented to the _get_term_and_positions method. |
paul@81 | 386 | """ |
paul@81 | 387 | |
paul@81 | 388 | found_term, offset, frequency, doc_frequency, info_offset = self._find_closest_term(term) |
paul@81 | 389 | |
paul@81 | 390 | # Position the reader, if necessary. |
paul@81 | 391 | |
paul@81 | 392 | if info_offset is not None: |
paul@90 | 393 | |
paul@90 | 394 | # Reset the term and offset for the new page. |
paul@90 | 395 | |
paul@90 | 396 | self.info_reader.go_to_term("", 0, info_offset) |
paul@90 | 397 | |
paul@90 | 398 | # Skip the term in the information file. |
paul@90 | 399 | |
paul@90 | 400 | self.info_reader.read_term() |
paul@81 | 401 | |
paul@81 | 402 | return found_term, offset, frequency, doc_frequency |
paul@44 | 403 | |
paul@44 | 404 | # Query methods. |
paul@44 | 405 | |
paul@72 | 406 | def get_terms(self): |
paul@72 | 407 | |
paul@72 | 408 | "Return a list of all terms." |
paul@72 | 409 | |
paul@74 | 410 | return iter(self) |
paul@72 | 411 | |
paul@44 | 412 | def find_terms(self, term): |
paul@44 | 413 | |
paul@44 | 414 | "Return all terms whose values start with the value of 'term'." |
paul@44 | 415 | |
paul@44 | 416 | terms = [] |
paul@44 | 417 | |
paul@81 | 418 | found_term, offset, frequency, doc_frequency = self.go_to_term(term) |
paul@44 | 419 | |
paul@44 | 420 | # Read and record terms. |
paul@44 | 421 | |
paul@44 | 422 | try: |
paul@44 | 423 | # Add the found term if it starts with the specified term. |
paul@44 | 424 | |
paul@44 | 425 | while found_term.startswith(term): |
paul@44 | 426 | terms.append(found_term) |
paul@90 | 427 | found_term, offset, frequency, doc_frequency = self._read_term() |
paul@44 | 428 | |
paul@44 | 429 | except EOFError: |
paul@44 | 430 | pass |
paul@44 | 431 | |
paul@44 | 432 | return terms |
paul@44 | 433 | |
paul@44 | 434 | def find_positions(self, term): |
paul@44 | 435 | |
paul@44 | 436 | "Return the documents and positions at which the given 'term' is found." |
paul@44 | 437 | |
paul@44 | 438 | t = self._find_term(term) |
paul@44 | 439 | if t is None: |
paul@61 | 440 | return [] |
paul@44 | 441 | else: |
paul@44 | 442 | offset, frequency, doc_frequency = t |
paul@44 | 443 | return self._get_positions(offset, doc_frequency) |
paul@44 | 444 | |
paul@60 | 445 | def find_common_positions(self, terms): |
paul@60 | 446 | |
paul@60 | 447 | """ |
paul@60 | 448 | Return the documents and positions at which all the given 'terms' are |
paul@60 | 449 | found, where only common documents are returned. |
paul@60 | 450 | """ |
paul@60 | 451 | |
paul@60 | 452 | return PhraseIterator([self.find_positions(term) for term in terms]) |
paul@60 | 453 | |
paul@44 | 454 | def get_frequency(self, term): |
paul@44 | 455 | |
paul@44 | 456 | "Return the frequency of the given 'term'." |
paul@44 | 457 | |
paul@44 | 458 | t = self._find_term(term) |
paul@44 | 459 | if t is None: |
paul@44 | 460 | return None |
paul@44 | 461 | else: |
paul@44 | 462 | offset, frequency, doc_frequency = t |
paul@44 | 463 | return frequency |
paul@44 | 464 | |
paul@44 | 465 | def get_document_frequency(self, term): |
paul@44 | 466 | |
paul@44 | 467 | "Return the document frequency of the given 'term'." |
paul@44 | 468 | |
paul@44 | 469 | t = self._find_term(term) |
paul@44 | 470 | if t is None: |
paul@44 | 471 | return None |
paul@44 | 472 | else: |
paul@44 | 473 | offset, frequency, doc_frequency = t |
paul@44 | 474 | return doc_frequency |
paul@44 | 475 | |
paul@44 | 476 | def close(self): |
paul@44 | 477 | self.info_reader.close() |
paul@44 | 478 | self.index_reader.close() |
paul@44 | 479 | self.position_dict_reader.close() |
paul@44 | 480 | |
paul@44 | 481 | # vim: tabstop=4 expandtab shiftwidth=4 |