1 #!/usr/bin/env python 2 3 """ 4 Specific classes for storing term information. 5 6 Copyright (C) 2009, 2010 Paul Boddie <paul@boddie.org.uk> 7 8 This program is free software; you can redistribute it and/or modify it under 9 the terms of the GNU General Public License as published by the Free Software 10 Foundation; either version 3 of the License, or (at your option) any later 11 version. 12 13 This program is distributed in the hope that it will be useful, but WITHOUT ANY 14 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A 15 PARTICULAR PURPOSE. See the GNU General Public License for more details. 16 17 You should have received a copy of the GNU General Public License along 18 with this program. If not, see <http://www.gnu.org/licenses/>. 19 """ 20 21 from iixr.files import * 22 from iixr.positions import * 23 from iixr.phrases import PhraseIterator 24 from os.path import commonprefix # to find common string prefixes 25 from bisect import bisect_right # to find terms in the dictionary index 26 27 class TermWriter(FileWriter): 28 29 "Writing term information to files." 30 31 def reset(self): 32 self.last_term = "" 33 self.last_offset = 0 34 35 def write_term(self, term, offset, frequency, doc_frequency): 36 37 """ 38 Write the given 'term', its position file 'offset', its 'frequency' and 39 its 'doc_frequency' (number of documents in which it appears) to the 40 term information file. 41 """ 42 43 if term <= self.last_term: 44 raise ValueError, "Term %r precedes the previous term %r." % (term, self.last_term) 45 46 # Write the prefix length and term suffix. 47 48 common = len(commonprefix([self.last_term, term])) 49 suffix = term[common:] 50 51 self.write_number(common) 52 self.write_string(suffix) 53 54 # Write the offset delta. 55 56 self.write_number(offset - self.last_offset) 57 58 # Write the frequency. 59 60 self.write_number(frequency) 61 62 # Write the document frequency. 63 64 self.write_number(doc_frequency) 65 66 self.last_term = term 67 self.last_offset = offset 68 69 class TermReader(FileReader): 70 71 "Reading term information from files." 72 73 def reset(self): 74 self.last_term = "" 75 self.last_offset = 0 76 77 def read_term(self): 78 79 """ 80 Read a term, its position file offset, its frequency and its document 81 frequency from the term information file. 82 """ 83 84 # Read the prefix length and term suffix. 85 86 common = self.read_number() 87 suffix = self.read_string() 88 89 self.last_term = self.last_term[:common] + suffix 90 91 # Read the offset delta. 92 93 self.last_offset += self.read_number() 94 95 # Read the frequency. 96 97 frequency = self.read_number() 98 99 # Read the document frequency. 100 101 doc_frequency = self.read_number() 102 103 return self.last_term, self.last_offset, frequency, doc_frequency 104 105 def go_to_term(self, term, offset, info_offset): 106 107 """ 108 Seek past the entry for 'term' having 'offset' to 'info_offset'. This 109 permits the scanning for later terms from the specified term. 110 """ 111 112 self.seek(info_offset) 113 self.last_term = term 114 self.last_offset = offset 115 116 class TermIndexWriter(TermWriter): 117 118 "Writing term dictionary index details to files." 119 120 def reset(self): 121 TermWriter.reset(self) 122 self.last_info_offset = 0 123 124 def write_term(self, term, offset, frequency, doc_frequency, info_offset): 125 126 """ 127 Write the given 'term', its position file 'offset', its 'frequency' and 128 its 'doc_frequency' to the term dictionary index file, along with the 129 'info_offset' in the term information file. 130 """ 131 132 TermWriter.write_term(self, term, offset, frequency, doc_frequency) 133 134 # Write the information file offset delta. 135 136 self.write_number(info_offset - self.last_info_offset) 137 self.last_info_offset = info_offset 138 139 class TermIndexReader(TermReader): 140 141 "Reading term dictionary index details from files." 142 143 def reset(self): 144 TermReader.reset(self) 145 self.last_info_offset = 0 146 147 def read_term(self): 148 149 """ 150 Read a term, its position file offset, its frequency, its document 151 frequency and a term information file offset from the term dictionary 152 index file. 153 """ 154 155 term, offset, frequency, doc_frequency = TermReader.read_term(self) 156 157 # Read the offset delta. 158 159 self.last_info_offset += self.read_number() 160 161 return term, offset, frequency, doc_frequency, self.last_info_offset 162 163 class TermDictionaryWriter: 164 165 "Writing term dictionaries." 166 167 def __init__(self, info_writer, index_writer, position_dict_writer, interval): 168 self.info_writer = info_writer 169 self.index_writer = index_writer 170 self.position_dict_writer = position_dict_writer 171 self.interval = interval 172 self.entry = 0 173 174 def _write_term(self, term, offset, frequency, doc_frequency): 175 176 """ 177 Write the given 'term', its position file 'offset', its 'frequency' and 178 its 'doc_frequency' (number of documents in which it appears) to the 179 term information file. Return the offset after the term information was 180 written to the file. 181 """ 182 183 self.info_writer.write_term(term, offset, frequency, doc_frequency) 184 185 if self.entry % self.interval == 0: 186 info_offset = self.info_writer.f.tell() 187 self.index_writer.write_term(term, offset, frequency, doc_frequency, info_offset) 188 189 self.entry += 1 190 191 def write_term_positions(self, term, doc_positions): 192 193 """ 194 Write the given 'term' and the 'doc_positions' recording the documents 195 and positions at which the term is found. 196 """ 197 198 offset, frequency, doc_frequency = self.position_dict_writer.write_term_positions(doc_positions) 199 200 if not frequency or not doc_frequency: 201 raise ValueError, "Term %r has no occurrences recorded: %r" % (term, doc_positions) 202 203 self._write_term(term, offset, frequency, doc_frequency) 204 205 def close(self): 206 self.info_writer.close() 207 self.index_writer.close() 208 self.position_dict_writer.close() 209 210 class TermDictionaryReader: 211 212 "Reading term dictionaries." 213 214 def __init__(self, info_reader, index_reader, position_dict_reader): 215 self.info_reader = info_reader 216 self.index_reader = index_reader 217 self.position_dict_reader = position_dict_reader 218 219 self.terms = [] 220 try: 221 while 1: 222 self.terms.append(self.index_reader.read_term()) 223 except EOFError: 224 pass 225 226 # Large numbers for ordering purposes. 227 228 if self.terms: 229 self.max_offset = self.terms[-1][1] + 1 230 else: 231 self.max_offset = None 232 233 def _find_closest_entry(self, term): 234 235 """ 236 Find the offsets and frequencies of 'term' from the term dictionary or 237 the closest term starting with the value of 'term'. 238 239 Return the closest index entry consisting of a term, the position file 240 offset, the term frequency, the document frequency, and the term details 241 file offset. 242 """ 243 244 i = bisect_right(self.terms, (term, self.max_offset, 0, 0)) - 1 245 246 # Get the entry position providing the term or one preceding it. 247 # If no entry precedes the requested term, return the very first entry 248 # as the closest. 249 250 if i == -1: 251 return self.terms[0] 252 else: 253 return self.terms[i] 254 255 def _find_closest_term(self, term): 256 257 """ 258 Find the offsets and frequencies of 'term' from the term dictionary or 259 the closest term starting with the value of 'term'. 260 261 Return the closest term (or the term itself), the position file offset, 262 the term frequency, the document frequency, and the term details file 263 offset (or None if the reader is already positioned). 264 """ 265 266 found_term, offset, frequency, doc_frequency, info_offset = self._find_closest_entry(term) 267 268 # Where the term is found immediately, return the offset and 269 # frequencies. If the term does not appear, return the details of the 270 # closest entry. 271 272 if term <= found_term: 273 return found_term, offset, frequency, doc_frequency, info_offset 274 275 # Otherwise, seek past the index term's entry in the information file 276 # and scan for the desired term. 277 278 else: 279 self.info_reader.go_to_term(found_term, offset, info_offset) 280 try: 281 while term > found_term: 282 found_term, offset, frequency, doc_frequency = self.info_reader.read_term() 283 except EOFError: 284 pass 285 286 return found_term, offset, frequency, doc_frequency, None 287 288 def _find_term(self, term): 289 290 """ 291 Find the position file offset and frequency of 'term' from the term 292 dictionary. 293 """ 294 295 found_term, offset, frequency, doc_frequency, info_offset = self._find_closest_term(term) 296 297 # If the term is found, return the offset and frequencies. 298 299 if term == found_term: 300 return offset, frequency, doc_frequency 301 else: 302 return None 303 304 def _get_term_and_positions(self, term, offset, frequency, doc_frequency): 305 306 """ 307 Return the term plus positions details using the given 'term', 'offset', 308 'frequency' and 'doc_frequency'. 309 """ 310 311 return term, frequency, doc_frequency, self._get_positions(offset, doc_frequency) 312 313 def _get_positions(self, offset, doc_frequency): 314 315 """ 316 Obtain positions from the position index 'offset' expecting a number of 317 documents equal to the given 'doc_frequency'. 318 """ 319 320 return self.position_dict_reader.read_term_positions(offset, doc_frequency) 321 322 # Iterator convenience methods. 323 324 def __iter__(self): 325 self.rewind() 326 return self 327 328 def next(self): 329 try: 330 return self.read_term() 331 except EOFError: 332 raise StopIteration 333 334 # Sequential access methods. 335 336 def rewind(self): 337 self.info_reader.rewind() 338 339 def read_term(self): 340 341 """ 342 Return the next term, its frequency, its document frequency, and the 343 documents and positions at which the term is found. 344 """ 345 346 term, offset, frequency, doc_frequency = self.info_reader.read_term() 347 return self._get_term_and_positions(term, offset, frequency, doc_frequency) 348 349 def go_to_term(self, term): 350 351 """ 352 Navigate to 'term' in the dictionary, returning the details from its 353 entry. The returned details can be augmented with position information 354 when presented to the _get_term_and_positions method. 355 """ 356 357 found_term, offset, frequency, doc_frequency, info_offset = self._find_closest_term(term) 358 359 # Position the reader, if necessary. 360 361 if info_offset is not None: 362 self.info_reader.go_to_term(found_term, offset, info_offset) 363 364 return found_term, offset, frequency, doc_frequency 365 366 # Query methods. 367 368 def get_terms(self): 369 370 "Return a list of all terms." 371 372 return iter(self) 373 374 def find_terms(self, term): 375 376 "Return all terms whose values start with the value of 'term'." 377 378 terms = [] 379 380 found_term, offset, frequency, doc_frequency = self.go_to_term(term) 381 382 # Read and record terms. 383 384 try: 385 # Add the found term if it starts with the specified term. 386 387 while found_term.startswith(term): 388 terms.append(found_term) 389 found_term, offset, frequency, doc_frequency = self.info_reader.read_term() 390 391 except EOFError: 392 pass 393 394 return terms 395 396 def find_positions(self, term): 397 398 "Return the documents and positions at which the given 'term' is found." 399 400 t = self._find_term(term) 401 if t is None: 402 return [] 403 else: 404 offset, frequency, doc_frequency = t 405 return self._get_positions(offset, doc_frequency) 406 407 def find_common_positions(self, terms): 408 409 """ 410 Return the documents and positions at which all the given 'terms' are 411 found, where only common documents are returned. 412 """ 413 414 return PhraseIterator([self.find_positions(term) for term in terms]) 415 416 def get_frequency(self, term): 417 418 "Return the frequency of the given 'term'." 419 420 t = self._find_term(term) 421 if t is None: 422 return None 423 else: 424 offset, frequency, doc_frequency = t 425 return frequency 426 427 def get_document_frequency(self, term): 428 429 "Return the document frequency of the given 'term'." 430 431 t = self._find_term(term) 432 if t is None: 433 return None 434 else: 435 offset, frequency, doc_frequency = t 436 return doc_frequency 437 438 def close(self): 439 self.info_reader.close() 440 self.index_reader.close() 441 self.position_dict_reader.close() 442 443 # vim: tabstop=4 expandtab shiftwidth=4