1 #!/usr/bin/env python 2 3 """ 4 Specific classes for storing position information. 5 6 Copyright (C) 2009 Paul Boddie <paul@boddie.org.uk> 7 8 This program is free software; you can redistribute it and/or modify it under 9 the terms of the GNU General Public License as published by the Free Software 10 Foundation; either version 3 of the License, or (at your option) any later 11 version. 12 13 This program is distributed in the hope that it will be useful, but WITHOUT ANY 14 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A 15 PARTICULAR PURPOSE. See the GNU General Public License for more details. 16 17 You should have received a copy of the GNU General Public License along 18 with this program. If not, see <http://www.gnu.org/licenses/>. 19 """ 20 21 from iixr.files import * 22 from iixr.data import vint, vint_to_array 23 from array import array 24 25 class PositionWriter(FileWriter): 26 27 "Writing position information to files." 28 29 def reset(self): 30 self.last_docnum = 0 31 32 def write_positions(self, docnum, positions): 33 34 """ 35 Write for the document 'docnum' the given 'positions'. 36 """ 37 38 if docnum < self.last_docnum: 39 raise ValueError, "Document number %r is less than previous number %r." % (docnum, self.last_docnum) 40 41 # Make sure that the positions are sorted. 42 43 positions.sort() 44 45 # Write the document number delta. 46 # Write the number of positions. 47 48 output = array('B') 49 vint_to_array(docnum - self.last_docnum, output) 50 vint_to_array(len(positions), output) 51 52 # Write the position deltas. 53 54 last = 0 55 56 for position in positions: 57 vint_to_array(position - last, output) 58 last = position 59 60 output.tofile(self.f) 61 62 self.last_docnum = docnum 63 64 class PositionOpener(FileOpener): 65 66 "Reading position information from files." 67 68 def read_term_positions(self, offset, count): 69 70 """ 71 Read all positions from 'offset', seeking to that position in the file 72 before reading. The number of documents available for reading is limited 73 to 'count'. 74 """ 75 76 f = self.open("rb") 77 return PositionIterator(f, offset, count) 78 79 class PositionIndexWriter(FileWriter): 80 81 "Writing position index information to files." 82 83 def reset(self): 84 self.last_docnum = 0 85 self.last_pos_offset = 0 86 87 def write_positions(self, docnum, pos_offset, count): 88 89 """ 90 Write the given 'docnum, 'pos_offset' and document 'count' to the 91 position index file. 92 """ 93 94 # Write the document number delta. 95 # Write the position file offset delta. 96 # Write the document count. 97 98 output = array('B') 99 vint_to_array(docnum - self.last_docnum, output) 100 vint_to_array(pos_offset - self.last_pos_offset, output) 101 vint_to_array(count, output) 102 103 # Actually write the data. 104 105 output.tofile(self.f) 106 107 self.last_pos_offset = pos_offset 108 self.last_docnum = docnum 109 110 class PositionIndexOpener(FileOpener): 111 112 "Reading position index information from files." 113 114 def read_term_positions(self, offset, doc_frequency): 115 116 """ 117 Read all positions from 'offset', seeking to that position in the file 118 before reading. The number of documents available for reading is limited 119 to 'doc_frequency'. 120 """ 121 122 f = self.open("rb") 123 return PositionIndexIterator(f, offset, doc_frequency) 124 125 # Iterators for position-related files. 126 127 class IteratorBase: 128 129 def __init__(self, count): 130 self.replenish(count) 131 132 def replenish(self, count): 133 self.count = count 134 self.read_documents = 0 135 136 def __len__(self): 137 return self.count 138 139 def sort(self): 140 pass # Stored document positions are already sorted. 141 142 def __iter__(self): 143 return self 144 145 class PositionIterator(FileReader, IteratorBase): 146 147 "Iterating over document positions." 148 149 def __init__(self, f, offset, count): 150 FileReader.__init__(self, f) 151 IteratorBase.__init__(self, count) 152 self.f.seek(offset) 153 154 def reset(self): 155 self.last_docnum = 0 156 157 def read_positions(self): 158 159 "Read positions, returning a document number and a list of positions." 160 161 # Read the document number delta and add it to the last number. 162 163 self.last_docnum += self.read_number() 164 165 # Read the number of positions. 166 167 npositions = self.read_number() 168 169 # Read the position deltas, adding each previous position to get the 170 # appropriate collection of absolute positions. 171 172 i = 0 173 last = 0 174 positions = [] 175 176 while i < npositions: 177 last += self.read_number() 178 positions.append(last) 179 i += 1 180 181 return self.last_docnum, positions 182 183 def next(self): 184 185 "Read positions for a single document." 186 187 if self.read_documents < self.count: 188 self.read_documents += 1 189 return self.read_positions() 190 else: 191 raise StopIteration 192 193 class PositionIndexIterator(FileReader, IteratorBase): 194 195 "Iterating over document positions." 196 197 def __init__(self, f, offset, count): 198 FileReader.__init__(self, f) 199 IteratorBase.__init__(self, count) 200 self.f.seek(offset) 201 202 def reset(self): 203 self.last_docnum = 0 204 self.last_pos_offset = 0 205 self.section_count = 0 206 207 def read_positions(self): 208 209 """ 210 Read a document number, a position file offset for the position index 211 file, and the number of documents in a section of that file. 212 """ 213 214 # Read the document number delta. 215 216 self.last_docnum += self.read_number() 217 218 # Read the offset delta. 219 220 self.last_pos_offset += self.read_number() 221 222 # Read the document count. 223 224 count = self.read_number() 225 226 return self.last_docnum, self.last_pos_offset, count 227 228 def next(self): 229 230 "Read positions for a single document." 231 232 self.read_documents += self.section_count 233 if self.read_documents < self.count: 234 docnum, pos_offset, self.section_count = t = self.read_positions() 235 return t 236 else: 237 #assert self.read_documents == self.count # not upheld by from_document 238 raise StopIteration 239 240 class PositionDictionaryWriter: 241 242 "Writing position dictionaries." 243 244 def __init__(self, position_writer, position_index_writer, interval): 245 self.position_writer = position_writer 246 self.position_index_writer = position_index_writer 247 self.interval = interval 248 249 def write_term_positions(self, doc_positions): 250 251 """ 252 Write all 'doc_positions' - a collection of tuples of the form (document 253 number, position list) - to the file. 254 255 Add some records to the index, making dictionary entries. 256 257 Return a tuple containing the offset of the written data, the frequency 258 (number of positions), and document frequency (number of documents) for 259 the term involved. 260 """ 261 262 # Reset the writers. 263 264 self.position_writer.reset() 265 self.position_index_writer.reset() 266 267 # Remember the first index entry offset. 268 269 index_offset = self.position_index_writer.f.tell() 270 271 # Write the positions. 272 273 frequency = 0 274 count = 0 275 276 if doc_positions: 277 278 # Retain the first record offset for a subsequent index entry. 279 280 first_offset = self.position_writer.f.tell() 281 first_docnum = None 282 283 doc_positions.sort() 284 285 for docnum, positions in doc_positions: 286 if first_docnum is None: 287 first_docnum = docnum 288 289 self.position_writer.write_positions(docnum, positions) 290 291 frequency += len(positions) 292 count += 1 293 294 # Every {interval} entries, write an index entry. 295 296 if count % self.interval == 0: 297 298 self.position_index_writer.write_positions(first_docnum, first_offset, self.interval) 299 300 first_offset = self.position_writer.f.tell() 301 first_docnum = None 302 303 # Reset the position writer so that position readers accessing 304 # a section start with the correct document number. 305 306 self.position_writer.reset() 307 308 # Finish writing an index entry for the remaining documents. 309 310 else: 311 if first_docnum is not None: 312 self.position_index_writer.write_positions(first_docnum, first_offset, count % self.interval) 313 314 return index_offset, frequency, count 315 316 def close(self): 317 self.position_writer.close() 318 self.position_index_writer.close() 319 320 class PositionDictionaryReader: 321 322 "Reading position dictionaries." 323 324 def __init__(self, position_opener, position_index_opener): 325 self.position_opener = position_opener 326 self.position_index_opener = position_index_opener 327 self.position_dict_iterators = [] 328 329 def read_term_positions(self, offset, doc_frequency): 330 331 """ 332 Return an iterator for dictionary entries starting at 'offset' with the 333 given 'doc_frequency'. 334 """ 335 336 it = PositionDictionaryIterator(self.position_opener, 337 self.position_index_opener, offset, doc_frequency) 338 self.position_dict_iterators.append(it) 339 return it 340 341 def close(self): 342 for it in self.position_dict_iterators: 343 it.close() 344 self.position_dict_iterators = [] 345 346 class PositionDictionaryIterator: 347 348 "Iteration over position dictionary entries." 349 350 def __init__(self, position_opener, position_index_opener, offset, doc_frequency): 351 self.position_opener = position_opener 352 self.position_index_opener = position_index_opener 353 self.doc_frequency = doc_frequency 354 355 self.index_iterator = None 356 self.iterator = None 357 358 # Initialise the iterators. 359 360 self.reset(offset, doc_frequency) 361 362 def reset(self, offset, doc_frequency): 363 364 # Remember the last values. 365 366 self.found_docnum, self.found_positions = None, None 367 368 # Attempt to reuse the index iterator. 369 370 if self.index_iterator is not None: 371 ii = self.index_iterator 372 ii.replenish(doc_frequency) 373 ii.f.seek(offset) 374 ii.reset() 375 376 # Or make a new index iterator. 377 378 else: 379 self.index_iterator = self.position_index_opener.read_term_positions(offset, doc_frequency) 380 381 # Maintain state for the next index entry, if read. 382 383 self.next_docnum, self.next_pos_offset, self.next_section_count = None, None, None 384 385 # Initialise the current index entry and current position file iterator. 386 387 self._next_section() 388 self._init_section() 389 390 # Sequence methods. 391 392 def __len__(self): 393 return self.doc_frequency 394 395 def sort(self): 396 pass 397 398 # Iterator methods. 399 400 def __iter__(self): 401 return self 402 403 def next(self): 404 405 """ 406 Attempt to get the next document record from the section in the 407 positions file. 408 """ 409 410 # Return any visited but unrequested record. 411 412 if self.found_docnum is not None: 413 t = self.found_docnum, self.found_positions 414 self.found_docnum, self.found_positions = None, None 415 return t 416 417 # Or search for the next record. 418 419 while 1: 420 421 # Either return the next record. 422 423 try: 424 return self.iterator.next() 425 426 # Or, where a section is finished, get the next section and try again. 427 428 except StopIteration: 429 430 # Where a section follows, update the index iterator, but keep 431 # reading using the same file iterator (since the data should 432 # just follow on from the last section). 433 434 self._next_section() 435 self.iterator.replenish(self.section_count) 436 437 # Reset the state of the iterator to make sure that document 438 # numbers are correct. 439 440 self.iterator.reset() 441 442 def from_document(self, docnum): 443 444 """ 445 Attempt to navigate to a positions entry for the given 'docnum', 446 returning the positions for 'docnum', or None otherwise. 447 """ 448 449 # Return any unrequested document positions. 450 451 if docnum == self.found_docnum: 452 return self.found_positions 453 454 # Read ahead in the index until the next entry refers to a document 455 # later than the desired document. 456 457 try: 458 if self.next_docnum is None: 459 self.next_docnum, self.next_pos_offset, self.next_section_count = self.index_iterator.next() 460 461 # Read until the next entry is after the desired document number, 462 # or until the end of the results. 463 464 while self.next_docnum <= docnum: 465 self._next_read_section() 466 if self.docnum < docnum: 467 self.next_docnum, self.next_pos_offset, self.next_section_count = self.index_iterator.next() 468 else: 469 break 470 471 except StopIteration: 472 pass 473 474 # Navigate in the position file to the document. 475 476 self._init_section() 477 478 try: 479 while 1: 480 found_docnum, found_positions = self.iterator.next() 481 482 # Return the desired document positions or None (retaining the 483 # positions for the document immediately after). 484 485 if docnum == found_docnum: 486 return found_positions 487 elif docnum < found_docnum: 488 self.found_docnum, self.found_positions = found_docnum, found_positions 489 return None 490 491 except StopIteration: 492 return None 493 494 # Internal methods. 495 496 def _next_section(self): 497 498 "Attempt to get the next section in the index." 499 500 if self.next_docnum is None: 501 self.docnum, self.pos_offset, self.section_count = self.index_iterator.next() 502 else: 503 self._next_read_section() 504 505 def _next_read_section(self): 506 507 """ 508 Make the next index entry the current one without reading from the 509 index. 510 """ 511 512 self.docnum, self.pos_offset, self.section_count = self.next_docnum, self.next_pos_offset, self.next_section_count 513 self.next_docnum, self.next_pos_offset, self.next_section_count = None, None, None 514 515 def _init_section(self): 516 517 "Initialise the iterator for the section in the position file." 518 519 # Attempt to reuse any correctly positioned iterator. 520 521 if self.iterator is not None: 522 i = self.iterator 523 i.replenish(self.section_count) 524 i.f.seek(self.pos_offset) 525 i.reset() 526 527 # Otherwise, obtain a new iterator. 528 529 else: 530 self.iterator = self.position_opener.read_term_positions(self.pos_offset, self.section_count) 531 532 def close(self): 533 if self.iterator is not None: 534 self.iterator.close() 535 self.iterator = None 536 if self.index_iterator is not None: 537 self.index_iterator.close() 538 self.index_iterator = None 539 540 class ResetPositionDictionaryIterator: 541 542 """ 543 A helper class which permits the reuse of iterators without modifying their 544 state. 545 """ 546 547 def __init__(self, iterator, offset, doc_frequency): 548 self.iterator = iterator 549 self.offset = offset 550 self.doc_frequency = doc_frequency 551 552 def __iter__(self): 553 self.iterator.reset(self.offset, self.doc_frequency) 554 return iter(self.iterator) 555 556 # vim: tabstop=4 expandtab shiftwidth=4