1 #!/usr/bin/env python 2 3 """ 4 A simple (and sane) text indexing library. 5 6 Copyright (C) 2009 Paul Boddie <paul@boddie.org.uk> 7 8 This program is free software; you can redistribute it and/or modify it under 9 the terms of the GNU General Public License as published by the Free Software 10 Foundation; either version 3 of the License, or (at your option) any later 11 version. 12 13 This program is distributed in the hope that it will be useful, but WITHOUT ANY 14 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A 15 PARTICULAR PURPOSE. See the GNU General Public License for more details. 16 17 You should have received a copy of the GNU General Public License along 18 with this program. If not, see <http://www.gnu.org/licenses/>. 19 """ 20 21 from os import mkdir # to determine whether to create indexes 22 from os.path import exists, join 23 from os.path import commonprefix # to find common string prefixes 24 from bisect import bisect_right # to find terms in the dictionary index 25 import bz2 # for field compression 26 27 # Constants. 28 29 INTERVAL = 100 30 31 # Foundation classes. 32 33 class File: 34 35 "A basic file abstraction." 36 37 def __init__(self, f): 38 self.f = f 39 self.reset() 40 41 def reset(self): 42 pass 43 44 def close(self): 45 if self.f is not None: 46 self.f.close() 47 self.f = None 48 49 class FileWriter(File): 50 51 "Writing basic data types to files." 52 53 def write_number(self, number): 54 55 "Write 'number' to the file using a variable length encoding." 56 57 # Negative numbers are not supported. 58 59 if number < 0: 60 raise ValueError, "Number %r is negative." % number 61 62 # Special case: one byte containing zero. 63 64 elif number == 0: 65 self.f.write(chr(0)) 66 return 67 68 # Write the number from least to most significant digits. 69 70 bytes = [] 71 72 while number != 0: 73 lsd = number & 127 74 number = number >> 7 75 if number != 0: 76 lsd |= 128 77 bytes.append(chr(lsd)) 78 79 record = "".join(bytes) 80 self.f.write(record) 81 82 def write_string(self, s, compress=0): 83 84 """ 85 Write 's' to the file, recording its length and compressing the string 86 if 'compress' is set to a true value. 87 """ 88 89 # Convert Unicode objects to strings. 90 91 if isinstance(s, unicode): 92 s = s.encode("utf-8") 93 94 # Compress the string if requested. 95 96 if compress: 97 s = bz2.compress(s) 98 99 # Write the length of the data before the data itself. 100 101 length = len(s) 102 self.write_number(length) 103 self.f.write(s) 104 105 class FileReader(File): 106 107 "Reading basic data types from files." 108 109 def read_number(self): 110 111 "Read a number from the file." 112 113 # Read each byte, adding it to the number. 114 115 shift = 0 116 number = 0 117 more = 1 118 119 while more: 120 byte = self.f.read(1) 121 if not byte: 122 raise EOFError 123 124 csd = ord(byte) 125 more = csd & 128 != 0 126 if more: 127 csd &= 127 128 number += (csd << shift) 129 shift += 7 130 131 return number 132 133 def read_string(self, decompress=0): 134 135 """ 136 Read a string from the file, decompressing the stored data if 137 'decompress' is set to a true value. 138 """ 139 140 length = self.read_number() 141 s = self.f.read(length) 142 143 # Decompress the data if requested. 144 145 if decompress: 146 s = bz2.decompress(s) 147 148 # Convert strings to Unicode objects. 149 150 return unicode(s, "utf-8") 151 152 # Specific classes for storing term and position information. 153 154 class PositionWriter(FileWriter): 155 156 "Writing position information to files." 157 158 def reset(self): 159 self.last_docnum = 0 160 161 def write_positions(self, docnum, positions): 162 163 "Write for the document 'docnum' the given 'positions'." 164 165 if docnum < self.last_docnum: 166 raise ValueError, "Document number %r is less than previous number %r." % (docnum, self.last_docnum) 167 168 # Write the document number delta. 169 170 self.write_number(docnum - self.last_docnum) 171 172 # Write the number of positions. 173 174 self.write_number(len(positions)) 175 176 # Make sure that the positions are sorted. 177 178 positions.sort() 179 180 # Write the position deltas. 181 182 last = 0 183 for position in positions: 184 pos = position - last 185 self.write_number(pos) 186 last = position 187 188 self.last_docnum = docnum 189 190 def write_all_positions(self, doc_positions): 191 192 """ 193 Write all 'doc_positions' - a collection of tuples of the form (document 194 number, position list) - to the file, returning the offset at which they 195 were stored. 196 """ 197 198 # Reset the writer and record the current file offset. 199 200 self.reset() 201 offset = self.f.tell() 202 203 # Write the number of documents. 204 205 self.write_number(len(doc_positions)) 206 207 # Write the positions. 208 209 for docnum, positions in doc_positions: 210 self.write_positions(docnum, positions) 211 212 return offset 213 214 class PositionReader(FileReader): 215 216 "Reading position information from files." 217 218 def reset(self): 219 self.last_docnum = 0 220 221 def read_positions(self): 222 223 "Read positions, returning a document number and a list of positions." 224 225 # Read the document number delta and add it to the last number. 226 227 self.last_docnum += self.read_number() 228 229 # Read the number of positions. 230 231 npositions = self.read_number() 232 233 # Read the position deltas, adding each previous position to get the 234 # appropriate collection of absolute positions. 235 236 i = 0 237 last = 0 238 positions = [] 239 240 while i < npositions: 241 last += self.read_number() 242 positions.append(last) 243 i += 1 244 245 return self.last_docnum, positions 246 247 def read_all_positions(self, offset): 248 249 """ 250 Read all positions from 'offset', seeking to that position in the file 251 before reading. 252 """ 253 254 self.reset() 255 self.f.seek(offset) 256 257 # Read the number of documents. 258 259 ndocuments = self.read_number() 260 261 # Read all records. 262 263 i = 0 264 doc_positions = [] 265 266 while i < ndocuments: 267 doc_positions.append(self.read_positions()) 268 i += 1 269 270 return doc_positions 271 272 class TermWriter(FileWriter): 273 274 "Writing term information to files." 275 276 def reset(self): 277 self.last_term = "" 278 self.last_offset = 0 279 280 def write_term(self, term, offset): 281 282 """ 283 Write the given 'term' and its position file 'offset' to the term 284 information file. Return the offset after the term information was 285 written to the file. 286 """ 287 288 # Too long terms are not currently supported. 289 290 if len(term) > 255: 291 raise ValueError, "Term %r is too long." % term 292 293 # Write the prefix length and term suffix. 294 295 common = len(commonprefix([self.last_term, term])) 296 suffix = term[common:] 297 298 self.write_number(common) 299 self.write_string(suffix) 300 301 # Write the offset delta. 302 303 self.write_number(offset - self.last_offset) 304 305 self.last_term = term 306 self.last_offset = offset 307 308 return self.f.tell() 309 310 class TermReader(FileReader): 311 312 "Reading term information from files." 313 314 def reset(self): 315 self.last_term = "" 316 self.last_offset = 0 317 318 def read_term(self): 319 320 """ 321 Read a term and its position file offset from the term information file. 322 """ 323 324 # Read the prefix length and term suffix. 325 326 common = self.read_number() 327 suffix = self.read_string() 328 329 self.last_term = self.last_term[:common] + suffix 330 331 # Read the offset delta. 332 333 self.last_offset += self.read_number() 334 335 return self.last_term, self.last_offset 336 337 def go_to_term(self, term, offset, info_offset): 338 339 """ 340 Seek past the entry for 'term' having 'offset' to 'info_offset'. This 341 permits the scanning for later terms from the specified term. 342 """ 343 344 self.f.seek(info_offset) 345 self.last_term = term 346 self.last_offset = offset 347 348 class TermIndexWriter(TermWriter): 349 350 "Writing term dictionary index details to files." 351 352 def reset(self): 353 TermWriter.reset(self) 354 self.last_info_offset = 0 355 356 def write_term(self, term, offset, info_offset): 357 358 """ 359 Write the given 'term' and its position file 'offset' to the term 360 dictionary index file, along with the 'info_offset' in the term 361 information file. 362 """ 363 364 TermWriter.write_term(self, term, offset) 365 366 # Write the information file offset delta. 367 368 self.write_number(info_offset - self.last_info_offset) 369 self.last_info_offset = info_offset 370 371 class TermIndexReader(TermReader): 372 373 "Reading term dictionary index details from files." 374 375 def reset(self): 376 TermReader.reset(self) 377 self.last_info_offset = 0 378 379 def read_term(self): 380 381 """ 382 Read a term, its position file offset, and its term information file 383 offset from the term dictionary index file. 384 """ 385 386 term, offset = TermReader.read_term(self) 387 388 # Read the offset delta. 389 390 self.last_info_offset += self.read_number() 391 392 return term, offset, self.last_info_offset 393 394 class TermDictionaryWriter: 395 396 "Writing term dictionaries." 397 398 def __init__(self, info_writer, index_writer, position_writer, interval): 399 self.info_writer = info_writer 400 self.index_writer = index_writer 401 self.position_writer = position_writer 402 self.interval = interval 403 self.entry = 0 404 405 def _write_term(self, term, offset): 406 407 """ 408 Write the given 'term' and its position file 'offset' to the term 409 information file and optionally to the index, making a dictionary entry. 410 """ 411 412 info_offset = self.info_writer.write_term(term, offset) 413 414 if self.entry % self.interval == 0: 415 self.index_writer.write_term(term, offset, info_offset) 416 417 self.entry += 1 418 419 def write_term_positions(self, term, doc_positions): 420 421 """ 422 Write the given 'term' and the 'doc_positions' recording the documents 423 and positions at which the term is found. 424 """ 425 426 offset = self.position_writer.write_all_positions(doc_positions) 427 self._write_term(term, offset) 428 429 def close(self): 430 self.info_writer.close() 431 self.index_writer.close() 432 self.position_writer.close() 433 434 class TermDictionaryReader: 435 436 "Reading term dictionaries." 437 438 def __init__(self, info_reader, index_reader, position_reader): 439 self.info_reader = info_reader 440 self.index_reader = index_reader 441 self.position_reader = position_reader 442 443 self.terms = [] 444 try: 445 while 1: 446 self.terms.append(self.index_reader.read_term()) 447 except EOFError: 448 pass 449 450 # Large numbers for ordering purposes. 451 452 self.max_offset = self.terms[-1][1] 453 self.max_info_offset = self.terms[-1][2] 454 455 def _find_term(self, term): 456 457 "Find the position file offset of 'term' from the term dictionary." 458 459 i = bisect_right(self.terms, (term, self.max_offset, self.max_info_offset)) - 1 460 461 # Get the entry position providing the term or one preceding it. 462 463 if i == -1: 464 return None 465 466 found_term, offset, info_offset = self.terms[i] 467 468 # Where the term is found immediately, return the offset. 469 470 if term == found_term: 471 return offset 472 473 # Otherwise, seek past the index term's entry in the information file 474 # and scan for the desired term. 475 476 else: 477 self.info_reader.go_to_term(found_term, offset, info_offset) 478 try: 479 while term > found_term: 480 found_term, offset = self.info_reader.read_term() 481 except EOFError: 482 pass 483 484 # If the term is found, return the offset. 485 486 if term == found_term: 487 return offset 488 else: 489 return None 490 491 def find_positions(self, term): 492 493 "Return the documents and positions at which the given 'term' is found." 494 495 offset = self._find_term(term) 496 if offset is None: 497 return None 498 else: 499 return self.position_reader.read_all_positions(offset) 500 501 def close(self): 502 self.info_reader.close() 503 self.index_reader.close() 504 self.position_reader.close() 505 506 # Specific classes for storing document information. 507 508 class FieldWriter(FileWriter): 509 510 "Writing field data to files." 511 512 def reset(self): 513 self.last_docnum = 0 514 515 def write_fields(self, docnum, fields): 516 517 """ 518 Write for the given 'docnum', a list of 'fields' (strings representing 519 field values). Return the offset at which the fields are stored. 520 """ 521 522 offset = self.f.tell() 523 524 # Write the document number delta. 525 526 self.write_number(docnum - self.last_docnum) 527 528 # Write the number of fields. 529 530 self.write_number(len(fields)) 531 532 # Write the fields themselves. 533 534 for field in fields: 535 self.write_string(field, 0) # compress 536 537 self.last_docnum = docnum 538 return offset 539 540 class FieldReader(FileReader): 541 542 "Reading field data from files." 543 544 def reset(self): 545 self.last_docnum = 0 546 547 def read_fields(self): 548 549 """ 550 Read fields from the file, returning a tuple containing the document 551 number and a list of field values. 552 """ 553 554 # Read the document number. 555 556 self.last_docnum += self.read_number() 557 558 # Read the number of fields. 559 560 nfields = self.read_number() 561 562 # Collect the fields. 563 564 fields = [] 565 i = 0 566 567 while i < nfields: 568 fields.append(self.read_string(0)) # decompress 569 i += 1 570 571 return self.last_docnum, fields 572 573 def read_document_fields(self, docnum, offset): 574 575 """ 576 Read fields for 'docnum' at the given 'offset'. This permits the 577 retrieval of details for the specified document, as well as scanning for 578 later documents. 579 """ 580 581 self.f.seek(offset) 582 bad_docnum, fields = self.read_fields() 583 self.last_docnum = docnum 584 return docnum, fields 585 586 class FieldIndexWriter(FileWriter): 587 588 "Writing field index details to files." 589 590 def reset(self): 591 self.last_docnum = 0 592 593 def write_document(self, docnum, offset): 594 595 """ 596 Write for the given 'docnum', the 'offset' at which the fields for the 597 document are stored in the fields file. 598 """ 599 600 # Write the document number delta and offset. 601 602 self.write_number(docnum - self.last_docnum) 603 self.write_number(offset) 604 605 self.last_docnum = docnum 606 607 class FieldIndexReader(FileReader): 608 609 "Reading field index details from files." 610 611 def reset(self): 612 self.last_docnum = 0 613 614 def read_document(self): 615 616 "Read a document number and field file offset." 617 618 # Read the document number delta and offset. 619 620 self.last_docnum += self.read_number() 621 offset = self.read_number() 622 623 return self.last_docnum, offset 624 625 class FieldDictionaryWriter: 626 627 "Writing field dictionary details." 628 629 def __init__(self, field_writer, field_index_writer, interval): 630 self.field_writer = field_writer 631 self.field_index_writer = field_index_writer 632 self.interval = interval 633 self.entry = 0 634 635 def write_fields(self, docnum, fields): 636 637 "Write details of the document with the given 'docnum' and 'fields'." 638 639 offset = self.field_writer.write_fields(docnum, fields) 640 641 if self.entry % self.interval == 0: 642 self.field_index_writer.write_document(docnum, offset) 643 644 self.entry += 1 645 646 def close(self): 647 self.field_writer.close() 648 self.field_index_writer.close() 649 650 class FieldDictionaryReader: 651 652 "Reading field dictionary details." 653 654 def __init__(self, field_reader, field_index_reader): 655 self.field_reader = field_reader 656 self.field_index_reader = field_index_reader 657 658 self.docs = [] 659 try: 660 while 1: 661 self.docs.append(self.field_index_reader.read_document()) 662 except EOFError: 663 pass 664 665 # Large numbers for ordering purposes. 666 667 self.max_offset = self.docs[-1][1] 668 669 def read_fields(self, docnum): 670 671 "Read the fields of the document with the given 'docnum'." 672 673 i = bisect_right(self.docs, (docnum, self.max_offset)) - 1 674 675 # Get the entry position providing the term or one preceding it. 676 677 if i == -1: 678 return None 679 680 found_docnum, offset = self.docs[i] 681 682 # Read from the fields file. 683 684 found_docnum, fields = self.field_reader.read_document_fields(found_docnum, offset) 685 686 # Scan for the document, if necessary. 687 688 try: 689 while docnum > found_docnum: 690 found_docnum, fields = self.field_reader.read_fields() 691 except EOFError: 692 pass 693 694 # If the document is found, return the fields. 695 696 if docnum == found_docnum: 697 return fields 698 else: 699 return None 700 701 def close(self): 702 self.field_reader.close() 703 self.field_index_reader.close() 704 705 # High-level classes. 706 707 class IndexWriter: 708 709 "Building term information and writing it to the term dictionary." 710 711 def __init__(self, dict_writer): 712 self.dict_writer = dict_writer 713 self.terms = {} 714 715 def add_position(self, term, docnum, position): 716 717 """ 718 Add a position entry for the given 'term' in the document with the given 719 'docnum', indicating the given 'position'. 720 """ 721 722 if not self.terms.has_key(term): 723 doc_positions = self.terms[term] = {} 724 else: 725 doc_positions = self.terms[term] 726 727 if not doc_positions.has_key(docnum): 728 doc = doc_positions[docnum] = [] 729 else: 730 doc = doc_positions[docnum] 731 732 doc.append(position) 733 734 def close(self): 735 if self.dict_writer is None: 736 return 737 738 # Get the terms in order. 739 740 terms = self.terms.items() 741 terms.sort() 742 743 for term, doc_positions in terms: 744 doc_positions = doc_positions.items() 745 doc_positions.sort() 746 self.dict_writer.write_term_positions(term, doc_positions) 747 748 self.dict_writer.close() 749 self.dict_writer = None 750 751 class Index: 752 753 "An inverted index solution encapsulating the various components." 754 755 def __init__(self, pathname): 756 self.pathname = pathname 757 self.reader = None 758 self.writer = None 759 760 def get_writer(self, interval=INTERVAL): 761 762 "Return a writer, optionally using the given indexing 'interval'." 763 764 if not exists(self.pathname): 765 mkdir(self.pathname) 766 767 tdf = open(join(self.pathname, "terms"), "wb") 768 info_writer = TermWriter(tdf) 769 770 tdif = open(join(self.pathname, "index"), "wb") 771 index_writer = TermIndexWriter(tdif) 772 773 tpf = open(join(self.pathname, "positions"), "wb") 774 positions_writer = PositionWriter(tpf) 775 776 dict_writer = TermDictionaryWriter(info_writer, index_writer, positions_writer, interval) 777 778 self.writer = IndexWriter(dict_writer) 779 return self.writer 780 781 def get_reader(self): 782 783 "Return a reader for the index." 784 785 if not exists(self.pathname): 786 raise OSError, "Index path %r does not exist." % self.pathname 787 788 tdf = open(join(self.pathname, "terms"), "rb") 789 info_reader = TermReader(tdf) 790 791 tdif = open(join(self.pathname, "index"), "rb") 792 index_reader = TermIndexReader(tdif) 793 794 tpf = open(join(self.pathname, "positions"), "rb") 795 positions_reader = PositionReader(tpf) 796 797 self.reader = TermDictionaryReader(info_reader, index_reader, positions_reader) 798 return self.reader 799 800 def close(self): 801 if self.reader is not None: 802 self.reader.close() 803 self.reader = None 804 if self.writer is not None: 805 self.writer.close() 806 self.writer = None 807 808 # vim: tabstop=4 expandtab shiftwidth=4