1 #!/usr/bin/env python 2 3 """ 4 A simple (and sane) text indexing library. 5 6 Copyright (C) 2009 Paul Boddie <paul@boddie.org.uk> 7 8 This program is free software; you can redistribute it and/or modify it under 9 the terms of the GNU General Public License as published by the Free Software 10 Foundation; either version 3 of the License, or (at your option) any later 11 version. 12 13 This program is distributed in the hope that it will be useful, but WITHOUT ANY 14 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A 15 PARTICULAR PURPOSE. See the GNU General Public License for more details. 16 17 You should have received a copy of the GNU General Public License along 18 with this program. If not, see <http://www.gnu.org/licenses/>. 19 """ 20 21 from os import mkdir # to determine whether to create indexes 22 from os.path import exists, join 23 from os.path import commonprefix # to find common string prefixes 24 from bisect import bisect_right # to find terms in the dictionary index 25 import bz2, zlib # for field compression 26 27 # Constants. 28 29 INTERVAL = 100 30 31 compressors = [("b", bz2.compress), ("z", zlib.compress)] 32 decompressors = {"b" : bz2.decompress, "z" : zlib.decompress} 33 34 # Foundation classes. 35 36 class File: 37 38 "A basic file abstraction." 39 40 def __init__(self, f): 41 self.f = f 42 self.reset() 43 44 def reset(self): 45 pass 46 47 def close(self): 48 if self.f is not None: 49 self.f.close() 50 self.f = None 51 52 class FileWriter(File): 53 54 "Writing basic data types to files." 55 56 def write_number(self, number): 57 58 "Write 'number' to the file using a variable length encoding." 59 60 # Negative numbers are not supported. 61 62 if number < 0: 63 raise ValueError, "Number %r is negative." % number 64 65 # Special case: one byte containing zero. 66 67 elif number == 0: 68 self.f.write(chr(0)) 69 return 70 71 # Write the number from least to most significant digits. 72 73 bytes = [] 74 75 while number != 0: 76 lsd = number & 127 77 number = number >> 7 78 if number != 0: 79 lsd |= 128 80 bytes.append(chr(lsd)) 81 82 record = "".join(bytes) 83 self.f.write(record) 84 85 def write_string(self, s, compress=0): 86 87 """ 88 Write 's' to the file, recording its length and compressing the string 89 if 'compress' is set to a true value. 90 """ 91 92 # Convert Unicode objects to strings. 93 94 if isinstance(s, unicode): 95 s = s.encode("utf-8") 96 97 # Compress the string if requested. 98 99 if compress: 100 for flag, fn in compressors: 101 cs = fn(s) 102 103 # Take the first string shorter than the original. 104 105 if len(cs) < len(s): 106 s = cs 107 break 108 else: 109 flag = "-" 110 111 # Record whether compression was used. 112 113 self.f.write(flag) 114 115 # Write the length of the data before the data itself. 116 117 length = len(s) 118 self.write_number(length) 119 self.f.write(s) 120 121 class FileReader(File): 122 123 "Reading basic data types from files." 124 125 def read_number(self): 126 127 "Read a number from the file." 128 129 # Read each byte, adding it to the number. 130 131 shift = 0 132 number = 0 133 more = 1 134 135 while more: 136 byte = self.f.read(1) 137 if not byte: 138 raise EOFError 139 140 csd = ord(byte) 141 more = csd & 128 != 0 142 if more: 143 csd &= 127 144 number += (csd << shift) 145 shift += 7 146 147 return number 148 149 def read_string(self, decompress=0): 150 151 """ 152 Read a string from the file, decompressing the stored data if 153 'decompress' is set to a true value. 154 """ 155 156 # Decompress the data if requested. 157 158 if decompress: 159 flag = self.f.read(1) 160 else: 161 flag = "-" 162 163 length = self.read_number() 164 s = self.f.read(length) 165 166 # Perform decompression if applicable. 167 168 if flag != "-": 169 fn = decompressors[flag] 170 s = fn(s) 171 172 # Convert strings to Unicode objects. 173 174 return unicode(s, "utf-8") 175 176 # Specific classes for storing term and position information. 177 178 class PositionWriter(FileWriter): 179 180 "Writing position information to files." 181 182 def reset(self): 183 self.last_docnum = 0 184 185 def write_positions(self, docnum, positions): 186 187 "Write for the document 'docnum' the given 'positions'." 188 189 if docnum < self.last_docnum: 190 raise ValueError, "Document number %r is less than previous number %r." % (docnum, self.last_docnum) 191 192 # Write the document number delta. 193 194 self.write_number(docnum - self.last_docnum) 195 196 # Write the number of positions. 197 198 self.write_number(len(positions)) 199 200 # Make sure that the positions are sorted. 201 202 positions.sort() 203 204 # Write the position deltas. 205 206 last = 0 207 for position in positions: 208 pos = position - last 209 self.write_number(pos) 210 last = position 211 212 self.last_docnum = docnum 213 214 def write_all_positions(self, doc_positions): 215 216 """ 217 Write all 'doc_positions' - a collection of tuples of the form (document 218 number, position list) - to the file, returning a tuple containing the 219 offset at which they were stored together with the frequency (number of 220 positions) for the term involved. 221 """ 222 223 # Reset the writer and record the current file offset. 224 225 self.reset() 226 offset = self.f.tell() 227 228 # Write the number of documents. 229 230 self.write_number(len(doc_positions)) 231 232 # Write the positions. 233 234 frequency = 0 235 236 for docnum, positions in doc_positions: 237 self.write_positions(docnum, positions) 238 frequency += len(positions) 239 240 return offset, frequency 241 242 class PositionReader(FileReader): 243 244 "Reading position information from files." 245 246 def reset(self): 247 self.last_docnum = 0 248 249 def read_positions(self): 250 251 "Read positions, returning a document number and a list of positions." 252 253 # Read the document number delta and add it to the last number. 254 255 self.last_docnum += self.read_number() 256 257 # Read the number of positions. 258 259 npositions = self.read_number() 260 261 # Read the position deltas, adding each previous position to get the 262 # appropriate collection of absolute positions. 263 264 i = 0 265 last = 0 266 positions = [] 267 268 while i < npositions: 269 last += self.read_number() 270 positions.append(last) 271 i += 1 272 273 return self.last_docnum, positions 274 275 def read_all_positions(self, offset): 276 277 """ 278 Read all positions from 'offset', seeking to that position in the file 279 before reading. 280 """ 281 282 self.reset() 283 self.f.seek(offset) 284 285 # Read the number of documents. 286 287 ndocuments = self.read_number() 288 289 # Read all records. 290 291 i = 0 292 doc_positions = [] 293 294 while i < ndocuments: 295 doc_positions.append(self.read_positions()) 296 i += 1 297 298 return doc_positions 299 300 class TermWriter(FileWriter): 301 302 "Writing term information to files." 303 304 def reset(self): 305 self.last_term = "" 306 self.last_offset = 0 307 308 def write_term(self, term, offset, frequency): 309 310 """ 311 Write the given 'term', its position file 'offset', and its 'frequency' 312 to the term information file. Return the offset after the term 313 information was written to the file. 314 """ 315 316 # Too long terms are not currently supported. 317 318 if len(term) > 255: 319 raise ValueError, "Term %r is too long." % term 320 321 # Write the prefix length and term suffix. 322 323 common = len(commonprefix([self.last_term, term])) 324 suffix = term[common:] 325 326 self.write_number(common) 327 self.write_string(suffix) 328 329 # Write the offset delta. 330 331 self.write_number(offset - self.last_offset) 332 333 # Write the frequency. 334 335 self.write_number(frequency) 336 337 self.last_term = term 338 self.last_offset = offset 339 340 return self.f.tell() 341 342 class TermReader(FileReader): 343 344 "Reading term information from files." 345 346 def reset(self): 347 self.last_term = "" 348 self.last_offset = 0 349 350 def read_term(self): 351 352 """ 353 Read a term, its position file offset, and its frequency from the term 354 information file. 355 """ 356 357 # Read the prefix length and term suffix. 358 359 common = self.read_number() 360 suffix = self.read_string() 361 362 self.last_term = self.last_term[:common] + suffix 363 364 # Read the offset delta. 365 366 self.last_offset += self.read_number() 367 368 # Read the frequency. 369 370 frequency = self.read_number() 371 372 return self.last_term, self.last_offset, frequency 373 374 def go_to_term(self, term, offset, info_offset): 375 376 """ 377 Seek past the entry for 'term' having 'offset' to 'info_offset'. This 378 permits the scanning for later terms from the specified term. 379 """ 380 381 self.f.seek(info_offset) 382 self.last_term = term 383 self.last_offset = offset 384 385 class TermIndexWriter(TermWriter): 386 387 "Writing term dictionary index details to files." 388 389 def reset(self): 390 TermWriter.reset(self) 391 self.last_info_offset = 0 392 393 def write_term(self, term, offset, frequency, info_offset): 394 395 """ 396 Write the given 'term', its position file 'offset', and its 'frequency' 397 to the term dictionary index file, along with the 'info_offset' in the 398 term information file. 399 """ 400 401 TermWriter.write_term(self, term, offset, frequency) 402 403 # Write the information file offset delta. 404 405 self.write_number(info_offset - self.last_info_offset) 406 self.last_info_offset = info_offset 407 408 class TermIndexReader(TermReader): 409 410 "Reading term dictionary index details from files." 411 412 def reset(self): 413 TermReader.reset(self) 414 self.last_info_offset = 0 415 416 def read_term(self): 417 418 """ 419 Read a term, its position file offset, its frequency, and its term 420 information file offset from the term dictionary index file. 421 """ 422 423 term, offset, frequency = TermReader.read_term(self) 424 425 # Read the offset delta. 426 427 self.last_info_offset += self.read_number() 428 429 return term, offset, frequency, self.last_info_offset 430 431 class TermDictionaryWriter: 432 433 "Writing term dictionaries." 434 435 def __init__(self, info_writer, index_writer, position_writer, interval): 436 self.info_writer = info_writer 437 self.index_writer = index_writer 438 self.position_writer = position_writer 439 self.interval = interval 440 self.entry = 0 441 442 def _write_term(self, term, offset, frequency): 443 444 """ 445 Write the given 'term', its position file 'offset', and its 'frequency' 446 to the term information file and optionally to the index, making a 447 dictionary entry. 448 """ 449 450 info_offset = self.info_writer.write_term(term, offset, frequency) 451 452 if self.entry % self.interval == 0: 453 self.index_writer.write_term(term, offset, frequency, info_offset) 454 455 self.entry += 1 456 457 def write_term_positions(self, term, doc_positions): 458 459 """ 460 Write the given 'term' and the 'doc_positions' recording the documents 461 and positions at which the term is found. 462 """ 463 464 offset, frequency = self.position_writer.write_all_positions(doc_positions) 465 self._write_term(term, offset, frequency) 466 467 def close(self): 468 self.info_writer.close() 469 self.index_writer.close() 470 self.position_writer.close() 471 472 class TermDictionaryReader: 473 474 "Reading term dictionaries." 475 476 def __init__(self, info_reader, index_reader, position_reader): 477 self.info_reader = info_reader 478 self.index_reader = index_reader 479 self.position_reader = position_reader 480 481 self.terms = [] 482 try: 483 while 1: 484 self.terms.append(self.index_reader.read_term()) 485 except EOFError: 486 pass 487 488 # Large numbers for ordering purposes. 489 490 self.max_offset = self.terms[-1][1] 491 self.max_info_offset = self.terms[-1][2] 492 493 def _find_term(self, term): 494 495 """ 496 Find the position file offset and frequency of 'term' from the term 497 dictionary. 498 """ 499 500 i = bisect_right(self.terms, (term, self.max_offset, self.max_info_offset)) - 1 501 502 # Get the entry position providing the term or one preceding it. 503 504 if i == -1: 505 return None 506 507 found_term, offset, frequency, info_offset = self.terms[i] 508 509 # Where the term is found immediately, return the offset. 510 511 if term == found_term: 512 return offset, frequency 513 514 # Otherwise, seek past the index term's entry in the information file 515 # and scan for the desired term. 516 517 else: 518 self.info_reader.go_to_term(found_term, offset, info_offset) 519 try: 520 while term > found_term: 521 found_term, offset, frequency = self.info_reader.read_term() 522 except EOFError: 523 pass 524 525 # If the term is found, return the offset and frequency. 526 527 if term == found_term: 528 return offset, frequency 529 else: 530 return None 531 532 def find_positions(self, term): 533 534 "Return the documents and positions at which the given 'term' is found." 535 536 t = self._find_term(term) 537 if t is None: 538 return None 539 else: 540 offset, frequency = t 541 return self.position_reader.read_all_positions(offset) 542 543 def get_frequency(self, term): 544 545 "Return the frequency of the given 'term'." 546 547 t = self._find_term(term) 548 if t is None: 549 return None 550 else: 551 offset, frequency = t 552 return frequency 553 554 def close(self): 555 self.info_reader.close() 556 self.index_reader.close() 557 self.position_reader.close() 558 559 # Specific classes for storing document information. 560 561 class FieldWriter(FileWriter): 562 563 "Writing field data to files." 564 565 def reset(self): 566 self.last_docnum = 0 567 568 def write_fields(self, docnum, fields): 569 570 """ 571 Write for the given 'docnum', a list of 'fields' (strings representing 572 field values). Return the offset at which the fields are stored. 573 """ 574 575 offset = self.f.tell() 576 577 # Write the document number delta. 578 579 self.write_number(docnum - self.last_docnum) 580 581 # Write the number of fields. 582 583 self.write_number(len(fields)) 584 585 # Write the fields themselves. 586 587 for field in fields: 588 self.write_string(field, 1) # compress 589 590 self.last_docnum = docnum 591 return offset 592 593 class FieldReader(FileReader): 594 595 "Reading field data from files." 596 597 def reset(self): 598 self.last_docnum = 0 599 600 def read_fields(self): 601 602 """ 603 Read fields from the file, returning a tuple containing the document 604 number and a list of field values. 605 """ 606 607 # Read the document number. 608 609 self.last_docnum += self.read_number() 610 611 # Read the number of fields. 612 613 nfields = self.read_number() 614 615 # Collect the fields. 616 617 fields = [] 618 i = 0 619 620 while i < nfields: 621 fields.append(self.read_string(1)) # decompress 622 i += 1 623 624 return self.last_docnum, fields 625 626 def read_document_fields(self, docnum, offset): 627 628 """ 629 Read fields for 'docnum' at the given 'offset'. This permits the 630 retrieval of details for the specified document, as well as scanning for 631 later documents. 632 """ 633 634 self.f.seek(offset) 635 bad_docnum, fields = self.read_fields() 636 self.last_docnum = docnum 637 return docnum, fields 638 639 class FieldIndexWriter(FileWriter): 640 641 "Writing field index details to files." 642 643 def reset(self): 644 self.last_docnum = 0 645 self.last_offset = 0 646 647 def write_document(self, docnum, offset): 648 649 """ 650 Write for the given 'docnum', the 'offset' at which the fields for the 651 document are stored in the fields file. 652 """ 653 654 # Write the document number and offset deltas. 655 656 self.write_number(docnum - self.last_docnum) 657 self.write_number(offset - self.last_offset) 658 659 self.last_docnum = docnum 660 self.last_offset = offset 661 662 class FieldIndexReader(FileReader): 663 664 "Reading field index details from files." 665 666 def reset(self): 667 self.last_docnum = 0 668 self.last_offset = 0 669 670 def read_document(self): 671 672 "Read a document number and field file offset." 673 674 # Read the document number delta and offset. 675 676 self.last_docnum += self.read_number() 677 self.last_offset += self.read_number() 678 679 return self.last_docnum, self.last_offset 680 681 class FieldDictionaryWriter: 682 683 "Writing field dictionary details." 684 685 def __init__(self, field_writer, field_index_writer, interval): 686 self.field_writer = field_writer 687 self.field_index_writer = field_index_writer 688 self.interval = interval 689 self.entry = 0 690 691 def write_fields(self, docnum, fields): 692 693 "Write details of the document with the given 'docnum' and 'fields'." 694 695 offset = self.field_writer.write_fields(docnum, fields) 696 697 if self.entry % self.interval == 0: 698 self.field_index_writer.write_document(docnum, offset) 699 700 self.entry += 1 701 702 def close(self): 703 self.field_writer.close() 704 self.field_index_writer.close() 705 706 class FieldDictionaryReader: 707 708 "Reading field dictionary details." 709 710 def __init__(self, field_reader, field_index_reader): 711 self.field_reader = field_reader 712 self.field_index_reader = field_index_reader 713 714 self.docs = [] 715 try: 716 while 1: 717 self.docs.append(self.field_index_reader.read_document()) 718 except EOFError: 719 pass 720 721 # Large numbers for ordering purposes. 722 723 self.max_offset = self.docs[-1][1] 724 725 def read_fields(self, docnum): 726 727 "Read the fields of the document with the given 'docnum'." 728 729 i = bisect_right(self.docs, (docnum, self.max_offset)) - 1 730 731 # Get the entry position providing the term or one preceding it. 732 733 if i == -1: 734 return None 735 736 found_docnum, offset = self.docs[i] 737 738 # Read from the fields file. 739 740 found_docnum, fields = self.field_reader.read_document_fields(found_docnum, offset) 741 742 # Scan for the document, if necessary. 743 744 try: 745 while docnum > found_docnum: 746 found_docnum, fields = self.field_reader.read_fields() 747 except EOFError: 748 pass 749 750 # If the document is found, return the fields. 751 752 if docnum == found_docnum: 753 return fields 754 else: 755 return None 756 757 def close(self): 758 self.field_reader.close() 759 self.field_index_reader.close() 760 761 # High-level classes. 762 763 class IndexWriter: 764 765 """ 766 Building term information and writing it to the term and field dictionaries. 767 """ 768 769 def __init__(self, dict_writer, field_dict_writer): 770 self.dict_writer = dict_writer 771 self.field_dict_writer = field_dict_writer 772 self.terms = {} 773 self.docs = {} 774 775 def add_position(self, term, docnum, position): 776 777 """ 778 Add a position entry for the given 'term' in the document with the given 779 'docnum', indicating the given 'position'. 780 """ 781 782 if not self.terms.has_key(term): 783 doc_positions = self.terms[term] = {} 784 else: 785 doc_positions = self.terms[term] 786 787 if not doc_positions.has_key(docnum): 788 doc = doc_positions[docnum] = [] 789 else: 790 doc = doc_positions[docnum] 791 792 doc.append(position) 793 794 def add_fields(self, docnum, fields): 795 796 "Add for the document with the given 'docnum' a list of 'fields'." 797 798 if not self.docs.has_key(docnum): 799 doc_fields = self.docs[docnum] = fields 800 else: 801 self.docs[docnum] += fields 802 803 def close(self): 804 if self.dict_writer is None: 805 return 806 807 # Get the terms in order. 808 809 terms = self.terms.items() 810 terms.sort() 811 812 for term, doc_positions in terms: 813 doc_positions = doc_positions.items() 814 doc_positions.sort() 815 self.dict_writer.write_term_positions(term, doc_positions) 816 817 self.dict_writer.close() 818 self.dict_writer = None 819 820 # Get the documents in order. 821 822 docs = self.docs.items() 823 docs.sort() 824 825 for docnum, fields in docs: 826 self.field_dict_writer.write_fields(docnum, fields) 827 828 self.field_dict_writer.close() 829 self.field_dict_writer = None 830 831 class IndexReader: 832 833 "Accessing the term and field dictionaries." 834 835 def __init__(self, dict_reader, field_dict_reader): 836 self.dict_reader = dict_reader 837 self.field_dict_reader = field_dict_reader 838 839 def find_positions(self, term): 840 return self.dict_reader.find_positions(term) 841 842 def get_frequency(self, term): 843 return self.dict_reader.get_frequency(term) 844 845 def get_fields(self, docnum): 846 return self.field_dict_reader.read_fields(docnum) 847 848 def close(self): 849 self.dict_reader.close() 850 self.field_dict_reader.close() 851 852 class Index: 853 854 "An inverted index solution encapsulating the various components." 855 856 def __init__(self, pathname): 857 self.pathname = pathname 858 self.reader = None 859 self.writer = None 860 861 def get_writer(self, interval=INTERVAL): 862 863 "Return a writer, optionally using the given indexing 'interval'." 864 865 if not exists(self.pathname): 866 mkdir(self.pathname) 867 868 tdf = open(join(self.pathname, "terms"), "wb") 869 info_writer = TermWriter(tdf) 870 871 tdif = open(join(self.pathname, "index"), "wb") 872 index_writer = TermIndexWriter(tdif) 873 874 tpf = open(join(self.pathname, "positions"), "wb") 875 positions_writer = PositionWriter(tpf) 876 877 dict_writer = TermDictionaryWriter(info_writer, index_writer, positions_writer, interval) 878 879 ff = open(join(self.pathname, "fields"), "wb") 880 field_writer = FieldWriter(ff) 881 882 fif = open(join(self.pathname, "fields_index"), "wb") 883 field_index_writer = FieldIndexWriter(fif) 884 885 field_dict_writer = FieldDictionaryWriter(field_writer, field_index_writer, interval) 886 887 self.writer = IndexWriter(dict_writer, field_dict_writer) 888 return self.writer 889 890 def get_reader(self): 891 892 "Return a reader for the index." 893 894 if not exists(self.pathname): 895 raise OSError, "Index path %r does not exist." % self.pathname 896 897 tdf = open(join(self.pathname, "terms"), "rb") 898 info_reader = TermReader(tdf) 899 900 tdif = open(join(self.pathname, "index"), "rb") 901 index_reader = TermIndexReader(tdif) 902 903 tpf = open(join(self.pathname, "positions"), "rb") 904 positions_reader = PositionReader(tpf) 905 906 dict_reader = TermDictionaryReader(info_reader, index_reader, positions_reader) 907 908 ff = open(join(self.pathname, "fields"), "rb") 909 field_reader = FieldReader(ff) 910 911 fif = open(join(self.pathname, "fields_index"), "rb") 912 field_index_reader = FieldIndexReader(fif) 913 914 field_dict_reader = FieldDictionaryReader(field_reader, field_index_reader) 915 916 self.reader = IndexReader(dict_reader, field_dict_reader) 917 return self.reader 918 919 def close(self): 920 if self.reader is not None: 921 self.reader.close() 922 self.reader = None 923 if self.writer is not None: 924 self.writer.close() 925 self.writer = None 926 927 # vim: tabstop=4 expandtab shiftwidth=4