1 #!/usr/bin/env python 2 3 """ 4 A simple (and sane) text indexing library. 5 6 Copyright (C) 2009 Paul Boddie <paul@boddie.org.uk> 7 8 This program is free software; you can redistribute it and/or modify it under 9 the terms of the GNU General Public License as published by the Free Software 10 Foundation; either version 3 of the License, or (at your option) any later 11 version. 12 13 This program is distributed in the hope that it will be useful, but WITHOUT ANY 14 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A 15 PARTICULAR PURPOSE. See the GNU General Public License for more details. 16 17 You should have received a copy of the GNU General Public License along 18 with this program. If not, see <http://www.gnu.org/licenses/>. 19 """ 20 21 from os import listdir, mkdir # index and partition discovery 22 from os.path import exists, join 23 from os.path import commonprefix # to find common string prefixes 24 from bisect import bisect_right # to find terms in the dictionary index 25 from bisect import insort_right # to maintain a sorted list of data for merging 26 import bz2, zlib # for field compression 27 28 # Constants. 29 30 INTERVAL = 100 31 FLUSH_INTERVAL = 1000000 32 33 compressors = [("b", bz2.compress), ("z", zlib.compress)] 34 decompressors = {"b" : bz2.decompress, "z" : zlib.decompress} 35 36 # Foundation classes. 37 38 class File: 39 40 "A basic file abstraction." 41 42 def __init__(self, f): 43 self.f = f 44 self.reset() 45 46 def reset(self): 47 48 "To be used to reset the state of the reader or writer between records." 49 50 pass 51 52 def rewind(self): 53 self.f.seek(0) 54 55 def close(self): 56 if self.f is not None: 57 self.f.close() 58 self.f = None 59 60 class FileWriter(File): 61 62 "Writing basic data types to files." 63 64 def write_number(self, number): 65 66 "Write 'number' to the file using a variable length encoding." 67 68 # Negative numbers are not supported. 69 70 if number < 0: 71 raise ValueError, "Number %r is negative." % number 72 73 # Special case: one byte containing zero. 74 75 elif number == 0: 76 self.f.write(chr(0)) 77 return 78 79 # Write the number from least to most significant digits. 80 81 bytes = [] 82 83 while number != 0: 84 lsd = number & 127 85 number = number >> 7 86 if number != 0: 87 lsd |= 128 88 bytes.append(chr(lsd)) 89 90 record = "".join(bytes) 91 self.f.write(record) 92 93 def write_string(self, s, compress=0): 94 95 """ 96 Write 's' to the file, recording its length and compressing the string 97 if 'compress' is set to a true value. 98 """ 99 100 # Convert Unicode objects to strings. 101 102 if isinstance(s, unicode): 103 s = s.encode("utf-8") 104 105 # Compress the string if requested. 106 107 if compress: 108 for flag, fn in compressors: 109 cs = fn(s) 110 111 # Take the first string shorter than the original. 112 113 if len(cs) < len(s): 114 s = cs 115 break 116 else: 117 flag = "-" 118 119 # Record whether compression was used. 120 121 self.f.write(flag) 122 123 # Write the length of the data before the data itself. 124 125 length = len(s) 126 self.write_number(length) 127 self.f.write(s) 128 129 class FileReader(File): 130 131 "Reading basic data types from files." 132 133 def read_number(self): 134 135 "Read a number from the file." 136 137 # Read each byte, adding it to the number. 138 139 shift = 0 140 number = 0 141 more = 1 142 143 while more: 144 byte = self.f.read(1) 145 if not byte: 146 raise EOFError 147 148 csd = ord(byte) 149 more = csd & 128 != 0 150 if more: 151 csd &= 127 152 number += (csd << shift) 153 shift += 7 154 155 return number 156 157 def read_string(self, decompress=0): 158 159 """ 160 Read a string from the file, decompressing the stored data if 161 'decompress' is set to a true value. 162 """ 163 164 # Decompress the data if requested. 165 166 if decompress: 167 flag = self.f.read(1) 168 else: 169 flag = "-" 170 171 length = self.read_number() 172 s = self.f.read(length) 173 174 # Perform decompression if applicable. 175 176 if flag != "-": 177 fn = decompressors[flag] 178 s = fn(s) 179 180 # Convert strings to Unicode objects. 181 182 return unicode(s, "utf-8") 183 184 # Specific classes for storing term and position information. 185 186 class PositionWriter(FileWriter): 187 188 "Writing position information to files." 189 190 def reset(self): 191 self.last_docnum = 0 192 193 def write_positions(self, docnum, positions): 194 195 "Write for the document 'docnum' the given 'positions'." 196 197 if docnum < self.last_docnum: 198 raise ValueError, "Document number %r is less than previous number %r." % (docnum, self.last_docnum) 199 200 # Write the document number delta. 201 202 self.write_number(docnum - self.last_docnum) 203 204 # Write the number of positions. 205 206 self.write_number(len(positions)) 207 208 # Make sure that the positions are sorted. 209 210 positions.sort() 211 212 # Write the position deltas. 213 214 last = 0 215 for position in positions: 216 pos = position - last 217 self.write_number(pos) 218 last = position 219 220 self.last_docnum = docnum 221 222 def write_term_positions(self, doc_positions): 223 224 """ 225 Write all 'doc_positions' - a collection of tuples of the form (document 226 number, position list) - to the file, returning a tuple containing the 227 offset at which they were stored together with the frequency (number of 228 positions) for the term involved. 229 """ 230 231 # Reset the writer and record the current file offset. 232 233 self.reset() 234 offset = self.f.tell() 235 236 # Write the number of documents. 237 238 self.write_number(len(doc_positions)) 239 240 # Write the positions. 241 242 frequency = 0 243 244 for docnum, positions in doc_positions: 245 self.write_positions(docnum, positions) 246 frequency += len(positions) 247 248 return offset, frequency 249 250 class PositionReader(FileReader): 251 252 "Reading position information from files." 253 254 def reset(self): 255 self.last_docnum = 0 256 257 def read_positions(self): 258 259 "Read positions, returning a document number and a list of positions." 260 261 # Read the document number delta and add it to the last number. 262 263 self.last_docnum += self.read_number() 264 265 # Read the number of positions. 266 267 npositions = self.read_number() 268 269 # Read the position deltas, adding each previous position to get the 270 # appropriate collection of absolute positions. 271 272 i = 0 273 last = 0 274 positions = [] 275 276 while i < npositions: 277 last += self.read_number() 278 positions.append(last) 279 i += 1 280 281 return self.last_docnum, positions 282 283 def read_term_positions(self, offset): 284 285 """ 286 Read all positions from 'offset', seeking to that position in the file 287 before reading. 288 """ 289 290 self.reset() 291 self.f.seek(offset) 292 293 # Read the number of documents. 294 295 ndocuments = self.read_number() 296 297 # Read all records. 298 299 i = 0 300 doc_positions = [] 301 302 while i < ndocuments: 303 doc_positions.append(self.read_positions()) 304 i += 1 305 306 return doc_positions 307 308 class TermWriter(FileWriter): 309 310 "Writing term information to files." 311 312 def reset(self): 313 self.last_term = "" 314 self.last_offset = 0 315 316 def write_term(self, term, offset, frequency): 317 318 """ 319 Write the given 'term', its position file 'offset', and its 'frequency' 320 to the term information file. Return the offset after the term 321 information was written to the file. 322 """ 323 324 # Too long terms are not currently supported. 325 326 if len(term) > 255: 327 raise ValueError, "Term %r is too long." % term 328 329 # Write the prefix length and term suffix. 330 331 common = len(commonprefix([self.last_term, term])) 332 suffix = term[common:] 333 334 self.write_number(common) 335 self.write_string(suffix) 336 337 # Write the offset delta. 338 339 self.write_number(offset - self.last_offset) 340 341 # Write the frequency. 342 343 self.write_number(frequency) 344 345 self.last_term = term 346 self.last_offset = offset 347 348 return self.f.tell() 349 350 class TermReader(FileReader): 351 352 "Reading term information from files." 353 354 def reset(self): 355 self.last_term = "" 356 self.last_offset = 0 357 358 def rewind(self): 359 self.reset() 360 FileReader.rewind(self) 361 362 def read_term(self): 363 364 """ 365 Read a term, its position file offset, and its frequency from the term 366 information file. 367 """ 368 369 # Read the prefix length and term suffix. 370 371 common = self.read_number() 372 suffix = self.read_string() 373 374 self.last_term = self.last_term[:common] + suffix 375 376 # Read the offset delta. 377 378 self.last_offset += self.read_number() 379 380 # Read the frequency. 381 382 frequency = self.read_number() 383 384 return self.last_term, self.last_offset, frequency 385 386 def go_to_term(self, term, offset, info_offset): 387 388 """ 389 Seek past the entry for 'term' having 'offset' to 'info_offset'. This 390 permits the scanning for later terms from the specified term. 391 """ 392 393 self.f.seek(info_offset) 394 self.last_term = term 395 self.last_offset = offset 396 397 class TermIndexWriter(TermWriter): 398 399 "Writing term dictionary index details to files." 400 401 def reset(self): 402 TermWriter.reset(self) 403 self.last_info_offset = 0 404 405 def write_term(self, term, offset, frequency, info_offset): 406 407 """ 408 Write the given 'term', its position file 'offset', and its 'frequency' 409 to the term dictionary index file, along with the 'info_offset' in the 410 term information file. 411 """ 412 413 TermWriter.write_term(self, term, offset, frequency) 414 415 # Write the information file offset delta. 416 417 self.write_number(info_offset - self.last_info_offset) 418 self.last_info_offset = info_offset 419 420 class TermIndexReader(TermReader): 421 422 "Reading term dictionary index details from files." 423 424 def reset(self): 425 TermReader.reset(self) 426 self.last_info_offset = 0 427 428 def read_term(self): 429 430 """ 431 Read a term, its position file offset, its frequency, and its term 432 information file offset from the term dictionary index file. 433 """ 434 435 term, offset, frequency = TermReader.read_term(self) 436 437 # Read the offset delta. 438 439 self.last_info_offset += self.read_number() 440 441 return term, offset, frequency, self.last_info_offset 442 443 class TermDictionaryWriter: 444 445 "Writing term dictionaries." 446 447 def __init__(self, info_writer, index_writer, position_writer, interval): 448 self.info_writer = info_writer 449 self.index_writer = index_writer 450 self.position_writer = position_writer 451 self.interval = interval 452 self.entry = 0 453 454 def _write_term(self, term, offset, frequency): 455 456 """ 457 Write the given 'term', its position file 'offset', and its 'frequency' 458 to the term information file and optionally to the index, making a 459 dictionary entry. 460 """ 461 462 info_offset = self.info_writer.write_term(term, offset, frequency) 463 464 if self.entry % self.interval == 0: 465 self.index_writer.write_term(term, offset, frequency, info_offset) 466 467 self.entry += 1 468 469 def write_term_positions(self, term, doc_positions): 470 471 """ 472 Write the given 'term' and the 'doc_positions' recording the documents 473 and positions at which the term is found. 474 """ 475 476 offset, frequency = self.position_writer.write_term_positions(doc_positions) 477 self._write_term(term, offset, frequency) 478 479 def close(self): 480 self.info_writer.close() 481 self.index_writer.close() 482 self.position_writer.close() 483 484 class TermDictionaryReader: 485 486 "Reading term dictionaries." 487 488 def __init__(self, info_reader, index_reader, position_reader): 489 self.info_reader = info_reader 490 self.index_reader = index_reader 491 self.position_reader = position_reader 492 493 self.terms = [] 494 try: 495 while 1: 496 self.terms.append(self.index_reader.read_term()) 497 except EOFError: 498 pass 499 500 # Large numbers for ordering purposes. 501 502 self.max_offset = self.terms[-1][1] 503 self.max_info_offset = self.terms[-1][2] 504 505 def _find_term(self, term): 506 507 """ 508 Find the position file offset and frequency of 'term' from the term 509 dictionary. 510 """ 511 512 i = bisect_right(self.terms, (term, self.max_offset, self.max_info_offset)) - 1 513 514 # Get the entry position providing the term or one preceding it. 515 516 if i == -1: 517 return None 518 519 found_term, offset, frequency, info_offset = self.terms[i] 520 521 # Where the term is found immediately, return the offset. 522 523 if term == found_term: 524 return offset, frequency 525 526 # Otherwise, seek past the index term's entry in the information file 527 # and scan for the desired term. 528 529 else: 530 self.info_reader.go_to_term(found_term, offset, info_offset) 531 try: 532 while term > found_term: 533 found_term, offset, frequency = self.info_reader.read_term() 534 except EOFError: 535 pass 536 537 # If the term is found, return the offset and frequency. 538 539 if term == found_term: 540 return offset, frequency 541 else: 542 return None 543 544 def rewind(self): 545 self.info_reader.rewind() 546 547 def _get_positions(self, offset): 548 return self.position_reader.read_term_positions(offset) 549 550 def read_term(self): 551 552 """ 553 Return the next term, its frequency and the documents and positions at 554 which the term is found. 555 """ 556 557 term, offset, frequency = self.info_reader.read_term() 558 positions = self._get_positions(offset) 559 return term, frequency, positions 560 561 def find_positions(self, term): 562 563 "Return the documents and positions at which the given 'term' is found." 564 565 t = self._find_term(term) 566 if t is None: 567 return None 568 else: 569 offset, frequency = t 570 return self._get_positions(offset) 571 572 def get_frequency(self, term): 573 574 "Return the frequency of the given 'term'." 575 576 t = self._find_term(term) 577 if t is None: 578 return None 579 else: 580 offset, frequency = t 581 return frequency 582 583 def close(self): 584 self.info_reader.close() 585 self.index_reader.close() 586 self.position_reader.close() 587 588 # Specific classes for storing document information. 589 590 class FieldWriter(FileWriter): 591 592 "Writing field data to files." 593 594 def reset(self): 595 self.last_docnum = 0 596 597 def write_fields(self, docnum, fields): 598 599 """ 600 Write for the given 'docnum', a list of 'fields' (strings representing 601 field values). Return the offset at which the fields are stored. 602 """ 603 604 offset = self.f.tell() 605 606 # Write the document number delta. 607 608 self.write_number(docnum - self.last_docnum) 609 610 # Write the number of fields. 611 612 self.write_number(len(fields)) 613 614 # Write the fields themselves. 615 616 for field in fields: 617 self.write_string(field, 1) # compress 618 619 self.last_docnum = docnum 620 return offset 621 622 class FieldReader(FileReader): 623 624 "Reading field data from files." 625 626 def reset(self): 627 self.last_docnum = 0 628 629 def read_fields(self): 630 631 """ 632 Read fields from the file, returning a tuple containing the document 633 number and a list of field values. 634 """ 635 636 # Read the document number. 637 638 self.last_docnum += self.read_number() 639 640 # Read the number of fields. 641 642 nfields = self.read_number() 643 644 # Collect the fields. 645 646 fields = [] 647 i = 0 648 649 while i < nfields: 650 fields.append(self.read_string(1)) # decompress 651 i += 1 652 653 return self.last_docnum, fields 654 655 def read_document_fields(self, docnum, offset): 656 657 """ 658 Read fields for 'docnum' at the given 'offset'. This permits the 659 retrieval of details for the specified document, as well as scanning for 660 later documents. 661 """ 662 663 self.f.seek(offset) 664 bad_docnum, fields = self.read_fields() 665 self.last_docnum = docnum 666 return docnum, fields 667 668 class FieldIndexWriter(FileWriter): 669 670 "Writing field index details to files." 671 672 def reset(self): 673 self.last_docnum = 0 674 self.last_offset = 0 675 676 def write_document(self, docnum, offset): 677 678 """ 679 Write for the given 'docnum', the 'offset' at which the fields for the 680 document are stored in the fields file. 681 """ 682 683 # Write the document number and offset deltas. 684 685 self.write_number(docnum - self.last_docnum) 686 self.write_number(offset - self.last_offset) 687 688 self.last_docnum = docnum 689 self.last_offset = offset 690 691 class FieldIndexReader(FileReader): 692 693 "Reading field index details from files." 694 695 def reset(self): 696 self.last_docnum = 0 697 self.last_offset = 0 698 699 def read_document(self): 700 701 "Read a document number and field file offset." 702 703 # Read the document number delta and offset. 704 705 self.last_docnum += self.read_number() 706 self.last_offset += self.read_number() 707 708 return self.last_docnum, self.last_offset 709 710 class FieldDictionaryWriter: 711 712 "Writing field dictionary details." 713 714 def __init__(self, field_writer, field_index_writer, interval): 715 self.field_writer = field_writer 716 self.field_index_writer = field_index_writer 717 self.interval = interval 718 self.entry = 0 719 720 def write_fields(self, docnum, fields): 721 722 "Write details of the document with the given 'docnum' and 'fields'." 723 724 offset = self.field_writer.write_fields(docnum, fields) 725 726 if self.entry % self.interval == 0: 727 self.field_index_writer.write_document(docnum, offset) 728 729 self.entry += 1 730 731 def close(self): 732 self.field_writer.close() 733 self.field_index_writer.close() 734 735 class FieldDictionaryReader: 736 737 "Reading field dictionary details." 738 739 def __init__(self, field_reader, field_index_reader): 740 self.field_reader = field_reader 741 self.field_index_reader = field_index_reader 742 743 self.docs = [] 744 try: 745 while 1: 746 self.docs.append(self.field_index_reader.read_document()) 747 except EOFError: 748 pass 749 750 # Large numbers for ordering purposes. 751 752 self.max_offset = self.docs[-1][1] 753 754 def read_fields(self, docnum): 755 756 "Read the fields of the document with the given 'docnum'." 757 758 i = bisect_right(self.docs, (docnum, self.max_offset)) - 1 759 760 # Get the entry position providing the term or one preceding it. 761 762 if i == -1: 763 return None 764 765 found_docnum, offset = self.docs[i] 766 767 # Read from the fields file. 768 769 found_docnum, fields = self.field_reader.read_document_fields(found_docnum, offset) 770 771 # Scan for the document, if necessary. 772 773 try: 774 while docnum > found_docnum: 775 found_docnum, fields = self.field_reader.read_fields() 776 except EOFError: 777 pass 778 779 # If the document is found, return the fields. 780 781 if docnum == found_docnum: 782 return fields 783 else: 784 return None 785 786 def close(self): 787 self.field_reader.close() 788 self.field_index_reader.close() 789 790 # Dictionary merging classes. 791 792 class TermDictionaryMerger: 793 794 "Merge position files." 795 796 def __init__(self, writer, readers): 797 self.writer = writer 798 self.readers = readers 799 800 def merge(self): 801 entries = [] 802 803 # Get the first entries from the readers. 804 805 for partition, reader in enumerate(self.readers): 806 reader.rewind() 807 808 try: 809 term, frequency, positions = reader.read_term() 810 insort_right(entries, (term, positions, partition)) 811 except EOFError: 812 pass 813 814 # While entries are available, write them out in order, merging where 815 # appropriate. 816 817 while entries: 818 term, doc_positions, partition = entries[0] 819 to_update = [partition] 820 821 nentries = len(entries) 822 i = 1 823 824 # Find other entries for the term. 825 826 while i < nentries: 827 other_term, other_doc_positions, other_partition = entries[i] 828 829 # For such entries, merge the positions. 830 831 if other_term == term: 832 doc_positions += other_doc_positions 833 to_update.append(other_partition) 834 i += 1 835 else: 836 break 837 838 # Write the combined term details. 839 840 doc_positions.sort() 841 self.writer.write_term_positions(term, doc_positions) 842 843 # Update the entries from the affected readers. 844 845 del entries[:i] 846 847 for partition in to_update: 848 try: 849 term, frequency, positions = self_readers[partition].read_term() 850 insort_right(entries, (term, positions, partition)) 851 except EOFError: 852 pass 853 854 # High-level classes. 855 856 class IndexWriter: 857 858 """ 859 Building term information and writing it to the term and field dictionaries. 860 """ 861 862 def __init__(self, pathname, interval, flush_interval): 863 self.pathname = pathname 864 self.interval = interval 865 self.flush_interval = flush_interval 866 867 self.dict_partition = 0 868 self.field_dict_partition = 0 869 870 self.terms = {} 871 self.docs = {} 872 873 self.position_counter = 0 874 self.field_counter = 0 875 876 def add_position(self, term, docnum, position): 877 878 """ 879 Add a position entry for the given 'term' in the document with the given 880 'docnum', indicating the given 'position'. 881 """ 882 883 if not self.terms.has_key(term): 884 doc_positions = self.terms[term] = {} 885 else: 886 doc_positions = self.terms[term] 887 888 if not doc_positions.has_key(docnum): 889 doc = doc_positions[docnum] = [] 890 else: 891 doc = doc_positions[docnum] 892 893 doc.append(position) 894 895 self.position_counter += 1 896 if self.flush_threshold and self.position_counter >= self.flush_threshold: 897 self.flush_terms() 898 899 def add_fields(self, docnum, fields): 900 901 "Add for the document with the given 'docnum' a list of 'fields'." 902 903 if not self.docs.has_key(docnum): 904 doc_fields = self.docs[docnum] = fields 905 else: 906 self.docs[docnum] += fields 907 908 self.field_counter += len(fields) 909 if self.flush_threshold and self.field_counter >= self.flush_threshold: 910 self.flush_fields() 911 912 def get_term_writer(self): 913 914 "Return a term dictionary writer for the current partition." 915 916 tdf = open(join(self.pathname, "terms-%d" % self.dict_partition), "wb") 917 info_writer = TermWriter(tdf) 918 919 tdif = open(join(self.pathname, "index-%d" % self.dict_partition), "wb") 920 index_writer = TermIndexWriter(tdif) 921 922 tpf = open(join(self.pathname, "positions-%d" % self.dict_partition), "wb") 923 positions_writer = PositionWriter(tpf) 924 925 return TermDictionaryWriter(info_writer, index_writer, positions_writer, self.interval) 926 927 def get_field_writer(self): 928 929 "Return a field dictionary writer for the current partition." 930 931 ff = open(join(self.pathname, "fields-%d" % self.field_dict_partition), "wb") 932 field_writer = FieldWriter(ff) 933 934 fif = open(join(self.pathname, "fields_index-%d" % self.field_dict_partition), "wb") 935 field_index_writer = FieldIndexWriter(fif) 936 937 return FieldDictionaryWriter(field_writer, field_index_writer, self.interval) 938 939 def flush_terms(self): 940 941 "Flush terms into the current term dictionary partition." 942 943 # Get the terms in order. 944 945 terms = self.terms.items() 946 terms.sort() 947 948 dict_writer = self.get_term_writer() 949 950 for term, doc_positions in terms: 951 doc_positions = doc_positions.items() 952 doc_positions.sort() 953 dict_writer.write_term_positions(term, doc_positions) 954 955 dict_writer.close() 956 957 self.terms = {} 958 self.dict_partition += 1 959 960 def flush_fields(self): 961 962 "Flush fields into the current term dictionary partition." 963 964 # Get the documents in order. 965 966 docs = self.docs.items() 967 docs.sort() 968 969 field_dict_writer = self.get_field_writer() 970 971 for docnum, fields in docs: 972 field_dict_writer.write_fields(docnum, fields) 973 974 field_dict_writer.close() 975 976 self.docs = {} 977 self.field_dict_partition += 1 978 979 def close(self): 980 if self.terms: 981 self.flush_terms() 982 if self.docs: 983 self.flush_fields() 984 985 class IndexReader: 986 987 "Accessing the term and field dictionaries." 988 989 def __init__(self, pathname, partition=0): 990 self.pathname = pathname 991 self.dict_reader = self.get_term_reader(partition) 992 self.field_dict_reader = self.get_field_reader(partition) 993 994 def get_term_reader(self, partition): 995 tdf = open(join(self.pathname, "terms-%d" % partition), "rb") 996 info_reader = TermReader(tdf) 997 998 tdif = open(join(self.pathname, "index-%d" % partition), "rb") 999 index_reader = TermIndexReader(tdif) 1000 1001 tpf = open(join(self.pathname, "positions-%d" % partition), "rb") 1002 positions_reader = PositionReader(tpf) 1003 1004 return TermDictionaryReader(info_reader, index_reader, positions_reader) 1005 1006 def get_field_reader(self, partition): 1007 ff = open(join(self.pathname, "fields-%d" % partition), "rb") 1008 field_reader = FieldReader(ff) 1009 1010 fif = open(join(self.pathname, "fields_index-%d" % partition), "rb") 1011 field_index_reader = FieldIndexReader(fif) 1012 1013 return FieldDictionaryReader(field_reader, field_index_reader) 1014 1015 def find_positions(self, term): 1016 return self.dict_reader.find_positions(term) 1017 1018 def get_frequency(self, term): 1019 return self.dict_reader.get_frequency(term) 1020 1021 def get_fields(self, docnum): 1022 return self.field_dict_reader.read_fields(docnum) 1023 1024 def close(self): 1025 self.dict_reader.close() 1026 self.field_dict_reader.close() 1027 1028 class Index: 1029 1030 "An inverted index solution encapsulating the various components." 1031 1032 def __init__(self, pathname): 1033 self.pathname = pathname 1034 self.reader = None 1035 self.writer = None 1036 1037 def get_writer(self, interval=INTERVAL, flush_interval=FLUSH_INTERVAL): 1038 1039 """ 1040 Return a writer, optionally using the given indexing 'interval' and 1041 'flush_interval'. 1042 """ 1043 1044 if not exists(self.pathname): 1045 mkdir(self.pathname) 1046 1047 self.writer = IndexWriter(self.pathname, interval, flush_interval) 1048 return self.writer 1049 1050 def get_reader(self, partition=0): 1051 1052 "Return a reader for the index." 1053 1054 if not exists(self.pathname): 1055 raise OSError, "Index path %r does not exist." % self.pathname 1056 1057 self.reader = IndexReader(self.pathname, partition) 1058 return self.reader 1059 1060 def merge_terms(self): 1061 1062 "Merge term dictionaries." 1063 1064 readers = [] 1065 1066 for filename in os.listdir(self.pathname): 1067 if filename.startswith("terms-"): # 6 character prefix 1068 partition = int(filename[6:]) 1069 readers.append(self.get_reader(partition)) 1070 1071 # NOTE: Make a distinct new writer/index. 1072 1073 def close(self): 1074 if self.reader is not None: 1075 self.reader.close() 1076 self.reader = None 1077 if self.writer is not None: 1078 self.writer.close() 1079 self.writer = None 1080 1081 # vim: tabstop=4 expandtab shiftwidth=4