1 #!/usr/bin/env python 2 3 """ 4 A simple (and sane) text indexing library. 5 6 Copyright (C) 2009 Paul Boddie <paul@boddie.org.uk> 7 8 This program is free software; you can redistribute it and/or modify it under 9 the terms of the GNU General Public License as published by the Free Software 10 Foundation; either version 3 of the License, or (at your option) any later 11 version. 12 13 This program is distributed in the hope that it will be useful, but WITHOUT ANY 14 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A 15 PARTICULAR PURPOSE. See the GNU General Public License for more details. 16 17 You should have received a copy of the GNU General Public License along 18 with this program. If not, see <http://www.gnu.org/licenses/>. 19 """ 20 21 from os import listdir, mkdir # index and partition discovery 22 from os.path import exists, join 23 from os.path import commonprefix # to find common string prefixes 24 from bisect import bisect_right # to find terms in the dictionary index 25 from bisect import insort_right # to maintain a sorted list of data for merging 26 import bz2, zlib # for field compression 27 28 # Constants. 29 30 INTERVAL = 100 31 FLUSH_INTERVAL = 1000000 32 33 compressors = [("b", bz2.compress), ("z", zlib.compress)] 34 decompressors = {"b" : bz2.decompress, "z" : zlib.decompress} 35 36 # Foundation classes. 37 38 class File: 39 40 "A basic file abstraction." 41 42 def __init__(self, f): 43 self.f = f 44 self.reset() 45 46 def reset(self): 47 48 "To be used to reset the state of the reader or writer between records." 49 50 pass 51 52 def rewind(self): 53 self.f.seek(0) 54 self.reset() 55 56 def close(self): 57 if self.f is not None: 58 self.f.close() 59 self.f = None 60 61 class FileWriter(File): 62 63 "Writing basic data types to files." 64 65 def write_number(self, number): 66 67 "Write 'number' to the file using a variable length encoding." 68 69 # Negative numbers are not supported. 70 71 if number < 0: 72 raise ValueError, "Number %r is negative." % number 73 74 # Special case: one byte containing zero. 75 76 elif number == 0: 77 self.f.write(chr(0)) 78 return 79 80 # Write the number from least to most significant digits. 81 82 bytes = [] 83 84 while number != 0: 85 lsd = number & 127 86 number = number >> 7 87 if number != 0: 88 lsd |= 128 89 bytes.append(chr(lsd)) 90 91 record = "".join(bytes) 92 self.f.write(record) 93 94 def write_string(self, s, compress=0): 95 96 """ 97 Write 's' to the file, recording its length and compressing the string 98 if 'compress' is set to a true value. 99 """ 100 101 # Convert Unicode objects to strings. 102 103 if isinstance(s, unicode): 104 s = s.encode("utf-8") 105 106 # Compress the string if requested. 107 108 if compress: 109 for flag, fn in compressors: 110 cs = fn(s) 111 112 # Take the first string shorter than the original. 113 114 if len(cs) < len(s): 115 s = cs 116 break 117 else: 118 flag = "-" 119 120 # Record whether compression was used. 121 122 self.f.write(flag) 123 124 # Write the length of the data before the data itself. 125 126 length = len(s) 127 self.write_number(length) 128 self.f.write(s) 129 130 class FileReader(File): 131 132 "Reading basic data types from files." 133 134 def read_number(self): 135 136 "Read a number from the file." 137 138 # Read each byte, adding it to the number. 139 140 shift = 0 141 number = 0 142 more = 1 143 144 while more: 145 byte = self.f.read(1) 146 if not byte: 147 raise EOFError 148 149 csd = ord(byte) 150 more = csd & 128 != 0 151 if more: 152 csd &= 127 153 number += (csd << shift) 154 shift += 7 155 156 return number 157 158 def read_string(self, decompress=0): 159 160 """ 161 Read a string from the file, decompressing the stored data if 162 'decompress' is set to a true value. 163 """ 164 165 # Decompress the data if requested. 166 167 if decompress: 168 flag = self.f.read(1) 169 else: 170 flag = "-" 171 172 length = self.read_number() 173 s = self.f.read(length) 174 175 # Perform decompression if applicable. 176 177 if flag != "-": 178 fn = decompressors[flag] 179 s = fn(s) 180 181 # Convert strings to Unicode objects. 182 183 return unicode(s, "utf-8") 184 185 # Specific classes for storing term and position information. 186 187 class PositionWriter(FileWriter): 188 189 "Writing position information to files." 190 191 def reset(self): 192 self.last_docnum = 0 193 194 def write_positions(self, docnum, positions): 195 196 "Write for the document 'docnum' the given 'positions'." 197 198 if docnum < self.last_docnum: 199 raise ValueError, "Document number %r is less than previous number %r." % (docnum, self.last_docnum) 200 201 # Write the document number delta. 202 203 self.write_number(docnum - self.last_docnum) 204 205 # Write the number of positions. 206 207 self.write_number(len(positions)) 208 209 # Make sure that the positions are sorted. 210 211 positions.sort() 212 213 # Write the position deltas. 214 215 last = 0 216 for position in positions: 217 pos = position - last 218 self.write_number(pos) 219 last = position 220 221 self.last_docnum = docnum 222 223 def write_term_positions(self, doc_positions): 224 225 """ 226 Write all 'doc_positions' - a collection of tuples of the form (document 227 number, position list) - to the file, returning a tuple containing the 228 offset at which they were stored together with the frequency (number of 229 positions) for the term involved. 230 """ 231 232 # Reset the writer and record the current file offset. 233 234 self.reset() 235 offset = self.f.tell() 236 237 # Write the number of documents. 238 239 self.write_number(len(doc_positions)) 240 241 # Write the positions. 242 243 frequency = 0 244 245 for docnum, positions in doc_positions: 246 self.write_positions(docnum, positions) 247 frequency += len(positions) 248 249 return offset, frequency 250 251 class PositionReader(FileReader): 252 253 "Reading position information from files." 254 255 def reset(self): 256 self.last_docnum = 0 257 258 def read_positions(self): 259 260 "Read positions, returning a document number and a list of positions." 261 262 # Read the document number delta and add it to the last number. 263 264 self.last_docnum += self.read_number() 265 266 # Read the number of positions. 267 268 npositions = self.read_number() 269 270 # Read the position deltas, adding each previous position to get the 271 # appropriate collection of absolute positions. 272 273 i = 0 274 last = 0 275 positions = [] 276 277 while i < npositions: 278 last += self.read_number() 279 positions.append(last) 280 i += 1 281 282 return self.last_docnum, positions 283 284 def read_term_positions(self, offset): 285 286 """ 287 Read all positions from 'offset', seeking to that position in the file 288 before reading. 289 """ 290 291 self.reset() 292 self.f.seek(offset) 293 294 # Read the number of documents. 295 296 ndocuments = self.read_number() 297 298 # Read all records. 299 300 i = 0 301 doc_positions = [] 302 303 while i < ndocuments: 304 doc_positions.append(self.read_positions()) 305 i += 1 306 307 return doc_positions 308 309 class TermWriter(FileWriter): 310 311 "Writing term information to files." 312 313 def reset(self): 314 self.last_term = "" 315 self.last_offset = 0 316 317 def write_term(self, term, offset, frequency): 318 319 """ 320 Write the given 'term', its position file 'offset', and its 'frequency' 321 to the term information file. Return the offset after the term 322 information was written to the file. 323 """ 324 325 # Write the prefix length and term suffix. 326 327 common = len(commonprefix([self.last_term, term])) 328 suffix = term[common:] 329 330 self.write_number(common) 331 self.write_string(suffix) 332 333 # Write the offset delta. 334 335 self.write_number(offset - self.last_offset) 336 337 # Write the frequency. 338 339 self.write_number(frequency) 340 341 self.last_term = term 342 self.last_offset = offset 343 344 return self.f.tell() 345 346 class TermReader(FileReader): 347 348 "Reading term information from files." 349 350 def reset(self): 351 self.last_term = "" 352 self.last_offset = 0 353 354 def read_term(self): 355 356 """ 357 Read a term, its position file offset, and its frequency from the term 358 information file. 359 """ 360 361 # Read the prefix length and term suffix. 362 363 common = self.read_number() 364 suffix = self.read_string() 365 366 self.last_term = self.last_term[:common] + suffix 367 368 # Read the offset delta. 369 370 self.last_offset += self.read_number() 371 372 # Read the frequency. 373 374 frequency = self.read_number() 375 376 return self.last_term, self.last_offset, frequency 377 378 def go_to_term(self, term, offset, info_offset): 379 380 """ 381 Seek past the entry for 'term' having 'offset' to 'info_offset'. This 382 permits the scanning for later terms from the specified term. 383 """ 384 385 self.f.seek(info_offset) 386 self.last_term = term 387 self.last_offset = offset 388 389 class TermIndexWriter(TermWriter): 390 391 "Writing term dictionary index details to files." 392 393 def reset(self): 394 TermWriter.reset(self) 395 self.last_info_offset = 0 396 397 def write_term(self, term, offset, frequency, info_offset): 398 399 """ 400 Write the given 'term', its position file 'offset', and its 'frequency' 401 to the term dictionary index file, along with the 'info_offset' in the 402 term information file. 403 """ 404 405 TermWriter.write_term(self, term, offset, frequency) 406 407 # Write the information file offset delta. 408 409 self.write_number(info_offset - self.last_info_offset) 410 self.last_info_offset = info_offset 411 412 class TermIndexReader(TermReader): 413 414 "Reading term dictionary index details from files." 415 416 def reset(self): 417 TermReader.reset(self) 418 self.last_info_offset = 0 419 420 def read_term(self): 421 422 """ 423 Read a term, its position file offset, its frequency, and its term 424 information file offset from the term dictionary index file. 425 """ 426 427 term, offset, frequency = TermReader.read_term(self) 428 429 # Read the offset delta. 430 431 self.last_info_offset += self.read_number() 432 433 return term, offset, frequency, self.last_info_offset 434 435 class TermDictionaryWriter: 436 437 "Writing term dictionaries." 438 439 def __init__(self, info_writer, index_writer, position_writer, interval): 440 self.info_writer = info_writer 441 self.index_writer = index_writer 442 self.position_writer = position_writer 443 self.interval = interval 444 self.entry = 0 445 446 def _write_term(self, term, offset, frequency): 447 448 """ 449 Write the given 'term', its position file 'offset', and its 'frequency' 450 to the term information file and optionally to the index, making a 451 dictionary entry. 452 """ 453 454 info_offset = self.info_writer.write_term(term, offset, frequency) 455 456 if self.entry % self.interval == 0: 457 self.index_writer.write_term(term, offset, frequency, info_offset) 458 459 self.entry += 1 460 461 def write_term_positions(self, term, doc_positions): 462 463 """ 464 Write the given 'term' and the 'doc_positions' recording the documents 465 and positions at which the term is found. 466 """ 467 468 offset, frequency = self.position_writer.write_term_positions(doc_positions) 469 self._write_term(term, offset, frequency) 470 471 def close(self): 472 self.info_writer.close() 473 self.index_writer.close() 474 self.position_writer.close() 475 476 class TermDictionaryReader: 477 478 "Reading term dictionaries." 479 480 def __init__(self, info_reader, index_reader, position_reader): 481 self.info_reader = info_reader 482 self.index_reader = index_reader 483 self.position_reader = position_reader 484 485 self.terms = [] 486 try: 487 while 1: 488 self.terms.append(self.index_reader.read_term()) 489 except EOFError: 490 pass 491 492 # Large numbers for ordering purposes. 493 494 self.max_offset = self.terms[-1][1] 495 self.max_info_offset = self.terms[-1][2] 496 497 def _find_term(self, term): 498 499 """ 500 Find the position file offset and frequency of 'term' from the term 501 dictionary. 502 """ 503 504 i = bisect_right(self.terms, (term, self.max_offset, self.max_info_offset)) - 1 505 506 # Get the entry position providing the term or one preceding it. 507 508 if i == -1: 509 return None 510 511 found_term, offset, frequency, info_offset = self.terms[i] 512 513 # Where the term is found immediately, return the offset. 514 515 if term == found_term: 516 return offset, frequency 517 518 # Otherwise, seek past the index term's entry in the information file 519 # and scan for the desired term. 520 521 else: 522 self.info_reader.go_to_term(found_term, offset, info_offset) 523 try: 524 while term > found_term: 525 found_term, offset, frequency = self.info_reader.read_term() 526 except EOFError: 527 pass 528 529 # If the term is found, return the offset and frequency. 530 531 if term == found_term: 532 return offset, frequency 533 else: 534 return None 535 536 def rewind(self): 537 self.info_reader.rewind() 538 539 def _get_positions(self, offset): 540 return self.position_reader.read_term_positions(offset) 541 542 def read_term(self): 543 544 """ 545 Return the next term, its frequency and the documents and positions at 546 which the term is found. 547 """ 548 549 term, offset, frequency = self.info_reader.read_term() 550 positions = self._get_positions(offset) 551 return term, frequency, positions 552 553 def find_positions(self, term): 554 555 "Return the documents and positions at which the given 'term' is found." 556 557 t = self._find_term(term) 558 if t is None: 559 return None 560 else: 561 offset, frequency = t 562 return self._get_positions(offset) 563 564 def get_frequency(self, term): 565 566 "Return the frequency of the given 'term'." 567 568 t = self._find_term(term) 569 if t is None: 570 return None 571 else: 572 offset, frequency = t 573 return frequency 574 575 def close(self): 576 self.info_reader.close() 577 self.index_reader.close() 578 self.position_reader.close() 579 580 # Specific classes for storing document information. 581 582 class FieldWriter(FileWriter): 583 584 "Writing field data to files." 585 586 def reset(self): 587 self.last_docnum = 0 588 589 def write_fields(self, docnum, fields): 590 591 """ 592 Write for the given 'docnum', a list of 'fields' (integer, string pairs 593 representing field identifiers and values respectively). 594 Return the offset at which the fields are stored. 595 """ 596 597 offset = self.f.tell() 598 599 # Write the document number delta. 600 601 self.write_number(docnum - self.last_docnum) 602 603 # Write the number of fields. 604 605 self.write_number(len(fields)) 606 607 # Write the fields themselves. 608 609 for i, field in fields: 610 self.write_number(i) 611 self.write_string(field, 1) # compress 612 613 self.last_docnum = docnum 614 return offset 615 616 class FieldReader(FileReader): 617 618 "Reading field data from files." 619 620 def reset(self): 621 self.last_docnum = 0 622 623 def read_fields(self): 624 625 """ 626 Read fields from the file, returning a tuple containing the document 627 number and a list of field (identifier, value) pairs. 628 """ 629 630 # Read the document number. 631 632 self.last_docnum += self.read_number() 633 634 # Read the number of fields. 635 636 nfields = self.read_number() 637 638 # Collect the fields. 639 640 fields = [] 641 i = 0 642 643 while i < nfields: 644 identifier = self.read_number() 645 value = self.read_string(1) # decompress 646 fields.append((identifier, value)) 647 i += 1 648 649 return self.last_docnum, fields 650 651 def read_document_fields(self, docnum, offset): 652 653 """ 654 Read fields for 'docnum' at the given 'offset'. This permits the 655 retrieval of details for the specified document, as well as scanning for 656 later documents. 657 """ 658 659 self.f.seek(offset) 660 bad_docnum, fields = self.read_fields() 661 self.last_docnum = docnum 662 return docnum, fields 663 664 class FieldIndexWriter(FileWriter): 665 666 "Writing field index details to files." 667 668 def reset(self): 669 self.last_docnum = 0 670 self.last_offset = 0 671 672 def write_document(self, docnum, offset): 673 674 """ 675 Write for the given 'docnum', the 'offset' at which the fields for the 676 document are stored in the fields file. 677 """ 678 679 # Write the document number and offset deltas. 680 681 self.write_number(docnum - self.last_docnum) 682 self.write_number(offset - self.last_offset) 683 684 self.last_docnum = docnum 685 self.last_offset = offset 686 687 class FieldIndexReader(FileReader): 688 689 "Reading field index details from files." 690 691 def reset(self): 692 self.last_docnum = 0 693 self.last_offset = 0 694 695 def read_document(self): 696 697 "Read a document number and field file offset." 698 699 # Read the document number delta and offset. 700 701 self.last_docnum += self.read_number() 702 self.last_offset += self.read_number() 703 704 return self.last_docnum, self.last_offset 705 706 class FieldDictionaryWriter: 707 708 "Writing field dictionary details." 709 710 def __init__(self, field_writer, field_index_writer, interval): 711 self.field_writer = field_writer 712 self.field_index_writer = field_index_writer 713 self.interval = interval 714 self.entry = 0 715 716 def write_fields(self, docnum, fields): 717 718 "Write details of the document with the given 'docnum' and 'fields'." 719 720 offset = self.field_writer.write_fields(docnum, fields) 721 722 if self.entry % self.interval == 0: 723 self.field_index_writer.write_document(docnum, offset) 724 725 self.entry += 1 726 727 def close(self): 728 self.field_writer.close() 729 self.field_index_writer.close() 730 731 class FieldDictionaryReader: 732 733 "Reading field dictionary details." 734 735 def __init__(self, field_reader, field_index_reader): 736 self.field_reader = field_reader 737 self.field_index_reader = field_index_reader 738 739 self.docs = [] 740 try: 741 while 1: 742 self.docs.append(self.field_index_reader.read_document()) 743 except EOFError: 744 pass 745 746 # Large numbers for ordering purposes. 747 748 self.max_offset = self.docs[-1][1] 749 750 def rewind(self): 751 self.field_reader.rewind() 752 753 def read_fields(self): 754 755 "Return the next document number and fields." 756 757 return self.field_reader.read_fields() 758 759 def get_fields(self, docnum): 760 761 "Read the fields of the document with the given 'docnum'." 762 763 i = bisect_right(self.docs, (docnum, self.max_offset)) - 1 764 765 # Get the entry position providing the term or one preceding it. 766 767 if i == -1: 768 return None 769 770 found_docnum, offset = self.docs[i] 771 772 # Read from the fields file. 773 774 found_docnum, fields = self.field_reader.read_document_fields(found_docnum, offset) 775 776 # Scan for the document, if necessary. 777 778 try: 779 while docnum > found_docnum: 780 found_docnum, fields = self.field_reader.read_fields() 781 except EOFError: 782 pass 783 784 # If the document is found, return the fields. 785 786 if docnum == found_docnum: 787 return fields 788 else: 789 return None 790 791 def close(self): 792 self.field_reader.close() 793 self.field_index_reader.close() 794 795 # Dictionary merging classes. 796 797 class Merger: 798 799 "Merge files." 800 801 def __init__(self, writer, readers): 802 self.writer = writer 803 self.readers = readers 804 805 def close(self): 806 for reader in self.readers: 807 reader.close() 808 self.writer.close() 809 810 class TermDictionaryMerger(Merger): 811 812 "Merge term and position files." 813 814 def merge(self): 815 816 """ 817 Merge terms and positions from the readers, sending them to the writer. 818 """ 819 820 entries = [] 821 822 # Get the first entries from the readers. 823 824 for partition, reader in enumerate(self.readers): 825 reader.rewind() 826 827 try: 828 term, frequency, positions = reader.read_term() 829 insort_right(entries, (term, positions, partition)) 830 except EOFError: 831 pass 832 833 # While entries are available, write them out in order, merging where 834 # appropriate. 835 836 while entries: 837 term, doc_positions, partition = entries[0] 838 to_update = [partition] 839 840 nentries = len(entries) 841 i = 1 842 843 # Find other entries for the term. 844 845 while i < nentries: 846 other_term, other_doc_positions, other_partition = entries[i] 847 848 # For such entries, merge the positions. 849 850 if other_term == term: 851 self.merge_positions(doc_positions, other_doc_positions) 852 to_update.append(other_partition) 853 i += 1 854 else: 855 break 856 857 # Write the combined term details. 858 859 self.writer.write_term_positions(term, doc_positions) 860 861 # Update the entries from the affected readers. 862 863 del entries[:i] 864 865 for partition in to_update: 866 try: 867 term, frequency, positions = self_readers[partition].read_term() 868 insort_right(entries, (term, positions, partition)) 869 except EOFError: 870 pass 871 872 def merge_positions(self, doc_positions, other_doc_positions): 873 874 """ 875 Merge 'doc_positions' with 'other_doc_positions' so that common document 876 records contain positions from both collections. 877 """ 878 879 doc_position_dict = dict(doc_positions) 880 881 for docnum, positions in other_doc_positions: 882 if doc_position_dict.has_key(docnum): 883 doc_position_dict[docnum] += positions 884 doc_position_dict[docnum].sort() 885 else: 886 doc_position_dict[docnum] = positions 887 888 doc_positions = doc_position_dict.items() 889 return doc_positions 890 891 class FieldDictionaryMerger(Merger): 892 893 "Merge field files." 894 895 def merge(self): 896 897 """ 898 Merge fields from the readers, sending them to the writer. 899 """ 900 901 entries = [] 902 903 # Get the first entries from the readers. 904 905 for partition, reader in enumerate(self.readers): 906 reader.rewind() 907 908 try: 909 docnum, fields = reader.read_fields() 910 insort_right(entries, (docnum, fields, partition)) 911 except EOFError: 912 pass 913 914 # While entries are available, write them out in order, merging where 915 # appropriate. 916 917 while entries: 918 docnum, fields, partition = entries[0] 919 to_update = [partition] 920 921 nentries = len(entries) 922 i = 1 923 924 # Find other entries for the term. 925 926 while i < nentries: 927 other_docnum, other_fields, other_partition = entries[i] 928 929 # For such entries, merge the positions. 930 931 if other_term == term: 932 fields += other_fields 933 to_update.append(other_partition) 934 i += 1 935 else: 936 break 937 938 # Write the combined term details. 939 940 self.writer.write_fields(docnum, fields) 941 942 # Update the entries from the affected readers. 943 944 del entries[:i] 945 946 for partition in to_update: 947 try: 948 docnum, fields = self_readers[partition].read_fields() 949 insort_right(entries, (docnum, fields, partition)) 950 except EOFError: 951 pass 952 953 # Utility functions. 954 955 def get_term_writer(pathname, partition, interval): 956 957 """ 958 Return a term dictionary writer using files under the given 'pathname' 959 labelled according to the given 'partition', using the given indexing 960 'interval'. 961 """ 962 963 tdf = open(join(pathname, "terms-%s" % partition), "wb") 964 info_writer = TermWriter(tdf) 965 966 tdif = open(join(pathname, "index-%s" % partition), "wb") 967 index_writer = TermIndexWriter(tdif) 968 969 tpf = open(join(pathname, "positions-%s" % partition), "wb") 970 positions_writer = PositionWriter(tpf) 971 972 return TermDictionaryWriter(info_writer, index_writer, positions_writer, interval) 973 974 def get_field_writer(pathname, partition, interval): 975 976 """ 977 Return a field dictionary writer using files under the given 'pathname' 978 labelled according to the given 'partition', using the given indexing 979 'interval'. 980 """ 981 982 ff = open(join(pathname, "fields-%s" % partition), "wb") 983 field_writer = FieldWriter(ff) 984 985 fif = open(join(pathname, "fields_index-%s" % partition), "wb") 986 field_index_writer = FieldIndexWriter(fif) 987 988 return FieldDictionaryWriter(field_writer, field_index_writer, interval) 989 990 # High-level classes. 991 992 class IndexWriter: 993 994 """ 995 Building term information and writing it to the term and field dictionaries. 996 """ 997 998 def __init__(self, pathname, interval, flush_interval): 999 self.pathname = pathname 1000 self.interval = interval 1001 self.flush_interval = flush_interval 1002 1003 self.dict_partition = 0 1004 self.field_dict_partition = 0 1005 1006 self.terms = {} 1007 self.docs = {} 1008 1009 self.position_counter = 0 1010 self.field_counter = 0 1011 1012 def add_position(self, term, docnum, position): 1013 1014 """ 1015 Add a position entry for the given 'term' in the document with the given 1016 'docnum', indicating the given 'position'. 1017 """ 1018 1019 if not self.terms.has_key(term): 1020 doc_positions = self.terms[term] = {} 1021 else: 1022 doc_positions = self.terms[term] 1023 1024 if not doc_positions.has_key(docnum): 1025 doc = doc_positions[docnum] = [] 1026 else: 1027 doc = doc_positions[docnum] 1028 1029 doc.append(position) 1030 1031 self.position_counter += 1 1032 if self.flush_interval and self.position_counter >= self.flush_interval: 1033 self.flush_terms() 1034 1035 def add_field(self, docnum, identifier, value): 1036 1037 """ 1038 Add for the document with the given 'docnum' a field having the given 1039 'identifier' and 'value'. 1040 """ 1041 1042 if not self.docs.has_key(docnum): 1043 doc_fields = self.docs[docnum] = [] 1044 else: 1045 doc_fields = self.docs[docnum] 1046 1047 doc_fields.append((identifier, value)) 1048 1049 self.field_counter += 1 1050 if self.flush_interval and self.field_counter >= self.flush_interval: 1051 self.flush_fields() 1052 1053 def get_term_writer(self): 1054 1055 "Return a term dictionary writer for the current partition." 1056 1057 return get_term_writer(self.pathname, self.dict_partition, self.interval) 1058 1059 def get_field_writer(self): 1060 1061 "Return a field dictionary writer for the current partition." 1062 1063 return get_field_writer(self.pathname, self.field_dict_partition, self.interval) 1064 1065 def flush_terms(self): 1066 1067 "Flush terms into the current term dictionary partition." 1068 1069 # Get the terms in order. 1070 1071 terms = self.terms.items() 1072 terms.sort() 1073 1074 dict_writer = self.get_term_writer() 1075 1076 for term, doc_positions in terms: 1077 doc_positions = doc_positions.items() 1078 doc_positions.sort() 1079 dict_writer.write_term_positions(term, doc_positions) 1080 1081 dict_writer.close() 1082 1083 self.terms = {} 1084 self.dict_partition += 1 1085 1086 def flush_fields(self): 1087 1088 "Flush fields into the current term dictionary partition." 1089 1090 # Get the documents in order. 1091 1092 docs = self.docs.items() 1093 docs.sort() 1094 1095 field_dict_writer = self.get_field_writer() 1096 1097 for docnum, fields in docs: 1098 field_dict_writer.write_fields(docnum, fields) 1099 1100 field_dict_writer.close() 1101 1102 self.docs = {} 1103 self.field_dict_partition += 1 1104 1105 def close(self): 1106 if self.terms: 1107 self.flush_terms() 1108 if self.docs: 1109 self.flush_fields() 1110 1111 class IndexReader: 1112 1113 "Accessing the term and field dictionaries." 1114 1115 def __init__(self, pathname, partition=0): 1116 self.pathname = pathname 1117 self.dict_reader = self.get_term_reader(partition) 1118 self.field_dict_reader = self.get_field_reader(partition) 1119 1120 def get_term_reader(self, partition): 1121 tdf = open(join(self.pathname, "terms-%s" % partition), "rb") 1122 info_reader = TermReader(tdf) 1123 1124 tdif = open(join(self.pathname, "index-%s" % partition), "rb") 1125 index_reader = TermIndexReader(tdif) 1126 1127 tpf = open(join(self.pathname, "positions-%s" % partition), "rb") 1128 positions_reader = PositionReader(tpf) 1129 1130 return TermDictionaryReader(info_reader, index_reader, positions_reader) 1131 1132 def get_field_reader(self, partition): 1133 ff = open(join(self.pathname, "fields-%s" % partition), "rb") 1134 field_reader = FieldReader(ff) 1135 1136 fif = open(join(self.pathname, "fields_index-%s" % partition), "rb") 1137 field_index_reader = FieldIndexReader(fif) 1138 1139 return FieldDictionaryReader(field_reader, field_index_reader) 1140 1141 def find_positions(self, term): 1142 return self.dict_reader.find_positions(term) 1143 1144 def get_frequency(self, term): 1145 return self.dict_reader.get_frequency(term) 1146 1147 def get_fields(self, docnum): 1148 return self.field_dict_reader.get_fields(docnum) 1149 1150 def close(self): 1151 self.dict_reader.close() 1152 self.field_dict_reader.close() 1153 1154 class Index: 1155 1156 "An inverted index solution encapsulating the various components." 1157 1158 def __init__(self, pathname): 1159 self.pathname = pathname 1160 self.reader = None 1161 self.writer = None 1162 1163 def get_writer(self, interval=INTERVAL, flush_interval=FLUSH_INTERVAL): 1164 1165 """ 1166 Return a writer, optionally using the given indexing 'interval' and 1167 'flush_interval'. 1168 """ 1169 1170 if not exists(self.pathname): 1171 mkdir(self.pathname) 1172 1173 self.writer = IndexWriter(self.pathname, interval, flush_interval) 1174 return self.writer 1175 1176 def get_reader(self, partition=0): 1177 1178 "Return a reader for the index." 1179 1180 if not exists(self.pathname): 1181 raise OSError, "Index path %r does not exist." % self.pathname 1182 1183 self.reader = IndexReader(self.pathname, partition) 1184 return self.reader 1185 1186 def merge_terms(self, interval=INTERVAL): 1187 1188 "Merge term dictionaries using the given indexing 'interval'." 1189 1190 readers = [] 1191 1192 for filename in os.listdir(self.pathname): 1193 if filename.startswith("terms-"): # 6 character prefix 1194 partition = int(filename[6:]) 1195 readers.append(self.get_reader(partition)) 1196 1197 writer = get_writer(self.pathname, "new", interval) 1198 1199 merger = TermDictionaryMerger(writer, readers) 1200 merger.merge() 1201 merger.close() 1202 1203 def close(self): 1204 if self.reader is not None: 1205 self.reader.close() 1206 self.reader = None 1207 if self.writer is not None: 1208 self.writer.close() 1209 self.writer = None 1210 1211 # vim: tabstop=4 expandtab shiftwidth=4