1 #!/usr/bin/env python 2 3 """ 4 A simple (and sane) text indexing library. 5 6 Copyright (C) 2009 Paul Boddie <paul@boddie.org.uk> 7 8 This program is free software; you can redistribute it and/or modify it under 9 the terms of the GNU General Public License as published by the Free Software 10 Foundation; either version 3 of the License, or (at your option) any later 11 version. 12 13 This program is distributed in the hope that it will be useful, but WITHOUT ANY 14 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A 15 PARTICULAR PURPOSE. See the GNU General Public License for more details. 16 17 You should have received a copy of the GNU General Public License along 18 with this program. If not, see <http://www.gnu.org/licenses/>. 19 """ 20 21 from os import mkdir # to determine whether to create indexes 22 from os.path import exists, join 23 from os.path import commonprefix # to find common string prefixes 24 from bisect import bisect_right # to find terms in the dictionary index 25 import bz2, zlib # for field compression 26 27 # Constants. 28 29 INTERVAL = 100 30 31 compressors = [("b", bz2.compress), ("z", zlib.compress)] 32 decompressors = {"b" : bz2.decompress, "z" : zlib.decompress} 33 34 # Foundation classes. 35 36 class File: 37 38 "A basic file abstraction." 39 40 def __init__(self, f): 41 self.f = f 42 self.reset() 43 44 def reset(self): 45 pass 46 47 def close(self): 48 if self.f is not None: 49 self.f.close() 50 self.f = None 51 52 class FileWriter(File): 53 54 "Writing basic data types to files." 55 56 def write_number(self, number): 57 58 "Write 'number' to the file using a variable length encoding." 59 60 # Negative numbers are not supported. 61 62 if number < 0: 63 raise ValueError, "Number %r is negative." % number 64 65 # Special case: one byte containing zero. 66 67 elif number == 0: 68 self.f.write(chr(0)) 69 return 70 71 # Write the number from least to most significant digits. 72 73 bytes = [] 74 75 while number != 0: 76 lsd = number & 127 77 number = number >> 7 78 if number != 0: 79 lsd |= 128 80 bytes.append(chr(lsd)) 81 82 record = "".join(bytes) 83 self.f.write(record) 84 85 def write_string(self, s, compress=0): 86 87 """ 88 Write 's' to the file, recording its length and compressing the string 89 if 'compress' is set to a true value. 90 """ 91 92 # Convert Unicode objects to strings. 93 94 if isinstance(s, unicode): 95 s = s.encode("utf-8") 96 97 # Compress the string if requested. 98 99 if compress: 100 for flag, fn in compressors: 101 cs = fn(s) 102 103 # Take the first string shorter than the original. 104 105 if len(cs) < len(s): 106 s = cs 107 break 108 else: 109 flag = "-" 110 111 # Record whether compression was used. 112 113 self.f.write(flag) 114 115 # Write the length of the data before the data itself. 116 117 length = len(s) 118 self.write_number(length) 119 self.f.write(s) 120 121 class FileReader(File): 122 123 "Reading basic data types from files." 124 125 def read_number(self): 126 127 "Read a number from the file." 128 129 # Read each byte, adding it to the number. 130 131 shift = 0 132 number = 0 133 more = 1 134 135 while more: 136 byte = self.f.read(1) 137 if not byte: 138 raise EOFError 139 140 csd = ord(byte) 141 more = csd & 128 != 0 142 if more: 143 csd &= 127 144 number += (csd << shift) 145 shift += 7 146 147 return number 148 149 def read_string(self, decompress=0): 150 151 """ 152 Read a string from the file, decompressing the stored data if 153 'decompress' is set to a true value. 154 """ 155 156 # Decompress the data if requested. 157 158 if decompress: 159 flag = self.f.read(1) 160 else: 161 flag = "-" 162 163 length = self.read_number() 164 s = self.f.read(length) 165 166 # Perform decompression if applicable. 167 168 if flag != "-": 169 fn = decompressors[flag] 170 s = fn(s) 171 172 # Convert strings to Unicode objects. 173 174 return unicode(s, "utf-8") 175 176 # Specific classes for storing term and position information. 177 178 class PositionWriter(FileWriter): 179 180 "Writing position information to files." 181 182 def reset(self): 183 self.last_docnum = 0 184 185 def write_positions(self, docnum, positions): 186 187 "Write for the document 'docnum' the given 'positions'." 188 189 if docnum < self.last_docnum: 190 raise ValueError, "Document number %r is less than previous number %r." % (docnum, self.last_docnum) 191 192 # Write the document number delta. 193 194 self.write_number(docnum - self.last_docnum) 195 196 # Write the number of positions. 197 198 self.write_number(len(positions)) 199 200 # Make sure that the positions are sorted. 201 202 positions.sort() 203 204 # Write the position deltas. 205 206 last = 0 207 for position in positions: 208 pos = position - last 209 self.write_number(pos) 210 last = position 211 212 self.last_docnum = docnum 213 214 def write_all_positions(self, doc_positions): 215 216 """ 217 Write all 'doc_positions' - a collection of tuples of the form (document 218 number, position list) - to the file, returning the offset at which they 219 were stored. 220 """ 221 222 # Reset the writer and record the current file offset. 223 224 self.reset() 225 offset = self.f.tell() 226 227 # Write the number of documents. 228 229 self.write_number(len(doc_positions)) 230 231 # Write the positions. 232 233 for docnum, positions in doc_positions: 234 self.write_positions(docnum, positions) 235 236 return offset 237 238 class PositionReader(FileReader): 239 240 "Reading position information from files." 241 242 def reset(self): 243 self.last_docnum = 0 244 245 def read_positions(self): 246 247 "Read positions, returning a document number and a list of positions." 248 249 # Read the document number delta and add it to the last number. 250 251 self.last_docnum += self.read_number() 252 253 # Read the number of positions. 254 255 npositions = self.read_number() 256 257 # Read the position deltas, adding each previous position to get the 258 # appropriate collection of absolute positions. 259 260 i = 0 261 last = 0 262 positions = [] 263 264 while i < npositions: 265 last += self.read_number() 266 positions.append(last) 267 i += 1 268 269 return self.last_docnum, positions 270 271 def read_all_positions(self, offset): 272 273 """ 274 Read all positions from 'offset', seeking to that position in the file 275 before reading. 276 """ 277 278 self.reset() 279 self.f.seek(offset) 280 281 # Read the number of documents. 282 283 ndocuments = self.read_number() 284 285 # Read all records. 286 287 i = 0 288 doc_positions = [] 289 290 while i < ndocuments: 291 doc_positions.append(self.read_positions()) 292 i += 1 293 294 return doc_positions 295 296 class TermWriter(FileWriter): 297 298 "Writing term information to files." 299 300 def reset(self): 301 self.last_term = "" 302 self.last_offset = 0 303 304 def write_term(self, term, offset): 305 306 """ 307 Write the given 'term' and its position file 'offset' to the term 308 information file. Return the offset after the term information was 309 written to the file. 310 """ 311 312 # Too long terms are not currently supported. 313 314 if len(term) > 255: 315 raise ValueError, "Term %r is too long." % term 316 317 # Write the prefix length and term suffix. 318 319 common = len(commonprefix([self.last_term, term])) 320 suffix = term[common:] 321 322 self.write_number(common) 323 self.write_string(suffix) 324 325 # Write the offset delta. 326 327 self.write_number(offset - self.last_offset) 328 329 self.last_term = term 330 self.last_offset = offset 331 332 return self.f.tell() 333 334 class TermReader(FileReader): 335 336 "Reading term information from files." 337 338 def reset(self): 339 self.last_term = "" 340 self.last_offset = 0 341 342 def read_term(self): 343 344 """ 345 Read a term and its position file offset from the term information file. 346 """ 347 348 # Read the prefix length and term suffix. 349 350 common = self.read_number() 351 suffix = self.read_string() 352 353 self.last_term = self.last_term[:common] + suffix 354 355 # Read the offset delta. 356 357 self.last_offset += self.read_number() 358 359 return self.last_term, self.last_offset 360 361 def go_to_term(self, term, offset, info_offset): 362 363 """ 364 Seek past the entry for 'term' having 'offset' to 'info_offset'. This 365 permits the scanning for later terms from the specified term. 366 """ 367 368 self.f.seek(info_offset) 369 self.last_term = term 370 self.last_offset = offset 371 372 class TermIndexWriter(TermWriter): 373 374 "Writing term dictionary index details to files." 375 376 def reset(self): 377 TermWriter.reset(self) 378 self.last_info_offset = 0 379 380 def write_term(self, term, offset, info_offset): 381 382 """ 383 Write the given 'term' and its position file 'offset' to the term 384 dictionary index file, along with the 'info_offset' in the term 385 information file. 386 """ 387 388 TermWriter.write_term(self, term, offset) 389 390 # Write the information file offset delta. 391 392 self.write_number(info_offset - self.last_info_offset) 393 self.last_info_offset = info_offset 394 395 class TermIndexReader(TermReader): 396 397 "Reading term dictionary index details from files." 398 399 def reset(self): 400 TermReader.reset(self) 401 self.last_info_offset = 0 402 403 def read_term(self): 404 405 """ 406 Read a term, its position file offset, and its term information file 407 offset from the term dictionary index file. 408 """ 409 410 term, offset = TermReader.read_term(self) 411 412 # Read the offset delta. 413 414 self.last_info_offset += self.read_number() 415 416 return term, offset, self.last_info_offset 417 418 class TermDictionaryWriter: 419 420 "Writing term dictionaries." 421 422 def __init__(self, info_writer, index_writer, position_writer, interval): 423 self.info_writer = info_writer 424 self.index_writer = index_writer 425 self.position_writer = position_writer 426 self.interval = interval 427 self.entry = 0 428 429 def _write_term(self, term, offset): 430 431 """ 432 Write the given 'term' and its position file 'offset' to the term 433 information file and optionally to the index, making a dictionary entry. 434 """ 435 436 info_offset = self.info_writer.write_term(term, offset) 437 438 if self.entry % self.interval == 0: 439 self.index_writer.write_term(term, offset, info_offset) 440 441 self.entry += 1 442 443 def write_term_positions(self, term, doc_positions): 444 445 """ 446 Write the given 'term' and the 'doc_positions' recording the documents 447 and positions at which the term is found. 448 """ 449 450 offset = self.position_writer.write_all_positions(doc_positions) 451 self._write_term(term, offset) 452 453 def close(self): 454 self.info_writer.close() 455 self.index_writer.close() 456 self.position_writer.close() 457 458 class TermDictionaryReader: 459 460 "Reading term dictionaries." 461 462 def __init__(self, info_reader, index_reader, position_reader): 463 self.info_reader = info_reader 464 self.index_reader = index_reader 465 self.position_reader = position_reader 466 467 self.terms = [] 468 try: 469 while 1: 470 self.terms.append(self.index_reader.read_term()) 471 except EOFError: 472 pass 473 474 # Large numbers for ordering purposes. 475 476 self.max_offset = self.terms[-1][1] 477 self.max_info_offset = self.terms[-1][2] 478 479 def _find_term(self, term): 480 481 "Find the position file offset of 'term' from the term dictionary." 482 483 i = bisect_right(self.terms, (term, self.max_offset, self.max_info_offset)) - 1 484 485 # Get the entry position providing the term or one preceding it. 486 487 if i == -1: 488 return None 489 490 found_term, offset, info_offset = self.terms[i] 491 492 # Where the term is found immediately, return the offset. 493 494 if term == found_term: 495 return offset 496 497 # Otherwise, seek past the index term's entry in the information file 498 # and scan for the desired term. 499 500 else: 501 self.info_reader.go_to_term(found_term, offset, info_offset) 502 try: 503 while term > found_term: 504 found_term, offset = self.info_reader.read_term() 505 except EOFError: 506 pass 507 508 # If the term is found, return the offset. 509 510 if term == found_term: 511 return offset 512 else: 513 return None 514 515 def find_positions(self, term): 516 517 "Return the documents and positions at which the given 'term' is found." 518 519 offset = self._find_term(term) 520 if offset is None: 521 return None 522 else: 523 return self.position_reader.read_all_positions(offset) 524 525 def close(self): 526 self.info_reader.close() 527 self.index_reader.close() 528 self.position_reader.close() 529 530 # Specific classes for storing document information. 531 532 class FieldWriter(FileWriter): 533 534 "Writing field data to files." 535 536 def reset(self): 537 self.last_docnum = 0 538 539 def write_fields(self, docnum, fields): 540 541 """ 542 Write for the given 'docnum', a list of 'fields' (strings representing 543 field values). Return the offset at which the fields are stored. 544 """ 545 546 offset = self.f.tell() 547 548 # Write the document number delta. 549 550 self.write_number(docnum - self.last_docnum) 551 552 # Write the number of fields. 553 554 self.write_number(len(fields)) 555 556 # Write the fields themselves. 557 558 for field in fields: 559 self.write_string(field, 1) # compress 560 561 self.last_docnum = docnum 562 return offset 563 564 class FieldReader(FileReader): 565 566 "Reading field data from files." 567 568 def reset(self): 569 self.last_docnum = 0 570 571 def read_fields(self): 572 573 """ 574 Read fields from the file, returning a tuple containing the document 575 number and a list of field values. 576 """ 577 578 # Read the document number. 579 580 self.last_docnum += self.read_number() 581 582 # Read the number of fields. 583 584 nfields = self.read_number() 585 586 # Collect the fields. 587 588 fields = [] 589 i = 0 590 591 while i < nfields: 592 fields.append(self.read_string(1)) # decompress 593 i += 1 594 595 return self.last_docnum, fields 596 597 def read_document_fields(self, docnum, offset): 598 599 """ 600 Read fields for 'docnum' at the given 'offset'. This permits the 601 retrieval of details for the specified document, as well as scanning for 602 later documents. 603 """ 604 605 self.f.seek(offset) 606 bad_docnum, fields = self.read_fields() 607 self.last_docnum = docnum 608 return docnum, fields 609 610 class FieldIndexWriter(FileWriter): 611 612 "Writing field index details to files." 613 614 def reset(self): 615 self.last_docnum = 0 616 self.last_offset = 0 617 618 def write_document(self, docnum, offset): 619 620 """ 621 Write for the given 'docnum', the 'offset' at which the fields for the 622 document are stored in the fields file. 623 """ 624 625 # Write the document number and offset deltas. 626 627 self.write_number(docnum - self.last_docnum) 628 self.write_number(offset - self.last_offset) 629 630 self.last_docnum = docnum 631 self.last_offset = offset 632 633 class FieldIndexReader(FileReader): 634 635 "Reading field index details from files." 636 637 def reset(self): 638 self.last_docnum = 0 639 self.last_offset = 0 640 641 def read_document(self): 642 643 "Read a document number and field file offset." 644 645 # Read the document number delta and offset. 646 647 self.last_docnum += self.read_number() 648 self.last_offset += self.read_number() 649 650 return self.last_docnum, self.last_offset 651 652 class FieldDictionaryWriter: 653 654 "Writing field dictionary details." 655 656 def __init__(self, field_writer, field_index_writer, interval): 657 self.field_writer = field_writer 658 self.field_index_writer = field_index_writer 659 self.interval = interval 660 self.entry = 0 661 662 def write_fields(self, docnum, fields): 663 664 "Write details of the document with the given 'docnum' and 'fields'." 665 666 offset = self.field_writer.write_fields(docnum, fields) 667 668 if self.entry % self.interval == 0: 669 self.field_index_writer.write_document(docnum, offset) 670 671 self.entry += 1 672 673 def close(self): 674 self.field_writer.close() 675 self.field_index_writer.close() 676 677 class FieldDictionaryReader: 678 679 "Reading field dictionary details." 680 681 def __init__(self, field_reader, field_index_reader): 682 self.field_reader = field_reader 683 self.field_index_reader = field_index_reader 684 685 self.docs = [] 686 try: 687 while 1: 688 self.docs.append(self.field_index_reader.read_document()) 689 except EOFError: 690 pass 691 692 # Large numbers for ordering purposes. 693 694 self.max_offset = self.docs[-1][1] 695 696 def read_fields(self, docnum): 697 698 "Read the fields of the document with the given 'docnum'." 699 700 i = bisect_right(self.docs, (docnum, self.max_offset)) - 1 701 702 # Get the entry position providing the term or one preceding it. 703 704 if i == -1: 705 return None 706 707 found_docnum, offset = self.docs[i] 708 709 # Read from the fields file. 710 711 found_docnum, fields = self.field_reader.read_document_fields(found_docnum, offset) 712 713 # Scan for the document, if necessary. 714 715 try: 716 while docnum > found_docnum: 717 found_docnum, fields = self.field_reader.read_fields() 718 except EOFError: 719 pass 720 721 # If the document is found, return the fields. 722 723 if docnum == found_docnum: 724 return fields 725 else: 726 return None 727 728 def close(self): 729 self.field_reader.close() 730 self.field_index_reader.close() 731 732 # High-level classes. 733 734 class IndexWriter: 735 736 """ 737 Building term information and writing it to the term and field dictionaries. 738 """ 739 740 def __init__(self, dict_writer, field_dict_writer): 741 self.dict_writer = dict_writer 742 self.field_dict_writer = field_dict_writer 743 self.terms = {} 744 self.docs = {} 745 746 def add_position(self, term, docnum, position): 747 748 """ 749 Add a position entry for the given 'term' in the document with the given 750 'docnum', indicating the given 'position'. 751 """ 752 753 if not self.terms.has_key(term): 754 doc_positions = self.terms[term] = {} 755 else: 756 doc_positions = self.terms[term] 757 758 if not doc_positions.has_key(docnum): 759 doc = doc_positions[docnum] = [] 760 else: 761 doc = doc_positions[docnum] 762 763 doc.append(position) 764 765 def add_fields(self, docnum, fields): 766 767 "Add for the document with the given 'docnum' a list of 'fields'." 768 769 if not self.docs.has_key(docnum): 770 doc_fields = self.docs[docnum] = fields 771 else: 772 self.docs[docnum] += fields 773 774 def close(self): 775 if self.dict_writer is None: 776 return 777 778 # Get the terms in order. 779 780 terms = self.terms.items() 781 terms.sort() 782 783 for term, doc_positions in terms: 784 doc_positions = doc_positions.items() 785 doc_positions.sort() 786 self.dict_writer.write_term_positions(term, doc_positions) 787 788 self.dict_writer.close() 789 self.dict_writer = None 790 791 # Get the documents in order. 792 793 docs = self.docs.items() 794 docs.sort() 795 796 for docnum, fields in docs: 797 self.field_dict_writer.write_fields(docnum, fields) 798 799 self.field_dict_writer.close() 800 self.field_dict_writer = None 801 802 class IndexReader: 803 804 "Accessing the term and field dictionaries." 805 806 def __init__(self, dict_reader, field_dict_reader): 807 self.dict_reader = dict_reader 808 self.field_dict_reader = field_dict_reader 809 810 def find_positions(self, term): 811 return self.dict_reader.find_positions(term) 812 813 def get_fields(self, docnum): 814 return self.field_dict_reader.read_fields(docnum) 815 816 def close(self): 817 self.dict_reader.close() 818 self.field_dict_reader.close() 819 820 class Index: 821 822 "An inverted index solution encapsulating the various components." 823 824 def __init__(self, pathname): 825 self.pathname = pathname 826 self.reader = None 827 self.writer = None 828 829 def get_writer(self, interval=INTERVAL): 830 831 "Return a writer, optionally using the given indexing 'interval'." 832 833 if not exists(self.pathname): 834 mkdir(self.pathname) 835 836 tdf = open(join(self.pathname, "terms"), "wb") 837 info_writer = TermWriter(tdf) 838 839 tdif = open(join(self.pathname, "index"), "wb") 840 index_writer = TermIndexWriter(tdif) 841 842 tpf = open(join(self.pathname, "positions"), "wb") 843 positions_writer = PositionWriter(tpf) 844 845 dict_writer = TermDictionaryWriter(info_writer, index_writer, positions_writer, interval) 846 847 ff = open(join(self.pathname, "fields"), "wb") 848 field_writer = FieldWriter(ff) 849 850 fif = open(join(self.pathname, "fields_index"), "wb") 851 field_index_writer = FieldIndexWriter(fif) 852 853 field_dict_writer = FieldDictionaryWriter(field_writer, field_index_writer, interval) 854 855 self.writer = IndexWriter(dict_writer, field_dict_writer) 856 return self.writer 857 858 def get_reader(self): 859 860 "Return a reader for the index." 861 862 if not exists(self.pathname): 863 raise OSError, "Index path %r does not exist." % self.pathname 864 865 tdf = open(join(self.pathname, "terms"), "rb") 866 info_reader = TermReader(tdf) 867 868 tdif = open(join(self.pathname, "index"), "rb") 869 index_reader = TermIndexReader(tdif) 870 871 tpf = open(join(self.pathname, "positions"), "rb") 872 positions_reader = PositionReader(tpf) 873 874 dict_reader = TermDictionaryReader(info_reader, index_reader, positions_reader) 875 876 ff = open(join(self.pathname, "fields"), "rb") 877 field_reader = FieldReader(ff) 878 879 fif = open(join(self.pathname, "fields_index"), "rb") 880 field_index_reader = FieldIndexReader(fif) 881 882 field_dict_reader = FieldDictionaryReader(field_reader, field_index_reader) 883 884 self.reader = IndexReader(dict_reader, field_dict_reader) 885 return self.reader 886 887 def close(self): 888 if self.reader is not None: 889 self.reader.close() 890 self.reader = None 891 if self.writer is not None: 892 self.writer.close() 893 self.writer = None 894 895 # vim: tabstop=4 expandtab shiftwidth=4