1 #!/usr/bin/env python 2 3 """ 4 A simple (and sane) text indexing library. 5 6 Copyright (C) 2009 Paul Boddie <paul@boddie.org.uk> 7 8 This program is free software; you can redistribute it and/or modify it under 9 the terms of the GNU General Public License as published by the Free Software 10 Foundation; either version 3 of the License, or (at your option) any later 11 version. 12 13 This program is distributed in the hope that it will be useful, but WITHOUT ANY 14 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A 15 PARTICULAR PURPOSE. See the GNU General Public License for more details. 16 17 You should have received a copy of the GNU General Public License along 18 with this program. If not, see <http://www.gnu.org/licenses/>. 19 """ 20 21 from os import mkdir # to determine whether to create indexes 22 from os.path import exists, join 23 from os.path import commonprefix # to find common string prefixes 24 from bisect import bisect_right # to find terms in the dictionary index 25 import bz2 # for field compression 26 27 # Constants. 28 29 INTERVAL = 100 30 31 # Foundation classes. 32 33 class File: 34 35 "A basic file abstraction." 36 37 def __init__(self, f): 38 self.f = f 39 self.reset() 40 41 def reset(self): 42 pass 43 44 def close(self): 45 if self.f is not None: 46 self.f.close() 47 self.f = None 48 49 class FileWriter(File): 50 51 "Writing basic data types to files." 52 53 def write_number(self, number): 54 55 "Write 'number' to the file using a variable length encoding." 56 57 # Negative numbers are not supported. 58 59 if number < 0: 60 raise ValueError, "Number %r is negative." % number 61 62 # Special case: one byte containing zero. 63 64 elif number == 0: 65 self.f.write(chr(0)) 66 return 67 68 # Write the number from least to most significant digits. 69 70 bytes = [] 71 72 while number != 0: 73 lsd = number & 127 74 number = number >> 7 75 if number != 0: 76 lsd |= 128 77 bytes.append(chr(lsd)) 78 79 record = "".join(bytes) 80 self.f.write(record) 81 82 def write_string(self, s, compress=0): 83 84 """ 85 Write 's' to the file, recording its length and compressing the string 86 if 'compress' is set to a true value. 87 """ 88 89 # Convert Unicode objects to strings. 90 91 if isinstance(s, unicode): 92 s = s.encode("utf-8") 93 94 # Compress the string if requested. 95 96 if compress: 97 s = bz2.compress(s) 98 99 # Write the length of the data before the data itself. 100 101 length = len(s) 102 self.write_number(length) 103 self.f.write(s) 104 105 class FileReader(File): 106 107 "Reading basic data types from files." 108 109 def read_number(self): 110 111 "Read a number from the file." 112 113 # Read each byte, adding it to the number. 114 115 shift = 0 116 number = 0 117 more = 1 118 119 while more: 120 byte = self.f.read(1) 121 if not byte: 122 raise EOFError 123 124 csd = ord(byte) 125 more = csd & 128 != 0 126 if more: 127 csd &= 127 128 number += (csd << shift) 129 shift += 7 130 131 return number 132 133 def read_string(self, decompress=0): 134 135 """ 136 Read a string from the file, decompressing the stored data if 137 'decompress' is set to a true value. 138 """ 139 140 length = self.read_number() 141 s = self.f.read(length) 142 143 # Decompress the data if requested. 144 145 if decompress: 146 s = bz2.decompress(s) 147 148 # Convert strings to Unicode objects. 149 150 return unicode(s, "utf-8") 151 152 # Specific classes. 153 154 class PositionWriter(FileWriter): 155 156 "Writing position information to files." 157 158 def reset(self): 159 self.last_docnum = 0 160 161 def write_positions(self, docnum, positions): 162 163 "Write for the document 'docnum' the given 'positions'." 164 165 if docnum < self.last_docnum: 166 raise ValueError, "Document number %r is less than previous number %r." % (docnum, self.last_docnum) 167 168 # Write the document number delta. 169 170 self.write_number(docnum - self.last_docnum) 171 172 # Write the number of positions. 173 174 self.write_number(len(positions)) 175 176 # Make sure that the positions are sorted. 177 178 positions.sort() 179 180 # Write the position deltas. 181 182 last = 0 183 for position in positions: 184 pos = position - last 185 self.write_number(pos) 186 last = position 187 188 self.last_docnum = docnum 189 190 def write_all_positions(self, doc_positions): 191 192 """ 193 Write all 'doc_positions' - a collection of tuples of the form (document 194 number, position list) - to the file, returning the offset at which they 195 were stored. 196 """ 197 198 # Reset the writer and record the current file offset. 199 200 self.reset() 201 offset = self.f.tell() 202 203 # Write the number of documents. 204 205 self.write_number(len(doc_positions)) 206 207 # Write the positions. 208 209 for docnum, positions in doc_positions: 210 self.write_positions(docnum, positions) 211 212 return offset 213 214 class PositionReader(FileReader): 215 216 "Reading position information from files." 217 218 def reset(self): 219 self.last_docnum = 0 220 221 def read_positions(self): 222 223 "Read positions, returning a document number and a list of positions." 224 225 # Read the document number delta and add it to the last number. 226 227 self.last_docnum += self.read_number() 228 229 # Read the number of positions. 230 231 npositions = self.read_number() 232 233 # Read the position deltas, adding each previous position to get the 234 # appropriate collection of absolute positions. 235 236 i = 0 237 last = 0 238 positions = [] 239 240 while i < npositions: 241 last += self.read_number() 242 positions.append(last) 243 i += 1 244 245 return self.last_docnum, positions 246 247 def read_all_positions(self, offset): 248 249 """ 250 Read all positions from 'offset', seeking to that position in the file 251 before reading. 252 """ 253 254 self.reset() 255 self.f.seek(offset) 256 257 # Read the number of documents. 258 259 ndocuments = self.read_number() 260 261 # Read all records. 262 263 i = 0 264 doc_positions = [] 265 266 while i < ndocuments: 267 doc_positions.append(self.read_positions()) 268 i += 1 269 270 return doc_positions 271 272 class TermWriter(FileWriter): 273 274 "Writing term information to files." 275 276 def reset(self): 277 self.last_term = "" 278 self.last_offset = 0 279 280 def write_term(self, term, offset): 281 282 """ 283 Write the given 'term' and its position file 'offset' to the term 284 information file. Return the offset after the term information was 285 written to the file. 286 """ 287 288 # Too long terms are not currently supported. 289 290 if len(term) > 255: 291 raise ValueError, "Term %r is too long." % term 292 293 # Write the prefix length and term suffix. 294 295 common = len(commonprefix([self.last_term, term])) 296 suffix = term[common:] 297 298 self.write_number(common) 299 self.write_string(suffix) 300 301 # Write the offset delta. 302 303 self.write_number(offset - self.last_offset) 304 305 self.last_term = term 306 self.last_offset = offset 307 308 return self.f.tell() 309 310 class TermReader(FileReader): 311 312 "Reading term information from files." 313 314 def reset(self): 315 self.last_term = "" 316 self.last_offset = 0 317 318 def read_term(self): 319 320 """ 321 Read a term and its position file offset from the term information file. 322 """ 323 324 # Read the prefix length and term suffix. 325 326 common = self.read_number() 327 suffix = self.read_string() 328 329 self.last_term = self.last_term[:common] + suffix 330 331 # Read the offset delta. 332 333 self.last_offset += self.read_number() 334 335 return self.last_term, self.last_offset 336 337 def go_to_term(self, term, offset, info_offset): 338 339 "Seek past the entry for 'term' having 'offset' to 'info_offset'." 340 341 self.f.seek(info_offset) 342 self.last_term = term 343 self.last_offset = offset 344 345 class TermIndexWriter(TermWriter): 346 347 "Writing term dictionary index details to files." 348 349 def reset(self): 350 TermWriter.reset(self) 351 self.last_info_offset = 0 352 353 def write_term(self, term, offset, info_offset): 354 355 """ 356 Write the given 'term' and its position file 'offset' to the term 357 dictionary index file, along with the 'info_offset' in the term 358 information file. 359 """ 360 361 TermWriter.write_term(self, term, offset) 362 363 # Write the information file offset delta. 364 365 self.write_number(info_offset - self.last_info_offset) 366 self.last_info_offset = info_offset 367 368 class TermIndexReader(TermReader): 369 370 "Reading term dictionary index details from files." 371 372 def reset(self): 373 TermReader.reset(self) 374 self.last_info_offset = 0 375 376 def read_term(self): 377 378 """ 379 Read a term, its position file offset, and its term information file 380 offset from the term dictionary index file. 381 """ 382 383 term, offset = TermReader.read_term(self) 384 385 # Read the offset delta. 386 387 self.last_info_offset += self.read_number() 388 389 return term, offset, self.last_info_offset 390 391 class TermDictionaryWriter: 392 393 "Writing term dictionaries." 394 395 def __init__(self, info_writer, index_writer, position_writer, interval): 396 self.info_writer = info_writer 397 self.index_writer = index_writer 398 self.position_writer = position_writer 399 self.interval = interval 400 self.entry = 0 401 402 def write_term(self, term, offset): 403 404 """ 405 Write the given 'term' and its position file 'offset' to the term 406 information file and optionally to the index, making a dictionary entry. 407 """ 408 409 info_offset = self.info_writer.write_term(term, offset) 410 411 if self.entry % self.interval == 0: 412 self.index_writer.write_term(term, offset, info_offset) 413 414 self.entry += 1 415 416 def write_term_positions(self, term, doc_positions): 417 418 """ 419 Write the given 'term' and the 'doc_positions' recording the documents 420 and positions at which the term is found. 421 """ 422 423 offset = self.position_writer.write_all_positions(doc_positions) 424 self.write_term(term, offset) 425 426 def close(self): 427 self.info_writer.close() 428 self.index_writer.close() 429 self.position_writer.close() 430 431 class TermDictionaryReader: 432 433 "Reading term dictionaries." 434 435 def __init__(self, info_reader, index_reader, position_reader): 436 self.info_reader = info_reader 437 self.index_reader = index_reader 438 self.position_reader = position_reader 439 440 self.terms = [] 441 try: 442 while 1: 443 self.terms.append(self.index_reader.read_term()) 444 except EOFError: 445 pass 446 447 # Large numbers for ordering purposes. 448 449 self.max_offset = self.terms[-1][1] 450 self.max_info_offset = self.terms[-1][2] 451 452 def find_term(self, term): 453 454 "Find the position file offset of 'term' from the term dictionary." 455 456 i = bisect_right(self.terms, (term, self.max_offset, self.max_info_offset)) - 1 457 458 # Get the entry position providing the term or one preceding it. 459 460 if i == -1: 461 return None 462 463 found_term, offset, info_offset = self.terms[i] 464 465 # Where the term is found immediately, return the offset. 466 467 if term == found_term: 468 return offset 469 470 # Otherwise, seek past the index term's entry in the information file 471 # and scan for the desired term. 472 473 else: 474 self.info_reader.go_to_term(found_term, offset, info_offset) 475 try: 476 while term > found_term: 477 found_term, offset = self.info_reader.read_term() 478 except EOFError: 479 pass 480 481 # If the term is found, return the offset. 482 483 if term == found_term: 484 return offset 485 else: 486 return None 487 488 def find_positions(self, term): 489 490 "Return the documents and positions at which the given 'term' is found." 491 492 offset = self.find_term(term) 493 if offset is None: 494 return None 495 else: 496 return self.position_reader.read_all_positions(offset) 497 498 def close(self): 499 self.info_reader.close() 500 self.index_reader.close() 501 self.position_reader.close() 502 503 class FieldWriter(FileWriter): 504 505 "Writing field data to files." 506 507 def write_fields(self, fields): 508 509 """ 510 Write the given list of 'fields' (strings representing field values). 511 Return the offset at which the fields are stored. 512 """ 513 514 offset = self.f.tell() 515 516 # Write the number of fields. 517 518 self.write_number(len(fields)) 519 520 # Write the fields themselves. 521 522 for field in fields: 523 self.write_string(field, 0) # compress 524 525 return offset 526 527 class FieldReader(FileReader): 528 529 "Reading field data from files." 530 531 def read_fields(self): 532 533 "Read fields from the file, returning the field values in a list." 534 535 # Read the number of fields. 536 537 nfields = self.read_number() 538 539 # Collect the fields. 540 541 fields = [] 542 i = 0 543 544 while i < nfields: 545 fields.append(self.read_string(0)) # decompress 546 i += 1 547 548 return fields 549 550 def read_doc_fields(self, offset): 551 552 "Read all fields at the given 'offset." 553 554 self.f.seek(offset) 555 return self.read_fields() 556 557 # High-level classes. 558 559 class IndexWriter: 560 561 "Building term information and writing it to the term dictionary." 562 563 def __init__(self, dict_writer): 564 self.dict_writer = dict_writer 565 self.terms = {} 566 567 def add_position(self, term, docnum, position): 568 569 """ 570 Add a position entry for the given 'term' in the document with the given 571 'docnum', indicating the given 'position'. 572 """ 573 574 if not self.terms.has_key(term): 575 doc_positions = self.terms[term] = {} 576 else: 577 doc_positions = self.terms[term] 578 579 if not doc_positions.has_key(docnum): 580 doc = doc_positions[docnum] = [] 581 else: 582 doc = doc_positions[docnum] 583 584 doc.append(position) 585 586 def close(self): 587 if self.dict_writer is None: 588 return 589 590 # Get the terms in order. 591 592 terms = self.terms.items() 593 terms.sort() 594 595 for term, doc_positions in terms: 596 doc_positions = doc_positions.items() 597 doc_positions.sort() 598 self.dict_writer.write_term_positions(term, doc_positions) 599 600 self.dict_writer.close() 601 self.dict_writer = None 602 603 class Index: 604 605 "An inverted index solution encapsulating the various components." 606 607 def __init__(self, pathname): 608 self.pathname = pathname 609 self.reader = None 610 self.writer = None 611 612 def get_writer(self, interval=INTERVAL): 613 614 "Return a writer, optionally using the given indexing 'interval'." 615 616 if not exists(self.pathname): 617 mkdir(self.pathname) 618 619 tdf = open(join(self.pathname, "terms"), "wb") 620 info_writer = TermWriter(tdf) 621 622 tdif = open(join(self.pathname, "index"), "wb") 623 index_writer = TermIndexWriter(tdif) 624 625 tpf = open(join(self.pathname, "positions"), "wb") 626 positions_writer = PositionWriter(tpf) 627 628 dict_writer = TermDictionaryWriter(info_writer, index_writer, positions_writer, interval) 629 630 self.writer = IndexWriter(dict_writer) 631 return self.writer 632 633 def get_reader(self): 634 635 "Return a reader for the index." 636 637 if not exists(self.pathname): 638 raise OSError, "Index path %r does not exist." % self.pathname 639 640 tdf = open(join(self.pathname, "terms"), "rb") 641 info_reader = TermReader(tdf) 642 643 tdif = open(join(self.pathname, "index"), "rb") 644 index_reader = TermIndexReader(tdif) 645 646 tpf = open(join(self.pathname, "positions"), "rb") 647 positions_reader = PositionReader(tpf) 648 649 self.reader = TermDictionaryReader(info_reader, index_reader, positions_reader) 650 return self.reader 651 652 def close(self): 653 if self.reader is not None: 654 self.reader.close() 655 self.reader = None 656 if self.writer is not None: 657 self.writer.close() 658 self.writer = None 659 660 # vim: tabstop=4 expandtab shiftwidth=4