1 #!/usr/bin/env python 2 3 """ 4 A simple (and sane) text indexing library. 5 6 Copyright (C) 2009 Paul Boddie <paul@boddie.org.uk> 7 8 This program is free software; you can redistribute it and/or modify it under 9 the terms of the GNU General Public License as published by the Free Software 10 Foundation; either version 3 of the License, or (at your option) any later 11 version. 12 13 This program is distributed in the hope that it will be useful, but WITHOUT ANY 14 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A 15 PARTICULAR PURPOSE. See the GNU General Public License for more details. 16 17 You should have received a copy of the GNU General Public License along 18 with this program. If not, see <http://www.gnu.org/licenses/>. 19 """ 20 21 from os import mkdir # to determine whether to create indexes 22 from os.path import exists, join 23 from os.path import commonprefix # to find common string prefixes 24 from bisect import bisect_right # to find terms in the dictionary index 25 26 # Constants. 27 28 INTERVAL = 100 29 30 # Foundation classes. 31 32 class File: 33 34 "A basic file abstraction." 35 36 def __init__(self, f): 37 self.f = f 38 self.reset() 39 40 def reset(self): 41 pass 42 43 def close(self): 44 if self.f is not None: 45 self.f.close() 46 self.f = None 47 48 class FileWriter(File): 49 50 "Writing basic data types to files." 51 52 def write_number(self, number): 53 54 "Write 'number' to the file using a variable length encoding." 55 56 # Negative numbers are not supported. 57 58 if number < 0: 59 raise ValueError, "Number %r is negative." % number 60 61 # Special case: one byte containing zero. 62 63 elif number == 0: 64 self.f.write(chr(0)) 65 return 66 67 # Write the number from least to most significant digits. 68 69 bytes = [] 70 71 while number != 0: 72 lsd = number & 127 73 number = number >> 7 74 if number != 0: 75 lsd |= 128 76 bytes.append(chr(lsd)) 77 78 record = "".join(bytes) 79 self.f.write(record) 80 81 def write_string(self, s): 82 83 "Write 's' to the file, recording its length." 84 85 # Convert Unicode objects to strings. 86 87 if isinstance(s, unicode): 88 s = s.encode("utf-8") 89 90 length = len(s) 91 92 if not (0 <= length <= 255): 93 raise ValueError, "String %r is too long." % s 94 95 self.write_number(length) 96 self.f.write(s) 97 98 class FileReader(File): 99 100 "Reading basic data types from files." 101 102 def read_number(self): 103 104 "Read a number from the file." 105 106 # Read each byte, adding it to the number. 107 108 shift = 0 109 number = 0 110 more = 1 111 112 while more: 113 byte = self.f.read(1) 114 if not byte: 115 raise EOFError 116 117 csd = ord(byte) 118 more = csd & 128 != 0 119 if more: 120 csd &= 127 121 number += (csd << shift) 122 shift += 7 123 124 return number 125 126 def read_string(self): 127 128 "Read a string from the file." 129 130 length = self.read_number() 131 132 # Convert strings to Unicode objects. 133 134 return unicode(self.f.read(length), "utf-8") 135 136 # Specific classes. 137 138 class PositionWriter(FileWriter): 139 140 "Writing position information to files." 141 142 def reset(self): 143 self.last_docnum = 0 144 145 def write_positions(self, docnum, positions): 146 147 "Write for the document 'docnum' the given 'positions'." 148 149 if docnum < self.last_docnum: 150 raise ValueError, "Document number %r is less than previous number %r." % (docnum, self.last_docnum) 151 152 # Write the document number delta. 153 154 self.write_number(docnum - self.last_docnum) 155 156 # Write the number of positions. 157 158 self.write_number(len(positions)) 159 160 # Make sure that the positions are sorted. 161 162 positions.sort() 163 164 # Write the position deltas. 165 166 last = 0 167 for position in positions: 168 pos = position - last 169 self.write_number(pos) 170 last = position 171 172 self.last_docnum = docnum 173 174 def write_all_positions(self, doc_positions): 175 176 """ 177 Write all 'doc_positions' - a collection of tuples of the form (document 178 number, position list) - to the file, returning the offset at which they 179 were stored. 180 """ 181 182 # Reset the writer and record the current file offset. 183 184 self.reset() 185 offset = self.f.tell() 186 187 # Write the number of documents. 188 189 self.write_number(len(doc_positions)) 190 191 # Write the positions. 192 193 for docnum, positions in doc_positions: 194 self.write_positions(docnum, positions) 195 196 return offset 197 198 class PositionReader(FileReader): 199 200 "Reading position information from files." 201 202 def reset(self): 203 self.last_docnum = 0 204 205 def read_positions(self): 206 207 "Read positions, returning a document number and a list of positions." 208 209 # Read the document number delta and add it to the last number. 210 211 self.last_docnum += self.read_number() 212 213 # Read the number of positions. 214 215 npositions = self.read_number() 216 217 # Read the position deltas, adding each previous position to get the 218 # appropriate collection of absolute positions. 219 220 i = 0 221 last = 0 222 positions = [] 223 224 while i < npositions: 225 last += self.read_number() 226 positions.append(last) 227 i += 1 228 229 return self.last_docnum, positions 230 231 def read_all_positions(self, offset): 232 233 """ 234 Read all positions from 'offset', seeking to that position in the file 235 before reading. 236 """ 237 238 self.reset() 239 self.f.seek(offset) 240 241 # Read the number of documents. 242 243 ndocuments = self.read_number() 244 245 # Read all records. 246 247 i = 0 248 doc_positions = [] 249 250 while i < ndocuments: 251 doc_positions.append(self.read_positions()) 252 i += 1 253 254 return doc_positions 255 256 class TermWriter(FileWriter): 257 258 "Writing term information to files." 259 260 def reset(self): 261 self.last_term = "" 262 self.last_offset = 0 263 264 def write_term(self, term, offset): 265 266 """ 267 Write the given 'term' and its position file 'offset' to the term 268 information file. Return the offset after the term information was 269 written to the file. 270 """ 271 272 # Too long terms are not currently supported. 273 274 if len(term) > 255: 275 raise ValueError, "Term %r is too long." % term 276 277 # Write the prefix length and term suffix. 278 279 common = len(commonprefix([self.last_term, term])) 280 suffix = term[common:] 281 282 self.write_number(common) 283 self.write_string(suffix) 284 285 # Write the offset delta. 286 287 self.write_number(offset - self.last_offset) 288 289 self.last_term = term 290 self.last_offset = offset 291 292 return self.f.tell() 293 294 class TermReader(FileReader): 295 296 "Reading term information from files." 297 298 def reset(self): 299 self.last_term = "" 300 self.last_offset = 0 301 302 def read_term(self): 303 304 """ 305 Read a term and its position file offset from the term information file. 306 """ 307 308 # Read the prefix length and term suffix. 309 310 common = self.read_number() 311 suffix = self.read_string() 312 313 self.last_term = self.last_term[:common] + suffix 314 315 # Read the offset delta. 316 317 self.last_offset += self.read_number() 318 319 return self.last_term, self.last_offset 320 321 def go_to_term(self, term, offset, info_offset): 322 323 "Seek past the entry for 'term' having 'offset' to 'info_offset'." 324 325 self.f.seek(info_offset) 326 self.last_term = term 327 self.last_offset = offset 328 329 class TermIndexWriter(TermWriter): 330 331 "Writing term dictionary index details to files." 332 333 def reset(self): 334 TermWriter.reset(self) 335 self.last_info_offset = 0 336 337 def write_term(self, term, offset, info_offset): 338 339 """ 340 Write the given 'term' and its position file 'offset' to the term 341 dictionary index file, along with the 'info_offset' in the term 342 information file. 343 """ 344 345 TermWriter.write_term(self, term, offset) 346 347 # Write the information file offset delta. 348 349 self.write_number(info_offset - self.last_info_offset) 350 self.last_info_offset = info_offset 351 352 class TermIndexReader(TermReader): 353 354 "Reading term dictionary index details from files." 355 356 def reset(self): 357 TermReader.reset(self) 358 self.last_info_offset = 0 359 360 def read_term(self): 361 362 """ 363 Read a term, its position file offset, and its term information file 364 offset from the term dictionary index file. 365 """ 366 367 term, offset = TermReader.read_term(self) 368 369 # Read the offset delta. 370 371 self.last_info_offset += self.read_number() 372 373 return term, offset, self.last_info_offset 374 375 class TermDictionaryWriter: 376 377 "Writing term dictionaries." 378 379 def __init__(self, info_writer, index_writer, position_writer, interval): 380 self.info_writer = info_writer 381 self.index_writer = index_writer 382 self.position_writer = position_writer 383 self.interval = interval 384 self.entry = 0 385 386 def write_term(self, term, offset): 387 388 """ 389 Write the given 'term' and its position file 'offset' to the term 390 information file and optionally to the index, making a dictionary entry. 391 """ 392 393 info_offset = self.info_writer.write_term(term, offset) 394 395 if self.entry % self.interval == 0: 396 self.index_writer.write_term(term, offset, info_offset) 397 398 self.entry += 1 399 400 def write_term_positions(self, term, doc_positions): 401 402 """ 403 Write the given 'term' and the 'doc_positions' recording the documents 404 and positions at which the term is found. 405 """ 406 407 offset = self.position_writer.write_all_positions(doc_positions) 408 self.write_term(term, offset) 409 410 def close(self): 411 self.info_writer.close() 412 self.index_writer.close() 413 self.position_writer.close() 414 415 class TermDictionaryReader: 416 417 "Reading term dictionaries." 418 419 def __init__(self, info_reader, index_reader, position_reader): 420 self.info_reader = info_reader 421 self.index_reader = index_reader 422 self.position_reader = position_reader 423 424 self.terms = [] 425 try: 426 while 1: 427 self.terms.append(self.index_reader.read_term()) 428 except EOFError: 429 pass 430 431 # Large numbers for ordering purposes. 432 433 self.max_offset = self.terms[-1][1] 434 self.max_info_offset = self.terms[-1][2] 435 436 def find_term(self, term): 437 438 "Find the position file offset of 'term' from the term dictionary." 439 440 i = bisect_right(self.terms, (term, self.max_offset, self.max_info_offset)) - 1 441 442 # Get the entry position providing the term or one preceding it. 443 444 if i == -1: 445 return None 446 447 found_term, offset, info_offset = self.terms[i] 448 449 # Where the term is found immediately, return the offset. 450 451 if term == found_term: 452 return offset 453 454 # Otherwise, seek past the index term's entry in the information file 455 # and scan for the desired term. 456 457 else: 458 self.info_reader.go_to_term(found_term, offset, info_offset) 459 try: 460 while term > found_term: 461 found_term, offset = self.info_reader.read_term() 462 except EOFError: 463 pass 464 465 # If the term is found, return the offset. 466 467 if term == found_term: 468 return offset 469 else: 470 return None 471 472 def find_positions(self, term): 473 474 "Return the documents and positions at which the given 'term' is found." 475 476 offset = self.find_term(term) 477 if offset is None: 478 return None 479 else: 480 return self.position_reader.read_all_positions(offset) 481 482 def close(self): 483 self.info_reader.close() 484 self.index_reader.close() 485 self.position_reader.close() 486 487 class IndexWriter: 488 489 "Building term information and writing it to the term dictionary." 490 491 def __init__(self, dict_writer): 492 self.dict_writer = dict_writer 493 self.terms = {} 494 495 def add_position(self, term, docnum, position): 496 497 """ 498 Add a position entry for the given 'term' in the document with the given 499 'docnum', indicating the given 'position'. 500 """ 501 502 if not self.terms.has_key(term): 503 doc_positions = self.terms[term] = {} 504 else: 505 doc_positions = self.terms[term] 506 507 if not doc_positions.has_key(docnum): 508 doc = doc_positions[docnum] = [] 509 else: 510 doc = doc_positions[docnum] 511 512 doc.append(position) 513 514 def close(self): 515 if self.dict_writer is None: 516 return 517 518 # Get the terms in order. 519 520 terms = self.terms.items() 521 terms.sort() 522 523 for term, doc_positions in terms: 524 doc_positions = doc_positions.items() 525 doc_positions.sort() 526 self.dict_writer.write_term_positions(term, doc_positions) 527 528 self.dict_writer.close() 529 self.dict_writer = None 530 531 class Index: 532 533 "An inverted index solution encapsulating the various components." 534 535 def __init__(self, pathname): 536 self.pathname = pathname 537 self.reader = None 538 self.writer = None 539 540 def get_writer(self, interval=INTERVAL): 541 542 "Return a writer, optionally using the given indexing 'interval'." 543 544 if not exists(self.pathname): 545 mkdir(self.pathname) 546 547 tdf = open(join(self.pathname, "terms"), "wb") 548 info_writer = TermWriter(tdf) 549 550 tdif = open(join(self.pathname, "index"), "wb") 551 index_writer = TermIndexWriter(tdif) 552 553 tpf = open(join(self.pathname, "positions"), "wb") 554 positions_writer = PositionWriter(tpf) 555 556 dict_writer = TermDictionaryWriter(info_writer, index_writer, positions_writer, interval) 557 558 self.writer = IndexWriter(dict_writer) 559 return self.writer 560 561 def get_reader(self): 562 563 "Return a reader for the index." 564 565 if not exists(self.pathname): 566 raise OSError, "Index path %r does not exist." % self.pathname 567 568 tdf = open(join(self.pathname, "terms"), "rb") 569 info_reader = TermReader(tdf) 570 571 tdif = open(join(self.pathname, "index"), "rb") 572 index_reader = TermIndexReader(tdif) 573 574 tpf = open(join(self.pathname, "positions"), "rb") 575 positions_reader = PositionReader(tpf) 576 577 self.reader = TermDictionaryReader(info_reader, index_reader, positions_reader) 578 return self.reader 579 580 def close(self): 581 if self.reader is not None: 582 self.reader.close() 583 self.reader = None 584 if self.writer is not None: 585 self.writer.close() 586 self.writer = None 587 588 # vim: tabstop=4 expandtab shiftwidth=4