1 #!/usr/bin/env python 2 3 """ 4 A simple (and sane) text indexing library. 5 6 Copyright (C) 2009 Paul Boddie <paul@boddie.org.uk> 7 8 This program is free software; you can redistribute it and/or modify it under 9 the terms of the GNU General Public License as published by the Free Software 10 Foundation; either version 3 of the License, or (at your option) any later 11 version. 12 13 This program is distributed in the hope that it will be useful, but WITHOUT ANY 14 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A 15 PARTICULAR PURPOSE. See the GNU General Public License for more details. 16 17 You should have received a copy of the GNU General Public License along 18 with this program. If not, see <http://www.gnu.org/licenses/>. 19 """ 20 21 from os.path import commonprefix # to find common string prefixes 22 from bisect import bisect_right # to find terms in the dictionary index 23 24 # Foundation classes. 25 26 class File: 27 28 "A basic file abstraction." 29 30 def __init__(self, f): 31 self.f = f 32 self.reset() 33 34 def reset(self): 35 pass 36 37 def close(self): 38 self.f.close() 39 40 class FileWriter(File): 41 42 "Writing basic data types to files." 43 44 def write_number(self, number): 45 46 "Write 'number' to the file using a variable length encoding." 47 48 # Negative numbers are not supported. 49 50 if number < 0: 51 raise ValueError, "Number %r is negative." % number 52 53 # Special case: one byte containing zero. 54 55 elif number == 0: 56 self.f.write(chr(0)) 57 return 58 59 # Write the number from least to most significant digits. 60 61 bytes = [] 62 63 while number != 0: 64 lsd = number & 127 65 number = number >> 7 66 if number != 0: 67 lsd |= 128 68 bytes.append(chr(lsd)) 69 70 record = "".join(bytes) 71 self.f.write(record) 72 73 def write_string(self, s): 74 75 "Write 's' to the file, recording its length." 76 77 length = len(s) 78 79 if not (0 <= length <= 255): 80 raise ValueError, "String %r is too long." % s 81 82 self.write_number(length) 83 self.f.write(s) 84 85 class FileReader(File): 86 87 "Reading basic data types from files." 88 89 def read_number(self): 90 91 "Read a number from the file." 92 93 # Read each byte, adding it to the number. 94 95 shift = 0 96 number = 0 97 more = 1 98 99 while more: 100 byte = self.f.read(1) 101 if not byte: 102 raise EOFError 103 104 csd = ord(byte) 105 more = csd & 128 != 0 106 if more: 107 csd &= 127 108 number += (csd << shift) 109 shift += 7 110 111 return number 112 113 def read_string(self): 114 115 "Read a string from the file." 116 117 length = self.read_number() 118 return self.f.read(length) 119 120 # Specific classes. 121 122 class PositionWriter(FileWriter): 123 124 "Writing position information to files." 125 126 def reset(self): 127 self.last_docnum = 0 128 129 def write_positions(self, docnum, positions): 130 131 "Write for the document 'docnum' the given 'positions'." 132 133 if docnum < self.last_docnum: 134 raise ValueError, "Document number %r is less than previous number %r." % (docnum, self.last_docnum) 135 136 # Write the document number delta. 137 138 self.write_number(docnum - self.last_docnum) 139 140 # Write the number of positions. 141 142 self.write_number(len(positions)) 143 144 # Write the position deltas. 145 146 last = 0 147 for position in positions: 148 pos = position - last 149 self.write_number(pos) 150 last = position 151 152 self.last_docnum = docnum 153 154 def write_all_positions(self, doc_positions): 155 156 """ 157 Write all 'doc_positions' - a collection of tuples of the form (document 158 number, position list) - to the file, returning the offset at which they 159 were stored. 160 """ 161 162 # Reset the writer and record the current file offset. 163 164 self.reset() 165 offset = self.f.tell() 166 167 # Write the number of documents. 168 169 self.write_number(len(doc_positions)) 170 171 # Write the positions. 172 173 for docnum, positions in doc_positions: 174 self.write_positions(docnum, positions) 175 176 return offset 177 178 class PositionReader(FileReader): 179 180 "Reading position information from files." 181 182 def reset(self): 183 self.last_docnum = 0 184 185 def read_positions(self): 186 187 "Read positions, returning a document number and a list of positions." 188 189 # Read the document number delta and add it to the last number. 190 191 self.last_docnum += self.read_number() 192 193 # Read the number of positions. 194 195 npositions = self.read_number() 196 197 # Read the position deltas, adding each previous position to get the 198 # appropriate collection of absolute positions. 199 200 i = 0 201 last = 0 202 positions = [] 203 204 while i < npositions: 205 last += self.read_number() 206 positions.append(last) 207 i += 1 208 209 return self.last_docnum, positions 210 211 def read_all_positions(self, offset): 212 213 """ 214 Read all positions from 'offset', seeking to that position in the file 215 before reading. 216 """ 217 218 self.reset() 219 self.f.seek(offset) 220 221 # Read the number of documents. 222 223 ndocuments = self.read_number() 224 225 # Read all records. 226 227 i = 0 228 doc_positions = [] 229 230 while i < ndocuments: 231 doc_positions.append(self.read_positions()) 232 i += 1 233 234 return doc_positions 235 236 class TermWriter(FileWriter): 237 238 "Writing term information to files." 239 240 def reset(self): 241 self.last_term = "" 242 self.last_offset = 0 243 244 def write_term(self, term, offset): 245 246 """ 247 Write the given 'term' and its position file 'offset' to the term 248 information file. Return the offset after the term information was 249 written to the file. 250 """ 251 252 # Too long terms are not currently supported. 253 254 if len(term) > 255: 255 raise ValueError, "Term %r is too long." % term 256 257 # Write the prefix length and term suffix. 258 259 common = len(commonprefix([self.last_term, term])) 260 suffix = term[common:] 261 262 self.write_number(common) 263 self.write_string(suffix) 264 265 # Write the offset delta. 266 267 self.write_number(offset - self.last_offset) 268 269 self.last_term = term 270 self.last_offset = offset 271 272 return self.f.tell() 273 274 class TermReader(FileReader): 275 276 "Reading term information from files." 277 278 def reset(self): 279 self.last_term = "" 280 self.last_offset = 0 281 282 def read_term(self): 283 284 """ 285 Read a term and its position file offset from the term information file. 286 """ 287 288 # Read the prefix length and term suffix. 289 290 common = self.read_number() 291 suffix = self.read_string() 292 293 self.last_term = self.last_term[:common] + suffix 294 295 # Read the offset delta. 296 297 self.last_offset += self.read_number() 298 299 return self.last_term, self.last_offset 300 301 def go_to_term(self, term, offset, info_offset): 302 303 "Seek past the entry for 'term' having 'offset' to 'info_offset'." 304 305 self.f.seek(info_offset) 306 self.last_term = term 307 self.last_offset = offset 308 309 class TermIndexWriter(TermWriter): 310 311 "Writing term dictionary index details to files." 312 313 def reset(self): 314 TermWriter.reset(self) 315 self.last_info_offset = 0 316 317 def write_term(self, term, offset, info_offset): 318 319 """ 320 Write the given 'term' and its position file 'offset' to the term 321 dictionary index file, along with the 'info_offset' in the term 322 information file. 323 """ 324 325 TermWriter.write_term(self, term, offset) 326 327 # Write the information file offset delta. 328 329 self.write_number(info_offset - self.last_info_offset) 330 self.last_info_offset = info_offset 331 332 class TermIndexReader(TermReader): 333 334 "Reading term dictionary index details from files." 335 336 def reset(self): 337 TermReader.reset(self) 338 self.last_info_offset = 0 339 340 def read_term(self): 341 342 """ 343 Read a term, its position file offset, and its term information file 344 offset from the term dictionary index file. 345 """ 346 347 term, offset = TermReader.read_term(self) 348 349 # Read the offset delta. 350 351 self.last_info_offset += self.read_number() 352 353 return term, offset, self.last_info_offset 354 355 class TermDictionaryWriter: 356 357 "Writing term dictionaries." 358 359 def __init__(self, info_writer, index_writer, position_writer, interval): 360 self.info_writer = info_writer 361 self.index_writer = index_writer 362 self.position_writer = position_writer 363 self.interval = interval 364 self.entry = 0 365 366 def write_term(self, term, offset): 367 368 """ 369 Write the given 'term' and its position file 'offset' to the term 370 information file and optionally to the index, making a dictionary entry. 371 """ 372 373 info_offset = self.info_writer.write_term(term, offset) 374 375 if self.entry % self.interval == 0: 376 self.index_writer.write_term(term, offset, info_offset) 377 378 self.entry += 1 379 380 def write_term_positions(self, term, doc_positions): 381 382 """ 383 Write the given 'term' and the 'doc_positions' recording the documents 384 and positions at which the term is found. 385 """ 386 387 offset = self.position_writer.write_all_positions(doc_positions) 388 self.write_term(term, offset) 389 390 def close(self): 391 self.info_writer.close() 392 self.index_writer.close() 393 self.position_writer.close() 394 395 class TermDictionaryReader: 396 397 "Reading term dictionaries." 398 399 def __init__(self, info_reader, index_reader, position_reader): 400 self.info_reader = info_reader 401 self.index_reader = index_reader 402 self.position_reader = position_reader 403 404 self.terms = [] 405 try: 406 while 1: 407 self.terms.append(self.index_reader.read_term()) 408 except EOFError: 409 pass 410 411 # Large numbers for ordering purposes. 412 413 self.max_offset = self.terms[-1][1] 414 self.max_info_offset = self.terms[-1][2] 415 416 def find_term(self, term): 417 418 "Find the position file offset of 'term' from the term dictionary." 419 420 i = bisect_right(self.terms, (term, self.max_offset, self.max_info_offset)) - 1 421 422 # Get the entry position providing the term or one preceding it. 423 424 if i == -1: 425 return None 426 427 found_term, offset, info_offset = self.terms[i] 428 429 # Where the term is found immediately, return the offset. 430 431 if term == found_term: 432 return offset 433 434 # Otherwise, seek past the index term's entry in the information file 435 # and scan for the desired term. 436 437 else: 438 self.info_reader.go_to_term(found_term, offset, info_offset) 439 try: 440 while term > found_term: 441 found_term, offset = self.info_reader.read_term() 442 except EOFError: 443 pass 444 445 # If the term is found, return the offset. 446 447 if term == found_term: 448 return offset 449 else: 450 return None 451 452 def find_positions(self, term): 453 454 "Return the documents and positions at which the given 'term' is found." 455 456 offset = self.find_term(term) 457 if offset is None: 458 return None 459 else: 460 return self.position_reader.read_all_positions(offset) 461 462 def close(self): 463 self.info_reader.close() 464 self.index_reader.close() 465 self.position_reader.close() 466 467 class IndexWriter: 468 469 "Building term information and writing it to the term dictionary." 470 471 def __init__(self, dict_writer): 472 self.dict_writer = dict_writer 473 self.terms = {} 474 475 def add_position(self, term, docnum, position): 476 477 """ 478 Add a position entry for the given 'term' in the document with the given 479 'docnum', indicating the given 'position'. 480 """ 481 482 if not self.terms.has_key(term): 483 doc_positions = self.terms[term] = {} 484 else: 485 doc_positions = self.terms[term] 486 487 if not doc_positions.has_key(docnum): 488 doc = doc_positions[docnum] = [] 489 else: 490 doc = doc_positions[docnum] 491 492 doc.append(position) 493 494 def close(self): 495 496 # Get the terms in order. 497 498 terms = self.terms.items() 499 terms.sort() 500 501 for term, doc_positions in terms: 502 doc_positions = doc_positions.items() 503 doc_positions.sort() 504 self.dict_writer.write_term_positions(term, doc_positions) 505 506 self.dict_writer.close() 507 508 # vim: tabstop=4 expandtab shiftwidth=4