1 #!/usr/bin/env python 2 3 """ 4 A simple (and sane) text indexing library. 5 6 Copyright (C) 2009 Paul Boddie <paul@boddie.org.uk> 7 8 This program is free software; you can redistribute it and/or modify it under 9 the terms of the GNU General Public License as published by the Free Software 10 Foundation; either version 3 of the License, or (at your option) any later 11 version. 12 13 This program is distributed in the hope that it will be useful, but WITHOUT ANY 14 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A 15 PARTICULAR PURPOSE. See the GNU General Public License for more details. 16 17 You should have received a copy of the GNU General Public License along 18 with this program. If not, see <http://www.gnu.org/licenses/>. 19 """ 20 21 from os.path import commonprefix # to find common string prefixes 22 from bisect import bisect_right # to find terms in the dictionary index 23 24 # Foundation classes. 25 26 class File: 27 28 "A basic file abstraction." 29 30 def __init__(self, f): 31 self.f = f 32 self.reset() 33 34 def reset(self): 35 pass 36 37 def close(self): 38 self.f.close() 39 40 class FileWriter(File): 41 42 "Writing basic data types to files." 43 44 def write_number(self, number): 45 46 "Write 'number' to the file using a variable length encoding." 47 48 # Negative numbers are not supported. 49 50 if number < 0: 51 raise ValueError, "Number %r is negative." % number 52 53 # Special case: one byte containing zero. 54 55 elif number == 0: 56 self.f.write(chr(1) + chr(0)) 57 return 58 59 # Write the number from least to most significant digits. 60 61 nbytes = 0 62 bytes = [] 63 64 while number != 0: 65 lsd = number & 255 66 bytes.append(chr(lsd)) 67 number = number >> 8 68 nbytes += 1 69 70 # Too large numbers are not supported. 71 72 if nbytes > 255: 73 raise ValueError, "Number %r is too large." % number 74 75 bytes.insert(0, chr(nbytes)) 76 record = "".join(bytes) 77 self.f.write(record) 78 79 def write_unsigned_byte(self, number): 80 81 "Write 'number' to the file using a single byte." 82 83 if not (0 <= number <= 255): 84 raise ValueError, "Number %r is out of range." % number 85 86 self.f.write(chr(number)) 87 88 def write_string(self, s): 89 90 "Write 's' to the file, recording its length." 91 92 length = len(s) 93 94 if not (0 <= length <= 255): 95 raise ValueError, "String %r is too long." % s 96 97 self.write_unsigned_byte(length) 98 self.f.write(s) 99 100 class FileReader(File): 101 102 "Reading basic data types from files." 103 104 def read_number(self): 105 106 "Read a number from the file." 107 108 nbytes = self.read_unsigned_byte() 109 110 # Read each byte, adding it to the number. 111 112 bytes = self.f.read(nbytes) 113 114 i = 0 115 shift = 0 116 number = 0 117 118 while i < nbytes: 119 csd = ord(bytes[i]) 120 number += (csd << shift) 121 shift += 8 122 i += 1 123 124 return number 125 126 def read_unsigned_byte(self): 127 128 "Read a number from the file, consuming a single byte." 129 130 s = self.f.read(1) 131 if not s: 132 raise EOFError 133 134 return ord(s) 135 136 def read_string(self): 137 138 "Read a string from the file." 139 140 length = self.read_unsigned_byte() 141 return self.f.read(length) 142 143 # Specific classes. 144 145 class PositionWriter(FileWriter): 146 147 "Writing position information to files." 148 149 def reset(self): 150 self.last_docnum = 0 151 152 def write_positions(self, docnum, positions): 153 154 "Write for the document 'docnum' the given 'positions'." 155 156 if docnum < self.last_docnum: 157 raise ValueError, "Document number %r is less than previous number %r." % (docnum, self.last_docnum) 158 159 # Write the document number delta. 160 161 self.write_number(docnum - self.last_docnum) 162 163 # Write the number of positions. 164 165 self.write_number(len(positions)) 166 167 # Write the position deltas. 168 169 last = 0 170 for position in positions: 171 pos = position - last 172 self.write_number(pos) 173 last = position 174 175 self.last_docnum = docnum 176 177 def write_all_positions(self, doc_positions): 178 179 """ 180 Write all 'doc_positions' - a collection of tuples of the form (document 181 number, position list) - to the file, returning the offset at which they 182 were stored. 183 """ 184 185 # Reset the writer and record the current file offset. 186 187 self.reset() 188 offset = self.f.tell() 189 190 # Write the number of documents. 191 192 self.write_number(len(doc_positions)) 193 194 # Write the positions. 195 196 for docnum, positions in doc_positions: 197 self.write_positions(docnum, positions) 198 199 return offset 200 201 class PositionReader(FileReader): 202 203 "Reading position information from files." 204 205 def reset(self): 206 self.last_docnum = 0 207 208 def read_positions(self): 209 210 "Read positions, returning a document number and a list of positions." 211 212 # Read the document number delta and add it to the last number. 213 214 self.last_docnum += self.read_number() 215 216 # Read the number of positions. 217 218 npositions = self.read_number() 219 220 # Read the position deltas, adding each previous position to get the 221 # appropriate collection of absolute positions. 222 223 i = 0 224 last = 0 225 positions = [] 226 227 while i < npositions: 228 last += self.read_number() 229 positions.append(last) 230 i += 1 231 232 return self.last_docnum, positions 233 234 def read_all_positions(self, offset): 235 236 """ 237 Read all positions from 'offset', seeking to that position in the file 238 before reading. 239 """ 240 241 self.reset() 242 self.f.seek(offset) 243 244 # Read the number of documents. 245 246 ndocuments = self.read_number() 247 248 # Read all records. 249 250 i = 0 251 doc_positions = [] 252 253 while i < ndocuments: 254 doc_positions.append(self.read_positions()) 255 i += 1 256 257 return doc_positions 258 259 class TermWriter(FileWriter): 260 261 "Writing term information to files." 262 263 def reset(self): 264 self.last_term = "" 265 self.last_offset = 0 266 267 def write_term(self, term, offset): 268 269 """ 270 Write the given 'term' and its position file 'offset' to the term 271 information file. Return the offset after the term information was 272 written to the file. 273 """ 274 275 # Too long terms are not currently supported. 276 277 if len(term) > 255: 278 raise ValueError, "Term %r is too long." % term 279 280 # Write the prefix length and term suffix. 281 282 common = len(commonprefix([self.last_term, term])) 283 suffix = term[common:] 284 285 self.write_unsigned_byte(common) 286 self.write_string(suffix) 287 288 # Write the offset delta. 289 290 self.write_number(offset - self.last_offset) 291 292 self.last_term = term 293 self.last_offset = offset 294 295 return self.f.tell() 296 297 class TermReader(FileReader): 298 299 "Reading term information from files." 300 301 def reset(self): 302 self.last_term = "" 303 self.last_offset = 0 304 305 def read_term(self): 306 307 """ 308 Read a term and its position file offset from the term information file. 309 """ 310 311 # Read the prefix length and term suffix. 312 313 common = self.read_unsigned_byte() 314 suffix = self.read_string() 315 316 self.last_term = self.last_term[:common] + suffix 317 318 # Read the offset delta. 319 320 self.last_offset += self.read_number() 321 322 return self.last_term, self.last_offset 323 324 def go_to_term(self, term, offset, info_offset): 325 326 "Seek past the entry for 'term' having 'offset' to 'info_offset'." 327 328 self.f.seek(info_offset) 329 self.last_term = term 330 self.last_offset = offset 331 332 class TermIndexWriter(TermWriter): 333 334 "Writing term dictionary index details to files." 335 336 def reset(self): 337 TermWriter.reset(self) 338 self.last_info_offset = 0 339 340 def write_term(self, term, offset, info_offset): 341 342 """ 343 Write the given 'term' and its position file 'offset' to the term 344 dictionary index file, along with the 'info_offset' in the term 345 information file. 346 """ 347 348 TermWriter.write_term(self, term, offset) 349 350 # Write the information file offset delta. 351 352 self.write_number(info_offset - self.last_info_offset) 353 self.last_info_offset = info_offset 354 355 class TermIndexReader(TermReader): 356 357 "Reading term dictionary index details from files." 358 359 def reset(self): 360 TermReader.reset(self) 361 self.last_info_offset = 0 362 363 def read_term(self): 364 365 """ 366 Read a term, its position file offset, and its term information file 367 offset from the term dictionary index file. 368 """ 369 370 term, offset = TermReader.read_term(self) 371 372 # Read the offset delta. 373 374 self.last_info_offset += self.read_number() 375 376 return term, offset, self.last_info_offset 377 378 class TermDictionaryWriter: 379 380 "Writing term dictionaries." 381 382 def __init__(self, info_writer, index_writer, interval): 383 self.info_writer = info_writer 384 self.index_writer = index_writer 385 self.interval = interval 386 self.entry = 0 387 388 def write_term(self, term, offset): 389 390 """ 391 Write the given 'term' and its position file 'offset' to the term 392 information file and optionally to the index, making a dictionary entry. 393 """ 394 395 info_offset = self.info_writer.write_term(term, offset) 396 397 if self.entry % self.interval == 0: 398 self.index_writer.write_term(term, offset, info_offset) 399 400 self.entry += 1 401 402 def close(self): 403 self.info_writer.close() 404 self.index_writer.close() 405 406 class TermDictionaryReader: 407 408 "Reading term dictionaries." 409 410 def __init__(self, info_reader, index_reader): 411 self.info_reader = info_reader 412 self.index_reader = index_reader 413 414 self.terms = [] 415 try: 416 while 1: 417 self.terms.append(self.index_reader.read_term()) 418 except EOFError: 419 pass 420 421 # Large numbers for ordering purposes. 422 423 self.max_offset = self.terms[-1][1] 424 self.max_info_offset = self.terms[-1][2] 425 426 def find(self, term): 427 428 "Find the position file offset of 'term' from the term dictionary." 429 430 i = bisect_right(self.terms, (term, self.max_offset, self.max_info_offset)) - 1 431 432 # Get the entry position providing the term or one preceding it. 433 434 if i == -1: 435 return None 436 437 found_term, offset, info_offset = self.terms[i] 438 439 # Where the term is found immediately, return the offset. 440 441 if term == found_term: 442 return offset 443 444 # Otherwise, seek past the index term's entry in the information file 445 # and scan for the desired term. 446 447 else: 448 self.info_reader.go_to_term(found_term, offset, info_offset) 449 try: 450 while term > found_term: 451 found_term, offset = self.info_reader.read_term() 452 except EOFError: 453 pass 454 455 # If the term is found, return the offset. 456 457 if term == found_term: 458 return offset 459 else: 460 return None 461 462 def close(self): 463 self.info_reader.close() 464 self.index_reader.close() 465 466 # vim: tabstop=4 expandtab shiftwidth=4