iixr (file iixr.py at 5ef1ed194593)

     1 #!/usr/bin/env python     2      3 """     4 A simple (and sane) text indexing library.     5      6 Copyright (C) 2009 Paul Boddie <paul@boddie.org.uk>     7      8 This program is free software; you can redistribute it and/or modify it under     9 the terms of the GNU General Public License as published by the Free Software    10 Foundation; either version 3 of the License, or (at your option) any later    11 version.    12     13 This program is distributed in the hope that it will be useful, but WITHOUT ANY    14 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A    15 PARTICULAR PURPOSE.  See the GNU General Public License for more details.    16     17 You should have received a copy of the GNU General Public License along    18 with this program.  If not, see <http://www.gnu.org/licenses/>.    19 """    20     21 from os.path import commonprefix # to find common string prefixes    22 from bisect import bisect_right  # to find terms in the dictionary index    23     24 # Foundation classes.    25     26 class File:    27     28     "A basic file abstraction."    29     30     def __init__(self, f):    31         self.f = f    32         self.reset()    33     34     def reset(self):    35         pass    36     37     def close(self):    38         self.f.close()    39     40 class FileWriter(File):    41     42     "Writing basic data types to files."    43     44     def write_number(self, number):    45     46         "Write 'number' to the file using a variable length encoding."    47     48         # Negative numbers are not supported.    49     50         if number < 0:    51             raise ValueError, "Number %r is negative." % number    52     53         # Special case: one byte containing zero.    54     55         elif number == 0:    56             self.f.write(chr(1) + chr(0))    57             return    58     59         # Write the number from least to most significant digits.    60     61         nbytes = 0    62         bytes = []    63     64         while number != 0:    65             lsd = number & 255    66             bytes.append(chr(lsd))    67             number = number >> 8    68             nbytes += 1    69     70         # Too large numbers are not supported.    71     72         if nbytes > 255:    73             raise ValueError, "Number %r is too large." % number    74     75         bytes.insert(0, chr(nbytes))    76         record = "".join(bytes)    77         self.f.write(record)    78     79     def write_unsigned_byte(self, number):    80     81         "Write 'number' to the file using a single byte."    82     83         if not (0 <= number <= 255):    84             raise ValueError, "Number %r is out of range." % number    85     86         self.f.write(chr(number))    87     88     def write_string(self, s):    89     90         "Write 's' to the file, recording its length."    91     92         length = len(s)    93     94         if not (0 <= length <= 255):    95             raise ValueError, "String %r is too long." % s    96     97         self.write_unsigned_byte(length)    98         self.f.write(s)    99    100 class FileReader(File):   101    102     "Reading basic data types from files."   103    104     def read_number(self):   105    106         "Read a number from the file."   107    108         nbytes = self.read_unsigned_byte()   109    110         # Read each byte, adding it to the number.   111    112         bytes = self.f.read(nbytes)   113    114         i = 0   115         shift = 0   116         number = 0   117    118         while i < nbytes:   119             csd = ord(bytes[i])   120             number += (csd << shift)   121             shift += 8   122             i += 1   123    124         return number   125    126     def read_unsigned_byte(self):   127    128         "Read a number from the file, consuming a single byte."   129    130         s = self.f.read(1)   131         if not s:   132             raise EOFError   133    134         return ord(s)   135    136     def read_string(self):   137    138         "Read a string from the file."   139    140         length = self.read_unsigned_byte()   141         return self.f.read(length)   142    143 # Specific classes.   144    145 class PositionWriter(FileWriter):   146    147     "Writing position information to files."   148    149     def reset(self):   150         self.last_docnum = 0   151    152     def write_positions(self, docnum, positions):   153    154         "Write for the document 'docnum' the given 'positions'."   155    156         if docnum < self.last_docnum:   157             raise ValueError, "Document number %r is less than previous number %r." % (docnum, self.last_docnum)   158    159         # Write the document number delta.   160    161         self.write_number(docnum - self.last_docnum)   162    163         # Write the number of positions.   164    165         self.write_number(len(positions))   166    167         # Write the position deltas.   168    169         last = 0   170         for position in positions:   171             pos = position - last   172             self.write_number(pos)   173             last = position   174    175         self.last_docnum = docnum   176    177     def write_all_positions(self, doc_positions):   178    179         """   180         Write all 'doc_positions' - a collection of tuples of the form (document   181         number, position list) - to the file, returning the offset at which they   182         were stored.   183         """   184    185         # Reset the writer and record the current file offset.   186    187         self.reset()   188         offset = self.f.tell()   189    190         # Write the number of documents.   191    192         self.write_number(len(doc_positions))   193    194         # Write the positions.   195    196         for docnum, positions in doc_positions:   197             self.write_positions(docnum, positions)   198    199         return offset   200    201 class PositionReader(FileReader):   202    203     "Reading position information from files."   204    205     def reset(self):   206         self.last_docnum = 0   207    208     def read_positions(self):   209    210         "Read positions, returning a document number and a list of positions."   211    212         # Read the document number delta and add it to the last number.   213    214         self.last_docnum += self.read_number()   215    216         # Read the number of positions.   217    218         npositions = self.read_number()   219    220         # Read the position deltas, adding each previous position to get the   221         # appropriate collection of absolute positions.   222    223         i = 0   224         last = 0   225         positions = []   226    227         while i < npositions:   228             last += self.read_number()   229             positions.append(last)   230             i += 1   231    232         return self.last_docnum, positions   233    234     def read_all_positions(self, offset):   235    236         """   237         Read all positions from 'offset', seeking to that position in the file   238         before reading.   239         """   240    241         self.reset()   242         self.f.seek(offset)   243    244         # Read the number of documents.   245    246         ndocuments = self.read_number()   247    248         # Read all records.   249    250         i = 0   251         doc_positions = []   252    253         while i < ndocuments:   254             doc_positions.append(self.read_positions())   255             i += 1   256    257         return doc_positions   258    259 class TermWriter(FileWriter):   260    261     "Writing term information to files."   262    263     def reset(self):   264         self.last_term = ""   265         self.last_offset = 0   266    267     def write_term(self, term, offset):   268    269         """   270         Write the given 'term' and its position file 'offset' to the term   271         information file. Return the offset after the term information was   272         written to the file.   273         """   274    275         # Too long terms are not currently supported.   276    277         if len(term) > 255:   278             raise ValueError, "Term %r is too long." % term   279    280         # Write the prefix length and term suffix.   281    282         common = len(commonprefix([self.last_term, term]))   283         suffix = term[common:]   284    285         self.write_unsigned_byte(common)   286         self.write_string(suffix)   287    288         # Write the offset delta.   289    290         self.write_number(offset - self.last_offset)   291    292         self.last_term = term   293         self.last_offset = offset   294    295         return self.f.tell()   296    297 class TermReader(FileReader):   298    299     "Reading term information from files."   300    301     def reset(self):   302         self.last_term = ""   303         self.last_offset = 0   304    305     def read_term(self):   306    307         """   308         Read a term and its position file offset from the term information file.   309         """   310    311         # Read the prefix length and term suffix.   312    313         common = self.read_unsigned_byte()   314         suffix = self.read_string()   315    316         self.last_term = self.last_term[:common] + suffix   317    318         # Read the offset delta.   319    320         self.last_offset += self.read_number()   321    322         return self.last_term, self.last_offset   323    324     def go_to_term(self, term, offset, info_offset):   325    326         "Seek past the entry for 'term' having 'offset' to 'info_offset'."   327    328         self.f.seek(info_offset)   329         self.last_term = term   330         self.last_offset = offset   331    332 class TermIndexWriter(TermWriter):   333    334     "Writing term dictionary index details to files."   335    336     def reset(self):   337         TermWriter.reset(self)   338         self.last_info_offset = 0   339    340     def write_term(self, term, offset, info_offset):   341    342         """   343         Write the given 'term' and its position file 'offset' to the term   344         dictionary index file, along with the 'info_offset' in the term   345         information file.   346         """   347    348         TermWriter.write_term(self, term, offset)   349    350         # Write the information file offset delta.   351    352         self.write_number(info_offset - self.last_info_offset)   353         self.last_info_offset = info_offset   354    355 class TermIndexReader(TermReader):   356    357     "Reading term dictionary index details from files."   358    359     def reset(self):   360         TermReader.reset(self)   361         self.last_info_offset = 0   362    363     def read_term(self):   364    365         """   366         Read a term, its position file offset, and its term information file   367         offset from the term dictionary index file.   368         """   369    370         term, offset = TermReader.read_term(self)   371    372         # Read the offset delta.   373    374         self.last_info_offset += self.read_number()   375    376         return term, offset, self.last_info_offset   377    378 class TermDictionaryWriter:   379    380     "Writing term dictionaries."   381    382     def __init__(self, info_writer, index_writer, interval):   383         self.info_writer = info_writer   384         self.index_writer = index_writer   385         self.interval = interval   386         self.entry = 0   387    388     def write_term(self, term, offset):   389    390         """   391         Write the given 'term' and its position file 'offset' to the term   392         information file and optionally to the index, making a dictionary entry.   393         """   394    395         info_offset = self.info_writer.write_term(term, offset)   396    397         if self.entry % self.interval == 0:   398             self.index_writer.write_term(term, offset, info_offset)   399    400         self.entry += 1   401    402     def close(self):   403         self.info_writer.close()   404         self.index_writer.close()   405    406 class TermDictionaryReader:   407    408     "Reading term dictionaries."   409    410     def __init__(self, info_reader, index_reader):   411         self.info_reader = info_reader   412         self.index_reader = index_reader   413    414         self.terms = []   415         try:   416             while 1:   417                 self.terms.append(self.index_reader.read_term())   418         except EOFError:   419             pass   420    421         # Large numbers for ordering purposes.   422    423         self.max_offset = self.terms[-1][1]   424         self.max_info_offset = self.terms[-1][2]   425    426     def find(self, term):   427    428         "Find the position file offset of 'term' from the term dictionary."   429    430         i = bisect_right(self.terms, (term, self.max_offset, self.max_info_offset)) - 1   431    432         # Get the entry position providing the term or one preceding it.   433    434         if i == -1:   435             return None   436    437         found_term, offset, info_offset = self.terms[i]   438    439         # Where the term is found immediately, return the offset.   440    441         if term == found_term:   442             return offset   443    444         # Otherwise, seek past the index term's entry in the information file   445         # and scan for the desired term.   446    447         else:   448             self.info_reader.go_to_term(found_term, offset, info_offset)   449             try:   450                 while term > found_term:   451                     found_term, offset = self.info_reader.read_term()   452             except EOFError:   453                 pass   454    455             # If the term is found, return the offset.   456    457             if term == found_term:   458                 return offset   459             else:   460                 return None   461    462     def close(self):   463         self.info_reader.close()   464         self.index_reader.close()   465    466 # vim: tabstop=4 expandtab shiftwidth=4