iixr (file iixr.py at 736ff8a2e022)

     1 #!/usr/bin/env python     2      3 """     4 A simple (and sane) text indexing library.     5      6 Copyright (C) 2009 Paul Boddie <paul@boddie.org.uk>     7      8 This program is free software; you can redistribute it and/or modify it under     9 the terms of the GNU General Public License as published by the Free Software    10 Foundation; either version 3 of the License, or (at your option) any later    11 version.    12     13 This program is distributed in the hope that it will be useful, but WITHOUT ANY    14 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A    15 PARTICULAR PURPOSE.  See the GNU General Public License for more details.    16     17 You should have received a copy of the GNU General Public License along    18 with this program.  If not, see <http://www.gnu.org/licenses/>.    19 """    20     21 from os.path import commonprefix # to find common string prefixes    22 from bisect import bisect_right  # to find terms in the dictionary index    23     24 # Foundation classes.    25     26 class File:    27     28     "A basic file abstraction."    29     30     def __init__(self, f):    31         self.f = f    32         self.reset()    33     34     def reset(self):    35         pass    36     37     def close(self):    38         self.f.close()    39     40 class FileWriter(File):    41     42     "Writing basic data types to files."    43     44     def write_number(self, number):    45     46         "Write 'number' to the file using a variable length encoding."    47     48         # Negative numbers are not supported.    49     50         if number < 0:    51             raise ValueError, "Number %r is negative." % number    52     53         # Special case: one byte containing zero.    54     55         elif number == 0:    56             self.f.write(chr(0))    57             return    58     59         # Write the number from least to most significant digits.    60     61         bytes = []    62     63         while number != 0:    64             lsd = number & 127    65             number = number >> 7    66             if number != 0:    67                 lsd |= 128    68             bytes.append(chr(lsd))    69     70         record = "".join(bytes)    71         self.f.write(record)    72     73     def write_string(self, s):    74     75         "Write 's' to the file, recording its length."    76     77         length = len(s)    78     79         if not (0 <= length <= 255):    80             raise ValueError, "String %r is too long." % s    81     82         self.write_number(length)    83         self.f.write(s)    84     85 class FileReader(File):    86     87     "Reading basic data types from files."    88     89     def read_number(self):    90     91         "Read a number from the file."    92     93         # Read each byte, adding it to the number.    94     95         shift = 0    96         number = 0    97         more = 1    98     99         while more:   100             byte = self.f.read(1)   101             if not byte:   102                 raise EOFError   103    104             csd = ord(byte)   105             more = csd & 128 != 0   106             if more:   107                 csd &= 127   108             number += (csd << shift)   109             shift += 7   110    111         return number   112    113     def read_string(self):   114    115         "Read a string from the file."   116    117         length = self.read_number()   118         return self.f.read(length)   119    120 # Specific classes.   121    122 class PositionWriter(FileWriter):   123    124     "Writing position information to files."   125    126     def reset(self):   127         self.last_docnum = 0   128    129     def write_positions(self, docnum, positions):   130    131         "Write for the document 'docnum' the given 'positions'."   132    133         if docnum < self.last_docnum:   134             raise ValueError, "Document number %r is less than previous number %r." % (docnum, self.last_docnum)   135    136         # Write the document number delta.   137    138         self.write_number(docnum - self.last_docnum)   139    140         # Write the number of positions.   141    142         self.write_number(len(positions))   143    144         # Write the position deltas.   145    146         last = 0   147         for position in positions:   148             pos = position - last   149             self.write_number(pos)   150             last = position   151    152         self.last_docnum = docnum   153    154     def write_all_positions(self, doc_positions):   155    156         """   157         Write all 'doc_positions' - a collection of tuples of the form (document   158         number, position list) - to the file, returning the offset at which they   159         were stored.   160         """   161    162         # Reset the writer and record the current file offset.   163    164         self.reset()   165         offset = self.f.tell()   166    167         # Write the number of documents.   168    169         self.write_number(len(doc_positions))   170    171         # Write the positions.   172    173         for docnum, positions in doc_positions:   174             self.write_positions(docnum, positions)   175    176         return offset   177    178 class PositionReader(FileReader):   179    180     "Reading position information from files."   181    182     def reset(self):   183         self.last_docnum = 0   184    185     def read_positions(self):   186    187         "Read positions, returning a document number and a list of positions."   188    189         # Read the document number delta and add it to the last number.   190    191         self.last_docnum += self.read_number()   192    193         # Read the number of positions.   194    195         npositions = self.read_number()   196    197         # Read the position deltas, adding each previous position to get the   198         # appropriate collection of absolute positions.   199    200         i = 0   201         last = 0   202         positions = []   203    204         while i < npositions:   205             last += self.read_number()   206             positions.append(last)   207             i += 1   208    209         return self.last_docnum, positions   210    211     def read_all_positions(self, offset):   212    213         """   214         Read all positions from 'offset', seeking to that position in the file   215         before reading.   216         """   217    218         self.reset()   219         self.f.seek(offset)   220    221         # Read the number of documents.   222    223         ndocuments = self.read_number()   224    225         # Read all records.   226    227         i = 0   228         doc_positions = []   229    230         while i < ndocuments:   231             doc_positions.append(self.read_positions())   232             i += 1   233    234         return doc_positions   235    236 class TermWriter(FileWriter):   237    238     "Writing term information to files."   239    240     def reset(self):   241         self.last_term = ""   242         self.last_offset = 0   243    244     def write_term(self, term, offset):   245    246         """   247         Write the given 'term' and its position file 'offset' to the term   248         information file. Return the offset after the term information was   249         written to the file.   250         """   251    252         # Too long terms are not currently supported.   253    254         if len(term) > 255:   255             raise ValueError, "Term %r is too long." % term   256    257         # Write the prefix length and term suffix.   258    259         common = len(commonprefix([self.last_term, term]))   260         suffix = term[common:]   261    262         self.write_number(common)   263         self.write_string(suffix)   264    265         # Write the offset delta.   266    267         self.write_number(offset - self.last_offset)   268    269         self.last_term = term   270         self.last_offset = offset   271    272         return self.f.tell()   273    274 class TermReader(FileReader):   275    276     "Reading term information from files."   277    278     def reset(self):   279         self.last_term = ""   280         self.last_offset = 0   281    282     def read_term(self):   283    284         """   285         Read a term and its position file offset from the term information file.   286         """   287    288         # Read the prefix length and term suffix.   289    290         common = self.read_number()   291         suffix = self.read_string()   292    293         self.last_term = self.last_term[:common] + suffix   294    295         # Read the offset delta.   296    297         self.last_offset += self.read_number()   298    299         return self.last_term, self.last_offset   300    301     def go_to_term(self, term, offset, info_offset):   302    303         "Seek past the entry for 'term' having 'offset' to 'info_offset'."   304    305         self.f.seek(info_offset)   306         self.last_term = term   307         self.last_offset = offset   308    309 class TermIndexWriter(TermWriter):   310    311     "Writing term dictionary index details to files."   312    313     def reset(self):   314         TermWriter.reset(self)   315         self.last_info_offset = 0   316    317     def write_term(self, term, offset, info_offset):   318    319         """   320         Write the given 'term' and its position file 'offset' to the term   321         dictionary index file, along with the 'info_offset' in the term   322         information file.   323         """   324    325         TermWriter.write_term(self, term, offset)   326    327         # Write the information file offset delta.   328    329         self.write_number(info_offset - self.last_info_offset)   330         self.last_info_offset = info_offset   331    332 class TermIndexReader(TermReader):   333    334     "Reading term dictionary index details from files."   335    336     def reset(self):   337         TermReader.reset(self)   338         self.last_info_offset = 0   339    340     def read_term(self):   341    342         """   343         Read a term, its position file offset, and its term information file   344         offset from the term dictionary index file.   345         """   346    347         term, offset = TermReader.read_term(self)   348    349         # Read the offset delta.   350    351         self.last_info_offset += self.read_number()   352    353         return term, offset, self.last_info_offset   354    355 class TermDictionaryWriter:   356    357     "Writing term dictionaries."   358    359     def __init__(self, info_writer, index_writer, position_writer, interval):   360         self.info_writer = info_writer   361         self.index_writer = index_writer   362         self.position_writer = position_writer   363         self.interval = interval   364         self.entry = 0   365    366     def write_term(self, term, offset):   367    368         """   369         Write the given 'term' and its position file 'offset' to the term   370         information file and optionally to the index, making a dictionary entry.   371         """   372    373         info_offset = self.info_writer.write_term(term, offset)   374    375         if self.entry % self.interval == 0:   376             self.index_writer.write_term(term, offset, info_offset)   377    378         self.entry += 1   379    380     def write_term_positions(self, term, doc_positions):   381    382         """   383         Write the given 'term' and the 'doc_positions' recording the documents   384         and positions at which the term is found.   385         """   386    387         offset = self.position_writer.write_all_positions(doc_positions)   388         self.write_term(term, offset)   389    390     def close(self):   391         self.info_writer.close()   392         self.index_writer.close()   393         self.position_writer.close()   394    395 class TermDictionaryReader:   396    397     "Reading term dictionaries."   398    399     def __init__(self, info_reader, index_reader, position_reader):   400         self.info_reader = info_reader   401         self.index_reader = index_reader   402         self.position_reader = position_reader   403    404         self.terms = []   405         try:   406             while 1:   407                 self.terms.append(self.index_reader.read_term())   408         except EOFError:   409             pass   410    411         # Large numbers for ordering purposes.   412    413         self.max_offset = self.terms[-1][1]   414         self.max_info_offset = self.terms[-1][2]   415    416     def find_term(self, term):   417    418         "Find the position file offset of 'term' from the term dictionary."   419    420         i = bisect_right(self.terms, (term, self.max_offset, self.max_info_offset)) - 1   421    422         # Get the entry position providing the term or one preceding it.   423    424         if i == -1:   425             return None   426    427         found_term, offset, info_offset = self.terms[i]   428    429         # Where the term is found immediately, return the offset.   430    431         if term == found_term:   432             return offset   433    434         # Otherwise, seek past the index term's entry in the information file   435         # and scan for the desired term.   436    437         else:   438             self.info_reader.go_to_term(found_term, offset, info_offset)   439             try:   440                 while term > found_term:   441                     found_term, offset = self.info_reader.read_term()   442             except EOFError:   443                 pass   444    445             # If the term is found, return the offset.   446    447             if term == found_term:   448                 return offset   449             else:   450                 return None   451    452     def find_positions(self, term):   453    454         "Return the documents and positions at which the given 'term' is found."   455    456         offset = self.find_term(term)   457         if offset is None:   458             return None   459         else:   460             return self.position_reader.read_all_positions(offset)   461    462     def close(self):   463         self.info_reader.close()   464         self.index_reader.close()   465         self.position_reader.close()   466    467 class IndexWriter:   468    469     "Building term information and writing it to the term dictionary."   470    471     def __init__(self, dict_writer):   472         self.dict_writer = dict_writer   473         self.terms = {}   474    475     def add_position(self, term, docnum, position):   476    477         """   478         Add a position entry for the given 'term' in the document with the given   479         'docnum', indicating the given 'position'.   480         """   481    482         if not self.terms.has_key(term):   483             doc_positions = self.terms[term] = {}   484         else:   485             doc_positions = self.terms[term]   486    487         if not doc_positions.has_key(docnum):   488             doc = doc_positions[docnum] = []   489         else:   490             doc = doc_positions[docnum]   491    492         doc.append(position)   493    494     def close(self):   495    496         # Get the terms in order.   497    498         terms = self.terms.items()   499         terms.sort()   500    501         for term, doc_positions in terms:   502             doc_positions = doc_positions.items()   503             doc_positions.sort()   504             self.dict_writer.write_term_positions(term, doc_positions)   505    506         self.dict_writer.close()   507    508 # vim: tabstop=4 expandtab shiftwidth=4