iixr (file iixr.py at 42cc066da2fd)

     1 #!/usr/bin/env python     2      3 """     4 A simple (and sane) text indexing library.     5      6 Copyright (C) 2009 Paul Boddie <paul@boddie.org.uk>     7      8 This program is free software; you can redistribute it and/or modify it under     9 the terms of the GNU General Public License as published by the Free Software    10 Foundation; either version 3 of the License, or (at your option) any later    11 version.    12     13 This program is distributed in the hope that it will be useful, but WITHOUT ANY    14 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A    15 PARTICULAR PURPOSE.  See the GNU General Public License for more details.    16     17 You should have received a copy of the GNU General Public License along    18 with this program.  If not, see <http://www.gnu.org/licenses/>.    19 """    20     21 from os import mkdir             # to determine whether to create indexes    22 from os.path import exists, join    23 from os.path import commonprefix # to find common string prefixes    24 from bisect import bisect_right  # to find terms in the dictionary index    25     26 # Constants.    27     28 INTERVAL = 100    29     30 # Foundation classes.    31     32 class File:    33     34     "A basic file abstraction."    35     36     def __init__(self, f):    37         self.f = f    38         self.reset()    39     40     def reset(self):    41         pass    42     43     def close(self):    44         if self.f is not None:    45             self.f.close()    46             self.f = None    47     48 class FileWriter(File):    49     50     "Writing basic data types to files."    51     52     def write_number(self, number):    53     54         "Write 'number' to the file using a variable length encoding."    55     56         # Negative numbers are not supported.    57     58         if number < 0:    59             raise ValueError, "Number %r is negative." % number    60     61         # Special case: one byte containing zero.    62     63         elif number == 0:    64             self.f.write(chr(0))    65             return    66     67         # Write the number from least to most significant digits.    68     69         bytes = []    70     71         while number != 0:    72             lsd = number & 127    73             number = number >> 7    74             if number != 0:    75                 lsd |= 128    76             bytes.append(chr(lsd))    77     78         record = "".join(bytes)    79         self.f.write(record)    80     81     def write_string(self, s):    82     83         "Write 's' to the file, recording its length."    84     85         # Convert Unicode objects to strings.    86     87         if isinstance(s, unicode):    88             s = s.encode("utf-8")    89     90         length = len(s)    91     92         if not (0 <= length <= 255):    93             raise ValueError, "String %r is too long." % s    94     95         self.write_number(length)    96         self.f.write(s)    97     98 class FileReader(File):    99    100     "Reading basic data types from files."   101    102     def read_number(self):   103    104         "Read a number from the file."   105    106         # Read each byte, adding it to the number.   107    108         shift = 0   109         number = 0   110         more = 1   111    112         while more:   113             byte = self.f.read(1)   114             if not byte:   115                 raise EOFError   116    117             csd = ord(byte)   118             more = csd & 128 != 0   119             if more:   120                 csd &= 127   121             number += (csd << shift)   122             shift += 7   123    124         return number   125    126     def read_string(self):   127    128         "Read a string from the file."   129    130         length = self.read_number()   131    132         # Convert strings to Unicode objects.   133    134         return unicode(self.f.read(length), "utf-8")   135    136 # Specific classes.   137    138 class PositionWriter(FileWriter):   139    140     "Writing position information to files."   141    142     def reset(self):   143         self.last_docnum = 0   144    145     def write_positions(self, docnum, positions):   146    147         "Write for the document 'docnum' the given 'positions'."   148    149         if docnum < self.last_docnum:   150             raise ValueError, "Document number %r is less than previous number %r." % (docnum, self.last_docnum)   151    152         # Write the document number delta.   153    154         self.write_number(docnum - self.last_docnum)   155    156         # Write the number of positions.   157    158         self.write_number(len(positions))   159    160         # Make sure that the positions are sorted.   161    162         positions.sort()   163    164         # Write the position deltas.   165    166         last = 0   167         for position in positions:   168             pos = position - last   169             self.write_number(pos)   170             last = position   171    172         self.last_docnum = docnum   173    174     def write_all_positions(self, doc_positions):   175    176         """   177         Write all 'doc_positions' - a collection of tuples of the form (document   178         number, position list) - to the file, returning the offset at which they   179         were stored.   180         """   181    182         # Reset the writer and record the current file offset.   183    184         self.reset()   185         offset = self.f.tell()   186    187         # Write the number of documents.   188    189         self.write_number(len(doc_positions))   190    191         # Write the positions.   192    193         for docnum, positions in doc_positions:   194             self.write_positions(docnum, positions)   195    196         return offset   197    198 class PositionReader(FileReader):   199    200     "Reading position information from files."   201    202     def reset(self):   203         self.last_docnum = 0   204    205     def read_positions(self):   206    207         "Read positions, returning a document number and a list of positions."   208    209         # Read the document number delta and add it to the last number.   210    211         self.last_docnum += self.read_number()   212    213         # Read the number of positions.   214    215         npositions = self.read_number()   216    217         # Read the position deltas, adding each previous position to get the   218         # appropriate collection of absolute positions.   219    220         i = 0   221         last = 0   222         positions = []   223    224         while i < npositions:   225             last += self.read_number()   226             positions.append(last)   227             i += 1   228    229         return self.last_docnum, positions   230    231     def read_all_positions(self, offset):   232    233         """   234         Read all positions from 'offset', seeking to that position in the file   235         before reading.   236         """   237    238         self.reset()   239         self.f.seek(offset)   240    241         # Read the number of documents.   242    243         ndocuments = self.read_number()   244    245         # Read all records.   246    247         i = 0   248         doc_positions = []   249    250         while i < ndocuments:   251             doc_positions.append(self.read_positions())   252             i += 1   253    254         return doc_positions   255    256 class TermWriter(FileWriter):   257    258     "Writing term information to files."   259    260     def reset(self):   261         self.last_term = ""   262         self.last_offset = 0   263    264     def write_term(self, term, offset):   265    266         """   267         Write the given 'term' and its position file 'offset' to the term   268         information file. Return the offset after the term information was   269         written to the file.   270         """   271    272         # Too long terms are not currently supported.   273    274         if len(term) > 255:   275             raise ValueError, "Term %r is too long." % term   276    277         # Write the prefix length and term suffix.   278    279         common = len(commonprefix([self.last_term, term]))   280         suffix = term[common:]   281    282         self.write_number(common)   283         self.write_string(suffix)   284    285         # Write the offset delta.   286    287         self.write_number(offset - self.last_offset)   288    289         self.last_term = term   290         self.last_offset = offset   291    292         return self.f.tell()   293    294 class TermReader(FileReader):   295    296     "Reading term information from files."   297    298     def reset(self):   299         self.last_term = ""   300         self.last_offset = 0   301    302     def read_term(self):   303    304         """   305         Read a term and its position file offset from the term information file.   306         """   307    308         # Read the prefix length and term suffix.   309    310         common = self.read_number()   311         suffix = self.read_string()   312    313         self.last_term = self.last_term[:common] + suffix   314    315         # Read the offset delta.   316    317         self.last_offset += self.read_number()   318    319         return self.last_term, self.last_offset   320    321     def go_to_term(self, term, offset, info_offset):   322    323         "Seek past the entry for 'term' having 'offset' to 'info_offset'."   324    325         self.f.seek(info_offset)   326         self.last_term = term   327         self.last_offset = offset   328    329 class TermIndexWriter(TermWriter):   330    331     "Writing term dictionary index details to files."   332    333     def reset(self):   334         TermWriter.reset(self)   335         self.last_info_offset = 0   336    337     def write_term(self, term, offset, info_offset):   338    339         """   340         Write the given 'term' and its position file 'offset' to the term   341         dictionary index file, along with the 'info_offset' in the term   342         information file.   343         """   344    345         TermWriter.write_term(self, term, offset)   346    347         # Write the information file offset delta.   348    349         self.write_number(info_offset - self.last_info_offset)   350         self.last_info_offset = info_offset   351    352 class TermIndexReader(TermReader):   353    354     "Reading term dictionary index details from files."   355    356     def reset(self):   357         TermReader.reset(self)   358         self.last_info_offset = 0   359    360     def read_term(self):   361    362         """   363         Read a term, its position file offset, and its term information file   364         offset from the term dictionary index file.   365         """   366    367         term, offset = TermReader.read_term(self)   368    369         # Read the offset delta.   370    371         self.last_info_offset += self.read_number()   372    373         return term, offset, self.last_info_offset   374    375 class TermDictionaryWriter:   376    377     "Writing term dictionaries."   378    379     def __init__(self, info_writer, index_writer, position_writer, interval):   380         self.info_writer = info_writer   381         self.index_writer = index_writer   382         self.position_writer = position_writer   383         self.interval = interval   384         self.entry = 0   385    386     def write_term(self, term, offset):   387    388         """   389         Write the given 'term' and its position file 'offset' to the term   390         information file and optionally to the index, making a dictionary entry.   391         """   392    393         info_offset = self.info_writer.write_term(term, offset)   394    395         if self.entry % self.interval == 0:   396             self.index_writer.write_term(term, offset, info_offset)   397    398         self.entry += 1   399    400     def write_term_positions(self, term, doc_positions):   401    402         """   403         Write the given 'term' and the 'doc_positions' recording the documents   404         and positions at which the term is found.   405         """   406    407         offset = self.position_writer.write_all_positions(doc_positions)   408         self.write_term(term, offset)   409    410     def close(self):   411         self.info_writer.close()   412         self.index_writer.close()   413         self.position_writer.close()   414    415 class TermDictionaryReader:   416    417     "Reading term dictionaries."   418    419     def __init__(self, info_reader, index_reader, position_reader):   420         self.info_reader = info_reader   421         self.index_reader = index_reader   422         self.position_reader = position_reader   423    424         self.terms = []   425         try:   426             while 1:   427                 self.terms.append(self.index_reader.read_term())   428         except EOFError:   429             pass   430    431         # Large numbers for ordering purposes.   432    433         self.max_offset = self.terms[-1][1]   434         self.max_info_offset = self.terms[-1][2]   435    436     def find_term(self, term):   437    438         "Find the position file offset of 'term' from the term dictionary."   439    440         i = bisect_right(self.terms, (term, self.max_offset, self.max_info_offset)) - 1   441    442         # Get the entry position providing the term or one preceding it.   443    444         if i == -1:   445             return None   446    447         found_term, offset, info_offset = self.terms[i]   448    449         # Where the term is found immediately, return the offset.   450    451         if term == found_term:   452             return offset   453    454         # Otherwise, seek past the index term's entry in the information file   455         # and scan for the desired term.   456    457         else:   458             self.info_reader.go_to_term(found_term, offset, info_offset)   459             try:   460                 while term > found_term:   461                     found_term, offset = self.info_reader.read_term()   462             except EOFError:   463                 pass   464    465             # If the term is found, return the offset.   466    467             if term == found_term:   468                 return offset   469             else:   470                 return None   471    472     def find_positions(self, term):   473    474         "Return the documents and positions at which the given 'term' is found."   475    476         offset = self.find_term(term)   477         if offset is None:   478             return None   479         else:   480             return self.position_reader.read_all_positions(offset)   481    482     def close(self):   483         self.info_reader.close()   484         self.index_reader.close()   485         self.position_reader.close()   486    487 class IndexWriter:   488    489     "Building term information and writing it to the term dictionary."   490    491     def __init__(self, dict_writer):   492         self.dict_writer = dict_writer   493         self.terms = {}   494    495     def add_position(self, term, docnum, position):   496    497         """   498         Add a position entry for the given 'term' in the document with the given   499         'docnum', indicating the given 'position'.   500         """   501    502         if not self.terms.has_key(term):   503             doc_positions = self.terms[term] = {}   504         else:   505             doc_positions = self.terms[term]   506    507         if not doc_positions.has_key(docnum):   508             doc = doc_positions[docnum] = []   509         else:   510             doc = doc_positions[docnum]   511    512         doc.append(position)   513    514     def close(self):   515         if self.dict_writer is None:   516             return   517    518         # Get the terms in order.   519    520         terms = self.terms.items()   521         terms.sort()   522    523         for term, doc_positions in terms:   524             doc_positions = doc_positions.items()   525             doc_positions.sort()   526             self.dict_writer.write_term_positions(term, doc_positions)   527    528         self.dict_writer.close()   529         self.dict_writer = None   530    531 class Index:   532    533     "An inverted index solution encapsulating the various components."   534    535     def __init__(self, pathname):   536         self.pathname = pathname   537         self.reader = None   538         self.writer = None   539    540     def get_writer(self, interval=INTERVAL):   541    542         "Return a writer, optionally using the given indexing 'interval'."   543    544         if not exists(self.pathname):   545             mkdir(self.pathname)   546    547         tdf = open(join(self.pathname, "terms"), "wb")   548         info_writer = TermWriter(tdf)   549    550         tdif = open(join(self.pathname, "index"), "wb")   551         index_writer = TermIndexWriter(tdif)   552    553         tpf = open(join(self.pathname, "positions"), "wb")   554         positions_writer = PositionWriter(tpf)   555    556         dict_writer = TermDictionaryWriter(info_writer, index_writer, positions_writer, interval)   557    558         self.writer = IndexWriter(dict_writer)   559         return self.writer   560    561     def get_reader(self):   562    563         "Return a reader for the index."   564    565         if not exists(self.pathname):   566             raise OSError, "Index path %r does not exist." % self.pathname   567    568         tdf = open(join(self.pathname, "terms"), "rb")   569         info_reader = TermReader(tdf)   570    571         tdif = open(join(self.pathname, "index"), "rb")   572         index_reader = TermIndexReader(tdif)   573    574         tpf = open(join(self.pathname, "positions"), "rb")   575         positions_reader = PositionReader(tpf)   576    577         self.reader = TermDictionaryReader(info_reader, index_reader, positions_reader)   578         return self.reader   579    580     def close(self):   581         if self.reader is not None:   582             self.reader.close()   583             self.reader = None   584         if self.writer is not None:   585             self.writer.close()   586             self.writer = None   587    588 # vim: tabstop=4 expandtab shiftwidth=4