iixr (file iixr.py at 3d86f5cb01c1)

     1 #!/usr/bin/env python     2      3 """     4 A simple (and sane) text indexing library.     5      6 Copyright (C) 2009 Paul Boddie <paul@boddie.org.uk>     7      8 This program is free software; you can redistribute it and/or modify it under     9 the terms of the GNU General Public License as published by the Free Software    10 Foundation; either version 3 of the License, or (at your option) any later    11 version.    12     13 This program is distributed in the hope that it will be useful, but WITHOUT ANY    14 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A    15 PARTICULAR PURPOSE.  See the GNU General Public License for more details.    16     17 You should have received a copy of the GNU General Public License along    18 with this program.  If not, see <http://www.gnu.org/licenses/>.    19 """    20     21 from os import mkdir             # to determine whether to create indexes    22 from os.path import exists, join    23 from os.path import commonprefix # to find common string prefixes    24 from bisect import bisect_right  # to find terms in the dictionary index    25 import bz2, zlib                 # for field compression    26     27 # Constants.    28     29 INTERVAL = 100    30     31 compressors = [("b", bz2.compress), ("z", zlib.compress)]    32 decompressors = {"b" : bz2.decompress, "z" : zlib.decompress}    33     34 # Foundation classes.    35     36 class File:    37     38     "A basic file abstraction."    39     40     def __init__(self, f):    41         self.f = f    42         self.reset()    43     44     def reset(self):    45         pass    46     47     def close(self):    48         if self.f is not None:    49             self.f.close()    50             self.f = None    51     52 class FileWriter(File):    53     54     "Writing basic data types to files."    55     56     def write_number(self, number):    57     58         "Write 'number' to the file using a variable length encoding."    59     60         # Negative numbers are not supported.    61     62         if number < 0:    63             raise ValueError, "Number %r is negative." % number    64     65         # Special case: one byte containing zero.    66     67         elif number == 0:    68             self.f.write(chr(0))    69             return    70     71         # Write the number from least to most significant digits.    72     73         bytes = []    74     75         while number != 0:    76             lsd = number & 127    77             number = number >> 7    78             if number != 0:    79                 lsd |= 128    80             bytes.append(chr(lsd))    81     82         record = "".join(bytes)    83         self.f.write(record)    84     85     def write_string(self, s, compress=0):    86     87         """    88         Write 's' to the file, recording its length and compressing the string    89         if 'compress' is set to a true value.    90         """    91     92         # Convert Unicode objects to strings.    93     94         if isinstance(s, unicode):    95             s = s.encode("utf-8")    96     97         # Compress the string if requested.    98     99         if compress:   100             for flag, fn in compressors:   101                 cs = fn(s)   102    103                 # Take the first string shorter than the original.   104    105                 if len(cs) < len(s):   106                     s = cs   107                     break   108             else:   109                 flag = "-"   110    111             # Record whether compression was used.   112    113             self.f.write(flag)   114    115         # Write the length of the data before the data itself.   116    117         length = len(s)   118         self.write_number(length)   119         self.f.write(s)   120    121 class FileReader(File):   122    123     "Reading basic data types from files."   124    125     def read_number(self):   126    127         "Read a number from the file."   128    129         # Read each byte, adding it to the number.   130    131         shift = 0   132         number = 0   133         more = 1   134    135         while more:   136             byte = self.f.read(1)   137             if not byte:   138                 raise EOFError   139    140             csd = ord(byte)   141             more = csd & 128 != 0   142             if more:   143                 csd &= 127   144             number += (csd << shift)   145             shift += 7   146    147         return number   148    149     def read_string(self, decompress=0):   150    151         """   152         Read a string from the file, decompressing the stored data if   153         'decompress' is set to a true value.   154         """   155    156         # Decompress the data if requested.   157    158         if decompress:   159             flag = self.f.read(1)   160         else:   161             flag = "-"   162    163         length = self.read_number()   164         s = self.f.read(length)   165    166         # Perform decompression if applicable.   167    168         if flag != "-":   169             fn = decompressors[flag]   170             s = fn(s)   171    172         # Convert strings to Unicode objects.   173    174         return unicode(s, "utf-8")   175    176 # Specific classes for storing term and position information.   177    178 class PositionWriter(FileWriter):   179    180     "Writing position information to files."   181    182     def reset(self):   183         self.last_docnum = 0   184    185     def write_positions(self, docnum, positions):   186    187         "Write for the document 'docnum' the given 'positions'."   188    189         if docnum < self.last_docnum:   190             raise ValueError, "Document number %r is less than previous number %r." % (docnum, self.last_docnum)   191    192         # Write the document number delta.   193    194         self.write_number(docnum - self.last_docnum)   195    196         # Write the number of positions.   197    198         self.write_number(len(positions))   199    200         # Make sure that the positions are sorted.   201    202         positions.sort()   203    204         # Write the position deltas.   205    206         last = 0   207         for position in positions:   208             pos = position - last   209             self.write_number(pos)   210             last = position   211    212         self.last_docnum = docnum   213    214     def write_all_positions(self, doc_positions):   215    216         """   217         Write all 'doc_positions' - a collection of tuples of the form (document   218         number, position list) - to the file, returning a tuple containing the   219         offset at which they were stored together with the frequency (number of   220         positions) for the term involved.   221         """   222    223         # Reset the writer and record the current file offset.   224    225         self.reset()   226         offset = self.f.tell()   227    228         # Write the number of documents.   229    230         self.write_number(len(doc_positions))   231    232         # Write the positions.   233    234         frequency = 0   235    236         for docnum, positions in doc_positions:   237             self.write_positions(docnum, positions)   238             frequency += len(positions)   239    240         return offset, frequency   241    242 class PositionReader(FileReader):   243    244     "Reading position information from files."   245    246     def reset(self):   247         self.last_docnum = 0   248    249     def read_positions(self):   250    251         "Read positions, returning a document number and a list of positions."   252    253         # Read the document number delta and add it to the last number.   254    255         self.last_docnum += self.read_number()   256    257         # Read the number of positions.   258    259         npositions = self.read_number()   260    261         # Read the position deltas, adding each previous position to get the   262         # appropriate collection of absolute positions.   263    264         i = 0   265         last = 0   266         positions = []   267    268         while i < npositions:   269             last += self.read_number()   270             positions.append(last)   271             i += 1   272    273         return self.last_docnum, positions   274    275     def read_all_positions(self, offset):   276    277         """   278         Read all positions from 'offset', seeking to that position in the file   279         before reading.   280         """   281    282         self.reset()   283         self.f.seek(offset)   284    285         # Read the number of documents.   286    287         ndocuments = self.read_number()   288    289         # Read all records.   290    291         i = 0   292         doc_positions = []   293    294         while i < ndocuments:   295             doc_positions.append(self.read_positions())   296             i += 1   297    298         return doc_positions   299    300 class TermWriter(FileWriter):   301    302     "Writing term information to files."   303    304     def reset(self):   305         self.last_term = ""   306         self.last_offset = 0   307    308     def write_term(self, term, offset, frequency):   309    310         """   311         Write the given 'term', its position file 'offset', and its 'frequency'   312         to the term information file. Return the offset after the term   313         information was written to the file.   314         """   315    316         # Too long terms are not currently supported.   317    318         if len(term) > 255:   319             raise ValueError, "Term %r is too long." % term   320    321         # Write the prefix length and term suffix.   322    323         common = len(commonprefix([self.last_term, term]))   324         suffix = term[common:]   325    326         self.write_number(common)   327         self.write_string(suffix)   328    329         # Write the offset delta.   330    331         self.write_number(offset - self.last_offset)   332    333         # Write the frequency.   334    335         self.write_number(frequency)   336    337         self.last_term = term   338         self.last_offset = offset   339    340         return self.f.tell()   341    342 class TermReader(FileReader):   343    344     "Reading term information from files."   345    346     def reset(self):   347         self.last_term = ""   348         self.last_offset = 0   349    350     def read_term(self):   351    352         """   353         Read a term, its position file offset, and its frequency from the term   354         information file.   355         """   356    357         # Read the prefix length and term suffix.   358    359         common = self.read_number()   360         suffix = self.read_string()   361    362         self.last_term = self.last_term[:common] + suffix   363    364         # Read the offset delta.   365    366         self.last_offset += self.read_number()   367    368         # Read the frequency.   369    370         frequency = self.read_number()   371    372         return self.last_term, self.last_offset, frequency   373    374     def go_to_term(self, term, offset, info_offset):   375    376         """   377         Seek past the entry for 'term' having 'offset' to 'info_offset'. This   378         permits the scanning for later terms from the specified term.   379         """   380    381         self.f.seek(info_offset)   382         self.last_term = term   383         self.last_offset = offset   384    385 class TermIndexWriter(TermWriter):   386    387     "Writing term dictionary index details to files."   388    389     def reset(self):   390         TermWriter.reset(self)   391         self.last_info_offset = 0   392    393     def write_term(self, term, offset, frequency, info_offset):   394    395         """   396         Write the given 'term', its position file 'offset', and its 'frequency'   397         to the term dictionary index file, along with the 'info_offset' in the   398         term information file.   399         """   400    401         TermWriter.write_term(self, term, offset, frequency)   402    403         # Write the information file offset delta.   404    405         self.write_number(info_offset - self.last_info_offset)   406         self.last_info_offset = info_offset   407    408 class TermIndexReader(TermReader):   409    410     "Reading term dictionary index details from files."   411    412     def reset(self):   413         TermReader.reset(self)   414         self.last_info_offset = 0   415    416     def read_term(self):   417    418         """   419         Read a term, its position file offset, its frequency, and its term   420         information file offset from the term dictionary index file.   421         """   422    423         term, offset, frequency = TermReader.read_term(self)   424    425         # Read the offset delta.   426    427         self.last_info_offset += self.read_number()   428    429         return term, offset, frequency, self.last_info_offset   430    431 class TermDictionaryWriter:   432    433     "Writing term dictionaries."   434    435     def __init__(self, info_writer, index_writer, position_writer, interval):   436         self.info_writer = info_writer   437         self.index_writer = index_writer   438         self.position_writer = position_writer   439         self.interval = interval   440         self.entry = 0   441    442     def _write_term(self, term, offset, frequency):   443    444         """   445         Write the given 'term', its position file 'offset', and its 'frequency'   446         to the term information file and optionally to the index, making a   447         dictionary entry.   448         """   449    450         info_offset = self.info_writer.write_term(term, offset, frequency)   451    452         if self.entry % self.interval == 0:   453             self.index_writer.write_term(term, offset, frequency, info_offset)   454    455         self.entry += 1   456    457     def write_term_positions(self, term, doc_positions):   458    459         """   460         Write the given 'term' and the 'doc_positions' recording the documents   461         and positions at which the term is found.   462         """   463    464         offset, frequency = self.position_writer.write_all_positions(doc_positions)   465         self._write_term(term, offset, frequency)   466    467     def close(self):   468         self.info_writer.close()   469         self.index_writer.close()   470         self.position_writer.close()   471    472 class TermDictionaryReader:   473    474     "Reading term dictionaries."   475    476     def __init__(self, info_reader, index_reader, position_reader):   477         self.info_reader = info_reader   478         self.index_reader = index_reader   479         self.position_reader = position_reader   480    481         self.terms = []   482         try:   483             while 1:   484                 self.terms.append(self.index_reader.read_term())   485         except EOFError:   486             pass   487    488         # Large numbers for ordering purposes.   489    490         self.max_offset = self.terms[-1][1]   491         self.max_info_offset = self.terms[-1][2]   492    493     def _find_term(self, term):   494    495         """   496         Find the position file offset and frequency of 'term' from the term   497         dictionary.   498         """   499    500         i = bisect_right(self.terms, (term, self.max_offset, self.max_info_offset)) - 1   501    502         # Get the entry position providing the term or one preceding it.   503    504         if i == -1:   505             return None   506    507         found_term, offset, frequency, info_offset = self.terms[i]   508    509         # Where the term is found immediately, return the offset.   510    511         if term == found_term:   512             return offset, frequency   513    514         # Otherwise, seek past the index term's entry in the information file   515         # and scan for the desired term.   516    517         else:   518             self.info_reader.go_to_term(found_term, offset, info_offset)   519             try:   520                 while term > found_term:   521                     found_term, offset, frequency = self.info_reader.read_term()   522             except EOFError:   523                 pass   524    525             # If the term is found, return the offset and frequency.   526    527             if term == found_term:   528                 return offset, frequency   529             else:   530                 return None   531    532     def find_positions(self, term):   533    534         "Return the documents and positions at which the given 'term' is found."   535    536         t = self._find_term(term)   537         if t is None:   538             return None   539         else:   540             offset, frequency = t   541             return self.position_reader.read_all_positions(offset)   542    543     def get_frequency(self, term):   544    545         "Return the frequency of the given 'term'."   546    547         t = self._find_term(term)   548         if t is None:   549             return None   550         else:   551             offset, frequency = t   552             return frequency   553    554     def close(self):   555         self.info_reader.close()   556         self.index_reader.close()   557         self.position_reader.close()   558    559 # Specific classes for storing document information.   560    561 class FieldWriter(FileWriter):   562    563     "Writing field data to files."   564    565     def reset(self):   566         self.last_docnum = 0   567    568     def write_fields(self, docnum, fields):   569    570         """   571         Write for the given 'docnum', a list of 'fields' (strings representing   572         field values). Return the offset at which the fields are stored.   573         """   574    575         offset = self.f.tell()   576    577         # Write the document number delta.   578    579         self.write_number(docnum - self.last_docnum)   580    581         # Write the number of fields.   582    583         self.write_number(len(fields))   584    585         # Write the fields themselves.   586    587         for field in fields:   588             self.write_string(field, 1) # compress   589    590         self.last_docnum = docnum   591         return offset   592    593 class FieldReader(FileReader):   594    595     "Reading field data from files."   596    597     def reset(self):   598         self.last_docnum = 0   599    600     def read_fields(self):   601    602         """   603         Read fields from the file, returning a tuple containing the document   604         number and a list of field values.   605         """   606    607         # Read the document number.   608    609         self.last_docnum += self.read_number()   610    611         # Read the number of fields.   612    613         nfields = self.read_number()   614    615         # Collect the fields.   616    617         fields = []   618         i = 0   619    620         while i < nfields:   621             fields.append(self.read_string(1)) # decompress   622             i += 1   623    624         return self.last_docnum, fields   625    626     def read_document_fields(self, docnum, offset):   627    628         """   629         Read fields for 'docnum' at the given 'offset'. This permits the   630         retrieval of details for the specified document, as well as scanning for   631         later documents.   632         """   633    634         self.f.seek(offset)   635         bad_docnum, fields = self.read_fields()   636         self.last_docnum = docnum   637         return docnum, fields   638            639 class FieldIndexWriter(FileWriter):   640    641     "Writing field index details to files."   642    643     def reset(self):   644         self.last_docnum = 0   645         self.last_offset = 0   646    647     def write_document(self, docnum, offset):   648    649         """   650         Write for the given 'docnum', the 'offset' at which the fields for the   651         document are stored in the fields file.   652         """   653    654         # Write the document number and offset deltas.   655    656         self.write_number(docnum - self.last_docnum)   657         self.write_number(offset - self.last_offset)   658    659         self.last_docnum = docnum   660         self.last_offset = offset   661    662 class FieldIndexReader(FileReader):   663    664     "Reading field index details from files."   665    666     def reset(self):   667         self.last_docnum = 0   668         self.last_offset = 0   669    670     def read_document(self):   671    672         "Read a document number and field file offset."   673    674         # Read the document number delta and offset.   675    676         self.last_docnum += self.read_number()   677         self.last_offset += self.read_number()   678    679         return self.last_docnum, self.last_offset   680    681 class FieldDictionaryWriter:   682    683     "Writing field dictionary details."   684    685     def __init__(self, field_writer, field_index_writer, interval):   686         self.field_writer = field_writer   687         self.field_index_writer = field_index_writer   688         self.interval = interval   689         self.entry = 0   690    691     def write_fields(self, docnum, fields):   692    693         "Write details of the document with the given 'docnum' and 'fields'."   694    695         offset = self.field_writer.write_fields(docnum, fields)   696    697         if self.entry % self.interval == 0:   698             self.field_index_writer.write_document(docnum, offset)   699    700         self.entry += 1   701    702     def close(self):   703         self.field_writer.close()   704         self.field_index_writer.close()   705    706 class FieldDictionaryReader:   707    708     "Reading field dictionary details."   709    710     def __init__(self, field_reader, field_index_reader):   711         self.field_reader = field_reader   712         self.field_index_reader = field_index_reader   713    714         self.docs = []   715         try:   716             while 1:   717                 self.docs.append(self.field_index_reader.read_document())   718         except EOFError:   719             pass   720    721         # Large numbers for ordering purposes.   722    723         self.max_offset = self.docs[-1][1]   724    725     def read_fields(self, docnum):   726    727         "Read the fields of the document with the given 'docnum'."   728    729         i = bisect_right(self.docs, (docnum, self.max_offset)) - 1   730    731         # Get the entry position providing the term or one preceding it.   732    733         if i == -1:   734             return None   735    736         found_docnum, offset = self.docs[i]   737    738         # Read from the fields file.   739    740         found_docnum, fields = self.field_reader.read_document_fields(found_docnum, offset)   741    742         # Scan for the document, if necessary.   743    744         try:   745             while docnum > found_docnum:   746                 found_docnum, fields = self.field_reader.read_fields()   747         except EOFError:   748             pass   749    750         # If the document is found, return the fields.   751    752         if docnum == found_docnum:   753             return fields   754         else:   755             return None   756    757     def close(self):   758         self.field_reader.close()   759         self.field_index_reader.close()   760    761 # High-level classes.   762    763 class IndexWriter:   764    765     """   766     Building term information and writing it to the term and field dictionaries.   767     """   768    769     def __init__(self, dict_writer, field_dict_writer):   770         self.dict_writer = dict_writer   771         self.field_dict_writer = field_dict_writer   772         self.terms = {}   773         self.docs = {}   774    775     def add_position(self, term, docnum, position):   776    777         """   778         Add a position entry for the given 'term' in the document with the given   779         'docnum', indicating the given 'position'.   780         """   781    782         if not self.terms.has_key(term):   783             doc_positions = self.terms[term] = {}   784         else:   785             doc_positions = self.terms[term]   786    787         if not doc_positions.has_key(docnum):   788             doc = doc_positions[docnum] = []   789         else:   790             doc = doc_positions[docnum]   791    792         doc.append(position)   793    794     def add_fields(self, docnum, fields):   795    796         "Add for the document with the given 'docnum' a list of 'fields'."   797    798         if not self.docs.has_key(docnum):   799             doc_fields = self.docs[docnum] = fields   800         else:   801             self.docs[docnum] += fields   802    803     def close(self):   804         if self.dict_writer is None:   805             return   806    807         # Get the terms in order.   808    809         terms = self.terms.items()   810         terms.sort()   811    812         for term, doc_positions in terms:   813             doc_positions = doc_positions.items()   814             doc_positions.sort()   815             self.dict_writer.write_term_positions(term, doc_positions)   816    817         self.dict_writer.close()   818         self.dict_writer = None   819    820         # Get the documents in order.   821    822         docs = self.docs.items()   823         docs.sort()   824    825         for docnum, fields in docs:   826             self.field_dict_writer.write_fields(docnum, fields)   827    828         self.field_dict_writer.close()   829         self.field_dict_writer = None   830    831 class IndexReader:   832    833     "Accessing the term and field dictionaries."   834    835     def __init__(self, dict_reader, field_dict_reader):   836         self.dict_reader = dict_reader   837         self.field_dict_reader = field_dict_reader   838    839     def find_positions(self, term):   840         return self.dict_reader.find_positions(term)   841    842     def get_frequency(self, term):   843         return self.dict_reader.get_frequency(term)   844    845     def get_fields(self, docnum):   846         return self.field_dict_reader.read_fields(docnum)   847    848     def close(self):   849         self.dict_reader.close()   850         self.field_dict_reader.close()   851    852 class Index:   853    854     "An inverted index solution encapsulating the various components."   855    856     def __init__(self, pathname):   857         self.pathname = pathname   858         self.reader = None   859         self.writer = None   860    861     def get_writer(self, interval=INTERVAL):   862    863         "Return a writer, optionally using the given indexing 'interval'."   864    865         if not exists(self.pathname):   866             mkdir(self.pathname)   867    868         tdf = open(join(self.pathname, "terms"), "wb")   869         info_writer = TermWriter(tdf)   870    871         tdif = open(join(self.pathname, "index"), "wb")   872         index_writer = TermIndexWriter(tdif)   873    874         tpf = open(join(self.pathname, "positions"), "wb")   875         positions_writer = PositionWriter(tpf)   876    877         dict_writer = TermDictionaryWriter(info_writer, index_writer, positions_writer, interval)   878    879         ff = open(join(self.pathname, "fields"), "wb")   880         field_writer = FieldWriter(ff)   881    882         fif = open(join(self.pathname, "fields_index"), "wb")   883         field_index_writer = FieldIndexWriter(fif)   884    885         field_dict_writer = FieldDictionaryWriter(field_writer, field_index_writer, interval)   886    887         self.writer = IndexWriter(dict_writer, field_dict_writer)   888         return self.writer   889    890     def get_reader(self):   891    892         "Return a reader for the index."   893    894         if not exists(self.pathname):   895             raise OSError, "Index path %r does not exist." % self.pathname   896    897         tdf = open(join(self.pathname, "terms"), "rb")   898         info_reader = TermReader(tdf)   899    900         tdif = open(join(self.pathname, "index"), "rb")   901         index_reader = TermIndexReader(tdif)   902    903         tpf = open(join(self.pathname, "positions"), "rb")   904         positions_reader = PositionReader(tpf)   905    906         dict_reader = TermDictionaryReader(info_reader, index_reader, positions_reader)   907    908         ff = open(join(self.pathname, "fields"), "rb")   909         field_reader = FieldReader(ff)   910    911         fif = open(join(self.pathname, "fields_index"), "rb")   912         field_index_reader = FieldIndexReader(fif)   913    914         field_dict_reader = FieldDictionaryReader(field_reader, field_index_reader)   915    916         self.reader = IndexReader(dict_reader, field_dict_reader)   917         return self.reader   918    919     def close(self):   920         if self.reader is not None:   921             self.reader.close()   922             self.reader = None   923         if self.writer is not None:   924             self.writer.close()   925             self.writer = None   926    927 # vim: tabstop=4 expandtab shiftwidth=4