iixr (file iixr.py at fe7ed6b96612)

     1 #!/usr/bin/env python     2      3 """     4 A simple (and sane) text indexing library.     5      6 Copyright (C) 2009 Paul Boddie <paul@boddie.org.uk>     7      8 This program is free software; you can redistribute it and/or modify it under     9 the terms of the GNU General Public License as published by the Free Software    10 Foundation; either version 3 of the License, or (at your option) any later    11 version.    12     13 This program is distributed in the hope that it will be useful, but WITHOUT ANY    14 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A    15 PARTICULAR PURPOSE.  See the GNU General Public License for more details.    16     17 You should have received a copy of the GNU General Public License along    18 with this program.  If not, see <http://www.gnu.org/licenses/>.    19 """    20     21 from os import mkdir             # to determine whether to create indexes    22 from os.path import exists, join    23 from os.path import commonprefix # to find common string prefixes    24 from bisect import bisect_right  # to find terms in the dictionary index    25 import bz2                       # for field compression    26     27 # Constants.    28     29 INTERVAL = 100    30     31 # Foundation classes.    32     33 class File:    34     35     "A basic file abstraction."    36     37     def __init__(self, f):    38         self.f = f    39         self.reset()    40     41     def reset(self):    42         pass    43     44     def close(self):    45         if self.f is not None:    46             self.f.close()    47             self.f = None    48     49 class FileWriter(File):    50     51     "Writing basic data types to files."    52     53     def write_number(self, number):    54     55         "Write 'number' to the file using a variable length encoding."    56     57         # Negative numbers are not supported.    58     59         if number < 0:    60             raise ValueError, "Number %r is negative." % number    61     62         # Special case: one byte containing zero.    63     64         elif number == 0:    65             self.f.write(chr(0))    66             return    67     68         # Write the number from least to most significant digits.    69     70         bytes = []    71     72         while number != 0:    73             lsd = number & 127    74             number = number >> 7    75             if number != 0:    76                 lsd |= 128    77             bytes.append(chr(lsd))    78     79         record = "".join(bytes)    80         self.f.write(record)    81     82     def write_string(self, s, compress=0):    83     84         """    85         Write 's' to the file, recording its length and compressing the string    86         if 'compress' is set to a true value.    87         """    88     89         # Convert Unicode objects to strings.    90     91         if isinstance(s, unicode):    92             s = s.encode("utf-8")    93     94         # Compress the string if requested.    95     96         if compress:    97             s = bz2.compress(s)    98     99         # Write the length of the data before the data itself.   100    101         length = len(s)   102         self.write_number(length)   103         self.f.write(s)   104    105 class FileReader(File):   106    107     "Reading basic data types from files."   108    109     def read_number(self):   110    111         "Read a number from the file."   112    113         # Read each byte, adding it to the number.   114    115         shift = 0   116         number = 0   117         more = 1   118    119         while more:   120             byte = self.f.read(1)   121             if not byte:   122                 raise EOFError   123    124             csd = ord(byte)   125             more = csd & 128 != 0   126             if more:   127                 csd &= 127   128             number += (csd << shift)   129             shift += 7   130    131         return number   132    133     def read_string(self, decompress=0):   134    135         """   136         Read a string from the file, decompressing the stored data if   137         'decompress' is set to a true value.   138         """   139    140         length = self.read_number()   141         s = self.f.read(length)   142    143         # Decompress the data if requested.   144    145         if decompress:   146             s = bz2.decompress(s)   147    148         # Convert strings to Unicode objects.   149    150         return unicode(s, "utf-8")   151    152 # Specific classes for storing term and position information.   153    154 class PositionWriter(FileWriter):   155    156     "Writing position information to files."   157    158     def reset(self):   159         self.last_docnum = 0   160    161     def write_positions(self, docnum, positions):   162    163         "Write for the document 'docnum' the given 'positions'."   164    165         if docnum < self.last_docnum:   166             raise ValueError, "Document number %r is less than previous number %r." % (docnum, self.last_docnum)   167    168         # Write the document number delta.   169    170         self.write_number(docnum - self.last_docnum)   171    172         # Write the number of positions.   173    174         self.write_number(len(positions))   175    176         # Make sure that the positions are sorted.   177    178         positions.sort()   179    180         # Write the position deltas.   181    182         last = 0   183         for position in positions:   184             pos = position - last   185             self.write_number(pos)   186             last = position   187    188         self.last_docnum = docnum   189    190     def write_all_positions(self, doc_positions):   191    192         """   193         Write all 'doc_positions' - a collection of tuples of the form (document   194         number, position list) - to the file, returning the offset at which they   195         were stored.   196         """   197    198         # Reset the writer and record the current file offset.   199    200         self.reset()   201         offset = self.f.tell()   202    203         # Write the number of documents.   204    205         self.write_number(len(doc_positions))   206    207         # Write the positions.   208    209         for docnum, positions in doc_positions:   210             self.write_positions(docnum, positions)   211    212         return offset   213    214 class PositionReader(FileReader):   215    216     "Reading position information from files."   217    218     def reset(self):   219         self.last_docnum = 0   220    221     def read_positions(self):   222    223         "Read positions, returning a document number and a list of positions."   224    225         # Read the document number delta and add it to the last number.   226    227         self.last_docnum += self.read_number()   228    229         # Read the number of positions.   230    231         npositions = self.read_number()   232    233         # Read the position deltas, adding each previous position to get the   234         # appropriate collection of absolute positions.   235    236         i = 0   237         last = 0   238         positions = []   239    240         while i < npositions:   241             last += self.read_number()   242             positions.append(last)   243             i += 1   244    245         return self.last_docnum, positions   246    247     def read_all_positions(self, offset):   248    249         """   250         Read all positions from 'offset', seeking to that position in the file   251         before reading.   252         """   253    254         self.reset()   255         self.f.seek(offset)   256    257         # Read the number of documents.   258    259         ndocuments = self.read_number()   260    261         # Read all records.   262    263         i = 0   264         doc_positions = []   265    266         while i < ndocuments:   267             doc_positions.append(self.read_positions())   268             i += 1   269    270         return doc_positions   271    272 class TermWriter(FileWriter):   273    274     "Writing term information to files."   275    276     def reset(self):   277         self.last_term = ""   278         self.last_offset = 0   279    280     def write_term(self, term, offset):   281    282         """   283         Write the given 'term' and its position file 'offset' to the term   284         information file. Return the offset after the term information was   285         written to the file.   286         """   287    288         # Too long terms are not currently supported.   289    290         if len(term) > 255:   291             raise ValueError, "Term %r is too long." % term   292    293         # Write the prefix length and term suffix.   294    295         common = len(commonprefix([self.last_term, term]))   296         suffix = term[common:]   297    298         self.write_number(common)   299         self.write_string(suffix)   300    301         # Write the offset delta.   302    303         self.write_number(offset - self.last_offset)   304    305         self.last_term = term   306         self.last_offset = offset   307    308         return self.f.tell()   309    310 class TermReader(FileReader):   311    312     "Reading term information from files."   313    314     def reset(self):   315         self.last_term = ""   316         self.last_offset = 0   317    318     def read_term(self):   319    320         """   321         Read a term and its position file offset from the term information file.   322         """   323    324         # Read the prefix length and term suffix.   325    326         common = self.read_number()   327         suffix = self.read_string()   328    329         self.last_term = self.last_term[:common] + suffix   330    331         # Read the offset delta.   332    333         self.last_offset += self.read_number()   334    335         return self.last_term, self.last_offset   336    337     def go_to_term(self, term, offset, info_offset):   338    339         """   340         Seek past the entry for 'term' having 'offset' to 'info_offset'. This   341         permits the scanning for later terms from the specified term.   342         """   343    344         self.f.seek(info_offset)   345         self.last_term = term   346         self.last_offset = offset   347    348 class TermIndexWriter(TermWriter):   349    350     "Writing term dictionary index details to files."   351    352     def reset(self):   353         TermWriter.reset(self)   354         self.last_info_offset = 0   355    356     def write_term(self, term, offset, info_offset):   357    358         """   359         Write the given 'term' and its position file 'offset' to the term   360         dictionary index file, along with the 'info_offset' in the term   361         information file.   362         """   363    364         TermWriter.write_term(self, term, offset)   365    366         # Write the information file offset delta.   367    368         self.write_number(info_offset - self.last_info_offset)   369         self.last_info_offset = info_offset   370    371 class TermIndexReader(TermReader):   372    373     "Reading term dictionary index details from files."   374    375     def reset(self):   376         TermReader.reset(self)   377         self.last_info_offset = 0   378    379     def read_term(self):   380    381         """   382         Read a term, its position file offset, and its term information file   383         offset from the term dictionary index file.   384         """   385    386         term, offset = TermReader.read_term(self)   387    388         # Read the offset delta.   389    390         self.last_info_offset += self.read_number()   391    392         return term, offset, self.last_info_offset   393    394 class TermDictionaryWriter:   395    396     "Writing term dictionaries."   397    398     def __init__(self, info_writer, index_writer, position_writer, interval):   399         self.info_writer = info_writer   400         self.index_writer = index_writer   401         self.position_writer = position_writer   402         self.interval = interval   403         self.entry = 0   404    405     def _write_term(self, term, offset):   406    407         """   408         Write the given 'term' and its position file 'offset' to the term   409         information file and optionally to the index, making a dictionary entry.   410         """   411    412         info_offset = self.info_writer.write_term(term, offset)   413    414         if self.entry % self.interval == 0:   415             self.index_writer.write_term(term, offset, info_offset)   416    417         self.entry += 1   418    419     def write_term_positions(self, term, doc_positions):   420    421         """   422         Write the given 'term' and the 'doc_positions' recording the documents   423         and positions at which the term is found.   424         """   425    426         offset = self.position_writer.write_all_positions(doc_positions)   427         self._write_term(term, offset)   428    429     def close(self):   430         self.info_writer.close()   431         self.index_writer.close()   432         self.position_writer.close()   433    434 class TermDictionaryReader:   435    436     "Reading term dictionaries."   437    438     def __init__(self, info_reader, index_reader, position_reader):   439         self.info_reader = info_reader   440         self.index_reader = index_reader   441         self.position_reader = position_reader   442    443         self.terms = []   444         try:   445             while 1:   446                 self.terms.append(self.index_reader.read_term())   447         except EOFError:   448             pass   449    450         # Large numbers for ordering purposes.   451    452         self.max_offset = self.terms[-1][1]   453         self.max_info_offset = self.terms[-1][2]   454    455     def _find_term(self, term):   456    457         "Find the position file offset of 'term' from the term dictionary."   458    459         i = bisect_right(self.terms, (term, self.max_offset, self.max_info_offset)) - 1   460    461         # Get the entry position providing the term or one preceding it.   462    463         if i == -1:   464             return None   465    466         found_term, offset, info_offset = self.terms[i]   467    468         # Where the term is found immediately, return the offset.   469    470         if term == found_term:   471             return offset   472    473         # Otherwise, seek past the index term's entry in the information file   474         # and scan for the desired term.   475    476         else:   477             self.info_reader.go_to_term(found_term, offset, info_offset)   478             try:   479                 while term > found_term:   480                     found_term, offset = self.info_reader.read_term()   481             except EOFError:   482                 pass   483    484             # If the term is found, return the offset.   485    486             if term == found_term:   487                 return offset   488             else:   489                 return None   490    491     def find_positions(self, term):   492    493         "Return the documents and positions at which the given 'term' is found."   494    495         offset = self._find_term(term)   496         if offset is None:   497             return None   498         else:   499             return self.position_reader.read_all_positions(offset)   500    501     def close(self):   502         self.info_reader.close()   503         self.index_reader.close()   504         self.position_reader.close()   505    506 # Specific classes for storing document information.   507    508 class FieldWriter(FileWriter):   509    510     "Writing field data to files."   511    512     def reset(self):   513         self.last_docnum = 0   514    515     def write_fields(self, docnum, fields):   516    517         """   518         Write for the given 'docnum', a list of 'fields' (strings representing   519         field values). Return the offset at which the fields are stored.   520         """   521    522         offset = self.f.tell()   523    524         # Write the document number delta.   525    526         self.write_number(docnum - self.last_docnum)   527    528         # Write the number of fields.   529    530         self.write_number(len(fields))   531    532         # Write the fields themselves.   533    534         for field in fields:   535             self.write_string(field, 0) # compress   536    537         self.last_docnum = docnum   538         return offset   539    540 class FieldReader(FileReader):   541    542     "Reading field data from files."   543    544     def reset(self):   545         self.last_docnum = 0   546    547     def read_fields(self):   548    549         """   550         Read fields from the file, returning a tuple containing the document   551         number and a list of field values.   552         """   553    554         # Read the document number.   555    556         self.last_docnum += self.read_number()   557    558         # Read the number of fields.   559    560         nfields = self.read_number()   561    562         # Collect the fields.   563    564         fields = []   565         i = 0   566    567         while i < nfields:   568             fields.append(self.read_string(0)) # decompress   569             i += 1   570    571         return self.last_docnum, fields   572    573     def read_document_fields(self, docnum, offset):   574    575         """   576         Read fields for 'docnum' at the given 'offset'. This permits the   577         retrieval of details for the specified document, as well as scanning for   578         later documents.   579         """   580    581         self.f.seek(offset)   582         bad_docnum, fields = self.read_fields()   583         self.last_docnum = docnum   584         return docnum, fields   585            586 class FieldIndexWriter(FileWriter):   587    588     "Writing field index details to files."   589    590     def reset(self):   591         self.last_docnum = 0   592    593     def write_document(self, docnum, offset):   594    595         """   596         Write for the given 'docnum', the 'offset' at which the fields for the   597         document are stored in the fields file.   598         """   599    600         # Write the document number delta and offset.   601    602         self.write_number(docnum - self.last_docnum)   603         self.write_number(offset)   604    605         self.last_docnum = docnum   606    607 class FieldIndexReader(FileReader):   608    609     "Reading field index details from files."   610    611     def reset(self):   612         self.last_docnum = 0   613    614     def read_document(self):   615    616         "Read a document number and field file offset."   617    618         # Read the document number delta and offset.   619    620         self.last_docnum += self.read_number()   621         offset = self.read_number()   622    623         return self.last_docnum, offset   624    625 class FieldDictionaryWriter:   626    627     "Writing field dictionary details."   628    629     def __init__(self, field_writer, field_index_writer, interval):   630         self.field_writer = field_writer   631         self.field_index_writer = field_index_writer   632         self.interval = interval   633         self.entry = 0   634    635     def write_fields(self, docnum, fields):   636    637         "Write details of the document with the given 'docnum' and 'fields'."   638    639         offset = self.field_writer.write_fields(docnum, fields)   640    641         if self.entry % self.interval == 0:   642             self.field_index_writer.write_document(docnum, offset)   643    644         self.entry += 1   645    646     def close(self):   647         self.field_writer.close()   648         self.field_index_writer.close()   649    650 class FieldDictionaryReader:   651    652     "Reading field dictionary details."   653    654     def __init__(self, field_reader, field_index_reader):   655         self.field_reader = field_reader   656         self.field_index_reader = field_index_reader   657    658         self.docs = []   659         try:   660             while 1:   661                 self.docs.append(self.field_index_reader.read_document())   662         except EOFError:   663             pass   664    665         # Large numbers for ordering purposes.   666    667         self.max_offset = self.docs[-1][1]   668    669     def read_fields(self, docnum):   670    671         "Read the fields of the document with the given 'docnum'."   672    673         i = bisect_right(self.docs, (docnum, self.max_offset)) - 1   674    675         # Get the entry position providing the term or one preceding it.   676    677         if i == -1:   678             return None   679    680         found_docnum, offset = self.docs[i]   681    682         # Read from the fields file.   683    684         found_docnum, fields = self.field_reader.read_document_fields(found_docnum, offset)   685    686         # Scan for the document, if necessary.   687    688         try:   689             while docnum > found_docnum:   690                 found_docnum, fields = self.field_reader.read_fields()   691         except EOFError:   692             pass   693    694         # If the document is found, return the fields.   695    696         if docnum == found_docnum:   697             return fields   698         else:   699             return None   700    701     def close(self):   702         self.field_reader.close()   703         self.field_index_reader.close()   704    705 # High-level classes.   706    707 class IndexWriter:   708    709     "Building term information and writing it to the term dictionary."   710    711     def __init__(self, dict_writer):   712         self.dict_writer = dict_writer   713         self.terms = {}   714    715     def add_position(self, term, docnum, position):   716    717         """   718         Add a position entry for the given 'term' in the document with the given   719         'docnum', indicating the given 'position'.   720         """   721    722         if not self.terms.has_key(term):   723             doc_positions = self.terms[term] = {}   724         else:   725             doc_positions = self.terms[term]   726    727         if not doc_positions.has_key(docnum):   728             doc = doc_positions[docnum] = []   729         else:   730             doc = doc_positions[docnum]   731    732         doc.append(position)   733    734     def close(self):   735         if self.dict_writer is None:   736             return   737    738         # Get the terms in order.   739    740         terms = self.terms.items()   741         terms.sort()   742    743         for term, doc_positions in terms:   744             doc_positions = doc_positions.items()   745             doc_positions.sort()   746             self.dict_writer.write_term_positions(term, doc_positions)   747    748         self.dict_writer.close()   749         self.dict_writer = None   750    751 class Index:   752    753     "An inverted index solution encapsulating the various components."   754    755     def __init__(self, pathname):   756         self.pathname = pathname   757         self.reader = None   758         self.writer = None   759    760     def get_writer(self, interval=INTERVAL):   761    762         "Return a writer, optionally using the given indexing 'interval'."   763    764         if not exists(self.pathname):   765             mkdir(self.pathname)   766    767         tdf = open(join(self.pathname, "terms"), "wb")   768         info_writer = TermWriter(tdf)   769    770         tdif = open(join(self.pathname, "index"), "wb")   771         index_writer = TermIndexWriter(tdif)   772    773         tpf = open(join(self.pathname, "positions"), "wb")   774         positions_writer = PositionWriter(tpf)   775    776         dict_writer = TermDictionaryWriter(info_writer, index_writer, positions_writer, interval)   777    778         self.writer = IndexWriter(dict_writer)   779         return self.writer   780    781     def get_reader(self):   782    783         "Return a reader for the index."   784    785         if not exists(self.pathname):   786             raise OSError, "Index path %r does not exist." % self.pathname   787    788         tdf = open(join(self.pathname, "terms"), "rb")   789         info_reader = TermReader(tdf)   790    791         tdif = open(join(self.pathname, "index"), "rb")   792         index_reader = TermIndexReader(tdif)   793    794         tpf = open(join(self.pathname, "positions"), "rb")   795         positions_reader = PositionReader(tpf)   796    797         self.reader = TermDictionaryReader(info_reader, index_reader, positions_reader)   798         return self.reader   799    800     def close(self):   801         if self.reader is not None:   802             self.reader.close()   803             self.reader = None   804         if self.writer is not None:   805             self.writer.close()   806             self.writer = None   807    808 # vim: tabstop=4 expandtab shiftwidth=4