iixr (file iixr.py at 5e4746613f83)

     1 #!/usr/bin/env python     2      3 """     4 A simple (and sane) text indexing library.     5      6 Copyright (C) 2009 Paul Boddie <paul@boddie.org.uk>     7      8 This program is free software; you can redistribute it and/or modify it under     9 the terms of the GNU General Public License as published by the Free Software    10 Foundation; either version 3 of the License, or (at your option) any later    11 version.    12     13 This program is distributed in the hope that it will be useful, but WITHOUT ANY    14 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A    15 PARTICULAR PURPOSE.  See the GNU General Public License for more details.    16     17 You should have received a copy of the GNU General Public License along    18 with this program.  If not, see <http://www.gnu.org/licenses/>.    19 """    20     21 from os import mkdir             # to determine whether to create indexes    22 from os.path import exists, join    23 from os.path import commonprefix # to find common string prefixes    24 from bisect import bisect_right  # to find terms in the dictionary index    25 import bz2                       # for field compression    26     27 # Constants.    28     29 INTERVAL = 100    30     31 # Foundation classes.    32     33 class File:    34     35     "A basic file abstraction."    36     37     def __init__(self, f):    38         self.f = f    39         self.reset()    40     41     def reset(self):    42         pass    43     44     def close(self):    45         if self.f is not None:    46             self.f.close()    47             self.f = None    48     49 class FileWriter(File):    50     51     "Writing basic data types to files."    52     53     def write_number(self, number):    54     55         "Write 'number' to the file using a variable length encoding."    56     57         # Negative numbers are not supported.    58     59         if number < 0:    60             raise ValueError, "Number %r is negative." % number    61     62         # Special case: one byte containing zero.    63     64         elif number == 0:    65             self.f.write(chr(0))    66             return    67     68         # Write the number from least to most significant digits.    69     70         bytes = []    71     72         while number != 0:    73             lsd = number & 127    74             number = number >> 7    75             if number != 0:    76                 lsd |= 128    77             bytes.append(chr(lsd))    78     79         record = "".join(bytes)    80         self.f.write(record)    81     82     def write_string(self, s, compress=0):    83     84         """    85         Write 's' to the file, recording its length and compressing the string    86         if 'compress' is set to a true value.    87         """    88     89         # Convert Unicode objects to strings.    90     91         if isinstance(s, unicode):    92             s = s.encode("utf-8")    93     94         # Compress the string if requested.    95     96         if compress:    97             s = bz2.compress(s)    98     99         # Write the length of the data before the data itself.   100    101         length = len(s)   102         self.write_number(length)   103         self.f.write(s)   104    105 class FileReader(File):   106    107     "Reading basic data types from files."   108    109     def read_number(self):   110    111         "Read a number from the file."   112    113         # Read each byte, adding it to the number.   114    115         shift = 0   116         number = 0   117         more = 1   118    119         while more:   120             byte = self.f.read(1)   121             if not byte:   122                 raise EOFError   123    124             csd = ord(byte)   125             more = csd & 128 != 0   126             if more:   127                 csd &= 127   128             number += (csd << shift)   129             shift += 7   130    131         return number   132    133     def read_string(self, decompress=0):   134    135         """   136         Read a string from the file, decompressing the stored data if   137         'decompress' is set to a true value.   138         """   139    140         length = self.read_number()   141         s = self.f.read(length)   142    143         # Decompress the data if requested.   144    145         if decompress:   146             s = bz2.decompress(s)   147    148         # Convert strings to Unicode objects.   149    150         return unicode(s, "utf-8")   151    152 # Specific classes.   153    154 class PositionWriter(FileWriter):   155    156     "Writing position information to files."   157    158     def reset(self):   159         self.last_docnum = 0   160    161     def write_positions(self, docnum, positions):   162    163         "Write for the document 'docnum' the given 'positions'."   164    165         if docnum < self.last_docnum:   166             raise ValueError, "Document number %r is less than previous number %r." % (docnum, self.last_docnum)   167    168         # Write the document number delta.   169    170         self.write_number(docnum - self.last_docnum)   171    172         # Write the number of positions.   173    174         self.write_number(len(positions))   175    176         # Make sure that the positions are sorted.   177    178         positions.sort()   179    180         # Write the position deltas.   181    182         last = 0   183         for position in positions:   184             pos = position - last   185             self.write_number(pos)   186             last = position   187    188         self.last_docnum = docnum   189    190     def write_all_positions(self, doc_positions):   191    192         """   193         Write all 'doc_positions' - a collection of tuples of the form (document   194         number, position list) - to the file, returning the offset at which they   195         were stored.   196         """   197    198         # Reset the writer and record the current file offset.   199    200         self.reset()   201         offset = self.f.tell()   202    203         # Write the number of documents.   204    205         self.write_number(len(doc_positions))   206    207         # Write the positions.   208    209         for docnum, positions in doc_positions:   210             self.write_positions(docnum, positions)   211    212         return offset   213    214 class PositionReader(FileReader):   215    216     "Reading position information from files."   217    218     def reset(self):   219         self.last_docnum = 0   220    221     def read_positions(self):   222    223         "Read positions, returning a document number and a list of positions."   224    225         # Read the document number delta and add it to the last number.   226    227         self.last_docnum += self.read_number()   228    229         # Read the number of positions.   230    231         npositions = self.read_number()   232    233         # Read the position deltas, adding each previous position to get the   234         # appropriate collection of absolute positions.   235    236         i = 0   237         last = 0   238         positions = []   239    240         while i < npositions:   241             last += self.read_number()   242             positions.append(last)   243             i += 1   244    245         return self.last_docnum, positions   246    247     def read_all_positions(self, offset):   248    249         """   250         Read all positions from 'offset', seeking to that position in the file   251         before reading.   252         """   253    254         self.reset()   255         self.f.seek(offset)   256    257         # Read the number of documents.   258    259         ndocuments = self.read_number()   260    261         # Read all records.   262    263         i = 0   264         doc_positions = []   265    266         while i < ndocuments:   267             doc_positions.append(self.read_positions())   268             i += 1   269    270         return doc_positions   271    272 class TermWriter(FileWriter):   273    274     "Writing term information to files."   275    276     def reset(self):   277         self.last_term = ""   278         self.last_offset = 0   279    280     def write_term(self, term, offset):   281    282         """   283         Write the given 'term' and its position file 'offset' to the term   284         information file. Return the offset after the term information was   285         written to the file.   286         """   287    288         # Too long terms are not currently supported.   289    290         if len(term) > 255:   291             raise ValueError, "Term %r is too long." % term   292    293         # Write the prefix length and term suffix.   294    295         common = len(commonprefix([self.last_term, term]))   296         suffix = term[common:]   297    298         self.write_number(common)   299         self.write_string(suffix)   300    301         # Write the offset delta.   302    303         self.write_number(offset - self.last_offset)   304    305         self.last_term = term   306         self.last_offset = offset   307    308         return self.f.tell()   309    310 class TermReader(FileReader):   311    312     "Reading term information from files."   313    314     def reset(self):   315         self.last_term = ""   316         self.last_offset = 0   317    318     def read_term(self):   319    320         """   321         Read a term and its position file offset from the term information file.   322         """   323    324         # Read the prefix length and term suffix.   325    326         common = self.read_number()   327         suffix = self.read_string()   328    329         self.last_term = self.last_term[:common] + suffix   330    331         # Read the offset delta.   332    333         self.last_offset += self.read_number()   334    335         return self.last_term, self.last_offset   336    337     def go_to_term(self, term, offset, info_offset):   338    339         "Seek past the entry for 'term' having 'offset' to 'info_offset'."   340    341         self.f.seek(info_offset)   342         self.last_term = term   343         self.last_offset = offset   344    345 class TermIndexWriter(TermWriter):   346    347     "Writing term dictionary index details to files."   348    349     def reset(self):   350         TermWriter.reset(self)   351         self.last_info_offset = 0   352    353     def write_term(self, term, offset, info_offset):   354    355         """   356         Write the given 'term' and its position file 'offset' to the term   357         dictionary index file, along with the 'info_offset' in the term   358         information file.   359         """   360    361         TermWriter.write_term(self, term, offset)   362    363         # Write the information file offset delta.   364    365         self.write_number(info_offset - self.last_info_offset)   366         self.last_info_offset = info_offset   367    368 class TermIndexReader(TermReader):   369    370     "Reading term dictionary index details from files."   371    372     def reset(self):   373         TermReader.reset(self)   374         self.last_info_offset = 0   375    376     def read_term(self):   377    378         """   379         Read a term, its position file offset, and its term information file   380         offset from the term dictionary index file.   381         """   382    383         term, offset = TermReader.read_term(self)   384    385         # Read the offset delta.   386    387         self.last_info_offset += self.read_number()   388    389         return term, offset, self.last_info_offset   390    391 class TermDictionaryWriter:   392    393     "Writing term dictionaries."   394    395     def __init__(self, info_writer, index_writer, position_writer, interval):   396         self.info_writer = info_writer   397         self.index_writer = index_writer   398         self.position_writer = position_writer   399         self.interval = interval   400         self.entry = 0   401    402     def write_term(self, term, offset):   403    404         """   405         Write the given 'term' and its position file 'offset' to the term   406         information file and optionally to the index, making a dictionary entry.   407         """   408    409         info_offset = self.info_writer.write_term(term, offset)   410    411         if self.entry % self.interval == 0:   412             self.index_writer.write_term(term, offset, info_offset)   413    414         self.entry += 1   415    416     def write_term_positions(self, term, doc_positions):   417    418         """   419         Write the given 'term' and the 'doc_positions' recording the documents   420         and positions at which the term is found.   421         """   422    423         offset = self.position_writer.write_all_positions(doc_positions)   424         self.write_term(term, offset)   425    426     def close(self):   427         self.info_writer.close()   428         self.index_writer.close()   429         self.position_writer.close()   430    431 class TermDictionaryReader:   432    433     "Reading term dictionaries."   434    435     def __init__(self, info_reader, index_reader, position_reader):   436         self.info_reader = info_reader   437         self.index_reader = index_reader   438         self.position_reader = position_reader   439    440         self.terms = []   441         try:   442             while 1:   443                 self.terms.append(self.index_reader.read_term())   444         except EOFError:   445             pass   446    447         # Large numbers for ordering purposes.   448    449         self.max_offset = self.terms[-1][1]   450         self.max_info_offset = self.terms[-1][2]   451    452     def find_term(self, term):   453    454         "Find the position file offset of 'term' from the term dictionary."   455    456         i = bisect_right(self.terms, (term, self.max_offset, self.max_info_offset)) - 1   457    458         # Get the entry position providing the term or one preceding it.   459    460         if i == -1:   461             return None   462    463         found_term, offset, info_offset = self.terms[i]   464    465         # Where the term is found immediately, return the offset.   466    467         if term == found_term:   468             return offset   469    470         # Otherwise, seek past the index term's entry in the information file   471         # and scan for the desired term.   472    473         else:   474             self.info_reader.go_to_term(found_term, offset, info_offset)   475             try:   476                 while term > found_term:   477                     found_term, offset = self.info_reader.read_term()   478             except EOFError:   479                 pass   480    481             # If the term is found, return the offset.   482    483             if term == found_term:   484                 return offset   485             else:   486                 return None   487    488     def find_positions(self, term):   489    490         "Return the documents and positions at which the given 'term' is found."   491    492         offset = self.find_term(term)   493         if offset is None:   494             return None   495         else:   496             return self.position_reader.read_all_positions(offset)   497    498     def close(self):   499         self.info_reader.close()   500         self.index_reader.close()   501         self.position_reader.close()   502    503 class FieldWriter(FileWriter):   504    505     "Writing field data to files."   506    507     def write_fields(self, fields):   508    509         """   510         Write the given list of 'fields' (strings representing field values).   511         Return the offset at which the fields are stored.   512         """   513    514         offset = self.f.tell()   515    516         # Write the number of fields.   517    518         self.write_number(len(fields))   519    520         # Write the fields themselves.   521    522         for field in fields:   523             self.write_string(field, 0) # compress   524    525         return offset   526    527 class FieldReader(FileReader):   528    529     "Reading field data from files."   530    531     def read_fields(self):   532    533         "Read fields from the file, returning the field values in a list."   534    535         # Read the number of fields.   536    537         nfields = self.read_number()   538    539         # Collect the fields.   540    541         fields = []   542         i = 0   543    544         while i < nfields:   545             fields.append(self.read_string(0)) # decompress   546             i += 1   547    548         return fields   549    550     def read_doc_fields(self, offset):   551    552         "Read all fields at the given 'offset."   553    554         self.f.seek(offset)   555         return self.read_fields()   556    557 # High-level classes.   558    559 class IndexWriter:   560    561     "Building term information and writing it to the term dictionary."   562    563     def __init__(self, dict_writer):   564         self.dict_writer = dict_writer   565         self.terms = {}   566    567     def add_position(self, term, docnum, position):   568    569         """   570         Add a position entry for the given 'term' in the document with the given   571         'docnum', indicating the given 'position'.   572         """   573    574         if not self.terms.has_key(term):   575             doc_positions = self.terms[term] = {}   576         else:   577             doc_positions = self.terms[term]   578    579         if not doc_positions.has_key(docnum):   580             doc = doc_positions[docnum] = []   581         else:   582             doc = doc_positions[docnum]   583    584         doc.append(position)   585    586     def close(self):   587         if self.dict_writer is None:   588             return   589    590         # Get the terms in order.   591    592         terms = self.terms.items()   593         terms.sort()   594    595         for term, doc_positions in terms:   596             doc_positions = doc_positions.items()   597             doc_positions.sort()   598             self.dict_writer.write_term_positions(term, doc_positions)   599    600         self.dict_writer.close()   601         self.dict_writer = None   602    603 class Index:   604    605     "An inverted index solution encapsulating the various components."   606    607     def __init__(self, pathname):   608         self.pathname = pathname   609         self.reader = None   610         self.writer = None   611    612     def get_writer(self, interval=INTERVAL):   613    614         "Return a writer, optionally using the given indexing 'interval'."   615    616         if not exists(self.pathname):   617             mkdir(self.pathname)   618    619         tdf = open(join(self.pathname, "terms"), "wb")   620         info_writer = TermWriter(tdf)   621    622         tdif = open(join(self.pathname, "index"), "wb")   623         index_writer = TermIndexWriter(tdif)   624    625         tpf = open(join(self.pathname, "positions"), "wb")   626         positions_writer = PositionWriter(tpf)   627    628         dict_writer = TermDictionaryWriter(info_writer, index_writer, positions_writer, interval)   629    630         self.writer = IndexWriter(dict_writer)   631         return self.writer   632    633     def get_reader(self):   634    635         "Return a reader for the index."   636    637         if not exists(self.pathname):   638             raise OSError, "Index path %r does not exist." % self.pathname   639    640         tdf = open(join(self.pathname, "terms"), "rb")   641         info_reader = TermReader(tdf)   642    643         tdif = open(join(self.pathname, "index"), "rb")   644         index_reader = TermIndexReader(tdif)   645    646         tpf = open(join(self.pathname, "positions"), "rb")   647         positions_reader = PositionReader(tpf)   648    649         self.reader = TermDictionaryReader(info_reader, index_reader, positions_reader)   650         return self.reader   651    652     def close(self):   653         if self.reader is not None:   654             self.reader.close()   655             self.reader = None   656         if self.writer is not None:   657             self.writer.close()   658             self.writer = None   659    660 # vim: tabstop=4 expandtab shiftwidth=4