# HG changeset patch # User Paul Boddie # Date 1251237200 -7200 # Node ID 5ef1ed1945938a5504bb45e6b3d98cd833714abe # Parent 67fdbd4c2a90064d0f6a8f939957ee29589c0547 Added end-of-file detection. Added term dictionary support, including term dictionary index file reading and writing. diff -r 67fdbd4c2a90 -r 5ef1ed194593 iixr.py --- a/iixr.py Tue Aug 25 22:44:15 2009 +0200 +++ b/iixr.py Tue Aug 25 23:53:20 2009 +0200 @@ -19,6 +19,7 @@ """ from os.path import commonprefix # to find common string prefixes +from bisect import bisect_right # to find terms in the dictionary index # Foundation classes. @@ -104,7 +105,7 @@ "Read a number from the file." - nbytes = ord(self.f.read(1)) + nbytes = self.read_unsigned_byte() # Read each byte, adding it to the number. @@ -126,7 +127,11 @@ "Read a number from the file, consuming a single byte." - return ord(self.f.read(1)) + s = self.f.read(1) + if not s: + raise EOFError + + return ord(s) def read_string(self): @@ -263,7 +268,8 @@ """ Write the given 'term' and its position file 'offset' to the term - information file. + information file. Return the offset after the term information was + written to the file. """ # Too long terms are not currently supported. @@ -286,6 +292,8 @@ self.last_term = term self.last_offset = offset + return self.f.tell() + class TermReader(FileReader): "Reading term information from files." @@ -313,4 +321,146 @@ return self.last_term, self.last_offset + def go_to_term(self, term, offset, info_offset): + + "Seek past the entry for 'term' having 'offset' to 'info_offset'." + + self.f.seek(info_offset) + self.last_term = term + self.last_offset = offset + +class TermIndexWriter(TermWriter): + + "Writing term dictionary index details to files." + + def reset(self): + TermWriter.reset(self) + self.last_info_offset = 0 + + def write_term(self, term, offset, info_offset): + + """ + Write the given 'term' and its position file 'offset' to the term + dictionary index file, along with the 'info_offset' in the term + information file. + """ + + TermWriter.write_term(self, term, offset) + + # Write the information file offset delta. + + self.write_number(info_offset - self.last_info_offset) + self.last_info_offset = info_offset + +class TermIndexReader(TermReader): + + "Reading term dictionary index details from files." + + def reset(self): + TermReader.reset(self) + self.last_info_offset = 0 + + def read_term(self): + + """ + Read a term, its position file offset, and its term information file + offset from the term dictionary index file. + """ + + term, offset = TermReader.read_term(self) + + # Read the offset delta. + + self.last_info_offset += self.read_number() + + return term, offset, self.last_info_offset + +class TermDictionaryWriter: + + "Writing term dictionaries." + + def __init__(self, info_writer, index_writer, interval): + self.info_writer = info_writer + self.index_writer = index_writer + self.interval = interval + self.entry = 0 + + def write_term(self, term, offset): + + """ + Write the given 'term' and its position file 'offset' to the term + information file and optionally to the index, making a dictionary entry. + """ + + info_offset = self.info_writer.write_term(term, offset) + + if self.entry % self.interval == 0: + self.index_writer.write_term(term, offset, info_offset) + + self.entry += 1 + + def close(self): + self.info_writer.close() + self.index_writer.close() + +class TermDictionaryReader: + + "Reading term dictionaries." + + def __init__(self, info_reader, index_reader): + self.info_reader = info_reader + self.index_reader = index_reader + + self.terms = [] + try: + while 1: + self.terms.append(self.index_reader.read_term()) + except EOFError: + pass + + # Large numbers for ordering purposes. + + self.max_offset = self.terms[-1][1] + self.max_info_offset = self.terms[-1][2] + + def find(self, term): + + "Find the position file offset of 'term' from the term dictionary." + + i = bisect_right(self.terms, (term, self.max_offset, self.max_info_offset)) - 1 + + # Get the entry position providing the term or one preceding it. + + if i == -1: + return None + + found_term, offset, info_offset = self.terms[i] + + # Where the term is found immediately, return the offset. + + if term == found_term: + return offset + + # Otherwise, seek past the index term's entry in the information file + # and scan for the desired term. + + else: + self.info_reader.go_to_term(found_term, offset, info_offset) + try: + while term > found_term: + found_term, offset = self.info_reader.read_term() + except EOFError: + pass + + # If the term is found, return the offset. + + if term == found_term: + return offset + else: + return None + + def close(self): + self.info_reader.close() + self.index_reader.close() + # vim: tabstop=4 expandtab shiftwidth=4 diff -r 67fdbd4c2a90 -r 5ef1ed194593 test.py --- a/test.py Tue Aug 25 22:44:15 2009 +0200 +++ b/test.py Tue Aug 25 23:53:20 2009 +0200 @@ -10,7 +10,7 @@ w.write_number(number) w.close() -f = open("test", "r") +f = open("test", "rb") r = iixr.FileReader(f) for number in numbers: n = r.read_number() @@ -36,7 +36,7 @@ w.reset() w.close() -f = open("test", "r") +f = open("test", "rb") r = iixr.PositionReader(f) for doc_positions in all_doc_positions: for docnum, positions in doc_positions: @@ -55,7 +55,7 @@ ) w.close() -f = open("test", "r") +f = open("test", "rb") r = iixr.PositionReader(f) offsets.reverse() all_doc_positions.reverse() @@ -79,7 +79,7 @@ w.write_term(term, offset) w.close() -f = open("test", "r") +f = open("test", "rb") r = iixr.TermReader(f) for term, offset in terms: t, o = r.read_term() @@ -87,4 +87,52 @@ print offset == o, offset, o r.close() +indexed_terms = [ + ("aardvark", 100000123, 200000321), + ("anteater", 100000456, 200000654), + ("badger", 100000789, 200000987), + ("bull", 1000001234, 200004321), + ("bulldog", 1000002345, 200005432), + ("cat", 1000003456, 200006543) + ] + +f = open("test", "wb") +w = iixr.TermIndexWriter(f) +for term, offset, info_offset in indexed_terms: + w.write_term(term, offset, info_offset) +w.close() + +f = open("test", "rb") +r = iixr.TermIndexReader(f) +for term, offset, info_offset in indexed_terms: + t, o, i = r.read_term() + print term == t, term, t + print offset == o, offset, o + print info_offset == i, info_offset, i +r.close() + +f = open("test", "wb") +w = iixr.TermWriter(f) +f2 = open("testI", "wb") +w2 = iixr.TermIndexWriter(f2) +w3 = iixr.TermDictionaryWriter(w, w2, 3) +for term, offset in terms: + w3.write_term(term, offset) +w3.close() + +f = open("test", "rb") +r = iixr.TermReader(f) +f2 = open("testI", "rb") +r2 = iixr.TermIndexReader(f2) +r3 = iixr.TermDictionaryReader(r, r2) +terms_reversed = terms[:] +terms_reversed.reverse() +for term, offset in terms_reversed: + o = r3.find(term) + print offset == o, offset, o +for term in ("dog", "dingo"): + o = r3.find(term) + print o is None, o +r3.close() + # vim: tabstop=4 expandtab shiftwidth=4