# HG changeset patch # User Paul Boddie # Date 1251745350 -7200 # Node ID 1e7ca36202ef5bbf0ac39ad19863eeb4ac14710e # Parent e6f4e8e226a0a797183ff5821e1674bb3e5dbf0e Introduced a positions iterator to permit incremental reading of position information. diff -r e6f4e8e226a0 -r 1e7ca36202ef iixr.py --- a/iixr.py Mon Aug 31 20:24:41 2009 +0200 +++ b/iixr.py Mon Aug 31 21:02:30 2009 +0200 @@ -293,20 +293,44 @@ self.reset() self.f.seek(offset) + # Could duplicate the file handle using... + # fdopen(dup(self.f.fileno()), "rb") + + return PositionIterator(self.f) + +class PositionIterator(PositionReader): + + "Iterating over document positions." + + def __init__(self, f): + PositionReader.__init__(self, f) + # Read the number of documents. - ndocuments = self.read_number() + self.ndocuments = self.read_number() + self.read_documents = 0 + + def __len__(self): + return self.ndocuments - # Read all records. + def sort(self): + + "Stored document positions are already sorted." + + pass - i = 0 - doc_positions = [] + def __iter__(self): + return self + + def next(self): - while i < ndocuments: - doc_positions.append(self.read_positions()) - i += 1 + "Read positions for a single document." - return doc_positions + if self.read_documents < self.ndocuments: + self.read_documents += 1 + return self.read_positions() + else: + raise StopIteration class TermWriter(FileWriter): diff -r e6f4e8e226a0 -r 1e7ca36202ef test.py --- a/test.py Mon Aug 31 20:24:41 2009 +0200 +++ b/test.py Mon Aug 31 21:02:30 2009 +0200 @@ -1,6 +1,20 @@ #!/usr/bin/env python import iixr +import os + +# Remove old test files. + +for filename in ("test", "testF", "testFI", "testI", "testP"): + try: + os.remove(filename) + except OSError: + pass + +try: + os.removedirs("test_index") +except OSError: + pass # Test basic data types. @@ -63,7 +77,7 @@ offsets.reverse() all_doc_positions.reverse() for offset, doc_positions in zip(offsets, all_doc_positions): - dp = r.read_term_positions(offset) + dp = list(r.read_term_positions(offset)) print doc_positions == dp, doc_positions, dp r.close() @@ -267,7 +281,7 @@ terms_reversed = terms_with_positions[:] terms_reversed.reverse() for term, doc_positions in terms_reversed: - dp = rd.find_positions(term) + dp = list(rd.find_positions(term)) print doc_positions == dp, doc_positions, dp for term in ("dog", "dingo"): dp = rd.find_positions(term) @@ -278,6 +292,7 @@ rd.rewind() for term, doc_positions in terms_with_positions: t, fr, dp = rd.read_term() + dp = list(dp) print term == t, term, t print doc_positions == dp, doc_positions, dp rd.close() @@ -310,7 +325,7 @@ rd = index.get_reader() for term, frequency, doc_positions in doc_tests: - dp = rd.find_positions(term) + dp = list(rd.find_positions(term)) print doc_positions == dp, doc_positions, dp fr = rd.get_frequency(term) print frequency == fr, frequency, fr