# HG changeset patch # User Paul Boddie # Date 1297644837 -3600 # Node ID 52bfb08d5b2f539118a72a3bbc8a4eb20649f194 # Parent cf159c14882af2b676ca9c5d6580f62940e97573 Fix the index bisection and attempt to prevent unnecessary seeking and scanning. diff -r cf159c14882a -r 52bfb08d5b2f iixr/terms.py --- a/iixr/terms.py Mon Feb 14 00:44:57 2011 +0100 +++ b/iixr/terms.py Mon Feb 14 01:53:57 2011 +0100 @@ -381,18 +381,38 @@ self.reader = reader self.index_reader = index_reader self.records = list(index_reader) + self.terms = [t for t, dp in self.records] + + # Cache the last term and positions. + + self.last_term = None + self.last_positions = None def go_to_term(self, term): + """ + Return the 'term' and positions or nearest following term and positions. + """ + + if self.last_term == term: + return self.last_term, self.last_positions + # Get the record providing a term less than or equal to the requested # term, getting the first entry if no such records exist. - i = max(0, bisect_right(self.records, (term, None)) - 1) - t, offset = self.records[i] + after = bisect_right(self.terms, term) + before = max(0, after - 1) + + t, offset = self.records[before] + terms_after = self.terms[after:] # Seek to the corresponding record in the information file. + # Only do this if the term is more quickly reached by seeking. - self.reader.seek(offset) + if term <= t or self.last_term is None or term <= self.last_term or \ + self.last_term < t or terms_after and terms_after[0] <= self.last_term: + + self.reader.seek(offset) # Where the found term is equal or greater, just read the positions for # the index entry. @@ -426,7 +446,8 @@ return self def next(self): - return self.reader.next() + self.last_term, self.last_positions = t = self.reader.next() + return t def close(self): if self.reader is not None: