# HG changeset patch # User Paul Boddie # Date 1251847842 -7200 # Node ID 1cf3b82959f3d8e3e764a18691ab40f8a3504df2 # Parent 1e7ca36202ef5bbf0ac39ad19863eeb4ac14710e Attempted to introduce position dictionaries with extra term record details providing document frequency information. Attempted to introduce file descriptor duplication in order to support concurrent iterators. diff -r 1e7ca36202ef -r 1cf3b82959f3 iixr.py --- a/iixr.py Mon Aug 31 21:02:30 2009 +0200 +++ b/iixr.py Wed Sep 02 01:30:42 2009 +0200 @@ -18,6 +18,7 @@ with this program. If not, see . """ +from os import dup, fdopen # independent iterator access to files from os import listdir, mkdir # index and partition discovery from os import remove, rename # partition manipulation from os.path import exists, join @@ -194,11 +195,18 @@ def write_positions(self, docnum, positions): - "Write for the document 'docnum' the given 'positions'." + """ + Write for the document 'docnum' the given 'positions'. + Return the offset of the written record. + """ if docnum < self.last_docnum: raise ValueError, "Document number %r is less than previous number %r." % (docnum, self.last_docnum) + # Record the offset of this record. + + offset = self.f.tell() + # Write the document number delta. self.write_number(docnum - self.last_docnum) @@ -221,34 +229,7 @@ self.last_docnum = docnum - def write_term_positions(self, doc_positions): - - """ - Write all 'doc_positions' - a collection of tuples of the form (document - number, position list) - to the file, returning a tuple containing the - offset at which they were stored together with the frequency (number of - positions) for the term involved. - """ - - # Reset the writer and record the current file offset. - - self.reset() - offset = self.f.tell() - - # Write the number of documents. - - self.write_number(len(doc_positions)) - doc_positions.sort() - - # Write the positions. - - frequency = 0 - - for docnum, positions in doc_positions: - self.write_positions(docnum, positions) - frequency += len(positions) - - return offset, frequency + return offset class PositionReader(FileReader): @@ -283,54 +264,295 @@ return self.last_docnum, positions - def read_term_positions(self, offset): + def read_term_positions(self, offset, count): """ Read all positions from 'offset', seeking to that position in the file - before reading. + before reading. The number of documents available for reading is limited + to 'count'. """ self.reset() - self.f.seek(offset) + + # Duplicate the file handle. + + f = fdopen(dup(self.f.fileno()), "rb") + f.seek(offset) + return PositionIterator(f, count) + +class IteratorBase: + + def __init__(self, count): + self.replenish(count) - # Could duplicate the file handle using... - # fdopen(dup(self.f.fileno()), "rb") + def replenish(self, count): + self.count = count + self.read_documents = 0 + + def __len__(self): + return self.count - return PositionIterator(self.f) + def sort(self): + pass # Stored document positions are already sorted. -class PositionIterator(PositionReader): + def __iter__(self): + return self + +class PositionIterator(PositionReader, IteratorBase): "Iterating over document positions." - def __init__(self, f): + def __init__(self, f, count): PositionReader.__init__(self, f) + IteratorBase.__init__(self, count) + + def next(self): + + "Read positions for a single document." + + if self.read_documents < self.count: + self.read_documents += 1 + return self.read_positions() + else: + raise StopIteration + +class PositionIndexWriter(FileWriter): + + "Writing position index information to files." + + def reset(self): + self.last_docnum = 0 + self.last_pos_offset = 0 + + def write_positions(self, docnum, pos_offset, count): + + """ + Write the given 'docnum, 'pos_offset' and document 'count' to the + position index file. + """ + + # Record the offset of this record. + + offset = self.f.tell() + + # Write the document number delta. + + self.write_number(docnum - self.last_docnum) + self.last_docnum = docnum + + # Write the position file offset delta. + + self.write_number(pos_offset - self.last_pos_offset) + self.last_pos_offset = pos_offset + + # Write the document count. + + self.write_number(count) + + return offset + +class PositionIndexReader(FileReader): + + "Reading position index information from files." - # Read the number of documents. + def reset(self): + self.last_docnum = 0 + self.last_pos_offset = 0 + + def read_positions(self): + + """ + Read a document number, a position file offset for the position index + file, and the number of documents in a section of that file. + """ + + # Read the document number delta. + + self.last_docnum += self.read_number() + + # Read the offset delta. + + self.last_pos_offset += self.read_number() + + # Read the document count. + + count = self.read_number() + + return self.last_docnum, self.last_pos_offset, count + + def read_term_positions(self, offset, doc_frequency): - self.ndocuments = self.read_number() - self.read_documents = 0 + """ + Read all positions from 'offset', seeking to that position in the file + before reading. The number of documents available for reading is limited + to 'doc_frequency'. + """ + + # NOTE: This is almost a duplication of PositionReader.read_term_positions. + + self.reset() + + # Duplicate the file handle. + + f = fdopen(dup(self.f.fileno()), "rb") + f.seek(offset) + return PositionIndexIterator(f, doc_frequency) + +class PositionIndexIterator(PositionIndexReader, IteratorBase): + + "Iterating over document positions." + + def __init__(self, f, count): + PositionIndexReader.__init__(self, f) + IteratorBase.__init__(self, count) + self.section_count = 0 + + def next(self): + + "Read positions for a single document." - def __len__(self): - return self.ndocuments + self.read_documents += self.section_count + if self.read_documents < self.count: + docnum, pos_offset, self.section_count = t = self.read_positions() + return t + else: + raise StopIteration + +class PositionDictionaryWriter: + + "Writing position dictionaries." + + def __init__(self, position_writer, position_index_writer, interval): + self.position_writer = position_writer + self.position_index_writer = position_index_writer + self.interval = interval + + def write_term_positions(self, doc_positions): + + """ + Write all 'doc_positions' - a collection of tuples of the form (document + number, position list) - to the file. + + Add some records to the index, making dictionary entries. + + Return a tuple containing the offset of the written data, the frequency + (number of positions), and document frequency (number of documents) for + the term involved. + """ + + # Reset the writer. + + self.position_writer.reset() + index_offset = None + + # Write the positions. + + frequency = 0 + first_offset = None + count = 0 + + doc_positions.sort() + + for docnum, positions in doc_positions: + pos_offset = self.position_writer.write_positions(docnum, positions) + + # Retain the first record offset for a subsequent index entry. + + if first_offset is None: + first_offset = pos_offset + + frequency += len(positions) + + # Every {interval} entries, write an index entry. + + if count == self.interval: + io = self.position_index_writer.write_positions(docnum, first_offset, self.interval) - def sort(self): + # Remember the first index entry offset. + + if index_offset is None: + index_offset = io + + first_offset = None + count = 0 + + count += 1 + + # Finish writing an index entry for the remaining documents. + + else: + if first_offset is not None: + io = self.position_index_writer.write_positions(docnum, first_offset, count) + + # Remember the first index entry offset. + + if index_offset is None: + index_offset = io + + return index_offset, frequency, len(doc_positions) + + def close(self): + self.position_writer.close() + self.position_index_writer.close() + +class PositionDictionaryReader: - "Stored document positions are already sorted." + "Reading position dictionaries." + + def __init__(self, position_reader, position_index_reader): + self.position_reader = position_reader + self.position_index_reader = position_index_reader + + def read_term_positions(self, offset, doc_frequency): + + """ + Return an iterator for dictionary entries starting at 'offset' with the + given 'doc_frequency'. + """ - pass + return PositionDictionaryIterator(self.position_reader, + self.position_index_reader, offset, doc_frequency) + + def close(self): + self.position_reader.close() + self.position_index_reader.close() + +class PositionDictionaryIterator: + + "Iteration over position dictionary entries." + + def __init__(self, position_reader, position_index_reader, offset, doc_frequency): + self.position_reader = position_reader + + self.index_iterator = position_index_reader.read_term_positions(offset, doc_frequency) + self.next_section() + self.init_section() def __iter__(self): return self def next(self): - "Read positions for a single document." + # Attempt to get the next document record from the section in the positions file. + + while 1: + + # Either return the next record. + + try: + return self.iterator.next() - if self.read_documents < self.ndocuments: - self.read_documents += 1 - return self.read_positions() - else: - raise StopIteration + # Or, where a section is finished, get the next section and try again. + + except StopIteration: + self.next_section() + self.iterator.replenish(self.section_count) + + def next_section(self): + self.docnum, self.pos_offset, self.section_count = self.index_iterator.read_positions() + + def init_section(self): + self.iterator = self.position_reader.read_term_positions(self.pos_offset, self.section_count) class TermWriter(FileWriter): @@ -340,12 +562,13 @@ self.last_term = "" self.last_offset = 0 - def write_term(self, term, offset, frequency): + def write_term(self, term, offset, frequency, doc_frequency): """ - Write the given 'term', its position file 'offset', and its 'frequency' - to the term information file. Return the offset after the term - information was written to the file. + Write the given 'term', its position file 'offset', its 'frequency' and + its 'doc_frequency' (number of documents in which it appears) to the + term information file. Return the offset after the term information was + written to the file. """ # Write the prefix length and term suffix. @@ -364,6 +587,10 @@ self.write_number(frequency) + # Write the document frequency. + + self.write_number(doc_frequency) + self.last_term = term self.last_offset = offset @@ -380,8 +607,8 @@ def read_term(self): """ - Read a term, its position file offset, and its frequency from the term - information file. + Read a term, its position file offset, its frequency and its document + frequence from the term information file. """ # Read the prefix length and term suffix. @@ -399,7 +626,11 @@ frequency = self.read_number() - return self.last_term, self.last_offset, frequency + # Read the document frequency. + + doc_frequency = self.read_number() + + return self.last_term, self.last_offset, frequency, doc_frequency def go_to_term(self, term, offset, info_offset): @@ -420,15 +651,15 @@ TermWriter.reset(self) self.last_info_offset = 0 - def write_term(self, term, offset, frequency, info_offset): + def write_term(self, term, offset, frequency, doc_frequency, info_offset): """ - Write the given 'term', its position file 'offset', and its 'frequency' - to the term dictionary index file, along with the 'info_offset' in the - term information file. + Write the given 'term', its position file 'offset', its 'frequency' and + its 'doc_frequency' to the term dictionary index file, along with the + 'info_offset' in the term information file. """ - TermWriter.write_term(self, term, offset, frequency) + TermWriter.write_term(self, term, offset, frequency, doc_frequency) # Write the information file offset delta. @@ -446,41 +677,43 @@ def read_term(self): """ - Read a term, its position file offset, its frequency, and its term - information file offset from the term dictionary index file. + Read a term, its position file offset, its frequency, its document + frequency and a term information file offset from the term dictionary + index file. """ - term, offset, frequency = TermReader.read_term(self) + term, offset, frequency, doc_frequency = TermReader.read_term(self) # Read the offset delta. self.last_info_offset += self.read_number() - return term, offset, frequency, self.last_info_offset + return term, offset, frequency, doc_frequency, self.last_info_offset class TermDictionaryWriter: "Writing term dictionaries." - def __init__(self, info_writer, index_writer, position_writer, interval): + def __init__(self, info_writer, index_writer, position_dict_writer, interval): self.info_writer = info_writer self.index_writer = index_writer - self.position_writer = position_writer + self.position_dict_writer = position_dict_writer self.interval = interval self.entry = 0 - def _write_term(self, term, offset, frequency): + def _write_term(self, term, offset, frequency, doc_frequency): """ - Write the given 'term', its position file 'offset', and its 'frequency' - to the term information file and optionally to the index, making a - dictionary entry. + Write the given 'term', its position file 'offset', its 'frequency' and + its 'doc_frequency' (number of documents in which it appears) to the + term information file. Return the offset after the term information was + written to the file. """ - info_offset = self.info_writer.write_term(term, offset, frequency) + info_offset = self.info_writer.write_term(term, offset, frequency, doc_frequency) if self.entry % self.interval == 0: - self.index_writer.write_term(term, offset, frequency, info_offset) + self.index_writer.write_term(term, offset, frequency, doc_frequency, info_offset) self.entry += 1 @@ -491,13 +724,13 @@ and positions at which the term is found. """ - offset, frequency = self.position_writer.write_term_positions(doc_positions) - self._write_term(term, offset, frequency) + offset, frequency, doc_frequency = self.position_dict_writer.write_term_positions(doc_positions) + self._write_term(term, offset, frequency, doc_frequency) def close(self): self.info_writer.close() self.index_writer.close() - self.position_writer.close() + self.position_dict_writer.close() class TermDictionaryReader: @@ -533,12 +766,13 @@ if i == -1: return None - found_term, offset, frequency, info_offset = self.terms[i] + found_term, offset, frequency, doc_frequency, info_offset = self.terms[i] - # Where the term is found immediately, return the offset. + # Where the term is found immediately, return the offset and + # frequencies. if term == found_term: - return offset, frequency + return offset, frequency, doc_frequency # Otherwise, seek past the index term's entry in the information file # and scan for the desired term. @@ -547,33 +781,33 @@ self.info_reader.go_to_term(found_term, offset, info_offset) try: while term > found_term: - found_term, offset, frequency = self.info_reader.read_term() + found_term, offset, frequency, doc_frequency = self.info_reader.read_term() except EOFError: pass - # If the term is found, return the offset and frequency. + # If the term is found, return the offset and frequencies. if term == found_term: - return offset, frequency + return offset, frequency, doc_frequency else: return None def rewind(self): self.info_reader.rewind() - def _get_positions(self, offset): - return self.position_reader.read_term_positions(offset) + def _get_positions(self, offset, doc_frequency): + return self.position_reader.read_term_positions(offset, doc_frequency) def read_term(self): """ - Return the next term, its frequency and the documents and positions at - which the term is found. + Return the next term, its frequency, its document frequency, and the + documents and positions at which the term is found. """ - term, offset, frequency = self.info_reader.read_term() - positions = self._get_positions(offset) - return term, frequency, positions + term, offset, frequency, doc_frequency = self.info_reader.read_term() + positions = self._get_positions(offset, doc_frequency) + return term, frequency, doc_frequency, positions def find_positions(self, term): @@ -583,8 +817,8 @@ if t is None: return None else: - offset, frequency = t - return self._get_positions(offset) + offset, frequency, doc_frequency = t + return self._get_positions(offset, doc_frequency) def get_frequency(self, term): @@ -594,9 +828,20 @@ if t is None: return None else: - offset, frequency = t + offset, frequency, doc_frequency = t return frequency + def get_document_frequency(self, term): + + "Return the document frequency of the given 'term'." + + t = self._find_term(term) + if t is None: + return None + else: + offset, frequency, doc_frequency = t + return doc_frequency + def close(self): self.info_reader.close() self.index_reader.close() @@ -850,7 +1095,7 @@ reader.rewind() try: - term, frequency, positions = reader.read_term() + term, frequency, doc_frequency, positions = reader.read_term() insort_right(entries, (term, positions, partition)) except EOFError: pass @@ -889,7 +1134,7 @@ for partition in to_update: try: - term, frequency, positions = self.readers[partition].read_term() + term, frequency, doc_frequency, positions = self.readers[partition].read_term() insort_right(entries, (term, positions, partition)) except EOFError: pass @@ -975,12 +1220,12 @@ # Utility functions. -def get_term_writer(pathname, partition, interval): +def get_term_writer(pathname, partition, interval, doc_interval): """ Return a term dictionary writer using files under the given 'pathname' labelled according to the given 'partition', using the given indexing - 'interval'. + 'interval' for terms and 'doc_interval' for document position records. """ tdf = open(join(pathname, "terms-%s" % partition), "wb") @@ -992,7 +1237,12 @@ tpf = open(join(pathname, "positions-%s" % partition), "wb") positions_writer = PositionWriter(tpf) - return TermDictionaryWriter(info_writer, index_writer, positions_writer, interval) + tpif = open(join(pathname, "positions_index-%s" % partition), "wb") + positions_index_writer = PositionIndexWriter(tpif) + + positions_dict_writer = PositionDictionaryWriter(positions_writer, positions_index_writer, doc_interval) + + return TermDictionaryWriter(info_writer, index_writer, positions_dict_writer, interval) def get_field_writer(pathname, partition, interval): @@ -1026,7 +1276,12 @@ tpf = open(join(pathname, "positions-%s" % partition), "rb") positions_reader = PositionReader(tpf) - return TermDictionaryReader(info_reader, index_reader, positions_reader) + tpif = open(join(pathname, "positions_index-%s" % partition), "rb") + positions_index_reader = PositionIndexReader(tpif) + + positions_dict_reader = PositionDictionaryReader(positions_reader, positions_index_reader) + + return TermDictionaryReader(info_reader, index_reader, positions_dict_reader) def get_field_reader(pathname, partition): diff -r 1e7ca36202ef -r 1cf3b82959f3 test.py --- a/test.py Mon Aug 31 21:02:30 2009 +0200 +++ b/test.py Wed Sep 02 01:30:42 2009 +0200 @@ -38,15 +38,18 @@ all_doc_positions = [ [ (123, [1, 3, 5, 15, 25]), - (124, [0, 100]) + (124, [0, 100]), + (125, [11, 99, 199]), + (130, [77, 78, 80, 82, 89]) ], [ (78, [9]), - (196, [10, 11]) + (196, [10, 11]), + (197, [17, 21, 30]) ] ] -f = open("test", "wb") +f = open("testP", "wb") w = iixr.PositionWriter(f) for doc_positions in all_doc_positions: for docnum, positions in doc_positions: @@ -54,7 +57,7 @@ w.reset() w.close() -f = open("test", "rb") +f = open("testP", "rb") r = iixr.PositionReader(f) for doc_positions in all_doc_positions: for docnum, positions in doc_positions: @@ -64,20 +67,68 @@ r.reset() r.close() -f = open("test", "wb") +# Test position index files. + +indexed_positions = [ + [ + (1234, 0, 100), + (2345, 700, 100), + (3456, 1900, 50) + ], + [ + (4567, 2800, 20) + ] + ] + +offsets = [] +f = open("testPI", "wb") +w = iixr.PositionIndexWriter(f) +for term_positions in indexed_positions: + offset = None + doc_frequency = 0 + w.reset() + for docnum, pos_offset, count in term_positions: + io = w.write_positions(docnum, pos_offset, count) + if offset is None: + offset = io + doc_frequency += count + offsets.append((offset, doc_frequency)) +w.close() + +f = open("testPI", "rb") +r = iixr.PositionIndexReader(f) +offsets.reverse() +indexed_positions.reverse() +for (offset, doc_frequency), term_positions in zip(offsets, indexed_positions): + found_positions = r.read_term_positions(offset, doc_frequency) + for (docnum, pos_offset, count), (dn, po, c) in zip(term_positions, found_positions): + print docnum == dn, docnum, dn + print pos_offset == po, pos_offset, po + print count == c, count, c +r.close() + +# Test position dictionaries. + +f = open("testP", "wb") w = iixr.PositionWriter(f) +f2 = open("testPI", "wb") +w2 = iixr.PositionIndexWriter(f2) +wd = iixr.PositionDictionaryWriter(w, w2, 2) offsets = [] for doc_positions in all_doc_positions: - offset, frequency = w.write_term_positions(doc_positions) - offsets.append(offset) + offset, frequency, doc_frequency = wd.write_term_positions(doc_positions) + offsets.append((offset, doc_frequency)) w.close() -f = open("test", "rb") +f = open("testP", "rb") r = iixr.PositionReader(f) +f2 = open("testPI", "rb") +r2 = iixr.PositionIndexReader(f2) +rd = iixr.PositionDictionaryReader(r, r2) offsets.reverse() all_doc_positions.reverse() -for offset, doc_positions in zip(offsets, all_doc_positions): - dp = list(r.read_term_positions(offset)) +for (offset, doc_frequency), doc_positions in zip(offsets, all_doc_positions): + dp = list(rd.read_term_positions(offset, doc_frequency)) print doc_positions == dp, doc_positions, dp r.close() @@ -166,55 +217,57 @@ # Test terms. terms = [ - # term offset frequency - ("aardvark", 100000123, 1), - ("anteater", 100000456, 2), - ("badger", 100000789, 13), - ("bull", 1000001234, 59), - ("bulldog", 1000002345, 99), - ("cat", 1000003456, 89) + # term offset frequency doc_frequency + ("aardvark", 100000123, 1, 1), + ("anteater", 100000456, 2, 1), + ("badger", 100000789, 13, 7), + ("bull", 1000001234, 59, 17), + ("bulldog", 1000002345, 99, 80), + ("cat", 1000003456, 89, 28) ] f = open("test", "wb") w = iixr.TermWriter(f) -for term, offset, frequency in terms: - w.write_term(term, offset, frequency) +for term, offset, frequency, doc_frequency in terms: + w.write_term(term, offset, frequency, doc_frequency) w.close() f = open("test", "rb") r = iixr.TermReader(f) -for term, offset, frequency in terms: - t, o, fr = r.read_term() +for term, offset, frequency, doc_frequency in terms: + t, o, fr, df = r.read_term() print term == t, term, t print offset == o, offset, o print frequency == fr, frequency, fr + print doc_frequency == df, doc_frequency, df r.close() # Test terms in index files. indexed_terms = [ - # term offset frequency info_offset - ("aardvark", 100000123, 1, 200000321), - ("anteater", 100000456, 2, 200000654), - ("badger", 100000789, 13, 200000987), - ("bull", 1000001234, 59, 200004321), - ("bulldog", 1000002345, 99, 200005432), - ("cat", 1000003456, 89, 200006543) + # term offset frequency doc_frequency info_offset + ("aardvark", 100000123, 1, 1, 200000321), + ("anteater", 100000456, 2, 1, 200000654), + ("badger", 100000789, 13, 7, 200000987), + ("bull", 1000001234, 59, 17, 200004321), + ("bulldog", 1000002345, 99, 80, 200005432), + ("cat", 1000003456, 89, 28, 200006543) ] f = open("test", "wb") w = iixr.TermIndexWriter(f) -for term, offset, frequency, info_offset in indexed_terms: - w.write_term(term, offset, frequency, info_offset) +for term, offset, frequency, doc_frequency, info_offset in indexed_terms: + w.write_term(term, offset, frequency, doc_frequency, info_offset) w.close() f = open("test", "rb") r = iixr.TermIndexReader(f) -for term, offset, frequency, info_offset in indexed_terms: - t, o, fr, i = r.read_term() +for term, offset, frequency, doc_frequency, info_offset in indexed_terms: + t, o, fr, df, i = r.read_term() print term == t, term, t print offset == o, offset, o print frequency == fr, frequency, fr + print doc_frequency == df, doc_frequency, df print info_offset == i, info_offset, i r.close() @@ -224,26 +277,23 @@ w = iixr.TermWriter(f) f2 = open("testI", "wb") w2 = iixr.TermIndexWriter(f2) -f3 = open("testP", "wb") -w3 = iixr.PositionWriter(f3) -wd = iixr.TermDictionaryWriter(w, w2, w3, 3) -for term, offset, frequency in terms: - wd._write_term(term, offset, frequency) +wd = iixr.TermDictionaryWriter(w, w2, None, 3) +for term, offset, frequency, doc_frequency in terms: + wd._write_term(term, offset, frequency, doc_frequency) wd.close() f = open("test", "rb") r = iixr.TermReader(f) f2 = open("testI", "rb") r2 = iixr.TermIndexReader(f2) -f3 = open("testP", "rb") -r3 = iixr.PositionReader(f3) -rd = iixr.TermDictionaryReader(r, r2, r3) +rd = iixr.TermDictionaryReader(r, r2, None) terms_reversed = terms[:] terms_reversed.reverse() -for term, offset, frequency in terms_reversed: - o, fr = rd._find_term(term) +for term, offset, frequency, doc_frequency in terms_reversed: + o, fr, df = rd._find_term(term) print offset == o, offset, o print frequency == fr, frequency, fr + print doc_frequency == df, doc_frequency, df for term in ("dog", "dingo"): t = rd._find_term(term) print t is None, t @@ -255,7 +305,7 @@ ("aardvark", [(1, [2, 45, 96]), (20, [13])]), ("anteater", [(1, [43, 44])]), ("badger", [(7, [2, 22, 196]), (19, [55, 1333]), (21, [0])]), - ("bull", [(6, [128]), (16, [12])]), + ("bull", [(6, [128]), (16, [12]), (26, [1, 3, 5, 7, 9]), (36, [2, 4, 6, 8, 10])]), ("bulldog", [(43, [17, 19, 256, 512])]), ("cat", [(123, [12, 145, 196]), (1200, [113])]) ] @@ -266,7 +316,10 @@ w2 = iixr.TermIndexWriter(f2) f3 = open("testP", "wb") w3 = iixr.PositionWriter(f3) -wd = iixr.TermDictionaryWriter(w, w2, w3, 3) +f4 = open("testPI", "wb") +w4 = iixr.PositionIndexWriter(f4) +wp = iixr.PositionDictionaryWriter(r3, r4, 2) +wd = iixr.TermDictionaryWriter(w, w2, wp, 3) for term, doc_positions in terms_with_positions: wd.write_term_positions(term, doc_positions) wd.close() @@ -277,7 +330,10 @@ r2 = iixr.TermIndexReader(f2) f3 = open("testP", "rb") r3 = iixr.PositionReader(f3) -rd = iixr.TermDictionaryReader(r, r2, r3) +f4 = open("testPI", "rb") +r4 = iixr.PositionIndexReader(f4) +rp = iixr.PositionDictionaryReader(r3, r4) +rd = iixr.TermDictionaryReader(r, r2, rp) terms_reversed = terms_with_positions[:] terms_reversed.reverse() for term, doc_positions in terms_reversed: @@ -291,7 +347,7 @@ rd.rewind() for term, doc_positions in terms_with_positions: - t, fr, dp = rd.read_term() + t, fr, df, dp = rd.read_term() dp = list(dp) print term == t, term, t print doc_positions == dp, doc_positions, dp