# HG changeset patch # User Paul Boddie # Date 1251324170 -7200 # Node ID 736ff8a2e02297fa984d788263e5eaaf4f5cf644 # Parent 247345e7c82fbd8513674fe492fbc4645fe9a604 Added an index writer which collects term positions and then writes them out using a dictionary writer. diff -r 247345e7c82f -r 736ff8a2e022 iixr.py --- a/iixr.py Wed Aug 26 23:36:02 2009 +0200 +++ b/iixr.py Thu Aug 27 00:02:50 2009 +0200 @@ -413,7 +413,7 @@ self.max_offset = self.terms[-1][1] self.max_info_offset = self.terms[-1][2] - def find(self, term): + def find_term(self, term): "Find the position file offset of 'term' from the term dictionary." @@ -453,7 +453,7 @@ "Return the documents and positions at which the given 'term' is found." - offset = self.find(term) + offset = self.find_term(term) if offset is None: return None else: @@ -464,4 +464,45 @@ self.index_reader.close() self.position_reader.close() +class IndexWriter: + + "Building term information and writing it to the term dictionary." + + def __init__(self, dict_writer): + self.dict_writer = dict_writer + self.terms = {} + + def add_position(self, term, docnum, position): + + """ + Add a position entry for the given 'term' in the document with the given + 'docnum', indicating the given 'position'. + """ + + if not self.terms.has_key(term): + doc_positions = self.terms[term] = {} + else: + doc_positions = self.terms[term] + + if not doc_positions.has_key(docnum): + doc = doc_positions[docnum] = [] + else: + doc = doc_positions[docnum] + + doc.append(position) + + def close(self): + + # Get the terms in order. + + terms = self.terms.items() + terms.sort() + + for term, doc_positions in terms: + doc_positions = doc_positions.items() + doc_positions.sort() + self.dict_writer.write_term_positions(term, doc_positions) + + self.dict_writer.close() + # vim: tabstop=4 expandtab shiftwidth=4 diff -r 247345e7c82f -r 736ff8a2e022 test.py --- a/test.py Wed Aug 26 23:36:02 2009 +0200 +++ b/test.py Thu Aug 27 00:02:50 2009 +0200 @@ -132,10 +132,10 @@ terms_reversed = terms[:] terms_reversed.reverse() for term, offset in terms_reversed: - o = rd.find(term) + o = rd.find_term(term) print offset == o, offset, o for term in ("dog", "dingo"): - o = rd.find(term) + o = rd.find_term(term) print o is None, o rd.close() @@ -176,4 +176,45 @@ print dp is None, dp rd.close() +docs = [ + (1, "The cat sat on the mat"), + (2, "Every good boy deserves football"), + (13, "One good turn deserves another"), + (14, "Every man for himself"), + (25, "Red sky at night shepherd's delight"), + (36, "She sells sea shells on the sea shore") + ] + +doc_tests = [ + ("Every", [(2, [0]), (14, [0])]), + ("good", [(2, [1]), (13, [1])]), + ("deserves", [(2, [3]), (13, [3])]), + ("sea", [(36, [2, 6])]) + ] + +f = open("test", "wb") +w = iixr.TermWriter(f) +f2 = open("testI", "wb") +w2 = iixr.TermIndexWriter(f2) +f3 = open("testP", "wb") +w3 = iixr.PositionWriter(f3) +wd = iixr.TermDictionaryWriter(w, w2, w3, 3) +wi = iixr.IndexWriter(wd) +for docnum, text in docs: + for position, term in enumerate(text.split()): + wi.add_position(term, docnum, position) +wi.close() + +f = open("test", "rb") +r = iixr.TermReader(f) +f2 = open("testI", "rb") +r2 = iixr.TermIndexReader(f2) +f3 = open("testP", "rb") +r3 = iixr.PositionReader(f3) +rd = iixr.TermDictionaryReader(r, r2, r3) +for term, doc_positions in doc_tests: + dp = rd.find_positions(term) + print doc_positions == dp, doc_positions, dp +rd.close() + # vim: tabstop=4 expandtab shiftwidth=4