# HG changeset patch # User Paul Boddie # Date 1251576745 -7200 # Node ID 3d86f5cb01c10b27660683dff34a45a88af6a287 # Parent b81c00a48c4984c230312085b7234684f520fb98 Added term frequency information to the term dictionary. diff -r b81c00a48c49 -r 3d86f5cb01c1 iixr.py --- a/iixr.py Sat Aug 29 21:15:47 2009 +0200 +++ b/iixr.py Sat Aug 29 22:12:25 2009 +0200 @@ -215,8 +215,9 @@ """ Write all 'doc_positions' - a collection of tuples of the form (document - number, position list) - to the file, returning the offset at which they - were stored. + number, position list) - to the file, returning a tuple containing the + offset at which they were stored together with the frequency (number of + positions) for the term involved. """ # Reset the writer and record the current file offset. @@ -230,10 +231,13 @@ # Write the positions. + frequency = 0 + for docnum, positions in doc_positions: self.write_positions(docnum, positions) + frequency += len(positions) - return offset + return offset, frequency class PositionReader(FileReader): @@ -301,12 +305,12 @@ self.last_term = "" self.last_offset = 0 - def write_term(self, term, offset): + def write_term(self, term, offset, frequency): """ - Write the given 'term' and its position file 'offset' to the term - information file. Return the offset after the term information was - written to the file. + Write the given 'term', its position file 'offset', and its 'frequency' + to the term information file. Return the offset after the term + information was written to the file. """ # Too long terms are not currently supported. @@ -326,6 +330,10 @@ self.write_number(offset - self.last_offset) + # Write the frequency. + + self.write_number(frequency) + self.last_term = term self.last_offset = offset @@ -342,7 +350,8 @@ def read_term(self): """ - Read a term and its position file offset from the term information file. + Read a term, its position file offset, and its frequency from the term + information file. """ # Read the prefix length and term suffix. @@ -356,7 +365,11 @@ self.last_offset += self.read_number() - return self.last_term, self.last_offset + # Read the frequency. + + frequency = self.read_number() + + return self.last_term, self.last_offset, frequency def go_to_term(self, term, offset, info_offset): @@ -377,15 +390,15 @@ TermWriter.reset(self) self.last_info_offset = 0 - def write_term(self, term, offset, info_offset): + def write_term(self, term, offset, frequency, info_offset): """ - Write the given 'term' and its position file 'offset' to the term - dictionary index file, along with the 'info_offset' in the term - information file. + Write the given 'term', its position file 'offset', and its 'frequency' + to the term dictionary index file, along with the 'info_offset' in the + term information file. """ - TermWriter.write_term(self, term, offset) + TermWriter.write_term(self, term, offset, frequency) # Write the information file offset delta. @@ -403,17 +416,17 @@ def read_term(self): """ - Read a term, its position file offset, and its term information file - offset from the term dictionary index file. + Read a term, its position file offset, its frequency, and its term + information file offset from the term dictionary index file. """ - term, offset = TermReader.read_term(self) + term, offset, frequency = TermReader.read_term(self) # Read the offset delta. self.last_info_offset += self.read_number() - return term, offset, self.last_info_offset + return term, offset, frequency, self.last_info_offset class TermDictionaryWriter: @@ -426,17 +439,18 @@ self.interval = interval self.entry = 0 - def _write_term(self, term, offset): + def _write_term(self, term, offset, frequency): """ - Write the given 'term' and its position file 'offset' to the term - information file and optionally to the index, making a dictionary entry. + Write the given 'term', its position file 'offset', and its 'frequency' + to the term information file and optionally to the index, making a + dictionary entry. """ - info_offset = self.info_writer.write_term(term, offset) + info_offset = self.info_writer.write_term(term, offset, frequency) if self.entry % self.interval == 0: - self.index_writer.write_term(term, offset, info_offset) + self.index_writer.write_term(term, offset, frequency, info_offset) self.entry += 1 @@ -447,8 +461,8 @@ and positions at which the term is found. """ - offset = self.position_writer.write_all_positions(doc_positions) - self._write_term(term, offset) + offset, frequency = self.position_writer.write_all_positions(doc_positions) + self._write_term(term, offset, frequency) def close(self): self.info_writer.close() @@ -478,7 +492,10 @@ def _find_term(self, term): - "Find the position file offset of 'term' from the term dictionary." + """ + Find the position file offset and frequency of 'term' from the term + dictionary. + """ i = bisect_right(self.terms, (term, self.max_offset, self.max_info_offset)) - 1 @@ -487,12 +504,12 @@ if i == -1: return None - found_term, offset, info_offset = self.terms[i] + found_term, offset, frequency, info_offset = self.terms[i] # Where the term is found immediately, return the offset. if term == found_term: - return offset + return offset, frequency # Otherwise, seek past the index term's entry in the information file # and scan for the desired term. @@ -501,14 +518,14 @@ self.info_reader.go_to_term(found_term, offset, info_offset) try: while term > found_term: - found_term, offset = self.info_reader.read_term() + found_term, offset, frequency = self.info_reader.read_term() except EOFError: pass - # If the term is found, return the offset. + # If the term is found, return the offset and frequency. if term == found_term: - return offset + return offset, frequency else: return None @@ -516,12 +533,24 @@ "Return the documents and positions at which the given 'term' is found." - offset = self._find_term(term) - if offset is None: + t = self._find_term(term) + if t is None: return None else: + offset, frequency = t return self.position_reader.read_all_positions(offset) + def get_frequency(self, term): + + "Return the frequency of the given 'term'." + + t = self._find_term(term) + if t is None: + return None + else: + offset, frequency = t + return frequency + def close(self): self.info_reader.close() self.index_reader.close() @@ -810,6 +839,9 @@ def find_positions(self, term): return self.dict_reader.find_positions(term) + def get_frequency(self, term): + return self.dict_reader.get_frequency(term) + def get_fields(self, docnum): return self.field_dict_reader.read_fields(docnum) diff -r b81c00a48c49 -r 3d86f5cb01c1 test.py --- a/test.py Sat Aug 29 21:15:47 2009 +0200 +++ b/test.py Sat Aug 29 22:12:25 2009 +0200 @@ -54,9 +54,8 @@ w = iixr.PositionWriter(f) offsets = [] for doc_positions in all_doc_positions: - offsets.append( - w.write_all_positions(doc_positions) - ) + offset, frequency = w.write_all_positions(doc_positions) + offsets.append(offset) w.close() f = open("test", "rb") @@ -145,51 +144,55 @@ # Test terms. terms = [ - ("aardvark", 100000123), - ("anteater", 100000456), - ("badger", 100000789), - ("bull", 1000001234), - ("bulldog", 1000002345), - ("cat", 1000003456) + # term offset frequency + ("aardvark", 100000123, 1), + ("anteater", 100000456, 2), + ("badger", 100000789, 13), + ("bull", 1000001234, 59), + ("bulldog", 1000002345, 99), + ("cat", 1000003456, 89) ] f = open("test", "wb") w = iixr.TermWriter(f) -for term, offset in terms: - w.write_term(term, offset) +for term, offset, frequency in terms: + w.write_term(term, offset, frequency) w.close() f = open("test", "rb") r = iixr.TermReader(f) -for term, offset in terms: - t, o = r.read_term() +for term, offset, frequency in terms: + t, o, fr = r.read_term() print term == t, term, t print offset == o, offset, o + print frequency == fr, frequency, fr r.close() # Test terms in index files. indexed_terms = [ - ("aardvark", 100000123, 200000321), - ("anteater", 100000456, 200000654), - ("badger", 100000789, 200000987), - ("bull", 1000001234, 200004321), - ("bulldog", 1000002345, 200005432), - ("cat", 1000003456, 200006543) + # term offset frequency info_offset + ("aardvark", 100000123, 1, 200000321), + ("anteater", 100000456, 2, 200000654), + ("badger", 100000789, 13, 200000987), + ("bull", 1000001234, 59, 200004321), + ("bulldog", 1000002345, 99, 200005432), + ("cat", 1000003456, 89, 200006543) ] f = open("test", "wb") w = iixr.TermIndexWriter(f) -for term, offset, info_offset in indexed_terms: - w.write_term(term, offset, info_offset) +for term, offset, frequency, info_offset in indexed_terms: + w.write_term(term, offset, frequency, info_offset) w.close() f = open("test", "rb") r = iixr.TermIndexReader(f) -for term, offset, info_offset in indexed_terms: - t, o, i = r.read_term() +for term, offset, frequency, info_offset in indexed_terms: + t, o, fr, i = r.read_term() print term == t, term, t print offset == o, offset, o + print frequency == fr, frequency, fr print info_offset == i, info_offset, i r.close() @@ -202,8 +205,8 @@ f3 = open("testP", "wb") w3 = iixr.PositionWriter(f3) wd = iixr.TermDictionaryWriter(w, w2, w3, 3) -for term, offset in terms: - wd._write_term(term, offset) +for term, offset, frequency in terms: + wd._write_term(term, offset, frequency) wd.close() f = open("test", "rb") @@ -215,12 +218,13 @@ rd = iixr.TermDictionaryReader(r, r2, r3) terms_reversed = terms[:] terms_reversed.reverse() -for term, offset in terms_reversed: - o = rd._find_term(term) +for term, offset, frequency in terms_reversed: + o, fr = rd._find_term(term) print offset == o, offset, o + print frequency == fr, frequency, fr for term in ("dog", "dingo"): - o = rd._find_term(term) - print o is None, o + t = rd._find_term(term) + print t is None, t rd.close() # Test dictionaries with term and position data. @@ -274,10 +278,10 @@ ] doc_tests = [ - ("Every", [(2, [0]), (14, [0])]), - ("good", [(2, [1]), (13, [1])]), - ("deserves", [(2, [3]), (13, [3])]), - ("sea", [(36, [2, 6])]) + ("Every", 2, [(2, [0]), (14, [0])]), + ("good", 2, [(2, [1]), (13, [1])]), + ("deserves", 2, [(2, [3]), (13, [3])]), + ("sea", 2, [(36, [2, 6])]) ] index = iixr.Index("test_index") @@ -289,9 +293,11 @@ wi.close() rd = index.get_reader() -for term, doc_positions in doc_tests: +for term, frequency, doc_positions in doc_tests: dp = rd.find_positions(term) print doc_positions == dp, doc_positions, dp + fr = rd.get_frequency(term) + print frequency == fr, frequency, fr for docnum, text in docs: df = rd.get_fields(docnum) print text == df[0], text, df[0]