# HG changeset patch
# User Paul Boddie <paul@boddie.org.uk>
# Date 1251576745 -7200
# Node ID 3d86f5cb01c10b27660683dff34a45a88af6a287
# Parent  b81c00a48c4984c230312085b7234684f520fb98
Added term frequency information to the term dictionary.

diff -r b81c00a48c49 -r 3d86f5cb01c1 iixr.py
--- a/iixr.py	Sat Aug 29 21:15:47 2009 +0200
+++ b/iixr.py	Sat Aug 29 22:12:25 2009 +0200
@@ -215,8 +215,9 @@
 
         """
         Write all 'doc_positions' - a collection of tuples of the form (document
-        number, position list) - to the file, returning the offset at which they
-        were stored.
+        number, position list) - to the file, returning a tuple containing the
+        offset at which they were stored together with the frequency (number of
+        positions) for the term involved.
         """
 
         # Reset the writer and record the current file offset.
@@ -230,10 +231,13 @@
 
         # Write the positions.
 
+        frequency = 0
+
         for docnum, positions in doc_positions:
             self.write_positions(docnum, positions)
+            frequency += len(positions)
 
-        return offset
+        return offset, frequency
 
 class PositionReader(FileReader):
 
@@ -301,12 +305,12 @@
         self.last_term = ""
         self.last_offset = 0
 
-    def write_term(self, term, offset):
+    def write_term(self, term, offset, frequency):
 
         """
-        Write the given 'term' and its position file 'offset' to the term
-        information file. Return the offset after the term information was
-        written to the file.
+        Write the given 'term', its position file 'offset', and its 'frequency'
+        to the term information file. Return the offset after the term
+        information was written to the file.
         """
 
         # Too long terms are not currently supported.
@@ -326,6 +330,10 @@
 
         self.write_number(offset - self.last_offset)
 
+        # Write the frequency.
+
+        self.write_number(frequency)
+
         self.last_term = term
         self.last_offset = offset
 
@@ -342,7 +350,8 @@
     def read_term(self):
 
         """
-        Read a term and its position file offset from the term information file.
+        Read a term, its position file offset, and its frequency from the term
+        information file.
         """
 
         # Read the prefix length and term suffix.
@@ -356,7 +365,11 @@
 
         self.last_offset += self.read_number()
 
-        return self.last_term, self.last_offset
+        # Read the frequency.
+
+        frequency = self.read_number()
+
+        return self.last_term, self.last_offset, frequency
 
     def go_to_term(self, term, offset, info_offset):
 
@@ -377,15 +390,15 @@
         TermWriter.reset(self)
         self.last_info_offset = 0
 
-    def write_term(self, term, offset, info_offset):
+    def write_term(self, term, offset, frequency, info_offset):
 
         """
-        Write the given 'term' and its position file 'offset' to the term
-        dictionary index file, along with the 'info_offset' in the term
-        information file.
+        Write the given 'term', its position file 'offset', and its 'frequency'
+        to the term dictionary index file, along with the 'info_offset' in the
+        term information file.
         """
 
-        TermWriter.write_term(self, term, offset)
+        TermWriter.write_term(self, term, offset, frequency)
 
         # Write the information file offset delta.
 
@@ -403,17 +416,17 @@
     def read_term(self):
 
         """
-        Read a term, its position file offset, and its term information file
-        offset from the term dictionary index file.
+        Read a term, its position file offset, its frequency, and its term
+        information file offset from the term dictionary index file.
         """
 
-        term, offset = TermReader.read_term(self)
+        term, offset, frequency = TermReader.read_term(self)
 
         # Read the offset delta.
 
         self.last_info_offset += self.read_number()
 
-        return term, offset, self.last_info_offset
+        return term, offset, frequency, self.last_info_offset
 
 class TermDictionaryWriter:
 
@@ -426,17 +439,18 @@
         self.interval = interval
         self.entry = 0
 
-    def _write_term(self, term, offset):
+    def _write_term(self, term, offset, frequency):
 
         """
-        Write the given 'term' and its position file 'offset' to the term
-        information file and optionally to the index, making a dictionary entry.
+        Write the given 'term', its position file 'offset', and its 'frequency'
+        to the term information file and optionally to the index, making a
+        dictionary entry.
         """
 
-        info_offset = self.info_writer.write_term(term, offset)
+        info_offset = self.info_writer.write_term(term, offset, frequency)
 
         if self.entry % self.interval == 0:
-            self.index_writer.write_term(term, offset, info_offset)
+            self.index_writer.write_term(term, offset, frequency, info_offset)
 
         self.entry += 1
 
@@ -447,8 +461,8 @@
         and positions at which the term is found.
         """
 
-        offset = self.position_writer.write_all_positions(doc_positions)
-        self._write_term(term, offset)
+        offset, frequency = self.position_writer.write_all_positions(doc_positions)
+        self._write_term(term, offset, frequency)
 
     def close(self):
         self.info_writer.close()
@@ -478,7 +492,10 @@
 
     def _find_term(self, term):
 
-        "Find the position file offset of 'term' from the term dictionary."
+        """
+        Find the position file offset and frequency of 'term' from the term
+        dictionary.
+        """
 
         i = bisect_right(self.terms, (term, self.max_offset, self.max_info_offset)) - 1
 
@@ -487,12 +504,12 @@
         if i == -1:
             return None
 
-        found_term, offset, info_offset = self.terms[i]
+        found_term, offset, frequency, info_offset = self.terms[i]
 
         # Where the term is found immediately, return the offset.
 
         if term == found_term:
-            return offset
+            return offset, frequency
 
         # Otherwise, seek past the index term's entry in the information file
         # and scan for the desired term.
@@ -501,14 +518,14 @@
             self.info_reader.go_to_term(found_term, offset, info_offset)
             try:
                 while term > found_term:
-                    found_term, offset = self.info_reader.read_term()
+                    found_term, offset, frequency = self.info_reader.read_term()
             except EOFError:
                 pass
 
-            # If the term is found, return the offset.
+            # If the term is found, return the offset and frequency.
 
             if term == found_term:
-                return offset
+                return offset, frequency
             else:
                 return None
 
@@ -516,12 +533,24 @@
 
         "Return the documents and positions at which the given 'term' is found."
 
-        offset = self._find_term(term)
-        if offset is None:
+        t = self._find_term(term)
+        if t is None:
             return None
         else:
+            offset, frequency = t
             return self.position_reader.read_all_positions(offset)
 
+    def get_frequency(self, term):
+
+        "Return the frequency of the given 'term'."
+
+        t = self._find_term(term)
+        if t is None:
+            return None
+        else:
+            offset, frequency = t
+            return frequency
+
     def close(self):
         self.info_reader.close()
         self.index_reader.close()
@@ -810,6 +839,9 @@
     def find_positions(self, term):
         return self.dict_reader.find_positions(term)
 
+    def get_frequency(self, term):
+        return self.dict_reader.get_frequency(term)
+
     def get_fields(self, docnum):
         return self.field_dict_reader.read_fields(docnum)
 
diff -r b81c00a48c49 -r 3d86f5cb01c1 test.py
--- a/test.py	Sat Aug 29 21:15:47 2009 +0200
+++ b/test.py	Sat Aug 29 22:12:25 2009 +0200
@@ -54,9 +54,8 @@
 w = iixr.PositionWriter(f)
 offsets = []
 for doc_positions in all_doc_positions:
-    offsets.append(
-        w.write_all_positions(doc_positions)
-        )
+    offset, frequency = w.write_all_positions(doc_positions)
+    offsets.append(offset)
 w.close()
 
 f = open("test", "rb")
@@ -145,51 +144,55 @@
 # Test terms.
 
 terms = [
-    ("aardvark",  100000123),
-    ("anteater",  100000456),
-    ("badger",    100000789),
-    ("bull",     1000001234),
-    ("bulldog",  1000002345),
-    ("cat",      1000003456)
+    # term       offset      frequency
+    ("aardvark",  100000123,  1),
+    ("anteater",  100000456,  2),
+    ("badger",    100000789, 13),
+    ("bull",     1000001234, 59),
+    ("bulldog",  1000002345, 99),
+    ("cat",      1000003456, 89)
     ]
 
 f = open("test", "wb")
 w = iixr.TermWriter(f)
-for term, offset in terms:
-    w.write_term(term, offset)
+for term, offset, frequency in terms:
+    w.write_term(term, offset, frequency)
 w.close()
 
 f = open("test", "rb")
 r = iixr.TermReader(f)
-for term, offset in terms:
-    t, o = r.read_term()
+for term, offset, frequency in terms:
+    t, o, fr = r.read_term()
     print term == t, term, t
     print offset == o, offset, o
+    print frequency == fr, frequency, fr
 r.close()
 
 # Test terms in index files.
 
 indexed_terms = [
-    ("aardvark",  100000123, 200000321),
-    ("anteater",  100000456, 200000654),
-    ("badger",    100000789, 200000987),
-    ("bull",     1000001234, 200004321),
-    ("bulldog",  1000002345, 200005432),
-    ("cat",      1000003456, 200006543)
+    # term       offset      frequency  info_offset
+    ("aardvark",  100000123,  1,        200000321),
+    ("anteater",  100000456,  2,        200000654),
+    ("badger",    100000789, 13,        200000987),
+    ("bull",     1000001234, 59,        200004321),
+    ("bulldog",  1000002345, 99,        200005432),
+    ("cat",      1000003456, 89,        200006543)
     ]
 
 f = open("test", "wb")
 w = iixr.TermIndexWriter(f)
-for term, offset, info_offset in indexed_terms:
-    w.write_term(term, offset, info_offset)
+for term, offset, frequency, info_offset in indexed_terms:
+    w.write_term(term, offset, frequency, info_offset)
 w.close()
 
 f = open("test", "rb")
 r = iixr.TermIndexReader(f)
-for term, offset, info_offset in indexed_terms:
-    t, o, i = r.read_term()
+for term, offset, frequency, info_offset in indexed_terms:
+    t, o, fr, i = r.read_term()
     print term == t, term, t
     print offset == o, offset, o
+    print frequency == fr, frequency, fr
     print info_offset == i, info_offset, i
 r.close()
 
@@ -202,8 +205,8 @@
 f3 = open("testP", "wb")
 w3 = iixr.PositionWriter(f3)
 wd = iixr.TermDictionaryWriter(w, w2, w3, 3)
-for term, offset in terms:
-    wd._write_term(term, offset)
+for term, offset, frequency in terms:
+    wd._write_term(term, offset, frequency)
 wd.close()
 
 f = open("test", "rb")
@@ -215,12 +218,13 @@
 rd = iixr.TermDictionaryReader(r, r2, r3)
 terms_reversed = terms[:]
 terms_reversed.reverse()
-for term, offset in terms_reversed:
-    o = rd._find_term(term)
+for term, offset, frequency in terms_reversed:
+    o, fr = rd._find_term(term)
     print offset == o, offset, o
+    print frequency == fr, frequency, fr
 for term in ("dog", "dingo"):
-    o = rd._find_term(term)
-    print o is None, o
+    t = rd._find_term(term)
+    print t is None, t
 rd.close()
 
 # Test dictionaries with term and position data.
@@ -274,10 +278,10 @@
     ]
 
 doc_tests = [
-    ("Every", [(2, [0]), (14, [0])]),
-    ("good", [(2, [1]), (13, [1])]),
-    ("deserves", [(2, [3]), (13, [3])]),
-    ("sea", [(36, [2, 6])])
+    ("Every", 2, [(2, [0]), (14, [0])]),
+    ("good", 2, [(2, [1]), (13, [1])]),
+    ("deserves", 2, [(2, [3]), (13, [3])]),
+    ("sea", 2, [(36, [2, 6])])
     ]
 
 index = iixr.Index("test_index")
@@ -289,9 +293,11 @@
 wi.close()
 
 rd = index.get_reader()
-for term, doc_positions in doc_tests:
+for term, frequency, doc_positions in doc_tests:
     dp = rd.find_positions(term)
     print doc_positions == dp, doc_positions, dp
+    fr = rd.get_frequency(term)
+    print frequency == fr, frequency, fr
 for docnum, text in docs:
     df = rd.get_fields(docnum)
     print text == df[0], text, df[0]