# HG changeset patch
# User Paul Boddie <paul@boddie.org.uk>
# Date 1251847842 -7200
# Node ID 1cf3b82959f3d8e3e764a18691ab40f8a3504df2
# Parent  1e7ca36202ef5bbf0ac39ad19863eeb4ac14710e
Attempted to introduce position dictionaries with extra term record details
providing document frequency information.
Attempted to introduce file descriptor duplication in order to support
concurrent iterators.

diff -r 1e7ca36202ef -r 1cf3b82959f3 iixr.py
--- a/iixr.py	Mon Aug 31 21:02:30 2009 +0200
+++ b/iixr.py	Wed Sep 02 01:30:42 2009 +0200
@@ -18,6 +18,7 @@
 with this program.  If not, see <http://www.gnu.org/licenses/>.
 """
 
+from os import dup, fdopen       # independent iterator access to files
 from os import listdir, mkdir    # index and partition discovery
 from os import remove, rename    # partition manipulation
 from os.path import exists, join
@@ -194,11 +195,18 @@
 
     def write_positions(self, docnum, positions):
 
-        "Write for the document 'docnum' the given 'positions'."
+        """
+        Write for the document 'docnum' the given 'positions'.
+        Return the offset of the written record.
+        """
 
         if docnum < self.last_docnum:
             raise ValueError, "Document number %r is less than previous number %r." % (docnum, self.last_docnum)
 
+        # Record the offset of this record.
+
+        offset = self.f.tell()
+
         # Write the document number delta.
 
         self.write_number(docnum - self.last_docnum)
@@ -221,34 +229,7 @@
 
         self.last_docnum = docnum
 
-    def write_term_positions(self, doc_positions):
-
-        """
-        Write all 'doc_positions' - a collection of tuples of the form (document
-        number, position list) - to the file, returning a tuple containing the
-        offset at which they were stored together with the frequency (number of
-        positions) for the term involved.
-        """
-
-        # Reset the writer and record the current file offset.
-
-        self.reset()
-        offset = self.f.tell()
-
-        # Write the number of documents.
-
-        self.write_number(len(doc_positions))
-        doc_positions.sort()
-
-        # Write the positions.
-
-        frequency = 0
-
-        for docnum, positions in doc_positions:
-            self.write_positions(docnum, positions)
-            frequency += len(positions)
-
-        return offset, frequency
+        return offset
 
 class PositionReader(FileReader):
 
@@ -283,54 +264,295 @@
 
         return self.last_docnum, positions
 
-    def read_term_positions(self, offset):
+    def read_term_positions(self, offset, count):
 
         """
         Read all positions from 'offset', seeking to that position in the file
-        before reading.
+        before reading. The number of documents available for reading is limited
+        to 'count'.
         """
 
         self.reset()
-        self.f.seek(offset)
+
+        # Duplicate the file handle.
+
+        f = fdopen(dup(self.f.fileno()), "rb")
+        f.seek(offset)
+        return PositionIterator(f, count)
+
+class IteratorBase:
+
+    def __init__(self, count):
+        self.replenish(count)
 
-        # Could duplicate the file handle using...
-        # fdopen(dup(self.f.fileno()), "rb")
+    def replenish(self, count):
+        self.count = count
+        self.read_documents = 0
+
+    def __len__(self):
+        return self.count
 
-        return PositionIterator(self.f)
+    def sort(self):
+        pass # Stored document positions are already sorted.
 
-class PositionIterator(PositionReader):
+    def __iter__(self):
+        return self
+
+class PositionIterator(PositionReader, IteratorBase):
 
     "Iterating over document positions."
 
-    def __init__(self, f):
+    def __init__(self, f, count):
         PositionReader.__init__(self, f)
+        IteratorBase.__init__(self, count)
+
+    def next(self):
+
+        "Read positions for a single document."
+
+        if self.read_documents < self.count:
+            self.read_documents += 1
+            return self.read_positions()
+        else:
+            raise StopIteration
+
+class PositionIndexWriter(FileWriter):
+
+    "Writing position index information to files."
+
+    def reset(self):
+        self.last_docnum = 0
+        self.last_pos_offset = 0
+
+    def write_positions(self, docnum, pos_offset, count):
+
+        """
+        Write the given 'docnum, 'pos_offset' and document 'count' to the
+        position index file.
+        """
+
+        # Record the offset of this record.
+
+        offset = self.f.tell()
+
+        # Write the document number delta.
+
+        self.write_number(docnum - self.last_docnum)
+        self.last_docnum = docnum
+
+        # Write the position file offset delta.
+
+        self.write_number(pos_offset - self.last_pos_offset)
+        self.last_pos_offset = pos_offset
+
+        # Write the document count.
+
+        self.write_number(count)
+
+        return offset
+
+class PositionIndexReader(FileReader):
+
+    "Reading position index information from files."
 
-        # Read the number of documents.
+    def reset(self):
+        self.last_docnum = 0
+        self.last_pos_offset = 0
+
+    def read_positions(self):
+
+        """
+        Read a document number, a position file offset for the position index
+        file, and the number of documents in a section of that file.
+        """
+
+        # Read the document number delta.
+
+        self.last_docnum += self.read_number()
+
+        # Read the offset delta.
+
+        self.last_pos_offset += self.read_number()
+
+        # Read the document count.
+
+        count = self.read_number()
+
+        return self.last_docnum, self.last_pos_offset, count
+
+    def read_term_positions(self, offset, doc_frequency):
 
-        self.ndocuments = self.read_number()
-        self.read_documents = 0
+        """
+        Read all positions from 'offset', seeking to that position in the file
+        before reading. The number of documents available for reading is limited
+        to 'doc_frequency'.
+        """
+
+        # NOTE: This is almost a duplication of PositionReader.read_term_positions.
+
+        self.reset()
+
+        # Duplicate the file handle.
+
+        f = fdopen(dup(self.f.fileno()), "rb")
+        f.seek(offset)
+        return PositionIndexIterator(f, doc_frequency)
+
+class PositionIndexIterator(PositionIndexReader, IteratorBase):
+
+    "Iterating over document positions."
+
+    def __init__(self, f, count):
+        PositionIndexReader.__init__(self, f)
+        IteratorBase.__init__(self, count)
+        self.section_count = 0
+
+    def next(self):
+
+        "Read positions for a single document."
 
-    def __len__(self):
-        return self.ndocuments
+        self.read_documents += self.section_count
+        if self.read_documents < self.count:
+            docnum, pos_offset, self.section_count = t = self.read_positions()
+            return t
+        else:
+            raise StopIteration
+
+class PositionDictionaryWriter:
+
+    "Writing position dictionaries."
+
+    def __init__(self, position_writer, position_index_writer, interval):
+        self.position_writer = position_writer
+        self.position_index_writer = position_index_writer
+        self.interval = interval
+
+    def write_term_positions(self, doc_positions):
+
+        """
+        Write all 'doc_positions' - a collection of tuples of the form (document
+        number, position list) - to the file.
+
+        Add some records to the index, making dictionary entries.
+
+        Return a tuple containing the offset of the written data, the frequency
+        (number of positions), and document frequency (number of documents) for
+        the term involved.
+        """
+
+        # Reset the writer.
+
+        self.position_writer.reset()
+        index_offset = None
+
+        # Write the positions.
+
+        frequency = 0
+        first_offset = None
+        count = 0
+
+        doc_positions.sort()
+
+        for docnum, positions in doc_positions:
+            pos_offset = self.position_writer.write_positions(docnum, positions)
+
+            # Retain the first record offset for a subsequent index entry.
+
+            if first_offset is None:
+                first_offset = pos_offset
+
+            frequency += len(positions)
+
+            # Every {interval} entries, write an index entry.
+
+            if count == self.interval:
+                io = self.position_index_writer.write_positions(docnum, first_offset, self.interval)
 
-    def sort(self):
+                # Remember the first index entry offset.
+
+                if index_offset is None:
+                    index_offset = io
+
+                first_offset = None
+                count = 0
+
+            count += 1
+
+        # Finish writing an index entry for the remaining documents.
+
+        else:
+            if first_offset is not None:
+                io = self.position_index_writer.write_positions(docnum, first_offset, count)
+
+                # Remember the first index entry offset.
+
+                if index_offset is None:
+                    index_offset = io
+
+        return index_offset, frequency, len(doc_positions)
+
+    def close(self):
+        self.position_writer.close()
+        self.position_index_writer.close()
+
+class PositionDictionaryReader:
 
-        "Stored document positions are already sorted."
+    "Reading position dictionaries."
+
+    def __init__(self, position_reader, position_index_reader):
+        self.position_reader = position_reader
+        self.position_index_reader = position_index_reader
+
+    def read_term_positions(self, offset, doc_frequency):
+
+        """
+        Return an iterator for dictionary entries starting at 'offset' with the
+        given 'doc_frequency'.
+        """
 
-        pass
+        return PositionDictionaryIterator(self.position_reader,
+            self.position_index_reader, offset, doc_frequency)
+
+    def close(self):
+        self.position_reader.close()
+        self.position_index_reader.close()
+
+class PositionDictionaryIterator:
+
+    "Iteration over position dictionary entries."
+
+    def __init__(self, position_reader, position_index_reader, offset, doc_frequency):
+        self.position_reader = position_reader
+
+        self.index_iterator = position_index_reader.read_term_positions(offset, doc_frequency)
+        self.next_section()
+        self.init_section()
 
     def __iter__(self):
         return self
 
     def next(self):
 
-        "Read positions for a single document."
+        # Attempt to get the next document record from the section in the positions file.
+
+        while 1:
+
+            # Either return the next record.
+
+            try:
+                return self.iterator.next()
 
-        if self.read_documents < self.ndocuments:
-            self.read_documents += 1
-            return self.read_positions()
-        else:
-            raise StopIteration
+            # Or, where a section is finished, get the next section and try again.
+
+            except StopIteration:
+                self.next_section()
+                self.iterator.replenish(self.section_count)
+
+    def next_section(self):
+        self.docnum, self.pos_offset, self.section_count = self.index_iterator.read_positions()
+
+    def init_section(self):
+        self.iterator = self.position_reader.read_term_positions(self.pos_offset, self.section_count)
 
 class TermWriter(FileWriter):
 
@@ -340,12 +562,13 @@
         self.last_term = ""
         self.last_offset = 0
 
-    def write_term(self, term, offset, frequency):
+    def write_term(self, term, offset, frequency, doc_frequency):
 
         """
-        Write the given 'term', its position file 'offset', and its 'frequency'
-        to the term information file. Return the offset after the term
-        information was written to the file.
+        Write the given 'term', its position file 'offset', its 'frequency' and
+        its 'doc_frequency' (number of documents in which it appears) to the
+        term information file. Return the offset after the term information was
+        written to the file.
         """
 
         # Write the prefix length and term suffix.
@@ -364,6 +587,10 @@
 
         self.write_number(frequency)
 
+        # Write the document frequency.
+
+        self.write_number(doc_frequency)
+
         self.last_term = term
         self.last_offset = offset
 
@@ -380,8 +607,8 @@
     def read_term(self):
 
         """
-        Read a term, its position file offset, and its frequency from the term
-        information file.
+        Read a term, its position file offset, its frequency and its document
+        frequence from the term information file.
         """
 
         # Read the prefix length and term suffix.
@@ -399,7 +626,11 @@
 
         frequency = self.read_number()
 
-        return self.last_term, self.last_offset, frequency
+        # Read the document frequency.
+
+        doc_frequency = self.read_number()
+
+        return self.last_term, self.last_offset, frequency, doc_frequency
 
     def go_to_term(self, term, offset, info_offset):
 
@@ -420,15 +651,15 @@
         TermWriter.reset(self)
         self.last_info_offset = 0
 
-    def write_term(self, term, offset, frequency, info_offset):
+    def write_term(self, term, offset, frequency, doc_frequency, info_offset):
 
         """
-        Write the given 'term', its position file 'offset', and its 'frequency'
-        to the term dictionary index file, along with the 'info_offset' in the
-        term information file.
+        Write the given 'term', its position file 'offset', its 'frequency' and
+        its 'doc_frequency' to the term dictionary index file, along with the
+        'info_offset' in the term information file.
         """
 
-        TermWriter.write_term(self, term, offset, frequency)
+        TermWriter.write_term(self, term, offset, frequency, doc_frequency)
 
         # Write the information file offset delta.
 
@@ -446,41 +677,43 @@
     def read_term(self):
 
         """
-        Read a term, its position file offset, its frequency, and its term
-        information file offset from the term dictionary index file.
+        Read a term, its position file offset, its frequency, its document
+        frequency and a term information file offset from the term dictionary
+        index file.
         """
 
-        term, offset, frequency = TermReader.read_term(self)
+        term, offset, frequency, doc_frequency = TermReader.read_term(self)
 
         # Read the offset delta.
 
         self.last_info_offset += self.read_number()
 
-        return term, offset, frequency, self.last_info_offset
+        return term, offset, frequency, doc_frequency, self.last_info_offset
 
 class TermDictionaryWriter:
 
     "Writing term dictionaries."
 
-    def __init__(self, info_writer, index_writer, position_writer, interval):
+    def __init__(self, info_writer, index_writer, position_dict_writer, interval):
         self.info_writer = info_writer
         self.index_writer = index_writer
-        self.position_writer = position_writer
+        self.position_dict_writer = position_dict_writer
         self.interval = interval
         self.entry = 0
 
-    def _write_term(self, term, offset, frequency):
+    def _write_term(self, term, offset, frequency, doc_frequency):
 
         """
-        Write the given 'term', its position file 'offset', and its 'frequency'
-        to the term information file and optionally to the index, making a
-        dictionary entry.
+        Write the given 'term', its position file 'offset', its 'frequency' and
+        its 'doc_frequency' (number of documents in which it appears) to the
+        term information file. Return the offset after the term information was
+        written to the file.
         """
 
-        info_offset = self.info_writer.write_term(term, offset, frequency)
+        info_offset = self.info_writer.write_term(term, offset, frequency, doc_frequency)
 
         if self.entry % self.interval == 0:
-            self.index_writer.write_term(term, offset, frequency, info_offset)
+            self.index_writer.write_term(term, offset, frequency, doc_frequency, info_offset)
 
         self.entry += 1
 
@@ -491,13 +724,13 @@
         and positions at which the term is found.
         """
 
-        offset, frequency = self.position_writer.write_term_positions(doc_positions)
-        self._write_term(term, offset, frequency)
+        offset, frequency, doc_frequency = self.position_dict_writer.write_term_positions(doc_positions)
+        self._write_term(term, offset, frequency, doc_frequency)
 
     def close(self):
         self.info_writer.close()
         self.index_writer.close()
-        self.position_writer.close()
+        self.position_dict_writer.close()
 
 class TermDictionaryReader:
 
@@ -533,12 +766,13 @@
         if i == -1:
             return None
 
-        found_term, offset, frequency, info_offset = self.terms[i]
+        found_term, offset, frequency, doc_frequency, info_offset = self.terms[i]
 
-        # Where the term is found immediately, return the offset.
+        # Where the term is found immediately, return the offset and
+        # frequencies.
 
         if term == found_term:
-            return offset, frequency
+            return offset, frequency, doc_frequency
 
         # Otherwise, seek past the index term's entry in the information file
         # and scan for the desired term.
@@ -547,33 +781,33 @@
             self.info_reader.go_to_term(found_term, offset, info_offset)
             try:
                 while term > found_term:
-                    found_term, offset, frequency = self.info_reader.read_term()
+                    found_term, offset, frequency, doc_frequency = self.info_reader.read_term()
             except EOFError:
                 pass
 
-            # If the term is found, return the offset and frequency.
+            # If the term is found, return the offset and frequencies.
 
             if term == found_term:
-                return offset, frequency
+                return offset, frequency, doc_frequency
             else:
                 return None
 
     def rewind(self):
         self.info_reader.rewind()
 
-    def _get_positions(self, offset):
-        return self.position_reader.read_term_positions(offset)
+    def _get_positions(self, offset, doc_frequency):
+        return self.position_reader.read_term_positions(offset, doc_frequency)
 
     def read_term(self):
 
         """
-        Return the next term, its frequency and the documents and positions at
-        which the term is found.
+        Return the next term, its frequency, its document frequency, and the
+        documents and positions at which the term is found.
         """
 
-        term, offset, frequency = self.info_reader.read_term()
-        positions = self._get_positions(offset)
-        return term, frequency, positions
+        term, offset, frequency, doc_frequency = self.info_reader.read_term()
+        positions = self._get_positions(offset, doc_frequency)
+        return term, frequency, doc_frequency, positions
 
     def find_positions(self, term):
 
@@ -583,8 +817,8 @@
         if t is None:
             return None
         else:
-            offset, frequency = t
-            return self._get_positions(offset)
+            offset, frequency, doc_frequency = t
+            return self._get_positions(offset, doc_frequency)
 
     def get_frequency(self, term):
 
@@ -594,9 +828,20 @@
         if t is None:
             return None
         else:
-            offset, frequency = t
+            offset, frequency, doc_frequency = t
             return frequency
 
+    def get_document_frequency(self, term):
+
+        "Return the document frequency of the given 'term'."
+
+        t = self._find_term(term)
+        if t is None:
+            return None
+        else:
+            offset, frequency, doc_frequency = t
+            return doc_frequency
+
     def close(self):
         self.info_reader.close()
         self.index_reader.close()
@@ -850,7 +1095,7 @@
             reader.rewind()
 
             try:
-                term, frequency, positions = reader.read_term()
+                term, frequency, doc_frequency, positions = reader.read_term()
                 insort_right(entries, (term, positions, partition))
             except EOFError:
                 pass
@@ -889,7 +1134,7 @@
 
             for partition in to_update:
                 try:
-                    term, frequency, positions = self.readers[partition].read_term()
+                    term, frequency, doc_frequency, positions = self.readers[partition].read_term()
                     insort_right(entries, (term, positions, partition))
                 except EOFError:
                     pass
@@ -975,12 +1220,12 @@
 
 # Utility functions.
 
-def get_term_writer(pathname, partition, interval):
+def get_term_writer(pathname, partition, interval, doc_interval):
 
     """
     Return a term dictionary writer using files under the given 'pathname'
     labelled according to the given 'partition', using the given indexing
-    'interval'.
+    'interval' for terms and 'doc_interval' for document position records.
     """
 
     tdf = open(join(pathname, "terms-%s" % partition), "wb")
@@ -992,7 +1237,12 @@
     tpf = open(join(pathname, "positions-%s" % partition), "wb")
     positions_writer = PositionWriter(tpf)
 
-    return TermDictionaryWriter(info_writer, index_writer, positions_writer, interval)
+    tpif = open(join(pathname, "positions_index-%s" % partition), "wb")
+    positions_index_writer = PositionIndexWriter(tpif)
+
+    positions_dict_writer = PositionDictionaryWriter(positions_writer, positions_index_writer, doc_interval)
+
+    return TermDictionaryWriter(info_writer, index_writer, positions_dict_writer, interval)
 
 def get_field_writer(pathname, partition, interval):
 
@@ -1026,7 +1276,12 @@
     tpf = open(join(pathname, "positions-%s" % partition), "rb")
     positions_reader = PositionReader(tpf)
 
-    return TermDictionaryReader(info_reader, index_reader, positions_reader)
+    tpif = open(join(pathname, "positions_index-%s" % partition), "rb")
+    positions_index_reader = PositionIndexReader(tpif)
+
+    positions_dict_reader = PositionDictionaryReader(positions_reader, positions_index_reader)
+
+    return TermDictionaryReader(info_reader, index_reader, positions_dict_reader)
 
 def get_field_reader(pathname, partition):
 
diff -r 1e7ca36202ef -r 1cf3b82959f3 test.py
--- a/test.py	Mon Aug 31 21:02:30 2009 +0200
+++ b/test.py	Wed Sep 02 01:30:42 2009 +0200
@@ -38,15 +38,18 @@
 all_doc_positions = [
     [
         (123, [1, 3, 5, 15, 25]),
-        (124, [0, 100])
+        (124, [0, 100]),
+        (125, [11, 99, 199]),
+        (130, [77, 78, 80, 82, 89])
     ],
     [
         (78, [9]),
-        (196, [10, 11])
+        (196, [10, 11]),
+        (197, [17, 21, 30])
     ]
     ]
 
-f = open("test", "wb")
+f = open("testP", "wb")
 w = iixr.PositionWriter(f)
 for doc_positions in all_doc_positions:
     for docnum, positions in doc_positions:
@@ -54,7 +57,7 @@
     w.reset()
 w.close()
 
-f = open("test", "rb")
+f = open("testP", "rb")
 r = iixr.PositionReader(f)
 for doc_positions in all_doc_positions:
     for docnum, positions in doc_positions:
@@ -64,20 +67,68 @@
     r.reset()
 r.close()
 
-f = open("test", "wb")
+# Test position index files.
+
+indexed_positions = [
+    [
+        (1234, 0, 100),
+        (2345, 700, 100),
+        (3456, 1900, 50)
+    ],
+    [
+        (4567, 2800, 20)
+    ]
+    ]
+
+offsets = []
+f = open("testPI", "wb")
+w = iixr.PositionIndexWriter(f)
+for term_positions in indexed_positions:
+    offset = None
+    doc_frequency = 0
+    w.reset()
+    for docnum, pos_offset, count in term_positions:
+        io = w.write_positions(docnum, pos_offset, count)
+        if offset is None:
+            offset = io
+        doc_frequency += count
+    offsets.append((offset, doc_frequency))
+w.close()
+
+f = open("testPI", "rb")
+r = iixr.PositionIndexReader(f)
+offsets.reverse()
+indexed_positions.reverse()
+for (offset, doc_frequency), term_positions in zip(offsets, indexed_positions):
+    found_positions = r.read_term_positions(offset, doc_frequency)
+    for (docnum, pos_offset, count), (dn, po, c) in zip(term_positions, found_positions):
+        print docnum == dn, docnum, dn
+        print pos_offset == po, pos_offset, po
+        print count == c, count, c
+r.close()
+
+# Test position dictionaries.
+
+f = open("testP", "wb")
 w = iixr.PositionWriter(f)
+f2 = open("testPI", "wb")
+w2 = iixr.PositionIndexWriter(f2)
+wd = iixr.PositionDictionaryWriter(w, w2, 2)
 offsets = []
 for doc_positions in all_doc_positions:
-    offset, frequency = w.write_term_positions(doc_positions)
-    offsets.append(offset)
+    offset, frequency, doc_frequency = wd.write_term_positions(doc_positions)
+    offsets.append((offset, doc_frequency))
 w.close()
 
-f = open("test", "rb")
+f = open("testP", "rb")
 r = iixr.PositionReader(f)
+f2 = open("testPI", "rb")
+r2 = iixr.PositionIndexReader(f2)
+rd = iixr.PositionDictionaryReader(r, r2)
 offsets.reverse()
 all_doc_positions.reverse()
-for offset, doc_positions in zip(offsets, all_doc_positions):
-    dp = list(r.read_term_positions(offset))
+for (offset, doc_frequency), doc_positions in zip(offsets, all_doc_positions):
+    dp = list(rd.read_term_positions(offset, doc_frequency))
     print doc_positions == dp, doc_positions, dp
 r.close()
 
@@ -166,55 +217,57 @@
 # Test terms.
 
 terms = [
-    # term       offset      frequency
-    ("aardvark",  100000123,  1),
-    ("anteater",  100000456,  2),
-    ("badger",    100000789, 13),
-    ("bull",     1000001234, 59),
-    ("bulldog",  1000002345, 99),
-    ("cat",      1000003456, 89)
+    # term       offset      frequency  doc_frequency
+    ("aardvark",  100000123,  1,         1),
+    ("anteater",  100000456,  2,         1),
+    ("badger",    100000789, 13,         7),
+    ("bull",     1000001234, 59,        17),
+    ("bulldog",  1000002345, 99,        80),
+    ("cat",      1000003456, 89,        28)
     ]
 
 f = open("test", "wb")
 w = iixr.TermWriter(f)
-for term, offset, frequency in terms:
-    w.write_term(term, offset, frequency)
+for term, offset, frequency, doc_frequency in terms:
+    w.write_term(term, offset, frequency, doc_frequency)
 w.close()
 
 f = open("test", "rb")
 r = iixr.TermReader(f)
-for term, offset, frequency in terms:
-    t, o, fr = r.read_term()
+for term, offset, frequency, doc_frequency in terms:
+    t, o, fr, df = r.read_term()
     print term == t, term, t
     print offset == o, offset, o
     print frequency == fr, frequency, fr
+    print doc_frequency == df, doc_frequency, df
 r.close()
 
 # Test terms in index files.
 
 indexed_terms = [
-    # term       offset      frequency  info_offset
-    ("aardvark",  100000123,  1,        200000321),
-    ("anteater",  100000456,  2,        200000654),
-    ("badger",    100000789, 13,        200000987),
-    ("bull",     1000001234, 59,        200004321),
-    ("bulldog",  1000002345, 99,        200005432),
-    ("cat",      1000003456, 89,        200006543)
+    # term       offset      frequency  doc_frequency   info_offset
+    ("aardvark",  100000123,  1,         1,             200000321),
+    ("anteater",  100000456,  2,         1,             200000654),
+    ("badger",    100000789, 13,         7,             200000987),
+    ("bull",     1000001234, 59,        17,             200004321),
+    ("bulldog",  1000002345, 99,        80,             200005432),
+    ("cat",      1000003456, 89,        28,             200006543)
     ]
 
 f = open("test", "wb")
 w = iixr.TermIndexWriter(f)
-for term, offset, frequency, info_offset in indexed_terms:
-    w.write_term(term, offset, frequency, info_offset)
+for term, offset, frequency, doc_frequency, info_offset in indexed_terms:
+    w.write_term(term, offset, frequency, doc_frequency, info_offset)
 w.close()
 
 f = open("test", "rb")
 r = iixr.TermIndexReader(f)
-for term, offset, frequency, info_offset in indexed_terms:
-    t, o, fr, i = r.read_term()
+for term, offset, frequency, doc_frequency, info_offset in indexed_terms:
+    t, o, fr, df, i = r.read_term()
     print term == t, term, t
     print offset == o, offset, o
     print frequency == fr, frequency, fr
+    print doc_frequency == df, doc_frequency, df
     print info_offset == i, info_offset, i
 r.close()
 
@@ -224,26 +277,23 @@
 w = iixr.TermWriter(f)
 f2 = open("testI", "wb")
 w2 = iixr.TermIndexWriter(f2)
-f3 = open("testP", "wb")
-w3 = iixr.PositionWriter(f3)
-wd = iixr.TermDictionaryWriter(w, w2, w3, 3)
-for term, offset, frequency in terms:
-    wd._write_term(term, offset, frequency)
+wd = iixr.TermDictionaryWriter(w, w2, None, 3)
+for term, offset, frequency, doc_frequency in terms:
+    wd._write_term(term, offset, frequency, doc_frequency)
 wd.close()
 
 f = open("test", "rb")
 r = iixr.TermReader(f)
 f2 = open("testI", "rb")
 r2 = iixr.TermIndexReader(f2)
-f3 = open("testP", "rb")
-r3 = iixr.PositionReader(f3)
-rd = iixr.TermDictionaryReader(r, r2, r3)
+rd = iixr.TermDictionaryReader(r, r2, None)
 terms_reversed = terms[:]
 terms_reversed.reverse()
-for term, offset, frequency in terms_reversed:
-    o, fr = rd._find_term(term)
+for term, offset, frequency, doc_frequency in terms_reversed:
+    o, fr, df = rd._find_term(term)
     print offset == o, offset, o
     print frequency == fr, frequency, fr
+    print doc_frequency == df, doc_frequency, df
 for term in ("dog", "dingo"):
     t = rd._find_term(term)
     print t is None, t
@@ -255,7 +305,7 @@
     ("aardvark",  [(1, [2, 45, 96]), (20, [13])]),
     ("anteater",  [(1, [43, 44])]),
     ("badger",    [(7, [2, 22, 196]), (19, [55, 1333]), (21, [0])]),
-    ("bull",      [(6, [128]), (16, [12])]),
+    ("bull",      [(6, [128]), (16, [12]), (26, [1, 3, 5, 7, 9]), (36, [2, 4, 6, 8, 10])]),
     ("bulldog",   [(43, [17, 19, 256, 512])]),
     ("cat",       [(123, [12, 145, 196]), (1200, [113])])
     ]
@@ -266,7 +316,10 @@
 w2 = iixr.TermIndexWriter(f2)
 f3 = open("testP", "wb")
 w3 = iixr.PositionWriter(f3)
-wd = iixr.TermDictionaryWriter(w, w2, w3, 3)
+f4 = open("testPI", "wb")
+w4 = iixr.PositionIndexWriter(f4)
+wp = iixr.PositionDictionaryWriter(r3, r4, 2)
+wd = iixr.TermDictionaryWriter(w, w2, wp, 3)
 for term, doc_positions in terms_with_positions:
     wd.write_term_positions(term, doc_positions)
 wd.close()
@@ -277,7 +330,10 @@
 r2 = iixr.TermIndexReader(f2)
 f3 = open("testP", "rb")
 r3 = iixr.PositionReader(f3)
-rd = iixr.TermDictionaryReader(r, r2, r3)
+f4 = open("testPI", "rb")
+r4 = iixr.PositionIndexReader(f4)
+rp = iixr.PositionDictionaryReader(r3, r4)
+rd = iixr.TermDictionaryReader(r, r2, rp)
 terms_reversed = terms_with_positions[:]
 terms_reversed.reverse()
 for term, doc_positions in terms_reversed:
@@ -291,7 +347,7 @@
 
 rd.rewind()
 for term, doc_positions in terms_with_positions:
-    t, fr, dp = rd.read_term()
+    t, fr, df, dp = rd.read_term()
     dp = list(dp)
     print term == t, term, t
     print doc_positions == dp, doc_positions, dp