# HG changeset patch
# User Paul Boddie <paul@boddie.org.uk>
# Date 1251504929 -7200
# Node ID fe7ed6b96612e2f0fd90cb13690e9b317f0383fe
# Parent  5e4746613f833ecb14c54547660ac704f11fae3e
Added field dictionary and field index readers and writers.
Renamed various internal methods.
Added document number deltas to field collections in order to support scanning
for documents.

diff -r 5e4746613f83 -r fe7ed6b96612 iixr.py
--- a/iixr.py	Fri Aug 28 01:15:17 2009 +0200
+++ b/iixr.py	Sat Aug 29 02:15:29 2009 +0200
@@ -149,7 +149,7 @@
 
         return unicode(s, "utf-8")
 
-# Specific classes.
+# Specific classes for storing term and position information.
 
 class PositionWriter(FileWriter):
 
@@ -336,7 +336,10 @@
 
     def go_to_term(self, term, offset, info_offset):
 
-        "Seek past the entry for 'term' having 'offset' to 'info_offset'."
+        """
+        Seek past the entry for 'term' having 'offset' to 'info_offset'. This
+        permits the scanning for later terms from the specified term.
+        """
 
         self.f.seek(info_offset)
         self.last_term = term
@@ -399,7 +402,7 @@
         self.interval = interval
         self.entry = 0
 
-    def write_term(self, term, offset):
+    def _write_term(self, term, offset):
 
         """
         Write the given 'term' and its position file 'offset' to the term
@@ -421,7 +424,7 @@
         """
 
         offset = self.position_writer.write_all_positions(doc_positions)
-        self.write_term(term, offset)
+        self._write_term(term, offset)
 
     def close(self):
         self.info_writer.close()
@@ -449,7 +452,7 @@
         self.max_offset = self.terms[-1][1]
         self.max_info_offset = self.terms[-1][2]
 
-    def find_term(self, term):
+    def _find_term(self, term):
 
         "Find the position file offset of 'term' from the term dictionary."
 
@@ -489,7 +492,7 @@
 
         "Return the documents and positions at which the given 'term' is found."
 
-        offset = self.find_term(term)
+        offset = self._find_term(term)
         if offset is None:
             return None
         else:
@@ -500,19 +503,28 @@
         self.index_reader.close()
         self.position_reader.close()
 
+# Specific classes for storing document information.
+
 class FieldWriter(FileWriter):
 
     "Writing field data to files."
 
-    def write_fields(self, fields):
+    def reset(self):
+        self.last_docnum = 0
+
+    def write_fields(self, docnum, fields):
 
         """
-        Write the given list of 'fields' (strings representing field values).
-        Return the offset at which the fields are stored.
+        Write for the given 'docnum', a list of 'fields' (strings representing
+        field values). Return the offset at which the fields are stored.
         """
 
         offset = self.f.tell()
 
+        # Write the document number delta.
+
+        self.write_number(docnum - self.last_docnum)
+
         # Write the number of fields.
 
         self.write_number(len(fields))
@@ -522,15 +534,26 @@
         for field in fields:
             self.write_string(field, 0) # compress
 
+        self.last_docnum = docnum
         return offset
 
 class FieldReader(FileReader):
 
     "Reading field data from files."
 
+    def reset(self):
+        self.last_docnum = 0
+
     def read_fields(self):
 
-        "Read fields from the file, returning the field values in a list."
+        """
+        Read fields from the file, returning a tuple containing the document
+        number and a list of field values.
+        """
+
+        # Read the document number.
+
+        self.last_docnum += self.read_number()
 
         # Read the number of fields.
 
@@ -545,14 +568,139 @@
             fields.append(self.read_string(0)) # decompress
             i += 1
 
-        return fields
+        return self.last_docnum, fields
+
+    def read_document_fields(self, docnum, offset):
 
-    def read_doc_fields(self, offset):
-
-        "Read all fields at the given 'offset."
+        """
+        Read fields for 'docnum' at the given 'offset'. This permits the
+        retrieval of details for the specified document, as well as scanning for
+        later documents.
+        """
 
         self.f.seek(offset)
-        return self.read_fields()
+        bad_docnum, fields = self.read_fields()
+        self.last_docnum = docnum
+        return docnum, fields
+        
+class FieldIndexWriter(FileWriter):
+
+    "Writing field index details to files."
+
+    def reset(self):
+        self.last_docnum = 0
+
+    def write_document(self, docnum, offset):
+
+        """
+        Write for the given 'docnum', the 'offset' at which the fields for the
+        document are stored in the fields file.
+        """
+
+        # Write the document number delta and offset.
+
+        self.write_number(docnum - self.last_docnum)
+        self.write_number(offset)
+
+        self.last_docnum = docnum
+
+class FieldIndexReader(FileReader):
+
+    "Reading field index details from files."
+
+    def reset(self):
+        self.last_docnum = 0
+
+    def read_document(self):
+
+        "Read a document number and field file offset."
+
+        # Read the document number delta and offset.
+
+        self.last_docnum += self.read_number()
+        offset = self.read_number()
+
+        return self.last_docnum, offset
+
+class FieldDictionaryWriter:
+
+    "Writing field dictionary details."
+
+    def __init__(self, field_writer, field_index_writer, interval):
+        self.field_writer = field_writer
+        self.field_index_writer = field_index_writer
+        self.interval = interval
+        self.entry = 0
+
+    def write_fields(self, docnum, fields):
+
+        "Write details of the document with the given 'docnum' and 'fields'."
+
+        offset = self.field_writer.write_fields(docnum, fields)
+
+        if self.entry % self.interval == 0:
+            self.field_index_writer.write_document(docnum, offset)
+
+        self.entry += 1
+
+    def close(self):
+        self.field_writer.close()
+        self.field_index_writer.close()
+
+class FieldDictionaryReader:
+
+    "Reading field dictionary details."
+
+    def __init__(self, field_reader, field_index_reader):
+        self.field_reader = field_reader
+        self.field_index_reader = field_index_reader
+
+        self.docs = []
+        try:
+            while 1:
+                self.docs.append(self.field_index_reader.read_document())
+        except EOFError:
+            pass
+
+        # Large numbers for ordering purposes.
+
+        self.max_offset = self.docs[-1][1]
+
+    def read_fields(self, docnum):
+
+        "Read the fields of the document with the given 'docnum'."
+
+        i = bisect_right(self.docs, (docnum, self.max_offset)) - 1
+
+        # Get the entry position providing the term or one preceding it.
+
+        if i == -1:
+            return None
+
+        found_docnum, offset = self.docs[i]
+
+        # Read from the fields file.
+
+        found_docnum, fields = self.field_reader.read_document_fields(found_docnum, offset)
+
+        # Scan for the document, if necessary.
+
+        try:
+            while docnum > found_docnum:
+                found_docnum, fields = self.field_reader.read_fields()
+        except EOFError:
+            pass
+
+        # If the document is found, return the fields.
+
+        if docnum == found_docnum:
+            return fields
+        else:
+            return None
+
+    def close(self):
+        self.field_reader.close()
+        self.field_index_reader.close()
 
 # High-level classes.
 
diff -r 5e4746613f83 -r fe7ed6b96612 test.py
--- a/test.py	Fri Aug 28 01:15:17 2009 +0200
+++ b/test.py	Sat Aug 29 02:15:29 2009 +0200
@@ -2,6 +2,8 @@
 
 import iixr
 
+# Test basic data types.
+
 numbers = [12345678, 0, 1, 127, 128, 255, 256]
 
 f = open("test", "wb")
@@ -17,6 +19,8 @@
     print number == n, number, n
 r.close()
 
+# Test positions.
+
 all_doc_positions = [
     [
         (123, [1, 3, 5, 15, 25]),
@@ -64,30 +68,82 @@
     print doc_positions == dp, doc_positions, dp
 r.close()
 
+# Test fields.
+
 doc_fields = [
-    ["testing", "fields", "stored", "compressed"],
-    ["fields", "for a second", "document"]
+    (123, ["testing", "fields", "stored", "compressed"]),
+    (456, ["fields", "for a second", "document"]),
+    (789, ["field value"]),
+    (1234, []),
+    (2345, ["abc", "def"]),
+    (3456, ["apple", "banana", "cherry"]),
+    (4567, ["drue", "eple"])
     ]
 
 f = open("testF", "wb")
 w = iixr.FieldWriter(f)
-offsets = []
-for fields in doc_fields:
-    offsets.append(w.write_fields(fields))
+for docnum, fields in doc_fields:
+    w.write_fields(docnum, fields)
 w.close()
 
 f = open("testF", "rb")
 r = iixr.FieldReader(f)
-for fields in doc_fields:
-    df = r.read_fields()
-    print fields == df, fields, df
-offsets.reverse()
-doc_fields.reverse()
-for offset, fields in zip(offsets, doc_fields):
-    df = r.read_doc_fields(offset)
+for docnum, fields in doc_fields:
+    dn, df = r.read_fields()
+    print docnum == dn, docnum, dn
     print fields == df, fields, df
 r.close()
 
+# Test field index files.
+
+indexed_docs = [
+    (123, 100000987),
+    (456, 100004321),
+    (789, 100008765)
+    ]
+
+f = open("testFI", "wb")
+w = iixr.FieldIndexWriter(f)
+for docnum, offset in indexed_docs:
+    w.write_document(docnum, offset)
+w.close()
+
+f = open("testFI", "rb")
+r = iixr.FieldIndexReader(f)
+for docnum, offset in indexed_docs:
+    dn, o = r.read_document()
+    print docnum == dn, docnum, dn
+    print offset == o, offset, o
+r.close()
+
+# Test field dictionaries.
+
+f = open("testF", "wb")
+w = iixr.FieldWriter(f)
+f2 = open("testFI", "wb")
+w2 = iixr.FieldIndexWriter(f2)
+wd = iixr.FieldDictionaryWriter(w, w2, 3)
+for docnum, fields in doc_fields:
+    wd.write_fields(docnum, fields)
+wd.close()
+
+f = open("testF", "rb")
+r = iixr.FieldReader(f)
+f2 = open("testFI", "rb")
+r2 = iixr.FieldIndexReader(f2)
+rd = iixr.FieldDictionaryReader(r, r2)
+doc_fields_reversed = doc_fields[:]
+doc_fields_reversed.reverse()
+for docnum, fields in doc_fields_reversed:
+    df = rd.read_fields(docnum)
+    print fields == df, fields, df
+for docnum in (13579, 246810):
+    df = rd.read_fields(docnum)
+    print df is None, df
+rd.close()
+
+# Test terms.
+
 terms = [
     ("aardvark",  100000123),
     ("anteater",  100000456),
@@ -111,6 +167,8 @@
     print offset == o, offset, o
 r.close()
 
+# Test terms in index files.
+
 indexed_terms = [
     ("aardvark",  100000123, 200000321),
     ("anteater",  100000456, 200000654),
@@ -135,6 +193,8 @@
     print info_offset == i, info_offset, i
 r.close()
 
+# Test dictionaries with only term data.
+
 f = open("test", "wb")
 w = iixr.TermWriter(f)
 f2 = open("testI", "wb")
@@ -143,7 +203,7 @@
 w3 = iixr.PositionWriter(f3)
 wd = iixr.TermDictionaryWriter(w, w2, w3, 3)
 for term, offset in terms:
-    wd.write_term(term, offset)
+    wd._write_term(term, offset)
 wd.close()
 
 f = open("test", "rb")
@@ -156,13 +216,15 @@
 terms_reversed = terms[:]
 terms_reversed.reverse()
 for term, offset in terms_reversed:
-    o = rd.find_term(term)
+    o = rd._find_term(term)
     print offset == o, offset, o
 for term in ("dog", "dingo"):
-    o = rd.find_term(term)
+    o = rd._find_term(term)
     print o is None, o
 rd.close()
 
+# Test dictionaries with term and position data.
+
 terms_with_positions = [
     ("aardvark",  [(1, [2, 45, 96]), (20, [13])]),
     ("anteater",  [(1, [43, 44])]),
@@ -200,6 +262,8 @@
     print dp is None, dp
 rd.close()
 
+# Test high-level index operations.
+
 docs = [
     (1, "The cat sat on the mat"),
     (2, "Every good boy deserves football"),