# HG changeset patch
# User Paul Boddie <paul@boddie.org.uk>
# Date 1251233055 -7200
# Node ID 67fdbd4c2a90064d0f6a8f939957ee29589c0547
# Parent  e014559eec92a39c0d958647a3237ebb57cfdcee
Added a term information reader and writer.

diff -r e014559eec92 -r 67fdbd4c2a90 iixr.py
--- a/iixr.py	Tue Aug 25 22:10:37 2009 +0200
+++ b/iixr.py	Tue Aug 25 22:44:15 2009 +0200
@@ -18,6 +18,8 @@
 with this program.  If not, see <http://www.gnu.org/licenses/>.
 """
 
+from os.path import commonprefix # to find common string prefixes
+
 # Foundation classes.
 
 class File:
@@ -73,6 +75,27 @@
         record = "".join(bytes)
         self.f.write(record)
 
+    def write_unsigned_byte(self, number):
+
+        "Write 'number' to the file using a single byte."
+
+        if not (0 <= number <= 255):
+            raise ValueError, "Number %r is out of range." % number
+
+        self.f.write(chr(number))
+
+    def write_string(self, s):
+
+        "Write 's' to the file, recording its length."
+
+        length = len(s)
+
+        if not (0 <= length <= 255):
+            raise ValueError, "String %r is too long." % s
+
+        self.write_unsigned_byte(length)
+        self.f.write(s)
+
 class FileReader(File):
 
     "Reading basic data types from files."
@@ -99,6 +122,19 @@
 
         return number
 
+    def read_unsigned_byte(self):
+
+        "Read a number from the file, consuming a single byte."
+
+        return ord(self.f.read(1))
+
+    def read_string(self):
+
+        "Read a string from the file."
+
+        length = self.read_unsigned_byte()
+        return self.f.read(length)
+
 # Specific classes.
 
 class PositionWriter(FileWriter):
@@ -215,4 +251,66 @@
 
         return doc_positions
 
+class TermWriter(FileWriter):
+
+    "Writing term information to files."
+
+    def reset(self):
+        self.last_term = ""
+        self.last_offset = 0
+
+    def write_term(self, term, offset):
+
+        """
+        Write the given 'term' and its position file 'offset' to the term
+        information file.
+        """
+
+        # Too long terms are not currently supported.
+
+        if len(term) > 255:
+            raise ValueError, "Term %r is too long." % term
+
+        # Write the prefix length and term suffix.
+
+        common = len(commonprefix([self.last_term, term]))
+        suffix = term[common:]
+
+        self.write_unsigned_byte(common)
+        self.write_string(suffix)
+
+        # Write the offset delta.
+
+        self.write_number(offset - self.last_offset)
+
+        self.last_term = term
+        self.last_offset = offset
+
+class TermReader(FileReader):
+
+    "Reading term information from files."
+
+    def reset(self):
+        self.last_term = ""
+        self.last_offset = 0
+
+    def read_term(self):
+
+        """
+        Read a term and its position file offset from the term information file.
+        """
+
+        # Read the prefix length and term suffix.
+
+        common = self.read_unsigned_byte()
+        suffix = self.read_string()
+
+        self.last_term = self.last_term[:common] + suffix
+
+        # Read the offset delta.
+
+        self.last_offset += self.read_number()
+
+        return self.last_term, self.last_offset
+
 # vim: tabstop=4 expandtab shiftwidth=4
diff -r e014559eec92 -r 67fdbd4c2a90 test.py
--- a/test.py	Tue Aug 25 22:10:37 2009 +0200
+++ b/test.py	Tue Aug 25 22:44:15 2009 +0200
@@ -64,4 +64,27 @@
     print doc_positions == dp, doc_positions, dp
 r.close()
 
+terms = [
+    ("aardvark",  100000123),
+    ("anteater",  100000456),
+    ("badger",    100000789),
+    ("bull",     1000001234),
+    ("bulldog",  1000002345),
+    ("cat",      1000003456)
+    ]
+
+f = open("test", "wb")
+w = iixr.TermWriter(f)
+for term, offset in terms:
+    w.write_term(term, offset)
+w.close()
+
+f = open("test", "r")
+r = iixr.TermReader(f)
+for term, offset in terms:
+    t, o = r.read_term()
+    print term == t, term, t
+    print offset == o, offset, o
+r.close()
+
 # vim: tabstop=4 expandtab shiftwidth=4