# HG changeset patch
# User Paul Boddie <paul@boddie.org.uk>
# Date 1297561795 -3600
# Node ID 6542c54d115b72f837b31d24a08ba25c3bdd21f2
# Parent  b75bd39cf61f4fdec7d3b85b66f50f443e194adb
Removed numerous classes, simplifying the package and focusing on combined term
and position files which can be merged using fewer processing operations.

diff -r b75bd39cf61f -r 6542c54d115b iixr/data.py
--- a/iixr/data.py	Sat Feb 12 01:23:58 2011 +0100
+++ b/iixr/data.py	Sun Feb 13 02:49:55 2011 +0100
@@ -23,7 +23,7 @@
 
 # High-level representations.
 
-def convert_sequence(values, op):
+def convert_sequence(values, op, last_from_old):
     if values:
         new_values = list(values)
         last = new_values[0]
@@ -31,10 +31,22 @@
         length = len(new_values)
         while i < length:
             current = new_values[i]
-            new_values[i] = op(new_values[i], last)
-            last = current
+            new_values[i] = op(current, last)
+
+            # Subtracting entries requires the old value to be used.
+            # Adding entries requires the new value.
+
+            if last_from_old:
+                last = current
+            else:
+                last = new_values[i]
+
             i += 1
 
+        return new_values
+    else:
+        return values
+
 def op_seq_monotonic(x, y, op):
     return tuple([op(a, b) for a, b in zip(x, y)])
 
@@ -44,15 +56,6 @@
 def sub_seq_monotonic(x, y):
     return op_seq_monotonic(x, y, operator.sub)
 
-def op_first_monotonic(x, y, op):
-    return (op(x[0], y[0]),) + tuple(zip(x[1:], y[1:]))
-
-def add_first_monotonic(x, y):
-    return op_first_monotonic(x, y, operator.add)
-
-def sub_first_monotonic(x, y):
-    return op_first_monotonic(x, y, operator.sub)
-
 def add_seq(x, y):
     length = min(len(x), len(y))
     seq = list(x)[:length]
@@ -84,17 +87,17 @@
 def sizeof(value):
     return is_sequence(value) and len(value) or 0
 
-def get_monotonic_adder(value):
-    return is_sequence(value) and add_seq_monotonic or operator.add
+def get_monotonic_adder(size):
+    return size and add_seq_monotonic or operator.add
 
-def get_monotonic_subtractor(value):
-    return is_sequence(value) and sub_seq_monotonic or operator.sub
+def get_monotonic_subtractor(size):
+    return size and sub_seq_monotonic or operator.sub
 
-def get_adder(value):
-    return is_sequence(value) and add_seq or operator.add
+def get_adder(size):
+    return size and add_seq or operator.add
 
-def get_subtractor(value):
-    return is_sequence(value) and sub_seq or operator.sub
+def get_subtractor(size):
+    return size and sub_seq or operator.sub
 
 # Low-level representations.
 # Variable-length integer functions.
@@ -177,15 +180,6 @@
             break
     return number, start
 
-# String serialisation.
-
-def string_to_array(s, bytes):
-
-    "Write the given string 's' to 'bytes'."
-
-    vint_to_array(len(s), bytes)
-    bytes.fromstring(s.encode("utf-8"))
-
 # Sequence serialisation.
 
 def sequence_to_array(value, size, bytes):
diff -r b75bd39cf61f -r 6542c54d115b iixr/fields.py
--- a/iixr/fields.py	Sat Feb 12 01:23:58 2011 +0100
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,345 +0,0 @@
-#!/usr/bin/env python
-
-"""
-Specific classes for storing document information.
-
-Copyright (C) 2009, 2010, 2011 Paul Boddie <paul@boddie.org.uk>
-
-This program is free software; you can redistribute it and/or modify it under
-the terms of the GNU General Public License as published by the Free Software
-Foundation; either version 3 of the License, or (at your option) any later
-version.
-
-This program is distributed in the hope that it will be useful, but WITHOUT ANY
-WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
-PARTICULAR PURPOSE.  See the GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program.  If not, see <http://www.gnu.org/licenses/>.
-"""
-
-from iixr.data import *
-from iixr.files import *
-from bisect import bisect_right  # to find terms in the dictionary index
-
-DOCUMENT_CACHE_LIMIT = 10000
-
-class FieldWriter(FileWriter):
-
-    "Writing field data to files."
-
-    def begin(self, docnum_size):
-        self.write_number(docnum_size)
-        self.end_record()
-        self.docnum_size = docnum_size
-        self.data_start = self.tell()
-
-    def reset(self):
-        self.end_record()
-        self.last_docnum = None
-        self.subtractor = None
-
-    def write_fields(self, docnum, fields):
-
-        """
-        Write for the given 'docnum', a list of 'fields' (integer, string pairs
-        representing field identifiers and values respectively).
-        """
-
-        # Find the size of document number values.
-
-        if self.last_docnum is not None:
-            docnum_seq = self.subtractor(docnum, self.last_docnum)
-        else:
-            self.subtractor = get_subtractor(docnum)
-            docnum_seq = docnum
-
-        # Write the document number.
-
-        self.write_sequence_value(docnum_seq, self.docnum_size)
-
-        # Write the number of fields.
-
-        self.write_number(len(fields))
-
-        # Write the fields themselves.
-
-        for i, field in fields:
-            self.write_number(i)
-            self.write_string(field, 1) # compress
-
-        self.last_docnum = docnum
-
-class FieldReader(FileReader):
-
-    "Reading field data from files."
-
-    def begin(self):
-        self.begin_record()
-        try:
-            self.docnum_size = self.read_number()
-        except EOFError:
-            self.docnum_size = 0 # NOTE: No fields!
-        self.data_start = self.tell()
-
-    def reset(self):
-        self.last_docnum = None
-        self.adder = None
-        self.begin_record()
-
-    def read_fields(self):
-
-        """
-        Read fields from the file, returning a tuple containing the document
-        number and a list of field (identifier, value) pairs.
-        """
-
-        # Read the document number.
-
-        docnum = self.read_sequence_value(self.docnum_size)
-
-        if self.last_docnum is not None:
-            self.last_docnum = self.adder(docnum, self.last_docnum)
-        else:
-            self.adder = get_adder(docnum)
-            self.last_docnum = docnum
-
-        # Read the number of fields.
-
-        nfields = self.read_number()
-
-        # Collect the fields.
-
-        fields = []
-        i = 0
-
-        while i < nfields:
-            identifier = self.read_number()
-            value = self.read_string(1) # decompress
-            fields.append((identifier, value))
-            i += 1
-
-        return self.last_docnum, fields
-
-    def read_document_fields(self, docnum, offset):
-
-        """
-        Read fields for 'docnum' at the given 'offset'. This permits the
-        retrieval of details for the specified document, as well as scanning for
-        later documents.
-        """
-
-        self.seek(offset)
-        bad_docnum, fields = self.read_fields()
-        self.last_docnum = docnum
-        return docnum, fields
-
-class FieldIndexWriter(FieldWriter):
-
-    "Writing field index details to files."
-
-    def reset(self):
-        FieldWriter.reset(self)
-        self.last_offset = 0
-
-    def write_document(self, docnum, offset):
-
-        """
-        Write for the given 'docnum', the 'offset' at which the fields for the
-        document are stored in the fields file.
-        """
-
-        # Find the size of document number values.
-
-        if self.last_docnum is not None:
-            docnum_seq = self.subtractor(docnum, self.last_docnum)
-        else:
-            self.subtractor = get_subtractor(docnum)
-            docnum_seq = docnum
-
-        # Write the document number.
-
-        self.write_sequence_value(docnum_seq, self.docnum_size)
-
-        # Write the offset delta.
-
-        self.write_number(offset - self.last_offset)
-
-        self.last_docnum = docnum
-        self.last_offset = offset
-
-class FieldIndexReader(FieldReader):
-
-    "Reading field index details from files."
-
-    def reset(self):
-        FieldReader.reset(self)
-        self.last_offset = 0
-
-    def read_document(self):
-
-        "Read a document number and field file offset."
-
-        # Read the document number.
-
-        docnum = self.read_sequence_value(self.docnum_size)
-
-        if self.last_docnum is not None:
-            self.last_docnum = self.adder(docnum, self.last_docnum)
-        else:
-            self.adder = get_adder(docnum)
-            self.last_docnum = docnum
-
-        # Read the offset.
-
-        self.last_offset += self.read_number()
-
-        return self.last_docnum, self.last_offset
-
-class FieldDictionaryWriter:
-
-    "Writing field dictionary details."
-
-    def __init__(self, field_writer, field_index_writer, interval):
-        self.field_writer = field_writer
-        self.field_index_writer = field_index_writer
-        self.interval = interval
-        self.entry = 0
-
-    def write_fields(self, docnum, fields):
-
-        "Write details of the given 'docnum' and 'fields'."
-
-        if self.entry == 0:
-            docnum_size = sizeof(docnum)
-            self.field_writer.begin(docnum_size)
-            self.field_index_writer.begin(docnum_size)
-            self.field_index_writer.reset()
-
-        if self.entry % self.interval == 0:
-            self.field_writer.reset()
-            offset = self.field_writer.tell()
-            self.field_writer.write_fields(docnum, fields)
-            self.field_index_writer.write_document(docnum, offset)
-        else:
-            self.field_writer.write_fields(docnum, fields)
-
-        self.entry += 1
-
-    def close(self):
-        self.field_writer.close()
-        self.field_index_writer.close()
-
-class FieldDictionaryReader:
-
-    "Reading field dictionary details."
-
-    def __init__(self, field_reader, field_index_reader):
-        self.field_reader = field_reader
-        self.field_index_reader = field_index_reader
-
-        self.field_reader.reset()
-        self.field_index_reader.reset()
-
-        self.cache = {}
-
-        self.entry = 0
-        self.docs = []
-        try:
-            while 1:
-                self.docs.append(self.field_index_reader.read_document())
-        except EOFError:
-            pass
-
-        # Large numbers for ordering purposes.
-
-        if self.docs:
-            self.max_offset = self.docs[-1][1]
-        else:
-            self.max_offset = None
-
-    # Iterator convenience methods.
-
-    def __iter__(self):
-        self.rewind()
-        return self
-
-    def next(self):
-        try:
-            return self.read_fields()
-        except EOFError:
-            raise StopIteration
-
-    # Sequential access methods.
-
-    def rewind(self):
-        self.field_reader.rewind()
-
-    def read_fields(self):
-
-        "Return the next document number and fields."
-
-        try:
-            return self.field_reader.read_fields()
-        except EOFError:
-            self.entry += 1
-            try:
-                found_docnum, offset = self.docs[self.entry]
-            except IndexError:
-                raise EOFError
-            else:
-                self.field_reader.reset()
-                return self.field_reader.read_fields()
-
-    # Random access methods.
-
-    def get_fields(self, docnum):
-
-        "Read the fields of the document with the given 'docnum'."
-
-        if self.cache.has_key(docnum):
-            return self.cache[docnum]
-
-        i = bisect_right(self.docs, (docnum, self.max_offset)) - 1
-
-        # Get the entry position providing the term or one preceding it.
-
-        if i == -1:
-            return None
-
-        found_docnum, offset = self.docs[i]
-
-        # Read from the fields file.
-
-        found_docnum, fields = self.field_reader.read_document_fields(found_docnum, offset)
-
-        # Scan for the document, if necessary.
-
-        try:
-            while docnum > found_docnum:
-                found_docnum, fields = self.field_reader.read_fields()
-        except EOFError:
-            pass
-
-        # If the document is found, return the fields.
-
-        if docnum == found_docnum:
-
-            # Store the fields in the cache, removing entries if the limit has
-            # been reached.
-
-            keys = self.cache.keys()
-
-            if len(keys) == DOCUMENT_CACHE_LIMIT:
-                del self.cache[keys[0]]
-
-            self.cache[docnum] = fields
-            return fields
-        else:
-            return None
-
-    def close(self):
-        self.field_reader.close()
-        self.field_index_reader.close()
-
-# vim: tabstop=4 expandtab shiftwidth=4
diff -r b75bd39cf61f -r 6542c54d115b iixr/files.py
--- a/iixr/files.py	Sat Feb 12 01:23:58 2011 +0100
+++ b/iixr/files.py	Sun Feb 13 02:49:55 2011 +0100
@@ -22,10 +22,6 @@
 from array import array
 import zlib
 
-# Constants.
-
-CACHE_SIZE = 100000
-
 # Classes.
 
 class File:
@@ -35,14 +31,21 @@
     def __init__(self, f):
         self.f = f
         self.record = array('B')    # record buffer
-        self.cache = array('B')
+        self.data_start = None
+
+    def begin(self):
+
+        """
+        Initialise file-wide parameters. In writers, this method may require
+        parameters to be specified. In readers, the parameters may be read from
+        the file.
+        """
+
         self.data_start = 0
 
-    def reset(self):
-
-        "To be used to reset the state of the reader or writer between records."
-
-        pass
+    def tell(self):
+        # NOTE: Will not be accurate within the current record.
+        return self.f.tell()
 
     def seek(self, offset):
         self.f.seek(offset)
@@ -60,27 +63,26 @@
 
     "Writing basic data types to files."
 
-    def __init__(self, f):
-        File.__init__(self, f)
-        self.written = 0
-
-    def tell(self):
-        # NOTE: Will not be accurate within the current record.
-        return self.written
-
     def begin_record(self):
         pass
 
     def end_record(self):
         if self.record:
-            length = len(self.record)
-            before = len(self.cache)
-            vint_to_array(length, self.cache)
-            length_size = len(self.cache) - before
-            self.cache += self.record
-            self.written += length_size + length
+            self.f.write(vint(len(self.record)))
+            self.record.tofile(self.f)
             self.record = array('B')
-            self.flush_cache()
+
+    def write_remaining(self, a):
+
+        "Write remaining data from the raw array 'a'."
+
+        self.record += a
+
+    def write_byte(self, b):
+
+        "Write the given byte 'b'."
+
+        self.record.append(b)
 
     def write_number(self, number):
 
@@ -137,25 +139,17 @@
             self.write_sequence_value(value, size)
 
     def write_delta_sequence(self, values, size):
-        convert_sequence(values, get_subtractor(values[0]))
-        self.write_sequence_values(values, size)
+        self.write_sequence_values(
+            convert_sequence(values, get_subtractor(size), 1),
+            size)
 
     def write_monotonic_sequence(self, values, size):
-        convert_sequence(values, get_monotonic_subtractor(values[0]))
-        self.write_sequence_values(values, size)
-
-    def flush(self, force=0):
-        self.end_record()
-        self.flush_cache(force)
-
-    def flush_cache(self, force=0):
-        if self.f is not None:
-            if force or len(self.cache) > CACHE_SIZE:
-                self.cache.tofile(self.f)
-                self.cache = array('B')
+        self.write_sequence_values(
+            convert_sequence(values, get_monotonic_subtractor(size), 1),
+            size)
 
     def close(self):
-        self.flush(1)
+        self.end_record()
         File.close(self)
 
 class FileReader(File):
@@ -164,58 +158,33 @@
 
     def __init__(self, f):
         File.__init__(self, f)
-        self.record_start = 0
-        self.record_end = 0
-        self.cache_start = 0
         self.begin()
 
-    def begin(self):
-
-        "Initialise file-wide parameters."
-
-        pass
-
     def begin_record(self):
         self.start = 0
+        self.record = array('B')
         try:
             size = self.read_number_from_file()
-            self.record = self.from_cache(size)
+            self.record.fromfile(self.f, size)
         except EOFError:
             pass
 
     def end_record(self):
         pass
 
-    def seek(self, offset):
-        from_cache_start = offset - self.cache_start
-        if 0 <= from_cache_start < len(self.cache):
-            self.record_start = self.record_end = from_cache_start
-        else:
-            self.f.seek(offset)
-            self.cache = array('B')
-            self.cache_start = offset
-            self.record_start = self.record_end = 0
-        self.reset()
+    def read_remaining(self):
 
-    def tell(self):
-        return self.cache_start + self.record_start + self.start
+        "Read remaining data as a raw array."
+
+        return self.record[self.start:]
 
-    def ensure_cache(self, size):
-        if size > len(self.cache) - self.record_end:
-            self.cache = self.cache[self.record_end:]
-            self.cache_start += self.record_end
-            s = self.f.read(CACHE_SIZE)
-            self.cache.fromstring(s)
-            self.record_start = 0
-            if not s:
-                raise EOFError
-        else:
-            self.record_start = self.record_end
-        self.record_end = self.record_start + size
+    def read_byte(self):
+
+        "Read a byte from the record."
 
-    def from_cache(self, size):
-        self.ensure_cache(size)
-        return self.cache[self.record_start:self.record_end]
+        b = self.record[self.start]
+        self.start += 1
+        return b
 
     def read_number_from_file(self):
 
@@ -224,13 +193,13 @@
         # Read each byte, adding it to the number.
 
         a = array('B')
-        a += self.from_cache(1)
+        a.fromfile(self.f, 1)
         csd = a[-1]
         if csd < 128:
             return csd
         else:
             while csd & 128:
-                a += self.from_cache(1)
+                a.fromfile(self.f, 1)
                 csd = a[-1]
             return vint_from_array(a)
 
@@ -292,13 +261,9 @@
         return values
 
     def read_delta_sequence(self, size):
-        values = self.read_sequences(size)
-        convert_sequence(values, get_adder(values[0]))
-        return values
+        return convert_sequence(self.read_sequences(size), get_adder(size), 0)
 
     def read_monotonic_sequence(self, size):
-        values = self.read_sequences(size)
-        convert_sequence(values, get_monotonic_adder(values[0]))
-        return values
+        return convert_sequence(self.read_sequences(size), get_monotonic_adder(size), 0)
 
 # vim: tabstop=4 expandtab shiftwidth=4
diff -r b75bd39cf61f -r 6542c54d115b iixr/filesystem.py
--- a/iixr/filesystem.py	Sat Feb 12 01:23:58 2011 +0100
+++ b/iixr/filesystem.py	Sun Feb 13 02:49:55 2011 +0100
@@ -3,7 +3,7 @@
 """
 File access.
 
-Copyright (C) 2009, 2010 Paul Boddie <paul@boddie.org.uk>
+Copyright (C) 2009, 2010, 2011 Paul Boddie <paul@boddie.org.uk>
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -18,9 +18,7 @@
 with this program.  If not, see <http://www.gnu.org/licenses/>.
 """
 
-from iixr.fields import *
 from iixr.terms import *
-from iixr.positions import *
 from os import listdir, remove, rename  # partition manipulation
 from shutil import copy                 # index updating
 from os.path import join
@@ -32,8 +30,7 @@
 
 # Constants.
 
-TERM_FILENAMES    = "terms", "terms_index", "positions", "positions_index"
-FIELD_FILENAMES   = "fields", "fields_index"
+TERM_FILENAMES    = "terms",
 
 # Utility functions.
 
@@ -49,7 +46,9 @@
     partitions = set()
     for filename in listdir(pathname):
         if filename.startswith(prefix):
-            partitions.add(filename[prefix_length:])
+            partition = filename[prefix_length:]
+            if partition.isdigit():
+                partitions.add(int(partition))
     return partitions
 
 def get_term_partitions(pathname):
@@ -61,95 +60,40 @@
 
     return get_partitions(pathname, "terms-")
 
-def get_field_partitions(pathname):
+def get_next_partition(partitions):
+    return max(partitions or [-1]) + 1
+
+def get_term_writer(pathname, partition):
 
     """
-    Return a set of field partition identifiers for partitions residing at the
-    given 'pathname'.
-    """
-
-    return get_partitions(pathname, "fields-")
-
-def get_next_partition(partitions):
-    return max([int(n) for n in partitions if n.isdigit()] or [-1]) + 1
-
-def get_term_writer(pathname, partition, interval, doc_interval):
-
-    """
-    Return a term dictionary writer using files under the given 'pathname'
-    labelled according to the given 'partition', using the given indexing
-    'interval' for terms and 'doc_interval' for document position records.
+    Return a term writer using files under the given 'pathname' labelled
+    according to the given 'partition'.
     """
 
-    tdf = open(join(pathname, "terms-%s" % partition), "wb")
-    info_writer = TermWriter(tdf)
-
-    tdif = open(join(pathname, "terms_index-%s" % partition), "wb")
-    index_writer = TermIndexWriter(tdif)
-
-    tpf = open(join(pathname, "positions-%s" % partition), "wb")
-    positions_writer = PositionWriter(tpf)
-
-    tpif = open(join(pathname, "positions_index-%s" % partition), "wb")
-    positions_index_writer = PositionIndexWriter(tpif)
-
-    positions_dict_writer = PositionDictionaryWriter(positions_writer, positions_index_writer, doc_interval)
-
-    return TermDictionaryWriter(info_writer, index_writer, positions_dict_writer, interval)
+    f = open(join(pathname, "terms-%s" % partition), "wb")
+    return TermWriter(f)
 
-def get_field_writer(pathname, partition, interval):
-
-    """
-    Return a field dictionary writer using files under the given 'pathname'
-    labelled according to the given 'partition', using the given indexing
-    'interval'.
-    """
-
-    ff = open(join(pathname, "fields-%s" % partition), "wb")
-    field_writer = FieldWriter(ff)
-
-    fif = open(join(pathname, "fields_index-%s" % partition), "wb")
-    field_index_writer = FieldIndexWriter(fif)
-
-    return FieldDictionaryWriter(field_writer, field_index_writer, interval)
+def get_reader(pathname, name, partition, cls):
+    f = open(join(pathname, "%s-%s" % (name, partition)), "rb")
+    return cls(f)
 
 def get_term_reader(pathname, partition):
 
     """
-    Return a term dictionary reader using files under the given 'pathname'
+    Return a term reader using files under the given 'pathname' labelled
+    according to the given 'partition'.
+    """
+
+    return get_reader(pathname, "terms", partition, TermIterator)
+
+def get_term_data_reader(pathname, partition):
+
+    """
+    Return a term plus data reader using files under the given 'pathname'
     labelled according to the given 'partition'.
     """
 
-    tdf = open(join(pathname, "terms-%s" % partition), "rb")
-    info_reader = TermReader(tdf)
-
-    tdif = open(join(pathname, "terms_index-%s" % partition), "rb")
-    index_reader = TermIndexReader(tdif)
-
-    pf = open(join(pathname, "positions-%s" % partition), "rb")
-    position_reader = PositionReader(pf)
-
-    pif = open(join(pathname, "positions_index-%s" % partition), "rb")
-    position_index_reader = PositionIndexReader(pif)
-
-    position_dict_reader = PositionDictionaryReader(position_reader, position_index_reader)
-
-    return TermDictionaryReader(info_reader, index_reader, position_dict_reader)
-
-def get_field_reader(pathname, partition):
-
-    """
-    Return a field dictionary reader using files under the given 'pathname'
-    labelled according to the given 'partition'.
-    """
-
-    ff = open(join(pathname, "fields-%s" % partition), "rb")
-    field_reader = FieldReader(ff)
-
-    fif = open(join(pathname, "fields_index-%s" % partition), "rb")
-    field_index_reader = FieldIndexReader(fif)
-
-    return FieldDictionaryReader(field_reader, field_index_reader)
+    return get_reader(pathname, "terms", partition, TermDataIterator)
 
 # Renaming.
 
@@ -160,9 +104,6 @@
 def rename_term_files(pathname, from_partition, to_partition):
     rename_files(pathname, TERM_FILENAMES, from_partition, to_partition)
 
-def rename_field_files(pathname, from_partition, to_partition):
-    rename_files(pathname, FIELD_FILENAMES, from_partition, to_partition)
-
 # Removal/deletion.
 
 def remove_files(pathname, names, partition):
@@ -172,9 +113,6 @@
 def remove_term_files(pathname, partition):
     remove_files(pathname, TERM_FILENAMES, partition)
 
-def remove_field_files(pathname, partition):
-    remove_files(pathname, FIELD_FILENAMES, partition)
-
 # Copying.
 
 def copy_files(source, names, partition, destination, suffix):
@@ -185,7 +123,4 @@
 def copy_term_files(source, partition, destination, suffix):
     copy_files(source, TERM_FILENAMES, partition, destination, suffix)
 
-def copy_field_files(source, partition, destination, suffix):
-    copy_files(source, FIELD_FILENAMES, partition, destination, suffix)
-
 # vim: tabstop=4 expandtab shiftwidth=4
diff -r b75bd39cf61f -r 6542c54d115b iixr/index.py
--- a/iixr/index.py	Sat Feb 12 01:23:58 2011 +0100
+++ b/iixr/index.py	Sun Feb 13 02:49:55 2011 +0100
@@ -19,18 +19,14 @@
 """
 
 from iixr.filesystem import *
-from iixr.merging import *
-from itertools import islice
+from itermerge import itermerge
 from os import mkdir    # index discovery
 from os.path import exists
+import operator
 
 # Constants.
 
-TERM_INTERVAL     = 100
-DOCUMENT_INTERVAL = 100
-FIELD_INTERVAL    = 100
 FLUSH_INTERVAL    = 10000
-POSITIONS_FLUSH_INTERVAL = 1000000
 OPEN_PARTITIONS   = 20
 
 # High-level classes.
@@ -39,11 +35,9 @@
 
     "A container of document information."
 
-    def __init__(self, docnum, fields=None):
+    def __init__(self, docnum):
         self.docnum = docnum
-        self.fields = fields or []
         self.terms = {}
-        self.field_dict = None
 
     def add_position(self, term, position):
 
@@ -54,55 +48,18 @@
 
         self.terms.setdefault(term, []).append(position)
 
-    def add_field(self, identifier, value):
-
-        "Add a field having the given 'identifier' and 'value'."
-
-        self.fields.append((identifier, unicode(value))) # convert to string
-
-    def set_fields(self, fields):
-
-        """
-        Set the document's 'fields': a list of tuples each containing an integer
-        identifier and a string value.
-        """
-
-        self.fields = fields
-
-    def _ensure_dict(self):
-        if self.field_dict is None:
-            self.field_dict = dict(self.fields)
-
-    def keys(self):
-        self._ensure_dict()
-        return self.field_dict.keys()
-
-    def __getitem__(self, key):
-        self._ensure_dict()
-        return self.field_dict[key]
-
 class IndexWriter:
 
-    """
-    Building term information and writing it to the term and field dictionaries.
-    """
+    "Building term information and writing it to the term dictionary."
 
-    def __init__(self, pathname, interval, doc_interval, field_interval, flush_interval, positions_flush_interval):
+    def __init__(self, pathname, flush_interval):
         self.pathname = pathname
-        self.interval = interval
-        self.doc_interval = doc_interval
-        self.field_interval = field_interval
         self.flush_interval = flush_interval
-        self.positions_flush_interval = positions_flush_interval
 
-        self.dict_partition = get_next_partition(get_term_partitions(self.pathname))
-        self.field_dict_partition = get_next_partition(get_field_partitions(self.pathname))
+        self.term_partition = get_next_partition(get_term_partitions(self.pathname))
 
         self.terms = {}
-        self.docs = []
-
         self.doc_counter = 0
-        self.position_counter = 0
 
     def add_document(self, doc):
 
@@ -115,134 +72,105 @@
 
         for term, positions in doc.terms.items():
             self.terms.setdefault(term, {})[docnum] = positions
-            self.position_counter += len(positions)
-
-        self.docs.append((docnum, doc.fields))
 
         self.doc_counter += 1
 
-        if self.flush_interval and self.doc_counter >= self.flush_interval or \
-            self.positions_flush_interval and self.position_counter >= self.positions_flush_interval:
-
+        if self.flush_interval and self.doc_counter >= self.flush_interval:
             self.flush_terms()
-            self.flush_fields()
             self.doc_counter = 0
-            self.position_counter = 0
 
     def get_term_writer(self):
 
-        "Return a term dictionary writer for the current partition."
-
-        return get_term_writer(self.pathname, self.dict_partition, self.interval, self.doc_interval)
+        "Return a term writer for the current partition."
 
-    def get_field_writer(self):
-
-        "Return a field dictionary writer for the current partition."
-
-        return get_field_writer(self.pathname, self.field_dict_partition, self.field_interval)
+        return get_term_writer(self.pathname, self.term_partition)
 
     def flush_terms(self):
 
-        "Flush terms into the current term dictionary partition."
+        "Flush terms into the current term partition."
 
         # Get the terms in order.
 
-        all_terms = self.terms
-        terms = all_terms.keys()
-        terms.sort()
-
-        dict_writer = self.get_term_writer()
-
-        for term in terms:
-            doc_positions = all_terms[term].items()
-            dict_writer.write_term_positions(term, doc_positions)
-
-        dict_writer.close()
+        term_writer = self.get_term_writer()
+        try:
+            term_writer.write_terms(self.terms)
+        finally:
+            term_writer.close()
 
         self.terms = {}
-        self.dict_partition += 1
-
-    def flush_fields(self):
-
-        "Flush fields into the current term dictionary partition."
-
-        # Get the documents in order.
-
-        self.docs.sort()
-
-        field_dict_writer = self.get_field_writer()
-        for docnum, fields in self.docs:
-            field_dict_writer.write_fields(docnum, fields)
-        field_dict_writer.close()
-
-        self.docs = []
-        self.field_dict_partition += 1
+        self.term_partition += 1
 
     def close(self):
         if self.terms or not get_term_partitions(self.pathname):
             self.flush_terms()
-        if self.docs or not get_field_partitions(self.pathname):
-            self.flush_fields()
+
+class IndexReader(itermerge):
+
+    "Accessing the term dictionaries."
 
-class IndexReader:
+    def __init__(self, pathname, get_reader=None, combine=None):
 
-    "Accessing the term and field dictionaries."
+        # Get the partitions in order.
+
+        partitions = list(get_term_partitions(pathname))
+        partitions.sort()
 
-    def __init__(self, pathname):
-        self.dict_reader = get_term_reader(pathname, "merged")
-        self.field_dict_reader = get_field_reader(pathname, "merged")
+        # Initialise the underlying term partition readers.
 
-    # Sequential access.
+        self.readers = [(get_reader or get_term_reader)(pathname, partition) for partition in partitions]
+        self.combine = combine or operator.add
+
+        # Initialise this object as an iterator over the readers.
 
-    def read_term(self):
-        return self.dict_reader.read_term()
+        itermerge.__init__(self, self.readers)
+        self.next_value = None
 
-    def go_to_term(self, term):
-        return self.dict_reader._get_term_and_positions(*self.dict_reader.go_to_term(term))
+    def get_sizes(self):
 
-    # Query access.
+        # Readers must have compatible sizes.
 
-    def get_terms(self):
-        return self.dict_reader.get_terms()
-
-    def find_terms(self, term):
-        return self.dict_reader.find_terms(term)
+        if self.readers:
+            return self.readers[0].get_sizes()
+        else:
+            return 0, 0
 
-    def find_positions(self, term):
-        return self.dict_reader.find_positions(term)
+    def next(self):
+        if self.next_value is not None:
+            term, positions = self.next_value
+        else:
+            term, positions = itermerge.next(self)
 
-    def find_common_positions(self, terms):
-        return self.dict_reader.find_common_positions(terms)
+        # Look at the next item to see if it is has positions for the current
+        # term.
 
-    def get_frequency(self, term):
-        return self.dict_reader.get_frequency(term)
-
-    def get_document_frequency(self, term):
-        return self.dict_reader.get_document_frequency(term)
+        try:
+            t, p = itermerge.next(self)
+            while t == term:
+                positions = self.combine(positions, p)
+                t, p = itermerge.next(self)
+            self.next_value = t, p
 
-    def get_fields(self, docnum):
-        return self.field_dict_reader.get_fields(docnum)
+        # Where an item could not be fetched, cause future requests to fail.
 
-    def get_document(self, docnum):
-        return Document(docnum, self.get_fields(docnum))
+        except StopIteration:
+            self.next_value = None
+
+        return term, positions
 
     def close(self):
-        self.dict_reader.close()
-        self.field_dict_reader.close()
+        for reader in self.readers:
+            reader.close()
+        self.readers = []
 
 class Index:
 
     "An inverted index solution encapsulating the various components."
 
-    def __init__(self, pathname, interval=TERM_INTERVAL, doc_interval=DOCUMENT_INTERVAL, field_interval=FIELD_INTERVAL,
-        flush_interval=FLUSH_INTERVAL, positions_flush_interval=POSITIONS_FLUSH_INTERVAL, open_partitions=OPEN_PARTITIONS):
+    def __init__(self, pathname, flush_interval=FLUSH_INTERVAL,
+        open_partitions=OPEN_PARTITIONS):
 
         self.pathname = pathname
-        self.interval = interval
-        self.doc_interval = doc_interval
-        self.field_interval = field_interval
         self.flush_interval = flush_interval
-        self.positions_flush_interval = positions_flush_interval
         self.open_partitions = open_partitions
         self.reader = None
         self.writer = None
@@ -251,132 +179,60 @@
 
         "Return a writer."
 
-        self._ensure_directory()
-        self.writer = IndexWriter(self.pathname, self.interval, self.doc_interval,
-            self.field_interval, self.flush_interval, self.positions_flush_interval)
+        if self.writer is None:
+            self._ensure_directory()
+            self.writer = IndexWriter(self.pathname, self.flush_interval)
         return self.writer
 
     def _ensure_directory(self):
         if not exists(self.pathname):
             mkdir(self.pathname)
 
-    def get_reader(self, partition=0):
-
-        "Return a reader for the index."
-
-        # Ensure that only one partition exists.
-
-        self.merge()
-        return self._get_reader(partition)
-
-    def _get_reader(self, partition):
+    def get_reader(self, refresh=0):
 
         "Return a reader for the index."
 
-        if not exists(self.pathname):
-            raise OSError, "Index path %r does not exist." % self.pathname
-
-        self.reader = IndexReader(self.pathname)
-        return self.reader
-
-    def get_term_partitions(self):
+        if refresh and self.reader is not None:
+            self.reader.close()
+            self.reader = None
 
-        "Return a set of term partition identifiers."
-
-        return get_term_partitions(self.pathname)
-
-    def get_field_partitions(self):
-
-        "Return a set of field partition identifiers."
-
-        return get_field_partitions(self.pathname)
+        if self.reader is None:
+            if not exists(self.pathname):
+                raise OSError, "Index path %r does not exist." % self.pathname
+            self.reader = IndexReader(self.pathname)
+        return self.reader
 
     def merge(self):
 
-        "Merge/optimise index partitions."
-
-        self._merge_terms()
-        self._merge_fields()
-
-    def _merge_dictionaries(self, get_partitions, rename_files, remove_files, get_reader, get_writer, get_merger, intervals):
-
-        "Merge term or field dictionaries."
-
-        partitions = get_partitions()
-
-        # Ensure the correct labelling of a single partition.
-
-        if len(partitions) == 1:
-            partition = list(partitions)[0]
-            if partition != "merged":
-                rename_files(self.pathname, partition, "merged")
-            return
+        "Merge the partitions in the index."
 
-        # Merge the partitions.
-
-        old_merged_counter = 0
-
-        while len(partitions) > 1:
-
-            if "merged" in partitions:
-                rename_files(self.pathname, "merged", "old-merged-%d" % old_merged_counter)
-                partitions.remove("merged")
-                partitions.add("old-merged-%d" % old_merged_counter)
-                old_merged_counter += 1
-
-            # Process only a certain number at once, avoiding resource limits.
-
-            active_partitions = list(islice(partitions, self.open_partitions))
-
-            readers = []
-            for partition in active_partitions:
-                readers.append(get_reader(self.pathname, partition))
-
-            # Write directly to a dictionary.
+        reader = IndexReader(self.pathname, get_term_data_reader, self.merge_data)
+        writer = get_term_writer(self.pathname, "merged")
+        try:
+            writer.begin(*reader.get_sizes())
+            for term, data in reader:
+                writer.write_term_plus_remaining(term, data)
+                writer.end_record()
+        finally:
+            writer.close()
+            reader.close()
 
-            writer = get_writer(self.pathname, "merged", *intervals)
-            merger = get_merger(writer, readers)
-            merger.merge()
-            merger.close()
-
-            # Remove old files.
-
-            for partition in active_partitions:
-                remove_files(self.pathname, partition)
+        for partition in get_term_partitions(self.pathname):
+            remove_term_files(self.pathname, partition)
 
-            # Acquire the partitions to check their number again.
-
-            partitions = get_partitions()
-
-    def _merge_terms(self):
+        rename_term_files(self.pathname, "merged", 0)
 
-        "Merge term dictionaries."
-
-        self._merge_dictionaries(self.get_term_partitions, rename_term_files,
-            remove_term_files, get_term_reader, get_term_writer,
-            TermDictionaryMerger, [self.interval, self.doc_interval])
+    def merge_data(self, a, b):
 
-    def _merge_fields(self):
-
-        "Merge field dictionaries."
-
-        self._merge_dictionaries(self.get_field_partitions, rename_field_files,
-            remove_field_files, get_field_reader, get_field_writer,
-            FieldDictionaryMerger, [self.field_interval])
-
-    def update(self, other_indexes):
+        """
+        Merge 'a' and 'b', modifying the data to permit concatenation.
+        """
 
-        "Copy the content of the 'other_indexes' into this index and merge."
-
-        self._ensure_directory()
+        # Modify the record to indicate a continuation of the data.
 
-        for i, index in enumerate(other_indexes):
-            for partition in index.get_term_partitions():
-                copy_term_files(index.pathname, partition, self.pathname, "-added-%d" % i)
-            for partition in index.get_field_partitions():
-                copy_field_files(index.pathname, partition, self.pathname, "-added-%d" % i)
-
-        self.merge()
+        c = a + b
+        c[len(a) - 1] = 1
+        return c
 
     def close(self):
         if self.reader is not None:
diff -r b75bd39cf61f -r 6542c54d115b iixr/merging.py
--- a/iixr/merging.py	Sat Feb 12 01:23:58 2011 +0100
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,89 +0,0 @@
-#!/usr/bin/env python
-
-"""
-Dictionary merging classes.
-
-Copyright (C) 2009, 2010 Paul Boddie <paul@boddie.org.uk>
-
-This program is free software; you can redistribute it and/or modify it under
-the terms of the GNU General Public License as published by the Free Software
-Foundation; either version 3 of the License, or (at your option) any later
-version.
-
-This program is distributed in the hope that it will be useful, but WITHOUT ANY
-WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
-PARTICULAR PURPOSE.  See the GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program.  If not, see <http://www.gnu.org/licenses/>.
-"""
-
-from itermerge import itermerge
-
-class Merger:
-
-    "Merge files."
-
-    def __init__(self, writer, readers):
-        self.writer = writer
-        self.readers = readers
-
-    def close(self):
-        for reader in self.readers:
-            reader.close()
-        self.readers = []
-        if self.writer is not None:
-            self.writer.close()
-            self.writer = None
-
-class TermDictionaryMerger(Merger):
-
-    "Merge term and position files."
-
-    def merge(self):
-
-        """
-        Merge terms and positions from the readers, sending them to the writer.
-        """
-
-        last_term = None
-        current_readers = []
-
-        for term, frequency, doc_frequency, positions in itermerge(self.readers):
-            if term == last_term:
-                current_readers.append(positions)
-            else:
-                if current_readers:
-                    self.writer.write_term_positions(last_term, itermerge(current_readers))
-                last_term = term
-                current_readers = [positions]
-        else:
-            if current_readers:
-                self.writer.write_term_positions(last_term, itermerge(current_readers))
-
-class FieldDictionaryMerger(Merger):
-
-    "Merge field files."
-
-    def merge(self):
-
-        """
-        Merge fields from the readers, sending them to the writer.
-        """
-
-        last_docnum = None
-        current_fields = []
-
-        for docnum, fields in itermerge(self.readers):
-            if docnum == last_docnum:
-                current_fields += fields
-            else:
-                if current_fields:
-                    self.writer.write_fields(last_docnum, current_fields)
-                last_docnum = docnum
-                current_fields = fields
-        else:
-            if current_fields:
-                self.writer.write_fields(last_docnum, current_fields)
-
-# vim: tabstop=4 expandtab shiftwidth=4
diff -r b75bd39cf61f -r 6542c54d115b iixr/positions.py
--- a/iixr/positions.py	Sat Feb 12 01:23:58 2011 +0100
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,566 +0,0 @@
-#!/usr/bin/env python
-
-"""
-Specific classes for storing position information.
-
-Copyright (C) 2009, 2010, 2011 Paul Boddie <paul@boddie.org.uk>
-
-This program is free software; you can redistribute it and/or modify it under
-the terms of the GNU General Public License as published by the Free Software
-Foundation; either version 3 of the License, or (at your option) any later
-version.
-
-This program is distributed in the hope that it will be useful, but WITHOUT ANY
-WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
-PARTICULAR PURPOSE.  See the GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program.  If not, see <http://www.gnu.org/licenses/>.
-"""
-
-from iixr.data import *
-from iixr.files import *
-
-class PositionWriter(FileWriter):
-
-    "Writing position information to files."
-
-    def begin(self, docnum_size, position_size):
-        self.write_numbers((docnum_size, position_size))
-        self.end_record()
-        self.data_start = self.tell()
-        self.docnum_size = docnum_size
-        self.position_size = position_size
-
-    def reset(self):
-        self.end_record()
-        self.last_docnum = None
-        self.subtractor = None
-
-    def write_positions(self, docnum, positions):
-
-        """
-        Write for the document 'docnum' the given 'positions'.
-        """
-
-        if not positions:
-            return
-
-        # Make sure that the positions are sorted.
-
-        positions.sort()
-
-        # Calculate an ongoing delta.
-
-        if self.last_docnum is not None:
-            if docnum < self.last_docnum:
-                raise ValueError, "Document number %r is less than previous number %r." % (docnum, self.last_docnum)
-
-            docnum_seq = self.subtractor(docnum, self.last_docnum)
-
-        # Or preserve the document number and prepare for future deltas.
-
-        else:
-            self.subtractor = get_subtractor(docnum)
-            docnum_seq = docnum
-
-        self.write_sequence_value(docnum_seq, self.docnum_size)
-        self.write_monotonic_sequence(positions, self.position_size)
-
-        self.last_docnum = docnum
-
-class PositionReader(FileReader):
-
-    "Reading position information within term-specific regions of a file."
-
-    def begin(self):
-        self.begin_record()
-        try:
-            self.docnum_size, self.position_size = self.read_numbers(2)
-        except EOFError:
-            self.docnum_size, self.position_size = 0, 0 # NOTE: No positions!
-        self.data_start = self.tell()
-
-    def reset(self):
-        self.last_docnum = None
-        self.adder = None
-        self.begin_record()
-
-    def read_positions(self):
-
-        """
-        Read positions, returning a document number and a list of positions.
-        """
-
-        # Read the document number.
-
-        docnum = self.read_sequence_value(self.docnum_size)
-
-        # Calculate an ongoing delta.
-
-        if self.last_docnum is not None:
-            self.last_docnum = self.adder(docnum, self.last_docnum)
-
-        # Or preserve the document number and prepare for future deltas.
-
-        else:
-            self.adder = get_adder(docnum)
-            self.last_docnum = docnum
-
-        positions = self.read_monotonic_sequence(self.position_size)
-
-        return self.last_docnum, positions
-
-class PositionIndexWriter(PositionWriter):
-
-    "Writing position index information to files."
-
-    def begin(self, docnum_size):
-        PositionWriter.begin(self, docnum_size, 0)
-
-    def reset(self):
-        PositionWriter.reset(self)
-        self.last_pos_offset = 0
-
-    def write_positions(self, docnum, pos_offset, count):
-
-        """
-        Write the given 'docnum, 'pos_offset' and document 'count' to the
-        position index file.
-        """
-
-        # Find the size of document number values.
-
-        if self.last_docnum is not None:
-            docnum_seq = self.subtractor(docnum, self.last_docnum)
-        else:
-            self.subtractor = get_subtractor(docnum)
-            docnum_seq = docnum
-
-        self.write_sequence_value(docnum_seq, self.docnum_size)
-        self.write_number(pos_offset - self.last_pos_offset)
-        self.write_number(count)
-
-        self.last_docnum = docnum
-        self.last_pos_offset = pos_offset
-
-class PositionIndexReader(PositionReader):
-
-    "Reading position index information within term-specific regions of a file."
-
-    def reset(self):
-        PositionReader.reset(self)
-        self.last_pos_offset = 0
-
-    def read_positions(self):
-
-        """
-        Read a document number, a position file offset for the position index
-        file, and the number of documents in a section of that file.
-        """
-
-        # Read the document number.
-
-        docnum = self.read_sequence_value(self.docnum_size)
-
-        if self.last_docnum is not None:
-            self.last_docnum = self.adder(docnum, self.last_docnum)
-        else:
-            self.adder = get_adder(docnum)
-            self.last_docnum = docnum
-
-        # Read the offset delta.
-
-        self.last_pos_offset += self.read_number()
-
-        # Read the document count.
-
-        count = self.read_number()
-
-        return self.last_docnum, self.last_pos_offset, count
-
-# Iterators for position-related files.
-
-class IteratorBase:
-
-    "Support for iterating over results."
-
-    def __init__(self, reader):
-
-        "Initialise the iterator using the given 'reader'."
-
-        self.reader = reader
-        self.replenish(0) # no iteration initially permitted
-
-    def replenish(self, count):
-
-        "Replenish the iterator with 'count' results."
-
-        self.count = count
-        self.read_documents = 0
-
-    def __len__(self):
-
-        "Return the total number of results."
-
-        return self.count
-
-    def sort(self):
-        pass # Stored document positions are already sorted.
-
-    def __iter__(self):
-        return self
-
-class PositionIterator(IteratorBase):
-
-    "Iterating over document positions."
-
-    def replenish(self, count):
-        IteratorBase.replenish(self, count)
-
-        # Fill a cache of positions.
-
-        self.cache = []
-        n = 0
-
-        while n < self.count:
-            self.cache.append(self.reader.read_positions())
-            n += 1
-
-    def seek(self, offset, count):
-
-        """
-        Seek to 'offset' in the file, limiting the number of documents available
-        for reading to 'count'.
-        """
-
-        self.reader.seek(offset)
-        self.replenish(count)
-
-    def next(self):
-
-        "Read positions for a single document."
-
-        if self.read_documents < self.count:
-            positions = self.cache[self.read_documents]
-            self.read_documents += 1
-            return positions
-        else:
-            raise StopIteration
-
-class PositionIndexIterator(IteratorBase):
-
-    "Iterating over document positions."
-
-    def replenish(self, count):
-        IteratorBase.replenish(self, count)
-
-        # Fill a cache of offsets.
-
-        self.cache = []
-        self.current = 0
-        n = 0
-
-        while n < self.count:
-            docnum, pos_offset, section_count = t = self.reader.read_positions()
-            self.cache.append(t)
-            n += section_count
-
-    def seek(self, offset, doc_frequency):
-
-        """
-        Seek to 'offset' in the file, limiting the number of documents available
-        for reading to 'doc_frequency'.
-        """
-
-        self.reader.seek(offset)
-        self.replenish(doc_frequency)
-
-    def next(self):
-
-        "Read positions for a single document."
-
-        if self.current < len(self.cache):
-            docnum, pos_offset, self.section_count = t = self.cache[self.current]
-            self.current += 1
-            return t
-        else:
-            raise StopIteration
-
-class PositionDictionaryWriter:
-
-    "Writing position dictionaries."
-
-    def __init__(self, position_writer, position_index_writer, interval):
-        self.position_writer = position_writer
-        self.position_index_writer = position_index_writer
-        self.interval = interval
-
-    def write_term_positions(self, doc_positions):
-
-        """
-        Write all 'doc_positions' - a collection of tuples of the form (document
-        number, position list) - to the file.
-
-        Add some records to the index, making dictionary entries.
-
-        Return a tuple containing the offset of the written data, the frequency
-        (number of positions), and document frequency (number of documents) for
-        the term involved.
-        """
-
-        # Write the positions.
-
-        frequency = 0
-        count = 0
-
-        if doc_positions:
-            doc_positions.sort()
-
-            # Look ahead at the first document record.
-            # NOTE: Any iterator would need to support this.
-
-            first_docnum, first_positions = doc_positions[0]
-            first_position = first_positions[0]
-
-            # Write out size details.
-
-            docnum_size, position_size = sizeof(first_docnum), sizeof(first_position)
-            self.position_writer.begin(docnum_size, position_size)
-            self.position_index_writer.begin(docnum_size)
-
-            # Reset the writers.
-
-            self.position_writer.reset()
-            self.position_index_writer.reset()
-
-            # Remember the first index entry offset.
-
-            index_offset = self.position_index_writer.tell()
-
-            # Retain the first record offset for a subsequent index entry.
-
-            first_offset = self.position_writer.tell()
-
-            for docnum, positions in doc_positions:
-                if first_docnum is None:
-                    first_docnum = docnum
-
-                self.position_writer.write_positions(docnum, positions)
-
-                frequency += len(positions)
-                count += 1
-
-                # Every {interval} entries, write an index entry.
-
-                if count % self.interval == 0:
-
-                    self.position_index_writer.write_positions(first_docnum, first_offset, self.interval)
-
-                    # Reset the position writer so that position readers accessing
-                    # a section start with the correct document number.
-
-                    self.position_writer.reset()
-
-                    first_offset = self.position_writer.tell()
-                    first_docnum = None
-
-            # Finish writing an index entry for the remaining documents.
-
-            else:
-                if first_docnum is not None:
-                    self.position_index_writer.write_positions(first_docnum, first_offset, count % self.interval)
-
-        return index_offset, frequency, count
-
-    def close(self):
-        self.position_writer.close()
-        self.position_index_writer.close()
-
-class PositionDictionaryReader:
-
-    "Access to position dictionary entries through iterators."
-
-    def __init__(self, position_reader, position_index_reader):
-        self.position_reader = position_reader
-        self.position_index_reader = position_index_reader
-
-    def read_term_positions(self, offset, doc_frequency):
-        iterator = PositionDictionaryIterator(
-            PositionIterator(self.position_reader),
-            PositionIndexIterator(self.position_index_reader)
-            )
-        iterator.seek(offset, doc_frequency)
-        return iterator
-
-    def close(self):
-        self.position_reader.close()
-        self.position_index_reader.close()
-
-class PositionDictionaryIterator:
-
-    "Iteration over position dictionary entries."
-
-    def __init__(self, position_iterator, position_index_iterator):
-        self.position_iterator = position_iterator
-        self.position_index_iterator = position_index_iterator
-        self.reset()
-
-    def reset(self):
-
-        # Remember the last values.
-
-        self.found_docnum, self.found_positions = None, None
-
-        # Maintain state for the next index entry, if read.
-
-        self.next_docnum, self.next_pos_offset, self.next_section_count = None, None, None
-
-    def seek(self, offset, doc_frequency):
-
-        """
-        Seek to 'offset' in the index file, limiting the number of documents
-        available for reading to 'doc_frequency'.
-        """
-
-        self.reset()
-
-        # Seek to the appropriate index entry.
-
-        self.position_index_iterator.seek(offset, doc_frequency)
-
-        # Initialise the current index entry and current position file iterator.
-
-        self._next_section()
-        self._init_section()
-
-    # Sequence methods.
-
-    def __len__(self):
-        return len(self.position_index_iterator)
-
-    def sort(self):
-        pass
-
-    # Iterator methods.
-
-    def __iter__(self):
-        return self
-
-    def next(self):
-
-        """
-        Attempt to get the next document record from the section in the
-        positions file.
-        """
-
-        # Return any visited but unrequested record.
-
-        if self.found_docnum is not None:
-            t = self.found_docnum, self.found_positions
-            self.found_docnum, self.found_positions = None, None
-            return t
-
-        # Or search for the next record.
-
-        while 1:
-
-            # Either return the next record.
-
-            try:
-                return self.position_iterator.next()
-
-            # Or, where a section is finished, get the next section and try again.
-
-            except StopIteration:
-
-                # Although, where a single iterator is in use, the file reader
-                # would be positioned appropriately, this is not guaranteed in a
-                # multiple iterator situation.
-
-                self._next_section()
-                self._init_section()
-
-    def from_document(self, docnum):
-
-        """
-        Attempt to navigate to a positions entry for the given 'docnum',
-        returning the positions for 'docnum', or None otherwise.
-        """
-
-        # Return any unrequested document positions.
-
-        if docnum == self.found_docnum:
-            return self.found_positions
-
-        # Read ahead in the index until the next entry refers to a document
-        # later than the desired document.
-
-        try:
-            if self.next_docnum is None:
-                self.next_docnum, self.next_pos_offset, self.next_section_count = self.position_index_iterator.next()
-
-            # Read until the next entry is after the desired document number,
-            # or until the end of the results.
-
-            while self.next_docnum <= docnum:
-                self._next_read_section()
-                if self.docnum < docnum:
-                    self.next_docnum, self.next_pos_offset, self.next_section_count = self.position_index_iterator.next()
-                else:
-                    break
-
-        except StopIteration:
-            pass
-
-        # Navigate in the position file to the document.
-
-        self._init_section()
-
-        try:
-            while 1:
-                found_docnum, found_positions = self.position_iterator.next()
-
-                # Return the desired document positions or None (retaining the
-                # positions for the document immediately after).
-
-                if docnum <= found_docnum:
-                    self.found_docnum, self.found_positions = found_docnum, found_positions
-                    if docnum == found_docnum:
-                        return found_positions
-                    elif docnum < found_docnum:
-                        return None
-
-        except StopIteration:
-            return None
-
-    # Internal methods.
-
-    def _next_section(self):
-
-        "Attempt to get the next section in the index."
-
-        if self.next_docnum is None:
-            self.docnum, self.pos_offset, self.section_count = self.position_index_iterator.next()
-        else:
-            self._next_read_section()
-
-    def _next_read_section(self):
-
-        """
-        Make the next index entry the current one without reading from the
-        index.
-        """
-
-        self.docnum, self.pos_offset, self.section_count = self.next_docnum, self.next_pos_offset, self.next_section_count
-        self.next_docnum, self.next_pos_offset, self.next_section_count = None, None, None
-
-    def _init_section(self):
-
-        "Initialise the iterator for the section in the position file."
-
-        # Seek to the position entry.
-
-        self.position_iterator.seek(self.pos_offset, self.section_count)
-
-# vim: tabstop=4 expandtab shiftwidth=4
diff -r b75bd39cf61f -r 6542c54d115b iixr/terms.py
--- a/iixr/terms.py	Sat Feb 12 01:23:58 2011 +0100
+++ b/iixr/terms.py	Sun Feb 13 02:49:55 2011 +0100
@@ -18,29 +18,87 @@
 with this program.  If not, see <http://www.gnu.org/licenses/>.
 """
 
+from iixr.data import *
 from iixr.files import *
-from iixr.positions import *
 from iixr.phrases import PhraseIterator
 from os.path import commonprefix # to find common string prefixes
-from bisect import bisect_right  # to find terms in the dictionary index
 
 class TermWriter(FileWriter):
 
     "Writing term information to files."
 
-    def reset(self):
+    def begin(self, docnum_size, position_size):
+
+        "Begin writing to the file."
+
+        self.write_numbers((docnum_size, position_size))
         self.end_record()
+
+        self.data_start = self.tell()
+        self.docnum_size = docnum_size
+        self.position_size = position_size
+        self.subtractor = get_subtractor(docnum_size)
         self.last_term = ""
-        self.last_offset = 0
 
-    def write_term(self, term, offset, frequency, doc_frequency):
+    def write_terms(self, terms):
 
         """
-        Write the given 'term', its position file 'offset', its 'frequency' and
-        its 'doc_frequency' (number of documents in which it appears) to the
-        term information file.
+        Write the 'terms' to the term information file, with each term's details
+        stored in a separate record.
         """
 
+        if hasattr(terms, "items"):
+            terms = terms.items()
+            terms.sort()
+
+        for term, doc_positions in terms:
+            if not doc_positions:
+                continue
+
+            if hasattr(doc_positions, "items"):
+                doc_positions = doc_positions.items()
+
+            docnum, positions = doc_positions[0]
+
+            if not positions:
+                continue
+
+            # Start the writing, if appropriate.
+
+            if self.data_start is None:
+                self.begin(sizeof(docnum), sizeof(positions[0]))
+
+            # Write each term and document positions.
+
+            self.write_term(term, doc_positions)
+            self.end_record()
+
+    # Methods requiring an open record.
+
+    def write_term(self, term, doc_positions):
+
+        """
+        Write the given 'term', its document frequency (number of documents in
+        which it appears), and 'doc_positions' to the term information file.
+        """
+
+        self.write_term_only(term)
+
+        # Write the document frequency and the term positions.
+
+        self.write_positions(doc_positions)
+
+    def write_term_plus_remaining(self, term, data):
+
+        "Write the given 'term' and the document position 'data'."
+
+        self.write_term_only(term)
+        self.write_remaining(data)
+
+    def write_term_only(self, term):
+
+        "Write only the given 'term'."
+
         if term <= self.last_term:
             raise ValueError, "Term %r precedes the previous term %r." % (term, self.last_term)
 
@@ -52,430 +110,173 @@
         self.write_number(common)
         self.write_string(suffix)
 
-        # Write the offset delta.
-        # Write the frequency.
+        self.last_term = term
+
+    def write_positions(self, doc_positions):
+
+        "Write the given 'doc_positions' to the file."
+
+        # Make sure that the positions are sorted.
+
+        doc_positions.sort()
+
         # Write the document frequency.
 
-        self.write_numbers((
-            offset - self.last_offset,
-            frequency,
-            doc_frequency
-            ))
+        self.write_number(len(doc_positions))
+
+        last_docnum = None
+
+        for docnum, positions in doc_positions:
+
+            # Store the first document number as it is.
+
+            if last_docnum is None:
+                docnum_seq = docnum
+
+            # Reject out-of-order documents.
+
+            elif docnum < last_docnum:
+                raise ValueError, "Document number %r is less than previous number %r." % (docnum, last_docnum)
 
-        self.last_term = term
-        self.last_offset = offset
+            # Calculate an ongoing delta.
+
+            else:
+                docnum_seq = self.subtractor(docnum, last_docnum)
+
+            # Write the document number and positions.
+
+            self.write_sequence_value(docnum_seq, self.docnum_size)
+            self.write_monotonic_sequence(positions, self.position_size)
+
+            last_docnum = docnum
+
+        # Write a terminating byte to indicate that no more document pages
+        # exist.
+
+        self.write_byte(0)
 
 class TermReader(FileReader):
 
     "Reading term information from files."
 
-    def reset(self):
+    def begin(self):
+
+        "Begin reading from the file."
+
+        self.begin_record()
+        try:
+            self.docnum_size, self.position_size = self.read_numbers(2)
+        except EOFError:
+            self.docnum_size, self.position_size = 0, 0 # NOTE: No positions!
+
+        self.data_start = self.tell()
+        self.adder = get_adder(self.docnum_size)
         self.last_term = ""
-        self.last_offset = 0
-        self.begin_record()
+
+    def get_sizes(self):
+        return self.docnum_size, self.position_size
+
+    # Methods requiring an open record.
 
     def read_term(self):
 
+        "Read a term and its document positions from the term information file."
+
+        # Read the term.
+
+        self.read_term_only()
+
+        # Read the document frequency and the term positions.
+
+        positions = self.read_positions()
+
+        return self.last_term, positions
+
+    def read_term_plus_remaining(self):
+
         """
-        Read a term, its position file offset, its frequency and its document
-        frequency from the term information file.
+        Read a term and the unprocessed document position data.
         """
 
+        self.read_term_only()
+        return self.last_term, self.read_remaining()
+
+    def read_term_only(self):
+
+        "Read a term only."
+
         # Read the prefix length and term suffix.
 
         common = self.read_number()
         suffix = self.read_string()
 
         self.last_term = self.last_term[:common] + suffix
-
-        # Read the offset delta.
-
-        self.last_offset += self.read_number()
-
-        # Read the frequency.
-
-        frequency = self.read_number()
-
-        # Read the document frequency.
-
-        doc_frequency = self.read_number()
+        return self.last_term
 
-        return self.last_term, self.last_offset, frequency, doc_frequency
-
-    def go_to_term(self, term, offset, info_offset):
-
-        """
-        Seek past the entry for 'term' having 'offset' to 'info_offset'. This
-        permits the scanning for later terms from the specified term.
-        """
-
-        self.seek(info_offset)
-        self.last_term = term
-        self.last_offset = offset
-
-class TermIndexWriter(TermWriter):
+    def read_positions(self):
 
-    "Writing term dictionary index details to files."
-
-    def reset(self):
-        TermWriter.reset(self)
-        self.last_info_offset = 0
-
-    def write_term(self, term, offset, frequency, doc_frequency, info_offset):
-
-        """
-        Write the given 'term', its position file 'offset', its 'frequency' and
-        its 'doc_frequency' to the term dictionary index file, along with the
-        'info_offset' in the term information file.
-        """
+        "Read document positions from the term information file."
 
-        TermWriter.write_term(self, term, offset, frequency, doc_frequency)
-
-        # Write the information file offset delta.
-
-        self.write_number(info_offset - self.last_info_offset)
-
-        self.last_info_offset = info_offset
+        doc_positions = []
 
-class TermIndexReader(TermReader):
-
-    "Reading term dictionary index details from files."
-
-    def reset(self):
-        TermReader.reset(self)
-        self.last_info_offset = 0
+        while 1:
 
-    def read_term(self):
-
-        """
-        Read a term, its position file offset, its frequency, its document
-        frequency and a term information file offset from the term dictionary
-        index file.
-        """
-
-        term, offset, frequency, doc_frequency = TermReader.read_term(self)
-
-        # Read the offset delta.
-
-        self.last_info_offset += self.read_number()
+            # Read the document frequency.
 
-        return term, offset, frequency, doc_frequency, self.last_info_offset
-
-class TermDictionaryWriter:
-
-    "Writing term dictionaries."
-
-    def __init__(self, info_writer, index_writer, position_dict_writer, interval):
-        self.info_writer = info_writer
-        self.index_writer = index_writer
-        self.position_dict_writer = position_dict_writer
-        self.interval = interval
-        self.entry = 0
-
-        self.index_writer.reset()
+            npositions = self.read_number()
 
-    def _write_term(self, term, offset, frequency, doc_frequency):
-
-        """
-        Write the given 'term', its position file 'offset', its 'frequency' and
-        its 'doc_frequency' (number of documents in which it appears) to the
-        term information file. Return the offset before the term information was
-        written to the file.
-        """
-
-        if self.entry % self.interval == 0:
-            self.info_writer.reset()
-            info_offset = self.info_writer.tell()
-            self.index_writer.write_term(term, offset, frequency, doc_frequency, info_offset)
+            last_docnum = None
+            i = 0
+            while i < npositions:
 
-        self.info_writer.write_term(term, offset, frequency, doc_frequency)
-        self.entry += 1
-
-    def write_term_positions(self, term, doc_positions):
-
-        """
-        Write the given 'term' and the 'doc_positions' recording the documents
-        and positions at which the term is found.
-        """
-
-        offset, frequency, doc_frequency = self.position_dict_writer.write_term_positions(doc_positions)
-
-        if not frequency or not doc_frequency:
-            raise ValueError, "Term %r has no occurrences recorded: %r" % (term, doc_positions)
-
-        self._write_term(term, offset, frequency, doc_frequency)
+                # Read the document number.
 
-    def close(self):
-        self.info_writer.close()
-        self.index_writer.close()
-        self.position_dict_writer.close()
-
-class TermDictionaryReader:
-
-    "Reading term dictionaries."
+                docnum = self.read_sequence_value(self.docnum_size)
+                if last_docnum is not None:
+                    docnum = self.adder(docnum, last_docnum)
 
-    def __init__(self, info_reader, index_reader, position_dict_reader):
-        self.info_reader = info_reader
-        self.index_reader = index_reader
-        self.position_dict_reader = position_dict_reader
-
-        self.info_reader.reset()
-        self.index_reader.reset()
-
-        self.entry = 0
-        self.terms = []
-        try:
-            while 1:
-                self.terms.append(self.index_reader.read_term())
-        except EOFError:
-            pass
-
-        # Large numbers for ordering purposes.
+                # Read the positions.
 
-        if self.terms:
-            self.max_offset = self.terms[-1][1] + 1
-        else:
-            self.max_offset = None
-
-    def _find_closest_entry(self, term):
-
-        """
-        Find the offsets and frequencies of 'term' from the term dictionary or
-        the closest term starting with the value of 'term'.
-
-        Return the closest index entry consisting of a term, the position file
-        offset, the term frequency, the document frequency, and the term details
-        file offset.
-        """
+                positions = self.read_monotonic_sequence(self.position_size)
+                doc_positions.append((docnum, positions))
 
-        i = bisect_right(self.terms, (term, self.max_offset, 0, 0)) - 1
-
-        # Get the entry position providing the term or one preceding it.
-        # If no entry precedes the requested term, return the very first entry
-        # as the closest.
-
-        if i == -1:
-            self.entry = 0
-            return self.terms[0]
-        else:
-            self.entry = i
-            return self.terms[i]
-
-    def _find_closest_term(self, term):
-
-        """
-        Find the offsets and frequencies of 'term' from the term dictionary or
-        the closest term starting with the value of 'term'.
+                last_docnum = docnum
+                i += 1
 
-        Return the closest term (or the term itself), the position file offset,
-        the term frequency, the document frequency, and the term details file
-        offset (or None if the reader is already positioned).
-        """
-
-        found_term, offset, frequency, doc_frequency, info_offset = self._find_closest_entry(term)
-
-        # Where the term is found immediately, return the offset and
-        # frequencies. If the term does not appear, return the details of the
-        # closest entry.
-
-        if term <= found_term:
-            return found_term, offset, frequency, doc_frequency, info_offset
+            # Read a terminating byte to discover whether more document pages
+            # exist.
 
-        # Otherwise, seek past the index term's entry in the information file
-        # and scan for the desired term.
-
-        else:
-            # Reset the term and offset for the new page.
-            self.info_reader.go_to_term("", 0, info_offset)
-            try:
-                while term > found_term:
-                    found_term, offset, frequency, doc_frequency = self._read_term()
-            except EOFError:
-                pass
-
-            return found_term, offset, frequency, doc_frequency, None
-
-    def _find_term(self, term):
+            if not self.read_byte():
+                break
 
-        """
-        Find the position file offset and frequency of 'term' from the term
-        dictionary.
-        """
-
-        found_term, offset, frequency, doc_frequency, info_offset = self._find_closest_term(term)
-
-        # If the term is found, return the offset and frequencies.
-
-        if term == found_term:
-            return offset, frequency, doc_frequency
-        else:
-            return None
-
-    def _get_term_and_positions(self, term, offset, frequency, doc_frequency):
+        return doc_positions
 
-        """
-        Return the term plus positions details using the given 'term', 'offset',
-        'frequency' and 'doc_frequency'.
-        """
-
-        return term, frequency, doc_frequency, self._get_positions(offset, doc_frequency)
-
-    def _get_positions(self, offset, doc_frequency):
+class TermIterator(TermReader):
 
-        """
-        Obtain positions from the position index 'offset' expecting a number of
-        documents equal to the given 'doc_frequency'.
-        """
-
-        return self.position_dict_reader.read_term_positions(offset, doc_frequency)
-
-    # Iterator convenience methods.
+    "An iterator over terms and positions read from a file."
 
     def __iter__(self):
-        self.rewind()
         return self
 
     def next(self):
         try:
+            self.begin_record()
             return self.read_term()
         except EOFError:
             raise StopIteration
 
-    # Sequential access methods.
-
-    def rewind(self):
-        self.entry = 0
-        self.info_reader.rewind()
-
-    def read_term(self):
-
-        """
-        Return the next term, its frequency, its document frequency, and the
-        documents and positions at which the term is found.
-        """
-
-        return self._get_term_and_positions(*self._read_term())
-
-    def _read_term(self):
-
-        try:
-            term, offset, frequency, doc_frequency = self.info_reader.read_term()
-        except EOFError:
-            self.entry += 1
-            try:
-                term, offset, frequency, doc_frequency, info_offset = self.terms[self.entry]
-            except IndexError:
-                raise EOFError
-            else:
-                # Reset the term and offset for the new page.
-
-                self.info_reader.go_to_term("", 0, info_offset)
-
-                # Skip the term in the information file.
-
-                self.info_reader.read_term()
+class TermDataIterator(TermReader):
 
-        return term, offset, frequency, doc_frequency
-
-    def go_to_term(self, term):
-
-        """
-        Navigate to 'term' in the dictionary, returning the details from its
-        entry. The returned details can be augmented with position information
-        when presented to the _get_term_and_positions method.
-        """
-
-        found_term, offset, frequency, doc_frequency, info_offset = self._find_closest_term(term)
-
-        # Position the reader, if necessary.
-
-        if info_offset is not None:
+    "An iterator over terms and unprocessed document positions data."
 
-            # Reset the term and offset for the new page.
-
-            self.info_reader.go_to_term("", 0, info_offset)
-
-            # Skip the term in the information file.
-
-            self.info_reader.read_term()
-
-        return found_term, offset, frequency, doc_frequency
-
-    # Query methods.
-
-    def get_terms(self):
-
-        "Return a list of all terms."
-
-        return iter(self)
+    def __iter__(self):
+        return self
 
-    def find_terms(self, term):
-
-        "Return all terms whose values start with the value of 'term'."
-
-        terms = []
-
-        found_term, offset, frequency, doc_frequency = self.go_to_term(term)
-
-        # Read and record terms.
-
+    def next(self):
         try:
-            # Add the found term if it starts with the specified term.
-
-            while found_term.startswith(term):
-                terms.append(found_term)
-                found_term, offset, frequency, doc_frequency = self._read_term()
-
+            self.begin_record()
+            return self.read_term_plus_remaining()
         except EOFError:
-            pass
-
-        return terms
-
-    def find_positions(self, term):
-
-        "Return the documents and positions at which the given 'term' is found."
-
-        t = self._find_term(term)
-        if t is None:
-            return []
-        else:
-            offset, frequency, doc_frequency = t
-            return self._get_positions(offset, doc_frequency)
-
-    def find_common_positions(self, terms):
-
-        """
-        Return the documents and positions at which all the given 'terms' are
-        found, where only common documents are returned.
-        """
-
-        return PhraseIterator([self.find_positions(term) for term in terms])
-
-    def get_frequency(self, term):
-
-        "Return the frequency of the given 'term'."
-
-        t = self._find_term(term)
-        if t is None:
-            return None
-        else:
-            offset, frequency, doc_frequency = t
-            return frequency
-
-    def get_document_frequency(self, term):
-
-        "Return the document frequency of the given 'term'."
-
-        t = self._find_term(term)
-        if t is None:
-            return None
-        else:
-            offset, frequency, doc_frequency = t
-            return doc_frequency
-
-    def close(self):
-        self.info_reader.close()
-        self.index_reader.close()
-        self.position_dict_reader.close()
+            raise StopIteration
 
 # vim: tabstop=4 expandtab shiftwidth=4
diff -r b75bd39cf61f -r 6542c54d115b itermerge.py
--- a/itermerge.py	Sat Feb 12 01:23:58 2011 +0100
+++ b/itermerge.py	Sun Feb 13 02:49:55 2011 +0100
@@ -3,7 +3,7 @@
 """
 An iterator merging class similar to heapq.merge in Python 2.6.
 
-Copyright (C) 2009 Paul Boddie <paul@boddie.org.uk>
+Copyright (C) 2009, 2011 Paul Boddie <paul@boddie.org.uk>
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
diff -r b75bd39cf61f -r 6542c54d115b test.py
--- a/test.py	Sat Feb 12 01:23:58 2011 +0100
+++ b/test.py	Sun Feb 13 02:49:55 2011 +0100
@@ -1,22 +1,21 @@
 #!/usr/bin/env python
+# encoding: iso-8859-1
 
 from iixr.files import *
-from iixr.fields import *
 from iixr.terms import *
-from iixr.positions import *
 from iixr.index import *
 import os, sys
 
 # Remove old test files.
 
-for filename in ("test", "testMS", "testNMS", "testF", "testFI", "testI", "testP", "testP2", "testPI"):
+for filename in ("test", "testMS", "testNMS", "testP", "testP2"):
     try:
         os.remove(filename)
     except OSError:
         pass
 
 try:
-    for dirname in ("test_index", "test_index2", "test_index3", "test_indexT"):
+    for dirname in ("test_index",):
         for filename in os.listdir(dirname):
             os.remove(os.path.join(dirname, filename))
         os.rmdir(dirname)
@@ -98,22 +97,20 @@
     ]
 
 f = open("testP", "wb")
-w = PositionWriter(f)
+w = TermWriter(f)
 w.begin(0, 0)
 for doc_positions in all_doc_positions:
-    w.reset()
-    for docnum, positions in doc_positions:
-        w.write_positions(docnum, positions)
+    w.write_positions(doc_positions)
+    w.end_record()
 w.close()
 
 f = open("testP", "rb")
-r = PositionReader(f)
+r = TermReader(f)
 for doc_positions in all_doc_positions:
-    r.reset()
-    for docnum, positions in doc_positions:
-        d, p = r.read_positions()
-        print docnum == d, docnum, d
-        print positions == p, positions, p
+    r.begin_record()
+    dp = r.read_positions()
+    print doc_positions == dp, doc_positions
+    print "     ", dp
 r.close()
 
 all_doc_positions_seq = [
@@ -131,350 +128,56 @@
     ]
 
 f = open("testP2", "wb")
-w = PositionWriter(f)
+w = TermWriter(f)
 w.begin(2, 2)
 for doc_positions in all_doc_positions_seq:
-    w.reset()
-    for docnum, positions in doc_positions:
-        w.write_positions(docnum, positions)
+    w.write_positions(doc_positions)
+    w.end_record()
 w.close()
 
 f = open("testP2", "rb")
-r = PositionReader(f)
+r = TermReader(f)
 for doc_positions in all_doc_positions_seq:
-    r.reset()
-    for docnum, positions in doc_positions:
-        d, p = r.read_positions()
-        print docnum == d, docnum, d
-        print positions == p, positions, p
-r.close()
-
-print "- Test position index files."
-
-indexed_positions = [
-    [
-        (1234, 0, 100),
-        (2345, 700, 100),
-        (3456, 1900, 50)
-    ],
-    [
-        (4567, 2800, 20)
-    ]
-    ]
-
-offsets = []
-f = open("testPI", "wb")
-w = PositionIndexWriter(f)
-w.begin(0)
-for term_positions in indexed_positions:
-    offset = None
-    doc_frequency = 0
-    w.reset()
-    for docnum, pos_offset, count in term_positions:
-        if offset is None:
-            offset = w.tell()
-        w.write_positions(docnum, pos_offset, count)
-        doc_frequency += count
-    offsets.append((offset, doc_frequency))
-w.close()
-
-r = PositionIndexIterator(PositionIndexReader(open("testPI", "rb")))
-offsets.reverse()
-indexed_positions.reverse()
-for (offset, doc_frequency), term_positions in zip(offsets, indexed_positions):
-    r.seek(offset, doc_frequency)
-    for (docnum, pos_offset, count), (dn, po, c) in zip(term_positions, r):
-        print docnum == dn, docnum, dn
-        print pos_offset == po, pos_offset, po
-        print count == c, count, c
-r.reader.close()
-
-print "- Test position dictionaries."
-
-f = open("testP", "wb")
-w = PositionWriter(f)
-f2 = open("testPI", "wb")
-w2 = PositionIndexWriter(f2)
-wd = PositionDictionaryWriter(w, w2, 2)
-offsets = []
-for doc_positions in all_doc_positions:
-    offset, frequency, doc_frequency = wd.write_term_positions(doc_positions)
-    offsets.append((offset, doc_frequency))
-wd.close()
-
-r = PositionReader(open("testP", "rb"))
-r2 = PositionIndexReader(open("testPI", "rb"))
-rd = PositionDictionaryReader(r, r2)
-offsets.reverse()
-all_doc_positions.reverse()
-for (offset, doc_frequency), doc_positions in zip(offsets, all_doc_positions):
-    it = rd.read_term_positions(offset, doc_frequency)
-    dp = list(it)
-    print doc_positions == dp, doc_positions, dp
-rd.close()
-
-print "- Test fields."
-
-doc_fields = [
-    (123, ["testing", "fields", "stored", "compressed"]),
-    (456, ["fields", "for a second", "document"]),
-    (789, ["field value"]),
-    (1234, []),
-    (2345, ["abc", "def"]),
-    (3456, ["apple", "banana", "cherry"]),
-    (4567, ["drue", "eple"])
-    ]
-
-f = open("testF", "wb")
-w = FieldWriter(f)
-w.begin(0)
-w.reset()
-for docnum, fields in doc_fields:
-    w.write_fields(docnum, list(enumerate(fields)))
-w.close()
-
-f = open("testF", "rb")
-r = FieldReader(f)
-r.reset()
-for docnum, fields in doc_fields:
-    dn, df = r.read_fields()
-    print docnum == dn, docnum, dn
-    print list(enumerate(fields)) == df, list(enumerate(fields)), df
-r.close()
-
-print "- Test field index files."
-
-indexed_docs = [
-    (123, 100000987),
-    (456, 100004321),
-    (789, 100008765)
-    ]
-
-f = open("testFI", "wb")
-w = FieldIndexWriter(f)
-w.begin(0)
-w.reset()
-for docnum, offset in indexed_docs:
-    w.write_document(docnum, offset)
-w.close()
-
-f = open("testFI", "rb")
-r = FieldIndexReader(f)
-r.reset()
-for docnum, offset in indexed_docs:
-    dn, o = r.read_document()
-    print docnum == dn, docnum, dn
-    print offset == o, offset, o
+    r.begin_record()
+    dp = r.read_positions()
+    print doc_positions == dp, doc_positions
+    print "     ", dp
 r.close()
 
-print "- Test field dictionaries."
-
-f = open("testF", "wb")
-w = FieldWriter(f)
-f2 = open("testFI", "wb")
-w2 = FieldIndexWriter(f2)
-wd = FieldDictionaryWriter(w, w2, 3)
-for docnum, fields in doc_fields:
-    wd.write_fields(docnum, list(enumerate(fields)))
-wd.close()
-
-f = open("testF", "rb")
-r = FieldReader(f)
-f2 = open("testFI", "rb")
-r2 = FieldIndexReader(f2)
-rd = FieldDictionaryReader(r, r2)
-doc_fields_reversed = doc_fields[:]
-doc_fields_reversed.reverse()
-for docnum, fields in doc_fields_reversed:
-    df = dict(rd.get_fields(docnum))
-    print dict(enumerate(fields)) == df, dict(enumerate(fields)), df
-for docnum in (13579, 246810):
-    df = rd.get_fields(docnum)
-    print df is None, df
-
-print "- (Test sequential access.)"
-
-rd.rewind()
-for docnum, fields in doc_fields:
-    dn, df = rd.read_fields()
-    print docnum == dn, docnum, dn
-    print list(enumerate(fields)) == df, list(enumerate(fields)), df
-rd.close()
-
-print "- Test terms."
-
-terms = [
-    # term       offset      frequency  doc_frequency
-    ("aardvark",  100000123,  1,         1),
-    ("anteater",  100000456,  2,         1),
-    ("badger",    100000789, 13,         7),
-    ("bull",     1000001234, 59,        17),
-    ("bulldog",  1000002345, 99,        80),
-    ("cat",      1000003456, 89,        28)
-    ]
-
-f = open("test", "wb")
-w = TermWriter(f)
-w.reset()
-for term, offset, frequency, doc_frequency in terms:
-    w.write_term(term, offset, frequency, doc_frequency)
-w.close()
-
-f = open("test", "rb")
-r = TermReader(f)
-r.reset()
-for term, offset, frequency, doc_frequency in terms:
-    t, o, fr, df = r.read_term()
-    print term == t, term, t
-    print offset == o, offset, o
-    print frequency == fr, frequency, fr
-    print doc_frequency == df, doc_frequency, df
-r.close()
-
-print "- Test terms in index files."
-
-indexed_terms = [
-    # term       offset      frequency  doc_frequency   info_offset
-    ("aardvark",  100000123,  1,         1,             200000321),
-    ("anteater",  100000456,  2,         1,             200000654),
-    ("badger",    100000789, 13,         7,             200000987),
-    ("bull",     1000001234, 59,        17,             200004321),
-    ("bulldog",  1000002345, 99,        80,             200005432),
-    ("cat",      1000003456, 89,        28,             200006543)
-    ]
-
-f = open("test", "wb")
-w = TermIndexWriter(f)
-w.reset()
-for term, offset, frequency, doc_frequency, info_offset in indexed_terms:
-    w.write_term(term, offset, frequency, doc_frequency, info_offset)
-w.close()
-
-f = open("test", "rb")
-r = TermIndexReader(f)
-r.reset()
-for term, offset, frequency, doc_frequency, info_offset in indexed_terms:
-    t, o, fr, df, i = r.read_term()
-    print term == t, term, t
-    print offset == o, offset, o
-    print frequency == fr, frequency, fr
-    print doc_frequency == df, doc_frequency, df
-    print info_offset == i, info_offset, i
-r.close()
-
-print "- Test dictionaries with only term data."
-
-f = open("test", "wb")
-w = TermWriter(f)
-f2 = open("testI", "wb")
-w2 = TermIndexWriter(f2)
-f3 = open("testP", "wb")
-w3 = PositionWriter(f3)
-f4 = open("testPI", "wb")
-w4 = PositionIndexWriter(f4)
-wp = PositionDictionaryWriter(w3, w4, 2)
-wd = TermDictionaryWriter(w, w2, wp, 3)
-for term, offset, frequency, doc_frequency in terms:
-    wd._write_term(term, offset, frequency, doc_frequency)
-wd.close()
-
-f = open("test", "rb")
-r = TermReader(f)
-f2 = open("testI", "rb")
-r2 = TermIndexReader(f2)
-r3 = PositionReader(open("testP", "rb"))
-r4 = PositionIndexReader(open("testPI", "rb"))
-rp = PositionDictionaryReader(r3, r4)
-rd = TermDictionaryReader(r, r2, rp)
-terms_reversed = terms[:]
-terms_reversed.reverse()
-for term, offset, frequency, doc_frequency in terms_reversed:
-    o, fr, df = rd._find_term(term)
-    print offset == o, offset, o
-    print frequency == fr, frequency, fr
-    print doc_frequency == df, doc_frequency, df
-for term in ("dog", "dingo"):
-    t = rd._find_term(term)
-    print t is None, t
-
-print "- (Test term prefix searching.)"
-
-print rd.find_terms("a") == ["aardvark", "anteater"], rd.find_terms("a"), ["aardvark", "anteater"]
-print rd.find_terms("bu") == ["bull", "bulldog"], rd.find_terms("bu"), ["bull", "bulldog"]
-print rd.find_terms("c") == ["cat"], rd.find_terms("c"), ["cat"]
-print rd.find_terms("d") == [], rd.find_terms("d"), []
-rd.close()
-
 print "- Test dictionaries with term and position data."
 
 terms_with_positions = [
     ("aardvark",  [(1, [2, 45, 96]), (20, [13])]),
     ("anteater",  [(1, [43, 44])]),
     ("badger",    [(7, [2, 22, 196]), (19, [55, 1333]), (21, [0])]),
+    (u"bj�rn",    [(11, [19, 54])]),
     ("bull",      [(6, [128]), (16, [12]), (26, [1, 3, 5, 7, 9]), (36, [2, 4, 6, 8, 10])]),
     ("bulldog",   [(43, [17, 19, 256, 512])]),
-    ("cat",       [(123, [12, 145, 196]), (1200, [113])])
-    ]
-
-position_dict_tests = [
-    ("badger", 19, [55, 1333]),
-    ("badger", 20, None),
-    ("bull", 6, [128]),
-    ("bull", 26, [1, 3, 5, 7, 9]),
-    ("cat", 111, None),
-    ("cat", 123, [12, 145, 196]),
-    ("cat", 1234, None)
+    ("cat",       [(123, [12, 145, 196]), (1200, [113])]),
+    (u"�",        [(15, [384])]),
     ]
 
 f = open("test", "wb")
 w = TermWriter(f)
-f2 = open("testI", "wb")
-w2 = TermIndexWriter(f2)
-f3 = open("testP", "wb")
-w3 = PositionWriter(f3)
-f4 = open("testPI", "wb")
-w4 = PositionIndexWriter(f4)
-wp = PositionDictionaryWriter(w3, w4, 2)
-wd = TermDictionaryWriter(w, w2, wp, 3)
-for term, doc_positions in terms_with_positions:
-    wd.write_term_positions(term, doc_positions)
-wd.close()
+w.begin(0, 0)
+w.write_terms(terms_with_positions)
+w.close()
 
 f = open("test", "rb")
-r = TermReader(f)
-f2 = open("testI", "rb")
-r2 = TermIndexReader(f2)
-r3 = PositionReader(open("testP", "rb"))
-r4 = PositionIndexReader(open("testPI", "rb"))
-rp = PositionDictionaryReader(r3, r4)
-rd = TermDictionaryReader(r, r2, rp)
-terms_reversed = terms_with_positions[:]
-terms_reversed.reverse()
-for term, doc_positions in terms_reversed:
-    dp = list(rd.find_positions(term))
-    print doc_positions == dp, doc_positions, dp
-for term in ("aaa", "dog", "dingo"):
-    dp = rd.find_positions(term)
-    print dp == [], dp
+r = TermIterator(f)
+for (term, doc_positions), (t, dp) in zip(terms_with_positions, r):
+    print term == t, term, t
+    print doc_positions == dp, doc_positions
+    print "     ", dp
+r.close()
 
-print "- (Test iterators.)"
-
-for term, docnum, positions in position_dict_tests:
-    dp = rd.find_positions(term)
-    pos = dp.from_document(docnum)
-    print positions is None and pos is None or pos is not None and positions == list(pos), positions, pos
-
-print "- (Test sequential access.)"
+f = open("test", "rb")
+r = TermDataIterator(f)
+for (term, doc_positions), (t, data) in zip(terms_with_positions, r):
+    print term == t, term, t, data
+r.close()
 
-rd.rewind()
-for term, doc_positions in terms_with_positions:
-    t, fr, df, dp = rd.read_term()
-    dp = list(dp)
-    print term == t, term, t
-    print doc_positions == dp, doc_positions, dp
-rd.close()
-
-print "- Test high-level index operations (including merging)."
+print "- Test high-level index operations."
 
 docs = [
     (1, "The cat sat on the mat"),
@@ -485,189 +188,26 @@
     (36, "She sells sea shells on the sea shore")
     ]
 
-doc_tests = [
-    ("Every", 2, [(2, [0]), (14, [0])]),
-    ("good", 2, [(2, [1]), (13, [1])]),
-    ("deserves", 2, [(2, [3]), (13, [3])]),
-    ("sea", 2, [(36, [2, 6])])
-    ]
-
-position_tests = [
-    ("Every", 14, [0]),
-    ("sea", 36, [2, 6]),
-    ("shells", 1, None),
-    ("shells", 37, None)
-    ]
-
-phrase_tests = [
-    (["good", "boy"], [(2, [1, 2])]),
-    (["on", "the"], [(1, [3, 4]), (36, [4, 5])]),
-    (["sea", "shore"], [(36, [6, 7])])
-    ]
-
-index = Index("test_index", 3, 2, 3, 6)
+index = Index("test_index", 3)
 wi = index.get_writer()
 for docnum, text in docs:
     doc = Document(docnum)
     for position, term in enumerate(text.split()):
         doc.add_position(term, position)
-    doc.add_field(123, text)
-    wi.add_document(doc)
-wi.close()
-
-rd = index.get_reader()
-
-print "- (Test searching.)"
-
-for term, frequency, doc_positions in doc_tests:
-    dp = list(rd.find_positions(term))
-    print doc_positions == dp, doc_positions, dp
-    fr = rd.get_frequency(term)
-    print frequency == fr, frequency, fr
-
-print "- (Test fields.)"
-
-for docnum, text in docs:
-    df = dict(rd.get_fields(docnum))
-    print df[123] == text, text, df[123]
-
-print "- (Test navigation.)"
-
-for term, docnum, positions in position_tests:
-    dp = rd.find_positions(term)
-    pos = dp.from_document(docnum)
-    print positions is None and pos is None or pos is not None and positions == list(pos), positions, pos
-
-print "- (Test phrases.)"
-
-for terms, results in phrase_tests:
-    res = list(rd.find_common_positions(terms))
-    print results == res, results, res
-
-index.close()
-
-docs2 = [
-    ((1, 0), "The cat sat on the mat"),
-    ((1, 2), "Every good boy deserves football"),
-    ((13, 1), "One good turn deserves another"),
-    ((14, 0), "Every man for himself"),
-    ((14, 25), "Red sky at night shepherd's delight"),
-    ((36, 12), "She sells sea shells on the sea shore")
-    ]
-
-doc_tests2 = [
-    ("Every", 2, [((1, 2), [(0, 0)]), ((14, 0), [(0, 0)])]),
-    ("good", 2, [((1, 2), [(1, 6)]), ((13, 1), [(1, 4)])]),
-    ("deserves", 2, [((1, 2), [(3, 15)]), ((13, 1), [(3, 14)])]),
-    ("sea", 2, [((36, 12), [(2, 10), (6, 28)])])
-    ]
-
-position_tests2 = [
-    ("Every", (14, 0), [(0, 0)]),
-    ("sea", (36, 12), [(2, 10), (6, 28)]),
-    ("shells", (1, 0), None),
-    ("shells", (37, 0), None)
-    ]
-
-phrase_tests2 = [
-    (["good", "boy"], [((1, 2), [(1, 6), (2, 11)])]),
-    (["on", "the"], [((1, 0), [(3, 12), (4, 15)]), ((36, 12), [(4, 21), (5, 24)])]),
-    (["sea", "shore"], [((36, 12), [(6, 28), (7, 32)])])
-    ]
-
-index = Index("test_indexT", 3, 2, 3, 6)
-wi = index.get_writer()
-for docnum, text in docs2:
-    doc = Document(docnum)
-    offset = 0
-    for position, term in enumerate(text.split()):
-        doc.add_position(term, (position, offset))
-        offset += len(term) + 1 # assume one space after the term
-    doc.add_field(123, text)
     wi.add_document(doc)
 wi.close()
 
-rd = index.get_reader()
-
-print "- (Test searching.)"
-
-for term, frequency, doc_positions in doc_tests2:
-    dp = list(rd.find_positions(term))
-    print doc_positions == dp, doc_positions, dp
-    fr = rd.get_frequency(term)
-    print frequency == fr, frequency, fr
-
-print "- (Test fields.)"
+print "- Test merge."
 
-for docnum, text in docs2:
-    df = dict(rd.get_fields(docnum))
-    print df[123] == text, text, df[123]
-
-print "- (Test navigation.)"
+l1 = list(index.get_reader())
+index.merge()
+l2 = list(index.get_reader(1))
 
-for term, docnum, positions in position_tests2:
-    dp = rd.find_positions(term)
-    pos = dp.from_document(docnum)
-    print positions is None and pos is None or pos is not None and positions == list(pos), positions, pos
-
-print "- (Test phrases.)"
-
-for terms, results in phrase_tests2:
-    res = list(rd.find_common_positions(terms))
-    print results == res, results, res
+for (t1, dp1), (t2, dp2) in zip(l1, l2):
+    print t1 == t2, t1, t2
+    print dp1 == dp1, dp1
+    print "     ", dp2
 
 index.close()
 
-print "- Test index updates."
-
-index = Index("test_index")
-index2 = Index("test_index2", 3, 2, 3, 6)
-wi = index2.get_writer()
-for docnum, text in docs:
-
-    # Add the same documents but with different numbers.
-
-    doc = Document(docnum + 100)
-    for position, term in enumerate(text.split()):
-        doc.add_position(term, position)
-    doc.add_field(123, text)
-    wi.add_document(doc)
-wi.close()
-
-index2.update([index])
-index.close()
-
-rd = index2.get_reader()
-for term, frequency, doc_positions in doc_tests:
-
-    # Add the extra documents to the expected result.
-
-    orig_doc_positions = doc_positions
-    doc_positions = doc_positions[:]
-
-    for docnum, positions in orig_doc_positions:
-        doc_positions.append((docnum + 100, positions))
-    frequency *= 2
-
-    dp = list(rd.find_positions(term))
-    print doc_positions == dp, doc_positions, dp
-    fr = rd.get_frequency(term)
-    print frequency == fr, frequency, fr
-index2.close()
-
-print "- (Test update of an empty index.)"
-
-index = Index("test_index")
-index3 = Index("test_index3")
-index3.update([index])
-index.close()
-
-rd = index3.get_reader()
-for term, frequency, doc_positions in doc_tests:
-    dp = list(rd.find_positions(term))
-    print doc_positions == dp, doc_positions, dp
-    fr = rd.get_frequency(term)
-    print frequency == fr, frequency, fr
-index3.close()
-
 # vim: tabstop=4 expandtab shiftwidth=4