Removed numerous classes, simplifying the package and focusing on combined term and position files which can be merged using fewer processing operations.

     1.1 --- a/iixr/data.py	Sat Feb 12 01:23:58 2011 +0100
     1.2 +++ b/iixr/data.py	Sun Feb 13 02:49:55 2011 +0100
     1.3 @@ -23,7 +23,7 @@
     1.4  
     1.5  # High-level representations.
     1.6  
     1.7 -def convert_sequence(values, op):
     1.8 +def convert_sequence(values, op, last_from_old):
     1.9      if values:
    1.10          new_values = list(values)
    1.11          last = new_values[0]
    1.12 @@ -31,10 +31,22 @@
    1.13          length = len(new_values)
    1.14          while i < length:
    1.15              current = new_values[i]
    1.16 -            new_values[i] = op(new_values[i], last)
    1.17 -            last = current
    1.18 +            new_values[i] = op(current, last)
    1.19 +
    1.20 +            # Subtracting entries requires the old value to be used.
    1.21 +            # Adding entries requires the new value.
    1.22 +
    1.23 +            if last_from_old:
    1.24 +                last = current
    1.25 +            else:
    1.26 +                last = new_values[i]
    1.27 +
    1.28              i += 1
    1.29  
    1.30 +        return new_values
    1.31 +    else:
    1.32 +        return values
    1.33 +
    1.34  def op_seq_monotonic(x, y, op):
    1.35      return tuple([op(a, b) for a, b in zip(x, y)])
    1.36  
    1.37 @@ -44,15 +56,6 @@
    1.38  def sub_seq_monotonic(x, y):
    1.39      return op_seq_monotonic(x, y, operator.sub)
    1.40  
    1.41 -def op_first_monotonic(x, y, op):
    1.42 -    return (op(x[0], y[0]),) + tuple(zip(x[1:], y[1:]))
    1.43 -
    1.44 -def add_first_monotonic(x, y):
    1.45 -    return op_first_monotonic(x, y, operator.add)
    1.46 -
    1.47 -def sub_first_monotonic(x, y):
    1.48 -    return op_first_monotonic(x, y, operator.sub)
    1.49 -
    1.50  def add_seq(x, y):
    1.51      length = min(len(x), len(y))
    1.52      seq = list(x)[:length]
    1.53 @@ -84,17 +87,17 @@
    1.54  def sizeof(value):
    1.55      return is_sequence(value) and len(value) or 0
    1.56  
    1.57 -def get_monotonic_adder(value):
    1.58 -    return is_sequence(value) and add_seq_monotonic or operator.add
    1.59 +def get_monotonic_adder(size):
    1.60 +    return size and add_seq_monotonic or operator.add
    1.61  
    1.62 -def get_monotonic_subtractor(value):
    1.63 -    return is_sequence(value) and sub_seq_monotonic or operator.sub
    1.64 +def get_monotonic_subtractor(size):
    1.65 +    return size and sub_seq_monotonic or operator.sub
    1.66  
    1.67 -def get_adder(value):
    1.68 -    return is_sequence(value) and add_seq or operator.add
    1.69 +def get_adder(size):
    1.70 +    return size and add_seq or operator.add
    1.71  
    1.72 -def get_subtractor(value):
    1.73 -    return is_sequence(value) and sub_seq or operator.sub
    1.74 +def get_subtractor(size):
    1.75 +    return size and sub_seq or operator.sub
    1.76  
    1.77  # Low-level representations.
    1.78  # Variable-length integer functions.
    1.79 @@ -177,15 +180,6 @@
    1.80              break
    1.81      return number, start
    1.82  
    1.83 -# String serialisation.
    1.84 -
    1.85 -def string_to_array(s, bytes):
    1.86 -
    1.87 -    "Write the given string 's' to 'bytes'."
    1.88 -
    1.89 -    vint_to_array(len(s), bytes)
    1.90 -    bytes.fromstring(s.encode("utf-8"))
    1.91 -
    1.92  # Sequence serialisation.
    1.93  
    1.94  def sequence_to_array(value, size, bytes):

     2.1 --- a/iixr/fields.py	Sat Feb 12 01:23:58 2011 +0100
     2.2 +++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
     2.3 @@ -1,345 +0,0 @@
     2.4 -#!/usr/bin/env python
     2.5 -
     2.6 -"""
     2.7 -Specific classes for storing document information.
     2.8 -
     2.9 -Copyright (C) 2009, 2010, 2011 Paul Boddie <paul@boddie.org.uk>
    2.10 -
    2.11 -This program is free software; you can redistribute it and/or modify it under
    2.12 -the terms of the GNU General Public License as published by the Free Software
    2.13 -Foundation; either version 3 of the License, or (at your option) any later
    2.14 -version.
    2.15 -
    2.16 -This program is distributed in the hope that it will be useful, but WITHOUT ANY
    2.17 -WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
    2.18 -PARTICULAR PURPOSE.  See the GNU General Public License for more details.
    2.19 -
    2.20 -You should have received a copy of the GNU General Public License along
    2.21 -with this program.  If not, see <http://www.gnu.org/licenses/>.
    2.22 -"""
    2.23 -
    2.24 -from iixr.data import *
    2.25 -from iixr.files import *
    2.26 -from bisect import bisect_right  # to find terms in the dictionary index
    2.27 -
    2.28 -DOCUMENT_CACHE_LIMIT = 10000
    2.29 -
    2.30 -class FieldWriter(FileWriter):
    2.31 -
    2.32 -    "Writing field data to files."
    2.33 -
    2.34 -    def begin(self, docnum_size):
    2.35 -        self.write_number(docnum_size)
    2.36 -        self.end_record()
    2.37 -        self.docnum_size = docnum_size
    2.38 -        self.data_start = self.tell()
    2.39 -
    2.40 -    def reset(self):
    2.41 -        self.end_record()
    2.42 -        self.last_docnum = None
    2.43 -        self.subtractor = None
    2.44 -
    2.45 -    def write_fields(self, docnum, fields):
    2.46 -
    2.47 -        """
    2.48 -        Write for the given 'docnum', a list of 'fields' (integer, string pairs
    2.49 -        representing field identifiers and values respectively).
    2.50 -        """
    2.51 -
    2.52 -        # Find the size of document number values.
    2.53 -
    2.54 -        if self.last_docnum is not None:
    2.55 -            docnum_seq = self.subtractor(docnum, self.last_docnum)
    2.56 -        else:
    2.57 -            self.subtractor = get_subtractor(docnum)
    2.58 -            docnum_seq = docnum
    2.59 -
    2.60 -        # Write the document number.
    2.61 -
    2.62 -        self.write_sequence_value(docnum_seq, self.docnum_size)
    2.63 -
    2.64 -        # Write the number of fields.
    2.65 -
    2.66 -        self.write_number(len(fields))
    2.67 -
    2.68 -        # Write the fields themselves.
    2.69 -
    2.70 -        for i, field in fields:
    2.71 -            self.write_number(i)
    2.72 -            self.write_string(field, 1) # compress
    2.73 -
    2.74 -        self.last_docnum = docnum
    2.75 -
    2.76 -class FieldReader(FileReader):
    2.77 -
    2.78 -    "Reading field data from files."
    2.79 -
    2.80 -    def begin(self):
    2.81 -        self.begin_record()
    2.82 -        try:
    2.83 -            self.docnum_size = self.read_number()
    2.84 -        except EOFError:
    2.85 -            self.docnum_size = 0 # NOTE: No fields!
    2.86 -        self.data_start = self.tell()
    2.87 -
    2.88 -    def reset(self):
    2.89 -        self.last_docnum = None
    2.90 -        self.adder = None
    2.91 -        self.begin_record()
    2.92 -
    2.93 -    def read_fields(self):
    2.94 -
    2.95 -        """
    2.96 -        Read fields from the file, returning a tuple containing the document
    2.97 -        number and a list of field (identifier, value) pairs.
    2.98 -        """
    2.99 -
   2.100 -        # Read the document number.
   2.101 -
   2.102 -        docnum = self.read_sequence_value(self.docnum_size)
   2.103 -
   2.104 -        if self.last_docnum is not None:
   2.105 -            self.last_docnum = self.adder(docnum, self.last_docnum)
   2.106 -        else:
   2.107 -            self.adder = get_adder(docnum)
   2.108 -            self.last_docnum = docnum
   2.109 -
   2.110 -        # Read the number of fields.
   2.111 -
   2.112 -        nfields = self.read_number()
   2.113 -
   2.114 -        # Collect the fields.
   2.115 -
   2.116 -        fields = []
   2.117 -        i = 0
   2.118 -
   2.119 -        while i < nfields:
   2.120 -            identifier = self.read_number()
   2.121 -            value = self.read_string(1) # decompress
   2.122 -            fields.append((identifier, value))
   2.123 -            i += 1
   2.124 -
   2.125 -        return self.last_docnum, fields
   2.126 -
   2.127 -    def read_document_fields(self, docnum, offset):
   2.128 -
   2.129 -        """
   2.130 -        Read fields for 'docnum' at the given 'offset'. This permits the
   2.131 -        retrieval of details for the specified document, as well as scanning for
   2.132 -        later documents.
   2.133 -        """
   2.134 -
   2.135 -        self.seek(offset)
   2.136 -        bad_docnum, fields = self.read_fields()
   2.137 -        self.last_docnum = docnum
   2.138 -        return docnum, fields
   2.139 -
   2.140 -class FieldIndexWriter(FieldWriter):
   2.141 -
   2.142 -    "Writing field index details to files."
   2.143 -
   2.144 -    def reset(self):
   2.145 -        FieldWriter.reset(self)
   2.146 -        self.last_offset = 0
   2.147 -
   2.148 -    def write_document(self, docnum, offset):
   2.149 -
   2.150 -        """
   2.151 -        Write for the given 'docnum', the 'offset' at which the fields for the
   2.152 -        document are stored in the fields file.
   2.153 -        """
   2.154 -
   2.155 -        # Find the size of document number values.
   2.156 -
   2.157 -        if self.last_docnum is not None:
   2.158 -            docnum_seq = self.subtractor(docnum, self.last_docnum)
   2.159 -        else:
   2.160 -            self.subtractor = get_subtractor(docnum)
   2.161 -            docnum_seq = docnum
   2.162 -
   2.163 -        # Write the document number.
   2.164 -
   2.165 -        self.write_sequence_value(docnum_seq, self.docnum_size)
   2.166 -
   2.167 -        # Write the offset delta.
   2.168 -
   2.169 -        self.write_number(offset - self.last_offset)
   2.170 -
   2.171 -        self.last_docnum = docnum
   2.172 -        self.last_offset = offset
   2.173 -
   2.174 -class FieldIndexReader(FieldReader):
   2.175 -
   2.176 -    "Reading field index details from files."
   2.177 -
   2.178 -    def reset(self):
   2.179 -        FieldReader.reset(self)
   2.180 -        self.last_offset = 0
   2.181 -
   2.182 -    def read_document(self):
   2.183 -
   2.184 -        "Read a document number and field file offset."
   2.185 -
   2.186 -        # Read the document number.
   2.187 -
   2.188 -        docnum = self.read_sequence_value(self.docnum_size)
   2.189 -
   2.190 -        if self.last_docnum is not None:
   2.191 -            self.last_docnum = self.adder(docnum, self.last_docnum)
   2.192 -        else:
   2.193 -            self.adder = get_adder(docnum)
   2.194 -            self.last_docnum = docnum
   2.195 -
   2.196 -        # Read the offset.
   2.197 -
   2.198 -        self.last_offset += self.read_number()
   2.199 -
   2.200 -        return self.last_docnum, self.last_offset
   2.201 -
   2.202 -class FieldDictionaryWriter:
   2.203 -
   2.204 -    "Writing field dictionary details."
   2.205 -
   2.206 -    def __init__(self, field_writer, field_index_writer, interval):
   2.207 -        self.field_writer = field_writer
   2.208 -        self.field_index_writer = field_index_writer
   2.209 -        self.interval = interval
   2.210 -        self.entry = 0
   2.211 -
   2.212 -    def write_fields(self, docnum, fields):
   2.213 -
   2.214 -        "Write details of the given 'docnum' and 'fields'."
   2.215 -
   2.216 -        if self.entry == 0:
   2.217 -            docnum_size = sizeof(docnum)
   2.218 -            self.field_writer.begin(docnum_size)
   2.219 -            self.field_index_writer.begin(docnum_size)
   2.220 -            self.field_index_writer.reset()
   2.221 -
   2.222 -        if self.entry % self.interval == 0:
   2.223 -            self.field_writer.reset()
   2.224 -            offset = self.field_writer.tell()
   2.225 -            self.field_writer.write_fields(docnum, fields)
   2.226 -            self.field_index_writer.write_document(docnum, offset)
   2.227 -        else:
   2.228 -            self.field_writer.write_fields(docnum, fields)
   2.229 -
   2.230 -        self.entry += 1
   2.231 -
   2.232 -    def close(self):
   2.233 -        self.field_writer.close()
   2.234 -        self.field_index_writer.close()
   2.235 -
   2.236 -class FieldDictionaryReader:
   2.237 -
   2.238 -    "Reading field dictionary details."
   2.239 -
   2.240 -    def __init__(self, field_reader, field_index_reader):
   2.241 -        self.field_reader = field_reader
   2.242 -        self.field_index_reader = field_index_reader
   2.243 -
   2.244 -        self.field_reader.reset()
   2.245 -        self.field_index_reader.reset()
   2.246 -
   2.247 -        self.cache = {}
   2.248 -
   2.249 -        self.entry = 0
   2.250 -        self.docs = []
   2.251 -        try:
   2.252 -            while 1:
   2.253 -                self.docs.append(self.field_index_reader.read_document())
   2.254 -        except EOFError:
   2.255 -            pass
   2.256 -
   2.257 -        # Large numbers for ordering purposes.
   2.258 -
   2.259 -        if self.docs:
   2.260 -            self.max_offset = self.docs[-1][1]
   2.261 -        else:
   2.262 -            self.max_offset = None
   2.263 -
   2.264 -    # Iterator convenience methods.
   2.265 -
   2.266 -    def __iter__(self):
   2.267 -        self.rewind()
   2.268 -        return self
   2.269 -
   2.270 -    def next(self):
   2.271 -        try:
   2.272 -            return self.read_fields()
   2.273 -        except EOFError:
   2.274 -            raise StopIteration
   2.275 -
   2.276 -    # Sequential access methods.
   2.277 -
   2.278 -    def rewind(self):
   2.279 -        self.field_reader.rewind()
   2.280 -
   2.281 -    def read_fields(self):
   2.282 -
   2.283 -        "Return the next document number and fields."
   2.284 -
   2.285 -        try:
   2.286 -            return self.field_reader.read_fields()
   2.287 -        except EOFError:
   2.288 -            self.entry += 1
   2.289 -            try:
   2.290 -                found_docnum, offset = self.docs[self.entry]
   2.291 -            except IndexError:
   2.292 -                raise EOFError
   2.293 -            else:
   2.294 -                self.field_reader.reset()
   2.295 -                return self.field_reader.read_fields()
   2.296 -
   2.297 -    # Random access methods.
   2.298 -
   2.299 -    def get_fields(self, docnum):
   2.300 -
   2.301 -        "Read the fields of the document with the given 'docnum'."
   2.302 -
   2.303 -        if self.cache.has_key(docnum):
   2.304 -            return self.cache[docnum]
   2.305 -
   2.306 -        i = bisect_right(self.docs, (docnum, self.max_offset)) - 1
   2.307 -
   2.308 -        # Get the entry position providing the term or one preceding it.
   2.309 -
   2.310 -        if i == -1:
   2.311 -            return None
   2.312 -
   2.313 -        found_docnum, offset = self.docs[i]
   2.314 -
   2.315 -        # Read from the fields file.
   2.316 -
   2.317 -        found_docnum, fields = self.field_reader.read_document_fields(found_docnum, offset)
   2.318 -
   2.319 -        # Scan for the document, if necessary.
   2.320 -
   2.321 -        try:
   2.322 -            while docnum > found_docnum:
   2.323 -                found_docnum, fields = self.field_reader.read_fields()
   2.324 -        except EOFError:
   2.325 -            pass
   2.326 -
   2.327 -        # If the document is found, return the fields.
   2.328 -
   2.329 -        if docnum == found_docnum:
   2.330 -
   2.331 -            # Store the fields in the cache, removing entries if the limit has
   2.332 -            # been reached.
   2.333 -
   2.334 -            keys = self.cache.keys()
   2.335 -
   2.336 -            if len(keys) == DOCUMENT_CACHE_LIMIT:
   2.337 -                del self.cache[keys[0]]
   2.338 -
   2.339 -            self.cache[docnum] = fields
   2.340 -            return fields
   2.341 -        else:
   2.342 -            return None
   2.343 -
   2.344 -    def close(self):
   2.345 -        self.field_reader.close()
   2.346 -        self.field_index_reader.close()
   2.347 -
   2.348 -# vim: tabstop=4 expandtab shiftwidth=4

     3.1 --- a/iixr/files.py	Sat Feb 12 01:23:58 2011 +0100
     3.2 +++ b/iixr/files.py	Sun Feb 13 02:49:55 2011 +0100
     3.3 @@ -22,10 +22,6 @@
     3.4  from array import array
     3.5  import zlib
     3.6  
     3.7 -# Constants.
     3.8 -
     3.9 -CACHE_SIZE = 100000
    3.10 -
    3.11  # Classes.
    3.12  
    3.13  class File:
    3.14 @@ -35,14 +31,21 @@
    3.15      def __init__(self, f):
    3.16          self.f = f
    3.17          self.record = array('B')    # record buffer
    3.18 -        self.cache = array('B')
    3.19 +        self.data_start = None
    3.20 +
    3.21 +    def begin(self):
    3.22 +
    3.23 +        """
    3.24 +        Initialise file-wide parameters. In writers, this method may require
    3.25 +        parameters to be specified. In readers, the parameters may be read from
    3.26 +        the file.
    3.27 +        """
    3.28 +
    3.29          self.data_start = 0
    3.30  
    3.31 -    def reset(self):
    3.32 -
    3.33 -        "To be used to reset the state of the reader or writer between records."
    3.34 -
    3.35 -        pass
    3.36 +    def tell(self):
    3.37 +        # NOTE: Will not be accurate within the current record.
    3.38 +        return self.f.tell()
    3.39  
    3.40      def seek(self, offset):
    3.41          self.f.seek(offset)
    3.42 @@ -60,27 +63,26 @@
    3.43  
    3.44      "Writing basic data types to files."
    3.45  
    3.46 -    def __init__(self, f):
    3.47 -        File.__init__(self, f)
    3.48 -        self.written = 0
    3.49 -
    3.50 -    def tell(self):
    3.51 -        # NOTE: Will not be accurate within the current record.
    3.52 -        return self.written
    3.53 -
    3.54      def begin_record(self):
    3.55          pass
    3.56  
    3.57      def end_record(self):
    3.58          if self.record:
    3.59 -            length = len(self.record)
    3.60 -            before = len(self.cache)
    3.61 -            vint_to_array(length, self.cache)
    3.62 -            length_size = len(self.cache) - before
    3.63 -            self.cache += self.record
    3.64 -            self.written += length_size + length
    3.65 +            self.f.write(vint(len(self.record)))
    3.66 +            self.record.tofile(self.f)
    3.67              self.record = array('B')
    3.68 -            self.flush_cache()
    3.69 +
    3.70 +    def write_remaining(self, a):
    3.71 +
    3.72 +        "Write remaining data from the raw array 'a'."
    3.73 +
    3.74 +        self.record += a
    3.75 +
    3.76 +    def write_byte(self, b):
    3.77 +
    3.78 +        "Write the given byte 'b'."
    3.79 +
    3.80 +        self.record.append(b)
    3.81  
    3.82      def write_number(self, number):
    3.83  
    3.84 @@ -137,25 +139,17 @@
    3.85              self.write_sequence_value(value, size)
    3.86  
    3.87      def write_delta_sequence(self, values, size):
    3.88 -        convert_sequence(values, get_subtractor(values[0]))
    3.89 -        self.write_sequence_values(values, size)
    3.90 +        self.write_sequence_values(
    3.91 +            convert_sequence(values, get_subtractor(size), 1),
    3.92 +            size)
    3.93  
    3.94      def write_monotonic_sequence(self, values, size):
    3.95 -        convert_sequence(values, get_monotonic_subtractor(values[0]))
    3.96 -        self.write_sequence_values(values, size)
    3.97 -
    3.98 -    def flush(self, force=0):
    3.99 -        self.end_record()
   3.100 -        self.flush_cache(force)
   3.101 -
   3.102 -    def flush_cache(self, force=0):
   3.103 -        if self.f is not None:
   3.104 -            if force or len(self.cache) > CACHE_SIZE:
   3.105 -                self.cache.tofile(self.f)
   3.106 -                self.cache = array('B')
   3.107 +        self.write_sequence_values(
   3.108 +            convert_sequence(values, get_monotonic_subtractor(size), 1),
   3.109 +            size)
   3.110  
   3.111      def close(self):
   3.112 -        self.flush(1)
   3.113 +        self.end_record()
   3.114          File.close(self)
   3.115  
   3.116  class FileReader(File):
   3.117 @@ -164,58 +158,33 @@
   3.118  
   3.119      def __init__(self, f):
   3.120          File.__init__(self, f)
   3.121 -        self.record_start = 0
   3.122 -        self.record_end = 0
   3.123 -        self.cache_start = 0
   3.124          self.begin()
   3.125  
   3.126 -    def begin(self):
   3.127 -
   3.128 -        "Initialise file-wide parameters."
   3.129 -
   3.130 -        pass
   3.131 -
   3.132      def begin_record(self):
   3.133          self.start = 0
   3.134 +        self.record = array('B')
   3.135          try:
   3.136              size = self.read_number_from_file()
   3.137 -            self.record = self.from_cache(size)
   3.138 +            self.record.fromfile(self.f, size)
   3.139          except EOFError:
   3.140              pass
   3.141  
   3.142      def end_record(self):
   3.143          pass
   3.144  
   3.145 -    def seek(self, offset):
   3.146 -        from_cache_start = offset - self.cache_start
   3.147 -        if 0 <= from_cache_start < len(self.cache):
   3.148 -            self.record_start = self.record_end = from_cache_start
   3.149 -        else:
   3.150 -            self.f.seek(offset)
   3.151 -            self.cache = array('B')
   3.152 -            self.cache_start = offset
   3.153 -            self.record_start = self.record_end = 0
   3.154 -        self.reset()
   3.155 +    def read_remaining(self):
   3.156  
   3.157 -    def tell(self):
   3.158 -        return self.cache_start + self.record_start + self.start
   3.159 +        "Read remaining data as a raw array."
   3.160 +
   3.161 +        return self.record[self.start:]
   3.162  
   3.163 -    def ensure_cache(self, size):
   3.164 -        if size > len(self.cache) - self.record_end:
   3.165 -            self.cache = self.cache[self.record_end:]
   3.166 -            self.cache_start += self.record_end
   3.167 -            s = self.f.read(CACHE_SIZE)
   3.168 -            self.cache.fromstring(s)
   3.169 -            self.record_start = 0
   3.170 -            if not s:
   3.171 -                raise EOFError
   3.172 -        else:
   3.173 -            self.record_start = self.record_end
   3.174 -        self.record_end = self.record_start + size
   3.175 +    def read_byte(self):
   3.176 +
   3.177 +        "Read a byte from the record."
   3.178  
   3.179 -    def from_cache(self, size):
   3.180 -        self.ensure_cache(size)
   3.181 -        return self.cache[self.record_start:self.record_end]
   3.182 +        b = self.record[self.start]
   3.183 +        self.start += 1
   3.184 +        return b
   3.185  
   3.186      def read_number_from_file(self):
   3.187  
   3.188 @@ -224,13 +193,13 @@
   3.189          # Read each byte, adding it to the number.
   3.190  
   3.191          a = array('B')
   3.192 -        a += self.from_cache(1)
   3.193 +        a.fromfile(self.f, 1)
   3.194          csd = a[-1]
   3.195          if csd < 128:
   3.196              return csd
   3.197          else:
   3.198              while csd & 128:
   3.199 -                a += self.from_cache(1)
   3.200 +                a.fromfile(self.f, 1)
   3.201                  csd = a[-1]
   3.202              return vint_from_array(a)
   3.203  
   3.204 @@ -292,13 +261,9 @@
   3.205          return values
   3.206  
   3.207      def read_delta_sequence(self, size):
   3.208 -        values = self.read_sequences(size)
   3.209 -        convert_sequence(values, get_adder(values[0]))
   3.210 -        return values
   3.211 +        return convert_sequence(self.read_sequences(size), get_adder(size), 0)
   3.212  
   3.213      def read_monotonic_sequence(self, size):
   3.214 -        values = self.read_sequences(size)
   3.215 -        convert_sequence(values, get_monotonic_adder(values[0]))
   3.216 -        return values
   3.217 +        return convert_sequence(self.read_sequences(size), get_monotonic_adder(size), 0)
   3.218  
   3.219  # vim: tabstop=4 expandtab shiftwidth=4

     4.1 --- a/iixr/filesystem.py	Sat Feb 12 01:23:58 2011 +0100
     4.2 +++ b/iixr/filesystem.py	Sun Feb 13 02:49:55 2011 +0100
     4.3 @@ -3,7 +3,7 @@
     4.4  """
     4.5  File access.
     4.6  
     4.7 -Copyright (C) 2009, 2010 Paul Boddie <paul@boddie.org.uk>
     4.8 +Copyright (C) 2009, 2010, 2011 Paul Boddie <paul@boddie.org.uk>
     4.9  
    4.10  This program is free software; you can redistribute it and/or modify it under
    4.11  the terms of the GNU General Public License as published by the Free Software
    4.12 @@ -18,9 +18,7 @@
    4.13  with this program.  If not, see <http://www.gnu.org/licenses/>.
    4.14  """
    4.15  
    4.16 -from iixr.fields import *
    4.17  from iixr.terms import *
    4.18 -from iixr.positions import *
    4.19  from os import listdir, remove, rename  # partition manipulation
    4.20  from shutil import copy                 # index updating
    4.21  from os.path import join
    4.22 @@ -32,8 +30,7 @@
    4.23  
    4.24  # Constants.
    4.25  
    4.26 -TERM_FILENAMES    = "terms", "terms_index", "positions", "positions_index"
    4.27 -FIELD_FILENAMES   = "fields", "fields_index"
    4.28 +TERM_FILENAMES    = "terms",
    4.29  
    4.30  # Utility functions.
    4.31  
    4.32 @@ -49,7 +46,9 @@
    4.33      partitions = set()
    4.34      for filename in listdir(pathname):
    4.35          if filename.startswith(prefix):
    4.36 -            partitions.add(filename[prefix_length:])
    4.37 +            partition = filename[prefix_length:]
    4.38 +            if partition.isdigit():
    4.39 +                partitions.add(int(partition))
    4.40      return partitions
    4.41  
    4.42  def get_term_partitions(pathname):
    4.43 @@ -61,95 +60,40 @@
    4.44  
    4.45      return get_partitions(pathname, "terms-")
    4.46  
    4.47 -def get_field_partitions(pathname):
    4.48 +def get_next_partition(partitions):
    4.49 +    return max(partitions or [-1]) + 1
    4.50 +
    4.51 +def get_term_writer(pathname, partition):
    4.52  
    4.53      """
    4.54 -    Return a set of field partition identifiers for partitions residing at the
    4.55 -    given 'pathname'.
    4.56 -    """
    4.57 -
    4.58 -    return get_partitions(pathname, "fields-")
    4.59 -
    4.60 -def get_next_partition(partitions):
    4.61 -    return max([int(n) for n in partitions if n.isdigit()] or [-1]) + 1
    4.62 -
    4.63 -def get_term_writer(pathname, partition, interval, doc_interval):
    4.64 -
    4.65 -    """
    4.66 -    Return a term dictionary writer using files under the given 'pathname'
    4.67 -    labelled according to the given 'partition', using the given indexing
    4.68 -    'interval' for terms and 'doc_interval' for document position records.
    4.69 +    Return a term writer using files under the given 'pathname' labelled
    4.70 +    according to the given 'partition'.
    4.71      """
    4.72  
    4.73 -    tdf = open(join(pathname, "terms-%s" % partition), "wb")
    4.74 -    info_writer = TermWriter(tdf)
    4.75 -
    4.76 -    tdif = open(join(pathname, "terms_index-%s" % partition), "wb")
    4.77 -    index_writer = TermIndexWriter(tdif)
    4.78 -
    4.79 -    tpf = open(join(pathname, "positions-%s" % partition), "wb")
    4.80 -    positions_writer = PositionWriter(tpf)
    4.81 -
    4.82 -    tpif = open(join(pathname, "positions_index-%s" % partition), "wb")
    4.83 -    positions_index_writer = PositionIndexWriter(tpif)
    4.84 -
    4.85 -    positions_dict_writer = PositionDictionaryWriter(positions_writer, positions_index_writer, doc_interval)
    4.86 -
    4.87 -    return TermDictionaryWriter(info_writer, index_writer, positions_dict_writer, interval)
    4.88 +    f = open(join(pathname, "terms-%s" % partition), "wb")
    4.89 +    return TermWriter(f)
    4.90  
    4.91 -def get_field_writer(pathname, partition, interval):
    4.92 -
    4.93 -    """
    4.94 -    Return a field dictionary writer using files under the given 'pathname'
    4.95 -    labelled according to the given 'partition', using the given indexing
    4.96 -    'interval'.
    4.97 -    """
    4.98 -
    4.99 -    ff = open(join(pathname, "fields-%s" % partition), "wb")
   4.100 -    field_writer = FieldWriter(ff)
   4.101 -
   4.102 -    fif = open(join(pathname, "fields_index-%s" % partition), "wb")
   4.103 -    field_index_writer = FieldIndexWriter(fif)
   4.104 -
   4.105 -    return FieldDictionaryWriter(field_writer, field_index_writer, interval)
   4.106 +def get_reader(pathname, name, partition, cls):
   4.107 +    f = open(join(pathname, "%s-%s" % (name, partition)), "rb")
   4.108 +    return cls(f)
   4.109  
   4.110  def get_term_reader(pathname, partition):
   4.111  
   4.112      """
   4.113 -    Return a term dictionary reader using files under the given 'pathname'
   4.114 +    Return a term reader using files under the given 'pathname' labelled
   4.115 +    according to the given 'partition'.
   4.116 +    """
   4.117 +
   4.118 +    return get_reader(pathname, "terms", partition, TermIterator)
   4.119 +
   4.120 +def get_term_data_reader(pathname, partition):
   4.121 +
   4.122 +    """
   4.123 +    Return a term plus data reader using files under the given 'pathname'
   4.124      labelled according to the given 'partition'.
   4.125      """
   4.126  
   4.127 -    tdf = open(join(pathname, "terms-%s" % partition), "rb")
   4.128 -    info_reader = TermReader(tdf)
   4.129 -
   4.130 -    tdif = open(join(pathname, "terms_index-%s" % partition), "rb")
   4.131 -    index_reader = TermIndexReader(tdif)
   4.132 -
   4.133 -    pf = open(join(pathname, "positions-%s" % partition), "rb")
   4.134 -    position_reader = PositionReader(pf)
   4.135 -
   4.136 -    pif = open(join(pathname, "positions_index-%s" % partition), "rb")
   4.137 -    position_index_reader = PositionIndexReader(pif)
   4.138 -
   4.139 -    position_dict_reader = PositionDictionaryReader(position_reader, position_index_reader)
   4.140 -
   4.141 -    return TermDictionaryReader(info_reader, index_reader, position_dict_reader)
   4.142 -
   4.143 -def get_field_reader(pathname, partition):
   4.144 -
   4.145 -    """
   4.146 -    Return a field dictionary reader using files under the given 'pathname'
   4.147 -    labelled according to the given 'partition'.
   4.148 -    """
   4.149 -
   4.150 -    ff = open(join(pathname, "fields-%s" % partition), "rb")
   4.151 -    field_reader = FieldReader(ff)
   4.152 -
   4.153 -    fif = open(join(pathname, "fields_index-%s" % partition), "rb")
   4.154 -    field_index_reader = FieldIndexReader(fif)
   4.155 -
   4.156 -    return FieldDictionaryReader(field_reader, field_index_reader)
   4.157 +    return get_reader(pathname, "terms", partition, TermDataIterator)
   4.158  
   4.159  # Renaming.
   4.160  
   4.161 @@ -160,9 +104,6 @@
   4.162  def rename_term_files(pathname, from_partition, to_partition):
   4.163      rename_files(pathname, TERM_FILENAMES, from_partition, to_partition)
   4.164  
   4.165 -def rename_field_files(pathname, from_partition, to_partition):
   4.166 -    rename_files(pathname, FIELD_FILENAMES, from_partition, to_partition)
   4.167 -
   4.168  # Removal/deletion.
   4.169  
   4.170  def remove_files(pathname, names, partition):
   4.171 @@ -172,9 +113,6 @@
   4.172  def remove_term_files(pathname, partition):
   4.173      remove_files(pathname, TERM_FILENAMES, partition)
   4.174  
   4.175 -def remove_field_files(pathname, partition):
   4.176 -    remove_files(pathname, FIELD_FILENAMES, partition)
   4.177 -
   4.178  # Copying.
   4.179  
   4.180  def copy_files(source, names, partition, destination, suffix):
   4.181 @@ -185,7 +123,4 @@
   4.182  def copy_term_files(source, partition, destination, suffix):
   4.183      copy_files(source, TERM_FILENAMES, partition, destination, suffix)
   4.184  
   4.185 -def copy_field_files(source, partition, destination, suffix):
   4.186 -    copy_files(source, FIELD_FILENAMES, partition, destination, suffix)
   4.187 -
   4.188  # vim: tabstop=4 expandtab shiftwidth=4

     5.1 --- a/iixr/index.py	Sat Feb 12 01:23:58 2011 +0100
     5.2 +++ b/iixr/index.py	Sun Feb 13 02:49:55 2011 +0100
     5.3 @@ -19,18 +19,14 @@
     5.4  """
     5.5  
     5.6  from iixr.filesystem import *
     5.7 -from iixr.merging import *
     5.8 -from itertools import islice
     5.9 +from itermerge import itermerge
    5.10  from os import mkdir    # index discovery
    5.11  from os.path import exists
    5.12 +import operator
    5.13  
    5.14  # Constants.
    5.15  
    5.16 -TERM_INTERVAL     = 100
    5.17 -DOCUMENT_INTERVAL = 100
    5.18 -FIELD_INTERVAL    = 100
    5.19  FLUSH_INTERVAL    = 10000
    5.20 -POSITIONS_FLUSH_INTERVAL = 1000000
    5.21  OPEN_PARTITIONS   = 20
    5.22  
    5.23  # High-level classes.
    5.24 @@ -39,11 +35,9 @@
    5.25  
    5.26      "A container of document information."
    5.27  
    5.28 -    def __init__(self, docnum, fields=None):
    5.29 +    def __init__(self, docnum):
    5.30          self.docnum = docnum
    5.31 -        self.fields = fields or []
    5.32          self.terms = {}
    5.33 -        self.field_dict = None
    5.34  
    5.35      def add_position(self, term, position):
    5.36  
    5.37 @@ -54,55 +48,18 @@
    5.38  
    5.39          self.terms.setdefault(term, []).append(position)
    5.40  
    5.41 -    def add_field(self, identifier, value):
    5.42 -
    5.43 -        "Add a field having the given 'identifier' and 'value'."
    5.44 -
    5.45 -        self.fields.append((identifier, unicode(value))) # convert to string
    5.46 -
    5.47 -    def set_fields(self, fields):
    5.48 -
    5.49 -        """
    5.50 -        Set the document's 'fields': a list of tuples each containing an integer
    5.51 -        identifier and a string value.
    5.52 -        """
    5.53 -
    5.54 -        self.fields = fields
    5.55 -
    5.56 -    def _ensure_dict(self):
    5.57 -        if self.field_dict is None:
    5.58 -            self.field_dict = dict(self.fields)
    5.59 -
    5.60 -    def keys(self):
    5.61 -        self._ensure_dict()
    5.62 -        return self.field_dict.keys()
    5.63 -
    5.64 -    def __getitem__(self, key):
    5.65 -        self._ensure_dict()
    5.66 -        return self.field_dict[key]
    5.67 -
    5.68  class IndexWriter:
    5.69  
    5.70 -    """
    5.71 -    Building term information and writing it to the term and field dictionaries.
    5.72 -    """
    5.73 +    "Building term information and writing it to the term dictionary."
    5.74  
    5.75 -    def __init__(self, pathname, interval, doc_interval, field_interval, flush_interval, positions_flush_interval):
    5.76 +    def __init__(self, pathname, flush_interval):
    5.77          self.pathname = pathname
    5.78 -        self.interval = interval
    5.79 -        self.doc_interval = doc_interval
    5.80 -        self.field_interval = field_interval
    5.81          self.flush_interval = flush_interval
    5.82 -        self.positions_flush_interval = positions_flush_interval
    5.83  
    5.84 -        self.dict_partition = get_next_partition(get_term_partitions(self.pathname))
    5.85 -        self.field_dict_partition = get_next_partition(get_field_partitions(self.pathname))
    5.86 +        self.term_partition = get_next_partition(get_term_partitions(self.pathname))
    5.87  
    5.88          self.terms = {}
    5.89 -        self.docs = []
    5.90 -
    5.91          self.doc_counter = 0
    5.92 -        self.position_counter = 0
    5.93  
    5.94      def add_document(self, doc):
    5.95  
    5.96 @@ -115,134 +72,105 @@
    5.97  
    5.98          for term, positions in doc.terms.items():
    5.99              self.terms.setdefault(term, {})[docnum] = positions
   5.100 -            self.position_counter += len(positions)
   5.101 -
   5.102 -        self.docs.append((docnum, doc.fields))
   5.103  
   5.104          self.doc_counter += 1
   5.105  
   5.106 -        if self.flush_interval and self.doc_counter >= self.flush_interval or \
   5.107 -            self.positions_flush_interval and self.position_counter >= self.positions_flush_interval:
   5.108 -
   5.109 +        if self.flush_interval and self.doc_counter >= self.flush_interval:
   5.110              self.flush_terms()
   5.111 -            self.flush_fields()
   5.112              self.doc_counter = 0
   5.113 -            self.position_counter = 0
   5.114  
   5.115      def get_term_writer(self):
   5.116  
   5.117 -        "Return a term dictionary writer for the current partition."
   5.118 -
   5.119 -        return get_term_writer(self.pathname, self.dict_partition, self.interval, self.doc_interval)
   5.120 +        "Return a term writer for the current partition."
   5.121  
   5.122 -    def get_field_writer(self):
   5.123 -
   5.124 -        "Return a field dictionary writer for the current partition."
   5.125 -
   5.126 -        return get_field_writer(self.pathname, self.field_dict_partition, self.field_interval)
   5.127 +        return get_term_writer(self.pathname, self.term_partition)
   5.128  
   5.129      def flush_terms(self):
   5.130  
   5.131 -        "Flush terms into the current term dictionary partition."
   5.132 +        "Flush terms into the current term partition."
   5.133  
   5.134          # Get the terms in order.
   5.135  
   5.136 -        all_terms = self.terms
   5.137 -        terms = all_terms.keys()
   5.138 -        terms.sort()
   5.139 -
   5.140 -        dict_writer = self.get_term_writer()
   5.141 -
   5.142 -        for term in terms:
   5.143 -            doc_positions = all_terms[term].items()
   5.144 -            dict_writer.write_term_positions(term, doc_positions)
   5.145 -
   5.146 -        dict_writer.close()
   5.147 +        term_writer = self.get_term_writer()
   5.148 +        try:
   5.149 +            term_writer.write_terms(self.terms)
   5.150 +        finally:
   5.151 +            term_writer.close()
   5.152  
   5.153          self.terms = {}
   5.154 -        self.dict_partition += 1
   5.155 -
   5.156 -    def flush_fields(self):
   5.157 -
   5.158 -        "Flush fields into the current term dictionary partition."
   5.159 -
   5.160 -        # Get the documents in order.
   5.161 -
   5.162 -        self.docs.sort()
   5.163 -
   5.164 -        field_dict_writer = self.get_field_writer()
   5.165 -        for docnum, fields in self.docs:
   5.166 -            field_dict_writer.write_fields(docnum, fields)
   5.167 -        field_dict_writer.close()
   5.168 -
   5.169 -        self.docs = []
   5.170 -        self.field_dict_partition += 1
   5.171 +        self.term_partition += 1
   5.172  
   5.173      def close(self):
   5.174          if self.terms or not get_term_partitions(self.pathname):
   5.175              self.flush_terms()
   5.176 -        if self.docs or not get_field_partitions(self.pathname):
   5.177 -            self.flush_fields()
   5.178 +
   5.179 +class IndexReader(itermerge):
   5.180 +
   5.181 +    "Accessing the term dictionaries."
   5.182  
   5.183 -class IndexReader:
   5.184 +    def __init__(self, pathname, get_reader=None, combine=None):
   5.185  
   5.186 -    "Accessing the term and field dictionaries."
   5.187 +        # Get the partitions in order.
   5.188 +
   5.189 +        partitions = list(get_term_partitions(pathname))
   5.190 +        partitions.sort()
   5.191  
   5.192 -    def __init__(self, pathname):
   5.193 -        self.dict_reader = get_term_reader(pathname, "merged")
   5.194 -        self.field_dict_reader = get_field_reader(pathname, "merged")
   5.195 +        # Initialise the underlying term partition readers.
   5.196  
   5.197 -    # Sequential access.
   5.198 +        self.readers = [(get_reader or get_term_reader)(pathname, partition) for partition in partitions]
   5.199 +        self.combine = combine or operator.add
   5.200 +
   5.201 +        # Initialise this object as an iterator over the readers.
   5.202  
   5.203 -    def read_term(self):
   5.204 -        return self.dict_reader.read_term()
   5.205 +        itermerge.__init__(self, self.readers)
   5.206 +        self.next_value = None
   5.207  
   5.208 -    def go_to_term(self, term):
   5.209 -        return self.dict_reader._get_term_and_positions(*self.dict_reader.go_to_term(term))
   5.210 +    def get_sizes(self):
   5.211  
   5.212 -    # Query access.
   5.213 +        # Readers must have compatible sizes.
   5.214  
   5.215 -    def get_terms(self):
   5.216 -        return self.dict_reader.get_terms()
   5.217 -
   5.218 -    def find_terms(self, term):
   5.219 -        return self.dict_reader.find_terms(term)
   5.220 +        if self.readers:
   5.221 +            return self.readers[0].get_sizes()
   5.222 +        else:
   5.223 +            return 0, 0
   5.224  
   5.225 -    def find_positions(self, term):
   5.226 -        return self.dict_reader.find_positions(term)
   5.227 +    def next(self):
   5.228 +        if self.next_value is not None:
   5.229 +            term, positions = self.next_value
   5.230 +        else:
   5.231 +            term, positions = itermerge.next(self)
   5.232  
   5.233 -    def find_common_positions(self, terms):
   5.234 -        return self.dict_reader.find_common_positions(terms)
   5.235 +        # Look at the next item to see if it is has positions for the current
   5.236 +        # term.
   5.237  
   5.238 -    def get_frequency(self, term):
   5.239 -        return self.dict_reader.get_frequency(term)
   5.240 -
   5.241 -    def get_document_frequency(self, term):
   5.242 -        return self.dict_reader.get_document_frequency(term)
   5.243 +        try:
   5.244 +            t, p = itermerge.next(self)
   5.245 +            while t == term:
   5.246 +                positions = self.combine(positions, p)
   5.247 +                t, p = itermerge.next(self)
   5.248 +            self.next_value = t, p
   5.249  
   5.250 -    def get_fields(self, docnum):
   5.251 -        return self.field_dict_reader.get_fields(docnum)
   5.252 +        # Where an item could not be fetched, cause future requests to fail.
   5.253  
   5.254 -    def get_document(self, docnum):
   5.255 -        return Document(docnum, self.get_fields(docnum))
   5.256 +        except StopIteration:
   5.257 +            self.next_value = None
   5.258 +
   5.259 +        return term, positions
   5.260  
   5.261      def close(self):
   5.262 -        self.dict_reader.close()
   5.263 -        self.field_dict_reader.close()
   5.264 +        for reader in self.readers:
   5.265 +            reader.close()
   5.266 +        self.readers = []
   5.267  
   5.268  class Index:
   5.269  
   5.270      "An inverted index solution encapsulating the various components."
   5.271  
   5.272 -    def __init__(self, pathname, interval=TERM_INTERVAL, doc_interval=DOCUMENT_INTERVAL, field_interval=FIELD_INTERVAL,
   5.273 -        flush_interval=FLUSH_INTERVAL, positions_flush_interval=POSITIONS_FLUSH_INTERVAL, open_partitions=OPEN_PARTITIONS):
   5.274 +    def __init__(self, pathname, flush_interval=FLUSH_INTERVAL,
   5.275 +        open_partitions=OPEN_PARTITIONS):
   5.276  
   5.277          self.pathname = pathname
   5.278 -        self.interval = interval
   5.279 -        self.doc_interval = doc_interval
   5.280 -        self.field_interval = field_interval
   5.281          self.flush_interval = flush_interval
   5.282 -        self.positions_flush_interval = positions_flush_interval
   5.283          self.open_partitions = open_partitions
   5.284          self.reader = None
   5.285          self.writer = None
   5.286 @@ -251,132 +179,60 @@
   5.287  
   5.288          "Return a writer."
   5.289  
   5.290 -        self._ensure_directory()
   5.291 -        self.writer = IndexWriter(self.pathname, self.interval, self.doc_interval,
   5.292 -            self.field_interval, self.flush_interval, self.positions_flush_interval)
   5.293 +        if self.writer is None:
   5.294 +            self._ensure_directory()
   5.295 +            self.writer = IndexWriter(self.pathname, self.flush_interval)
   5.296          return self.writer
   5.297  
   5.298      def _ensure_directory(self):
   5.299          if not exists(self.pathname):
   5.300              mkdir(self.pathname)
   5.301  
   5.302 -    def get_reader(self, partition=0):
   5.303 -
   5.304 -        "Return a reader for the index."
   5.305 -
   5.306 -        # Ensure that only one partition exists.
   5.307 -
   5.308 -        self.merge()
   5.309 -        return self._get_reader(partition)
   5.310 -
   5.311 -    def _get_reader(self, partition):
   5.312 +    def get_reader(self, refresh=0):
   5.313  
   5.314          "Return a reader for the index."
   5.315  
   5.316 -        if not exists(self.pathname):
   5.317 -            raise OSError, "Index path %r does not exist." % self.pathname
   5.318 -
   5.319 -        self.reader = IndexReader(self.pathname)
   5.320 -        return self.reader
   5.321 -
   5.322 -    def get_term_partitions(self):
   5.323 +        if refresh and self.reader is not None:
   5.324 +            self.reader.close()
   5.325 +            self.reader = None
   5.326  
   5.327 -        "Return a set of term partition identifiers."
   5.328 -
   5.329 -        return get_term_partitions(self.pathname)
   5.330 -
   5.331 -    def get_field_partitions(self):
   5.332 -
   5.333 -        "Return a set of field partition identifiers."
   5.334 -
   5.335 -        return get_field_partitions(self.pathname)
   5.336 +        if self.reader is None:
   5.337 +            if not exists(self.pathname):
   5.338 +                raise OSError, "Index path %r does not exist." % self.pathname
   5.339 +            self.reader = IndexReader(self.pathname)
   5.340 +        return self.reader
   5.341  
   5.342      def merge(self):
   5.343  
   5.344 -        "Merge/optimise index partitions."
   5.345 -
   5.346 -        self._merge_terms()
   5.347 -        self._merge_fields()
   5.348 -
   5.349 -    def _merge_dictionaries(self, get_partitions, rename_files, remove_files, get_reader, get_writer, get_merger, intervals):
   5.350 -
   5.351 -        "Merge term or field dictionaries."
   5.352 -
   5.353 -        partitions = get_partitions()
   5.354 -
   5.355 -        # Ensure the correct labelling of a single partition.
   5.356 -
   5.357 -        if len(partitions) == 1:
   5.358 -            partition = list(partitions)[0]
   5.359 -            if partition != "merged":
   5.360 -                rename_files(self.pathname, partition, "merged")
   5.361 -            return
   5.362 +        "Merge the partitions in the index."
   5.363  
   5.364 -        # Merge the partitions.
   5.365 -
   5.366 -        old_merged_counter = 0
   5.367 -
   5.368 -        while len(partitions) > 1:
   5.369 -
   5.370 -            if "merged" in partitions:
   5.371 -                rename_files(self.pathname, "merged", "old-merged-%d" % old_merged_counter)
   5.372 -                partitions.remove("merged")
   5.373 -                partitions.add("old-merged-%d" % old_merged_counter)
   5.374 -                old_merged_counter += 1
   5.375 -
   5.376 -            # Process only a certain number at once, avoiding resource limits.
   5.377 -
   5.378 -            active_partitions = list(islice(partitions, self.open_partitions))
   5.379 -
   5.380 -            readers = []
   5.381 -            for partition in active_partitions:
   5.382 -                readers.append(get_reader(self.pathname, partition))
   5.383 -
   5.384 -            # Write directly to a dictionary.
   5.385 +        reader = IndexReader(self.pathname, get_term_data_reader, self.merge_data)
   5.386 +        writer = get_term_writer(self.pathname, "merged")
   5.387 +        try:
   5.388 +            writer.begin(*reader.get_sizes())
   5.389 +            for term, data in reader:
   5.390 +                writer.write_term_plus_remaining(term, data)
   5.391 +                writer.end_record()
   5.392 +        finally:
   5.393 +            writer.close()
   5.394 +            reader.close()
   5.395  
   5.396 -            writer = get_writer(self.pathname, "merged", *intervals)
   5.397 -            merger = get_merger(writer, readers)
   5.398 -            merger.merge()
   5.399 -            merger.close()
   5.400 -
   5.401 -            # Remove old files.
   5.402 -
   5.403 -            for partition in active_partitions:
   5.404 -                remove_files(self.pathname, partition)
   5.405 +        for partition in get_term_partitions(self.pathname):
   5.406 +            remove_term_files(self.pathname, partition)
   5.407  
   5.408 -            # Acquire the partitions to check their number again.
   5.409 -
   5.410 -            partitions = get_partitions()
   5.411 -
   5.412 -    def _merge_terms(self):
   5.413 +        rename_term_files(self.pathname, "merged", 0)
   5.414  
   5.415 -        "Merge term dictionaries."
   5.416 -
   5.417 -        self._merge_dictionaries(self.get_term_partitions, rename_term_files,
   5.418 -            remove_term_files, get_term_reader, get_term_writer,
   5.419 -            TermDictionaryMerger, [self.interval, self.doc_interval])
   5.420 +    def merge_data(self, a, b):
   5.421  
   5.422 -    def _merge_fields(self):
   5.423 -
   5.424 -        "Merge field dictionaries."
   5.425 -
   5.426 -        self._merge_dictionaries(self.get_field_partitions, rename_field_files,
   5.427 -            remove_field_files, get_field_reader, get_field_writer,
   5.428 -            FieldDictionaryMerger, [self.field_interval])
   5.429 -
   5.430 -    def update(self, other_indexes):
   5.431 +        """
   5.432 +        Merge 'a' and 'b', modifying the data to permit concatenation.
   5.433 +        """
   5.434  
   5.435 -        "Copy the content of the 'other_indexes' into this index and merge."
   5.436 -
   5.437 -        self._ensure_directory()
   5.438 +        # Modify the record to indicate a continuation of the data.
   5.439  
   5.440 -        for i, index in enumerate(other_indexes):
   5.441 -            for partition in index.get_term_partitions():
   5.442 -                copy_term_files(index.pathname, partition, self.pathname, "-added-%d" % i)
   5.443 -            for partition in index.get_field_partitions():
   5.444 -                copy_field_files(index.pathname, partition, self.pathname, "-added-%d" % i)
   5.445 -
   5.446 -        self.merge()
   5.447 +        c = a + b
   5.448 +        c[len(a) - 1] = 1
   5.449 +        return c
   5.450  
   5.451      def close(self):
   5.452          if self.reader is not None:

     6.1 --- a/iixr/merging.py	Sat Feb 12 01:23:58 2011 +0100
     6.2 +++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
     6.3 @@ -1,89 +0,0 @@
     6.4 -#!/usr/bin/env python
     6.5 -
     6.6 -"""
     6.7 -Dictionary merging classes.
     6.8 -
     6.9 -Copyright (C) 2009, 2010 Paul Boddie <paul@boddie.org.uk>
    6.10 -
    6.11 -This program is free software; you can redistribute it and/or modify it under
    6.12 -the terms of the GNU General Public License as published by the Free Software
    6.13 -Foundation; either version 3 of the License, or (at your option) any later
    6.14 -version.
    6.15 -
    6.16 -This program is distributed in the hope that it will be useful, but WITHOUT ANY
    6.17 -WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
    6.18 -PARTICULAR PURPOSE.  See the GNU General Public License for more details.
    6.19 -
    6.20 -You should have received a copy of the GNU General Public License along
    6.21 -with this program.  If not, see <http://www.gnu.org/licenses/>.
    6.22 -"""
    6.23 -
    6.24 -from itermerge import itermerge
    6.25 -
    6.26 -class Merger:
    6.27 -
    6.28 -    "Merge files."
    6.29 -
    6.30 -    def __init__(self, writer, readers):
    6.31 -        self.writer = writer
    6.32 -        self.readers = readers
    6.33 -
    6.34 -    def close(self):
    6.35 -        for reader in self.readers:
    6.36 -            reader.close()
    6.37 -        self.readers = []
    6.38 -        if self.writer is not None:
    6.39 -            self.writer.close()
    6.40 -            self.writer = None
    6.41 -
    6.42 -class TermDictionaryMerger(Merger):
    6.43 -
    6.44 -    "Merge term and position files."
    6.45 -
    6.46 -    def merge(self):
    6.47 -
    6.48 -        """
    6.49 -        Merge terms and positions from the readers, sending them to the writer.
    6.50 -        """
    6.51 -
    6.52 -        last_term = None
    6.53 -        current_readers = []
    6.54 -
    6.55 -        for term, frequency, doc_frequency, positions in itermerge(self.readers):
    6.56 -            if term == last_term:
    6.57 -                current_readers.append(positions)
    6.58 -            else:
    6.59 -                if current_readers:
    6.60 -                    self.writer.write_term_positions(last_term, itermerge(current_readers))
    6.61 -                last_term = term
    6.62 -                current_readers = [positions]
    6.63 -        else:
    6.64 -            if current_readers:
    6.65 -                self.writer.write_term_positions(last_term, itermerge(current_readers))
    6.66 -
    6.67 -class FieldDictionaryMerger(Merger):
    6.68 -
    6.69 -    "Merge field files."
    6.70 -
    6.71 -    def merge(self):
    6.72 -
    6.73 -        """
    6.74 -        Merge fields from the readers, sending them to the writer.
    6.75 -        """
    6.76 -
    6.77 -        last_docnum = None
    6.78 -        current_fields = []
    6.79 -
    6.80 -        for docnum, fields in itermerge(self.readers):
    6.81 -            if docnum == last_docnum:
    6.82 -                current_fields += fields
    6.83 -            else:
    6.84 -                if current_fields:
    6.85 -                    self.writer.write_fields(last_docnum, current_fields)
    6.86 -                last_docnum = docnum
    6.87 -                current_fields = fields
    6.88 -        else:
    6.89 -            if current_fields:
    6.90 -                self.writer.write_fields(last_docnum, current_fields)
    6.91 -
    6.92 -# vim: tabstop=4 expandtab shiftwidth=4

     7.1 --- a/iixr/positions.py	Sat Feb 12 01:23:58 2011 +0100
     7.2 +++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
     7.3 @@ -1,566 +0,0 @@
     7.4 -#!/usr/bin/env python
     7.5 -
     7.6 -"""
     7.7 -Specific classes for storing position information.
     7.8 -
     7.9 -Copyright (C) 2009, 2010, 2011 Paul Boddie <paul@boddie.org.uk>
    7.10 -
    7.11 -This program is free software; you can redistribute it and/or modify it under
    7.12 -the terms of the GNU General Public License as published by the Free Software
    7.13 -Foundation; either version 3 of the License, or (at your option) any later
    7.14 -version.
    7.15 -
    7.16 -This program is distributed in the hope that it will be useful, but WITHOUT ANY
    7.17 -WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
    7.18 -PARTICULAR PURPOSE.  See the GNU General Public License for more details.
    7.19 -
    7.20 -You should have received a copy of the GNU General Public License along
    7.21 -with this program.  If not, see <http://www.gnu.org/licenses/>.
    7.22 -"""
    7.23 -
    7.24 -from iixr.data import *
    7.25 -from iixr.files import *
    7.26 -
    7.27 -class PositionWriter(FileWriter):
    7.28 -
    7.29 -    "Writing position information to files."
    7.30 -
    7.31 -    def begin(self, docnum_size, position_size):
    7.32 -        self.write_numbers((docnum_size, position_size))
    7.33 -        self.end_record()
    7.34 -        self.data_start = self.tell()
    7.35 -        self.docnum_size = docnum_size
    7.36 -        self.position_size = position_size
    7.37 -
    7.38 -    def reset(self):
    7.39 -        self.end_record()
    7.40 -        self.last_docnum = None
    7.41 -        self.subtractor = None
    7.42 -
    7.43 -    def write_positions(self, docnum, positions):
    7.44 -
    7.45 -        """
    7.46 -        Write for the document 'docnum' the given 'positions'.
    7.47 -        """
    7.48 -
    7.49 -        if not positions:
    7.50 -            return
    7.51 -
    7.52 -        # Make sure that the positions are sorted.
    7.53 -
    7.54 -        positions.sort()
    7.55 -
    7.56 -        # Calculate an ongoing delta.
    7.57 -
    7.58 -        if self.last_docnum is not None:
    7.59 -            if docnum < self.last_docnum:
    7.60 -                raise ValueError, "Document number %r is less than previous number %r." % (docnum, self.last_docnum)
    7.61 -
    7.62 -            docnum_seq = self.subtractor(docnum, self.last_docnum)
    7.63 -
    7.64 -        # Or preserve the document number and prepare for future deltas.
    7.65 -
    7.66 -        else:
    7.67 -            self.subtractor = get_subtractor(docnum)
    7.68 -            docnum_seq = docnum
    7.69 -
    7.70 -        self.write_sequence_value(docnum_seq, self.docnum_size)
    7.71 -        self.write_monotonic_sequence(positions, self.position_size)
    7.72 -
    7.73 -        self.last_docnum = docnum
    7.74 -
    7.75 -class PositionReader(FileReader):
    7.76 -
    7.77 -    "Reading position information within term-specific regions of a file."
    7.78 -
    7.79 -    def begin(self):
    7.80 -        self.begin_record()
    7.81 -        try:
    7.82 -            self.docnum_size, self.position_size = self.read_numbers(2)
    7.83 -        except EOFError:
    7.84 -            self.docnum_size, self.position_size = 0, 0 # NOTE: No positions!
    7.85 -        self.data_start = self.tell()
    7.86 -
    7.87 -    def reset(self):
    7.88 -        self.last_docnum = None
    7.89 -        self.adder = None
    7.90 -        self.begin_record()
    7.91 -
    7.92 -    def read_positions(self):
    7.93 -
    7.94 -        """
    7.95 -        Read positions, returning a document number and a list of positions.
    7.96 -        """
    7.97 -
    7.98 -        # Read the document number.
    7.99 -
   7.100 -        docnum = self.read_sequence_value(self.docnum_size)
   7.101 -
   7.102 -        # Calculate an ongoing delta.
   7.103 -
   7.104 -        if self.last_docnum is not None:
   7.105 -            self.last_docnum = self.adder(docnum, self.last_docnum)
   7.106 -
   7.107 -        # Or preserve the document number and prepare for future deltas.
   7.108 -
   7.109 -        else:
   7.110 -            self.adder = get_adder(docnum)
   7.111 -            self.last_docnum = docnum
   7.112 -
   7.113 -        positions = self.read_monotonic_sequence(self.position_size)
   7.114 -
   7.115 -        return self.last_docnum, positions
   7.116 -
   7.117 -class PositionIndexWriter(PositionWriter):
   7.118 -
   7.119 -    "Writing position index information to files."
   7.120 -
   7.121 -    def begin(self, docnum_size):
   7.122 -        PositionWriter.begin(self, docnum_size, 0)
   7.123 -
   7.124 -    def reset(self):
   7.125 -        PositionWriter.reset(self)
   7.126 -        self.last_pos_offset = 0
   7.127 -
   7.128 -    def write_positions(self, docnum, pos_offset, count):
   7.129 -
   7.130 -        """
   7.131 -        Write the given 'docnum, 'pos_offset' and document 'count' to the
   7.132 -        position index file.
   7.133 -        """
   7.134 -
   7.135 -        # Find the size of document number values.
   7.136 -
   7.137 -        if self.last_docnum is not None:
   7.138 -            docnum_seq = self.subtractor(docnum, self.last_docnum)
   7.139 -        else:
   7.140 -            self.subtractor = get_subtractor(docnum)
   7.141 -            docnum_seq = docnum
   7.142 -
   7.143 -        self.write_sequence_value(docnum_seq, self.docnum_size)
   7.144 -        self.write_number(pos_offset - self.last_pos_offset)
   7.145 -        self.write_number(count)
   7.146 -
   7.147 -        self.last_docnum = docnum
   7.148 -        self.last_pos_offset = pos_offset
   7.149 -
   7.150 -class PositionIndexReader(PositionReader):
   7.151 -
   7.152 -    "Reading position index information within term-specific regions of a file."
   7.153 -
   7.154 -    def reset(self):
   7.155 -        PositionReader.reset(self)
   7.156 -        self.last_pos_offset = 0
   7.157 -
   7.158 -    def read_positions(self):
   7.159 -
   7.160 -        """
   7.161 -        Read a document number, a position file offset for the position index
   7.162 -        file, and the number of documents in a section of that file.
   7.163 -        """
   7.164 -
   7.165 -        # Read the document number.
   7.166 -
   7.167 -        docnum = self.read_sequence_value(self.docnum_size)
   7.168 -
   7.169 -        if self.last_docnum is not None:
   7.170 -            self.last_docnum = self.adder(docnum, self.last_docnum)
   7.171 -        else:
   7.172 -            self.adder = get_adder(docnum)
   7.173 -            self.last_docnum = docnum
   7.174 -
   7.175 -        # Read the offset delta.
   7.176 -
   7.177 -        self.last_pos_offset += self.read_number()
   7.178 -
   7.179 -        # Read the document count.
   7.180 -
   7.181 -        count = self.read_number()
   7.182 -
   7.183 -        return self.last_docnum, self.last_pos_offset, count
   7.184 -
   7.185 -# Iterators for position-related files.
   7.186 -
   7.187 -class IteratorBase:
   7.188 -
   7.189 -    "Support for iterating over results."
   7.190 -
   7.191 -    def __init__(self, reader):
   7.192 -
   7.193 -        "Initialise the iterator using the given 'reader'."
   7.194 -
   7.195 -        self.reader = reader
   7.196 -        self.replenish(0) # no iteration initially permitted
   7.197 -
   7.198 -    def replenish(self, count):
   7.199 -
   7.200 -        "Replenish the iterator with 'count' results."
   7.201 -
   7.202 -        self.count = count
   7.203 -        self.read_documents = 0
   7.204 -
   7.205 -    def __len__(self):
   7.206 -
   7.207 -        "Return the total number of results."
   7.208 -
   7.209 -        return self.count
   7.210 -
   7.211 -    def sort(self):
   7.212 -        pass # Stored document positions are already sorted.
   7.213 -
   7.214 -    def __iter__(self):
   7.215 -        return self
   7.216 -
   7.217 -class PositionIterator(IteratorBase):
   7.218 -
   7.219 -    "Iterating over document positions."
   7.220 -
   7.221 -    def replenish(self, count):
   7.222 -        IteratorBase.replenish(self, count)
   7.223 -
   7.224 -        # Fill a cache of positions.
   7.225 -
   7.226 -        self.cache = []
   7.227 -        n = 0
   7.228 -
   7.229 -        while n < self.count:
   7.230 -            self.cache.append(self.reader.read_positions())
   7.231 -            n += 1
   7.232 -
   7.233 -    def seek(self, offset, count):
   7.234 -
   7.235 -        """
   7.236 -        Seek to 'offset' in the file, limiting the number of documents available
   7.237 -        for reading to 'count'.
   7.238 -        """
   7.239 -
   7.240 -        self.reader.seek(offset)
   7.241 -        self.replenish(count)
   7.242 -
   7.243 -    def next(self):
   7.244 -
   7.245 -        "Read positions for a single document."
   7.246 -
   7.247 -        if self.read_documents < self.count:
   7.248 -            positions = self.cache[self.read_documents]
   7.249 -            self.read_documents += 1
   7.250 -            return positions
   7.251 -        else:
   7.252 -            raise StopIteration
   7.253 -
   7.254 -class PositionIndexIterator(IteratorBase):
   7.255 -
   7.256 -    "Iterating over document positions."
   7.257 -
   7.258 -    def replenish(self, count):
   7.259 -        IteratorBase.replenish(self, count)
   7.260 -
   7.261 -        # Fill a cache of offsets.
   7.262 -
   7.263 -        self.cache = []
   7.264 -        self.current = 0
   7.265 -        n = 0
   7.266 -
   7.267 -        while n < self.count:
   7.268 -            docnum, pos_offset, section_count = t = self.reader.read_positions()
   7.269 -            self.cache.append(t)
   7.270 -            n += section_count
   7.271 -
   7.272 -    def seek(self, offset, doc_frequency):
   7.273 -
   7.274 -        """
   7.275 -        Seek to 'offset' in the file, limiting the number of documents available
   7.276 -        for reading to 'doc_frequency'.
   7.277 -        """
   7.278 -
   7.279 -        self.reader.seek(offset)
   7.280 -        self.replenish(doc_frequency)
   7.281 -
   7.282 -    def next(self):
   7.283 -
   7.284 -        "Read positions for a single document."
   7.285 -
   7.286 -        if self.current < len(self.cache):
   7.287 -            docnum, pos_offset, self.section_count = t = self.cache[self.current]
   7.288 -            self.current += 1
   7.289 -            return t
   7.290 -        else:
   7.291 -            raise StopIteration
   7.292 -
   7.293 -class PositionDictionaryWriter:
   7.294 -
   7.295 -    "Writing position dictionaries."
   7.296 -
   7.297 -    def __init__(self, position_writer, position_index_writer, interval):
   7.298 -        self.position_writer = position_writer
   7.299 -        self.position_index_writer = position_index_writer
   7.300 -        self.interval = interval
   7.301 -
   7.302 -    def write_term_positions(self, doc_positions):
   7.303 -
   7.304 -        """
   7.305 -        Write all 'doc_positions' - a collection of tuples of the form (document
   7.306 -        number, position list) - to the file.
   7.307 -
   7.308 -        Add some records to the index, making dictionary entries.
   7.309 -
   7.310 -        Return a tuple containing the offset of the written data, the frequency
   7.311 -        (number of positions), and document frequency (number of documents) for
   7.312 -        the term involved.
   7.313 -        """
   7.314 -
   7.315 -        # Write the positions.
   7.316 -
   7.317 -        frequency = 0
   7.318 -        count = 0
   7.319 -
   7.320 -        if doc_positions:
   7.321 -            doc_positions.sort()
   7.322 -
   7.323 -            # Look ahead at the first document record.
   7.324 -            # NOTE: Any iterator would need to support this.
   7.325 -
   7.326 -            first_docnum, first_positions = doc_positions[0]
   7.327 -            first_position = first_positions[0]
   7.328 -
   7.329 -            # Write out size details.
   7.330 -
   7.331 -            docnum_size, position_size = sizeof(first_docnum), sizeof(first_position)
   7.332 -            self.position_writer.begin(docnum_size, position_size)
   7.333 -            self.position_index_writer.begin(docnum_size)
   7.334 -
   7.335 -            # Reset the writers.
   7.336 -
   7.337 -            self.position_writer.reset()
   7.338 -            self.position_index_writer.reset()
   7.339 -
   7.340 -            # Remember the first index entry offset.
   7.341 -
   7.342 -            index_offset = self.position_index_writer.tell()
   7.343 -
   7.344 -            # Retain the first record offset for a subsequent index entry.
   7.345 -
   7.346 -            first_offset = self.position_writer.tell()
   7.347 -
   7.348 -            for docnum, positions in doc_positions:
   7.349 -                if first_docnum is None:
   7.350 -                    first_docnum = docnum
   7.351 -
   7.352 -                self.position_writer.write_positions(docnum, positions)
   7.353 -
   7.354 -                frequency += len(positions)
   7.355 -                count += 1
   7.356 -
   7.357 -                # Every {interval} entries, write an index entry.
   7.358 -
   7.359 -                if count % self.interval == 0:
   7.360 -
   7.361 -                    self.position_index_writer.write_positions(first_docnum, first_offset, self.interval)
   7.362 -
   7.363 -                    # Reset the position writer so that position readers accessing
   7.364 -                    # a section start with the correct document number.
   7.365 -
   7.366 -                    self.position_writer.reset()
   7.367 -
   7.368 -                    first_offset = self.position_writer.tell()
   7.369 -                    first_docnum = None
   7.370 -
   7.371 -            # Finish writing an index entry for the remaining documents.
   7.372 -
   7.373 -            else:
   7.374 -                if first_docnum is not None:
   7.375 -                    self.position_index_writer.write_positions(first_docnum, first_offset, count % self.interval)
   7.376 -
   7.377 -        return index_offset, frequency, count
   7.378 -
   7.379 -    def close(self):
   7.380 -        self.position_writer.close()
   7.381 -        self.position_index_writer.close()
   7.382 -
   7.383 -class PositionDictionaryReader:
   7.384 -
   7.385 -    "Access to position dictionary entries through iterators."
   7.386 -
   7.387 -    def __init__(self, position_reader, position_index_reader):
   7.388 -        self.position_reader = position_reader
   7.389 -        self.position_index_reader = position_index_reader
   7.390 -
   7.391 -    def read_term_positions(self, offset, doc_frequency):
   7.392 -        iterator = PositionDictionaryIterator(
   7.393 -            PositionIterator(self.position_reader),
   7.394 -            PositionIndexIterator(self.position_index_reader)
   7.395 -            )
   7.396 -        iterator.seek(offset, doc_frequency)
   7.397 -        return iterator
   7.398 -
   7.399 -    def close(self):
   7.400 -        self.position_reader.close()
   7.401 -        self.position_index_reader.close()
   7.402 -
   7.403 -class PositionDictionaryIterator:
   7.404 -
   7.405 -    "Iteration over position dictionary entries."
   7.406 -
   7.407 -    def __init__(self, position_iterator, position_index_iterator):
   7.408 -        self.position_iterator = position_iterator
   7.409 -        self.position_index_iterator = position_index_iterator
   7.410 -        self.reset()
   7.411 -
   7.412 -    def reset(self):
   7.413 -
   7.414 -        # Remember the last values.
   7.415 -
   7.416 -        self.found_docnum, self.found_positions = None, None
   7.417 -
   7.418 -        # Maintain state for the next index entry, if read.
   7.419 -
   7.420 -        self.next_docnum, self.next_pos_offset, self.next_section_count = None, None, None
   7.421 -
   7.422 -    def seek(self, offset, doc_frequency):
   7.423 -
   7.424 -        """
   7.425 -        Seek to 'offset' in the index file, limiting the number of documents
   7.426 -        available for reading to 'doc_frequency'.
   7.427 -        """
   7.428 -
   7.429 -        self.reset()
   7.430 -
   7.431 -        # Seek to the appropriate index entry.
   7.432 -
   7.433 -        self.position_index_iterator.seek(offset, doc_frequency)
   7.434 -
   7.435 -        # Initialise the current index entry and current position file iterator.
   7.436 -
   7.437 -        self._next_section()
   7.438 -        self._init_section()
   7.439 -
   7.440 -    # Sequence methods.
   7.441 -
   7.442 -    def __len__(self):
   7.443 -        return len(self.position_index_iterator)
   7.444 -
   7.445 -    def sort(self):
   7.446 -        pass
   7.447 -
   7.448 -    # Iterator methods.
   7.449 -
   7.450 -    def __iter__(self):
   7.451 -        return self
   7.452 -
   7.453 -    def next(self):
   7.454 -
   7.455 -        """
   7.456 -        Attempt to get the next document record from the section in the
   7.457 -        positions file.
   7.458 -        """
   7.459 -
   7.460 -        # Return any visited but unrequested record.
   7.461 -
   7.462 -        if self.found_docnum is not None:
   7.463 -            t = self.found_docnum, self.found_positions
   7.464 -            self.found_docnum, self.found_positions = None, None
   7.465 -            return t
   7.466 -
   7.467 -        # Or search for the next record.
   7.468 -
   7.469 -        while 1:
   7.470 -
   7.471 -            # Either return the next record.
   7.472 -
   7.473 -            try:
   7.474 -                return self.position_iterator.next()
   7.475 -
   7.476 -            # Or, where a section is finished, get the next section and try again.
   7.477 -
   7.478 -            except StopIteration:
   7.479 -
   7.480 -                # Although, where a single iterator is in use, the file reader
   7.481 -                # would be positioned appropriately, this is not guaranteed in a
   7.482 -                # multiple iterator situation.
   7.483 -
   7.484 -                self._next_section()
   7.485 -                self._init_section()
   7.486 -
   7.487 -    def from_document(self, docnum):
   7.488 -
   7.489 -        """
   7.490 -        Attempt to navigate to a positions entry for the given 'docnum',
   7.491 -        returning the positions for 'docnum', or None otherwise.
   7.492 -        """
   7.493 -
   7.494 -        # Return any unrequested document positions.
   7.495 -
   7.496 -        if docnum == self.found_docnum:
   7.497 -            return self.found_positions
   7.498 -
   7.499 -        # Read ahead in the index until the next entry refers to a document
   7.500 -        # later than the desired document.
   7.501 -
   7.502 -        try:
   7.503 -            if self.next_docnum is None:
   7.504 -                self.next_docnum, self.next_pos_offset, self.next_section_count = self.position_index_iterator.next()
   7.505 -
   7.506 -            # Read until the next entry is after the desired document number,
   7.507 -            # or until the end of the results.
   7.508 -
   7.509 -            while self.next_docnum <= docnum:
   7.510 -                self._next_read_section()
   7.511 -                if self.docnum < docnum:
   7.512 -                    self.next_docnum, self.next_pos_offset, self.next_section_count = self.position_index_iterator.next()
   7.513 -                else:
   7.514 -                    break
   7.515 -
   7.516 -        except StopIteration:
   7.517 -            pass
   7.518 -
   7.519 -        # Navigate in the position file to the document.
   7.520 -
   7.521 -        self._init_section()
   7.522 -
   7.523 -        try:
   7.524 -            while 1:
   7.525 -                found_docnum, found_positions = self.position_iterator.next()
   7.526 -
   7.527 -                # Return the desired document positions or None (retaining the
   7.528 -                # positions for the document immediately after).
   7.529 -
   7.530 -                if docnum <= found_docnum:
   7.531 -                    self.found_docnum, self.found_positions = found_docnum, found_positions
   7.532 -                    if docnum == found_docnum:
   7.533 -                        return found_positions
   7.534 -                    elif docnum < found_docnum:
   7.535 -                        return None
   7.536 -
   7.537 -        except StopIteration:
   7.538 -            return None
   7.539 -
   7.540 -    # Internal methods.
   7.541 -
   7.542 -    def _next_section(self):
   7.543 -
   7.544 -        "Attempt to get the next section in the index."
   7.545 -
   7.546 -        if self.next_docnum is None:
   7.547 -            self.docnum, self.pos_offset, self.section_count = self.position_index_iterator.next()
   7.548 -        else:
   7.549 -            self._next_read_section()
   7.550 -
   7.551 -    def _next_read_section(self):
   7.552 -
   7.553 -        """
   7.554 -        Make the next index entry the current one without reading from the
   7.555 -        index.
   7.556 -        """
   7.557 -
   7.558 -        self.docnum, self.pos_offset, self.section_count = self.next_docnum, self.next_pos_offset, self.next_section_count
   7.559 -        self.next_docnum, self.next_pos_offset, self.next_section_count = None, None, None
   7.560 -
   7.561 -    def _init_section(self):
   7.562 -
   7.563 -        "Initialise the iterator for the section in the position file."
   7.564 -
   7.565 -        # Seek to the position entry.
   7.566 -
   7.567 -        self.position_iterator.seek(self.pos_offset, self.section_count)
   7.568 -
   7.569 -# vim: tabstop=4 expandtab shiftwidth=4

     8.1 --- a/iixr/terms.py	Sat Feb 12 01:23:58 2011 +0100
     8.2 +++ b/iixr/terms.py	Sun Feb 13 02:49:55 2011 +0100
     8.3 @@ -18,29 +18,87 @@
     8.4  with this program.  If not, see <http://www.gnu.org/licenses/>.
     8.5  """
     8.6  
     8.7 +from iixr.data import *
     8.8  from iixr.files import *
     8.9 -from iixr.positions import *
    8.10  from iixr.phrases import PhraseIterator
    8.11  from os.path import commonprefix # to find common string prefixes
    8.12 -from bisect import bisect_right  # to find terms in the dictionary index
    8.13  
    8.14  class TermWriter(FileWriter):
    8.15  
    8.16      "Writing term information to files."
    8.17  
    8.18 -    def reset(self):
    8.19 +    def begin(self, docnum_size, position_size):
    8.20 +
    8.21 +        "Begin writing to the file."
    8.22 +
    8.23 +        self.write_numbers((docnum_size, position_size))
    8.24          self.end_record()
    8.25 +
    8.26 +        self.data_start = self.tell()
    8.27 +        self.docnum_size = docnum_size
    8.28 +        self.position_size = position_size
    8.29 +        self.subtractor = get_subtractor(docnum_size)
    8.30          self.last_term = ""
    8.31 -        self.last_offset = 0
    8.32  
    8.33 -    def write_term(self, term, offset, frequency, doc_frequency):
    8.34 +    def write_terms(self, terms):
    8.35  
    8.36          """
    8.37 -        Write the given 'term', its position file 'offset', its 'frequency' and
    8.38 -        its 'doc_frequency' (number of documents in which it appears) to the
    8.39 -        term information file.
    8.40 +        Write the 'terms' to the term information file, with each term's details
    8.41 +        stored in a separate record.
    8.42          """
    8.43  
    8.44 +        if hasattr(terms, "items"):
    8.45 +            terms = terms.items()
    8.46 +            terms.sort()
    8.47 +
    8.48 +        for term, doc_positions in terms:
    8.49 +            if not doc_positions:
    8.50 +                continue
    8.51 +
    8.52 +            if hasattr(doc_positions, "items"):
    8.53 +                doc_positions = doc_positions.items()
    8.54 +
    8.55 +            docnum, positions = doc_positions[0]
    8.56 +
    8.57 +            if not positions:
    8.58 +                continue
    8.59 +
    8.60 +            # Start the writing, if appropriate.
    8.61 +
    8.62 +            if self.data_start is None:
    8.63 +                self.begin(sizeof(docnum), sizeof(positions[0]))
    8.64 +
    8.65 +            # Write each term and document positions.
    8.66 +
    8.67 +            self.write_term(term, doc_positions)
    8.68 +            self.end_record()
    8.69 +
    8.70 +    # Methods requiring an open record.
    8.71 +
    8.72 +    def write_term(self, term, doc_positions):
    8.73 +
    8.74 +        """
    8.75 +        Write the given 'term', its document frequency (number of documents in
    8.76 +        which it appears), and 'doc_positions' to the term information file.
    8.77 +        """
    8.78 +
    8.79 +        self.write_term_only(term)
    8.80 +
    8.81 +        # Write the document frequency and the term positions.
    8.82 +
    8.83 +        self.write_positions(doc_positions)
    8.84 +
    8.85 +    def write_term_plus_remaining(self, term, data):
    8.86 +
    8.87 +        "Write the given 'term' and the document position 'data'."
    8.88 +
    8.89 +        self.write_term_only(term)
    8.90 +        self.write_remaining(data)
    8.91 +
    8.92 +    def write_term_only(self, term):
    8.93 +
    8.94 +        "Write only the given 'term'."
    8.95 +
    8.96          if term <= self.last_term:
    8.97              raise ValueError, "Term %r precedes the previous term %r." % (term, self.last_term)
    8.98  
    8.99 @@ -52,430 +110,173 @@
   8.100          self.write_number(common)
   8.101          self.write_string(suffix)
   8.102  
   8.103 -        # Write the offset delta.
   8.104 -        # Write the frequency.
   8.105 +        self.last_term = term
   8.106 +
   8.107 +    def write_positions(self, doc_positions):
   8.108 +
   8.109 +        "Write the given 'doc_positions' to the file."
   8.110 +
   8.111 +        # Make sure that the positions are sorted.
   8.112 +
   8.113 +        doc_positions.sort()
   8.114 +
   8.115          # Write the document frequency.
   8.116  
   8.117 -        self.write_numbers((
   8.118 -            offset - self.last_offset,
   8.119 -            frequency,
   8.120 -            doc_frequency
   8.121 -            ))
   8.122 +        self.write_number(len(doc_positions))
   8.123 +
   8.124 +        last_docnum = None
   8.125 +
   8.126 +        for docnum, positions in doc_positions:
   8.127 +
   8.128 +            # Store the first document number as it is.
   8.129 +
   8.130 +            if last_docnum is None:
   8.131 +                docnum_seq = docnum
   8.132 +
   8.133 +            # Reject out-of-order documents.
   8.134 +
   8.135 +            elif docnum < last_docnum:
   8.136 +                raise ValueError, "Document number %r is less than previous number %r." % (docnum, last_docnum)
   8.137  
   8.138 -        self.last_term = term
   8.139 -        self.last_offset = offset
   8.140 +            # Calculate an ongoing delta.
   8.141 +
   8.142 +            else:
   8.143 +                docnum_seq = self.subtractor(docnum, last_docnum)
   8.144 +
   8.145 +            # Write the document number and positions.
   8.146 +
   8.147 +            self.write_sequence_value(docnum_seq, self.docnum_size)
   8.148 +            self.write_monotonic_sequence(positions, self.position_size)
   8.149 +
   8.150 +            last_docnum = docnum
   8.151 +
   8.152 +        # Write a terminating byte to indicate that no more document pages
   8.153 +        # exist.
   8.154 +
   8.155 +        self.write_byte(0)
   8.156  
   8.157  class TermReader(FileReader):
   8.158  
   8.159      "Reading term information from files."
   8.160  
   8.161 -    def reset(self):
   8.162 +    def begin(self):
   8.163 +
   8.164 +        "Begin reading from the file."
   8.165 +
   8.166 +        self.begin_record()
   8.167 +        try:
   8.168 +            self.docnum_size, self.position_size = self.read_numbers(2)
   8.169 +        except EOFError:
   8.170 +            self.docnum_size, self.position_size = 0, 0 # NOTE: No positions!
   8.171 +
   8.172 +        self.data_start = self.tell()
   8.173 +        self.adder = get_adder(self.docnum_size)
   8.174          self.last_term = ""
   8.175 -        self.last_offset = 0
   8.176 -        self.begin_record()
   8.177 +
   8.178 +    def get_sizes(self):
   8.179 +        return self.docnum_size, self.position_size
   8.180 +
   8.181 +    # Methods requiring an open record.
   8.182  
   8.183      def read_term(self):
   8.184  
   8.185 +        "Read a term and its document positions from the term information file."
   8.186 +
   8.187 +        # Read the term.
   8.188 +
   8.189 +        self.read_term_only()
   8.190 +
   8.191 +        # Read the document frequency and the term positions.
   8.192 +
   8.193 +        positions = self.read_positions()
   8.194 +
   8.195 +        return self.last_term, positions
   8.196 +
   8.197 +    def read_term_plus_remaining(self):
   8.198 +
   8.199          """
   8.200 -        Read a term, its position file offset, its frequency and its document
   8.201 -        frequency from the term information file.
   8.202 +        Read a term and the unprocessed document position data.
   8.203          """
   8.204  
   8.205 +        self.read_term_only()
   8.206 +        return self.last_term, self.read_remaining()
   8.207 +
   8.208 +    def read_term_only(self):
   8.209 +
   8.210 +        "Read a term only."
   8.211 +
   8.212          # Read the prefix length and term suffix.
   8.213  
   8.214          common = self.read_number()
   8.215          suffix = self.read_string()
   8.216  
   8.217          self.last_term = self.last_term[:common] + suffix
   8.218 -
   8.219 -        # Read the offset delta.
   8.220 -
   8.221 -        self.last_offset += self.read_number()
   8.222 -
   8.223 -        # Read the frequency.
   8.224 -
   8.225 -        frequency = self.read_number()
   8.226 -
   8.227 -        # Read the document frequency.
   8.228 -
   8.229 -        doc_frequency = self.read_number()
   8.230 +        return self.last_term
   8.231  
   8.232 -        return self.last_term, self.last_offset, frequency, doc_frequency
   8.233 -
   8.234 -    def go_to_term(self, term, offset, info_offset):
   8.235 -
   8.236 -        """
   8.237 -        Seek past the entry for 'term' having 'offset' to 'info_offset'. This
   8.238 -        permits the scanning for later terms from the specified term.
   8.239 -        """
   8.240 -
   8.241 -        self.seek(info_offset)
   8.242 -        self.last_term = term
   8.243 -        self.last_offset = offset
   8.244 -
   8.245 -class TermIndexWriter(TermWriter):
   8.246 +    def read_positions(self):
   8.247  
   8.248 -    "Writing term dictionary index details to files."
   8.249 -
   8.250 -    def reset(self):
   8.251 -        TermWriter.reset(self)
   8.252 -        self.last_info_offset = 0
   8.253 -
   8.254 -    def write_term(self, term, offset, frequency, doc_frequency, info_offset):
   8.255 -
   8.256 -        """
   8.257 -        Write the given 'term', its position file 'offset', its 'frequency' and
   8.258 -        its 'doc_frequency' to the term dictionary index file, along with the
   8.259 -        'info_offset' in the term information file.
   8.260 -        """
   8.261 +        "Read document positions from the term information file."
   8.262  
   8.263 -        TermWriter.write_term(self, term, offset, frequency, doc_frequency)
   8.264 -
   8.265 -        # Write the information file offset delta.
   8.266 -
   8.267 -        self.write_number(info_offset - self.last_info_offset)
   8.268 -
   8.269 -        self.last_info_offset = info_offset
   8.270 +        doc_positions = []
   8.271  
   8.272 -class TermIndexReader(TermReader):
   8.273 -
   8.274 -    "Reading term dictionary index details from files."
   8.275 -
   8.276 -    def reset(self):
   8.277 -        TermReader.reset(self)
   8.278 -        self.last_info_offset = 0
   8.279 +        while 1:
   8.280  
   8.281 -    def read_term(self):
   8.282 -
   8.283 -        """
   8.284 -        Read a term, its position file offset, its frequency, its document
   8.285 -        frequency and a term information file offset from the term dictionary
   8.286 -        index file.
   8.287 -        """
   8.288 -
   8.289 -        term, offset, frequency, doc_frequency = TermReader.read_term(self)
   8.290 -
   8.291 -        # Read the offset delta.
   8.292 -
   8.293 -        self.last_info_offset += self.read_number()
   8.294 +            # Read the document frequency.
   8.295  
   8.296 -        return term, offset, frequency, doc_frequency, self.last_info_offset
   8.297 -
   8.298 -class TermDictionaryWriter:
   8.299 -
   8.300 -    "Writing term dictionaries."
   8.301 -
   8.302 -    def __init__(self, info_writer, index_writer, position_dict_writer, interval):
   8.303 -        self.info_writer = info_writer
   8.304 -        self.index_writer = index_writer
   8.305 -        self.position_dict_writer = position_dict_writer
   8.306 -        self.interval = interval
   8.307 -        self.entry = 0
   8.308 -
   8.309 -        self.index_writer.reset()
   8.310 +            npositions = self.read_number()
   8.311  
   8.312 -    def _write_term(self, term, offset, frequency, doc_frequency):
   8.313 -
   8.314 -        """
   8.315 -        Write the given 'term', its position file 'offset', its 'frequency' and
   8.316 -        its 'doc_frequency' (number of documents in which it appears) to the
   8.317 -        term information file. Return the offset before the term information was
   8.318 -        written to the file.
   8.319 -        """
   8.320 -
   8.321 -        if self.entry % self.interval == 0:
   8.322 -            self.info_writer.reset()
   8.323 -            info_offset = self.info_writer.tell()
   8.324 -            self.index_writer.write_term(term, offset, frequency, doc_frequency, info_offset)
   8.325 +            last_docnum = None
   8.326 +            i = 0
   8.327 +            while i < npositions:
   8.328  
   8.329 -        self.info_writer.write_term(term, offset, frequency, doc_frequency)
   8.330 -        self.entry += 1
   8.331 -
   8.332 -    def write_term_positions(self, term, doc_positions):
   8.333 -
   8.334 -        """
   8.335 -        Write the given 'term' and the 'doc_positions' recording the documents
   8.336 -        and positions at which the term is found.
   8.337 -        """
   8.338 -
   8.339 -        offset, frequency, doc_frequency = self.position_dict_writer.write_term_positions(doc_positions)
   8.340 -
   8.341 -        if not frequency or not doc_frequency:
   8.342 -            raise ValueError, "Term %r has no occurrences recorded: %r" % (term, doc_positions)
   8.343 -
   8.344 -        self._write_term(term, offset, frequency, doc_frequency)
   8.345 +                # Read the document number.
   8.346  
   8.347 -    def close(self):
   8.348 -        self.info_writer.close()
   8.349 -        self.index_writer.close()
   8.350 -        self.position_dict_writer.close()
   8.351 -
   8.352 -class TermDictionaryReader:
   8.353 -
   8.354 -    "Reading term dictionaries."
   8.355 +                docnum = self.read_sequence_value(self.docnum_size)
   8.356 +                if last_docnum is not None:
   8.357 +                    docnum = self.adder(docnum, last_docnum)
   8.358  
   8.359 -    def __init__(self, info_reader, index_reader, position_dict_reader):
   8.360 -        self.info_reader = info_reader
   8.361 -        self.index_reader = index_reader
   8.362 -        self.position_dict_reader = position_dict_reader
   8.363 -
   8.364 -        self.info_reader.reset()
   8.365 -        self.index_reader.reset()
   8.366 -
   8.367 -        self.entry = 0
   8.368 -        self.terms = []
   8.369 -        try:
   8.370 -            while 1:
   8.371 -                self.terms.append(self.index_reader.read_term())
   8.372 -        except EOFError:
   8.373 -            pass
   8.374 -
   8.375 -        # Large numbers for ordering purposes.
   8.376 +                # Read the positions.
   8.377  
   8.378 -        if self.terms:
   8.379 -            self.max_offset = self.terms[-1][1] + 1
   8.380 -        else:
   8.381 -            self.max_offset = None
   8.382 -
   8.383 -    def _find_closest_entry(self, term):
   8.384 -
   8.385 -        """
   8.386 -        Find the offsets and frequencies of 'term' from the term dictionary or
   8.387 -        the closest term starting with the value of 'term'.
   8.388 -
   8.389 -        Return the closest index entry consisting of a term, the position file
   8.390 -        offset, the term frequency, the document frequency, and the term details
   8.391 -        file offset.
   8.392 -        """
   8.393 +                positions = self.read_monotonic_sequence(self.position_size)
   8.394 +                doc_positions.append((docnum, positions))
   8.395  
   8.396 -        i = bisect_right(self.terms, (term, self.max_offset, 0, 0)) - 1
   8.397 -
   8.398 -        # Get the entry position providing the term or one preceding it.
   8.399 -        # If no entry precedes the requested term, return the very first entry
   8.400 -        # as the closest.
   8.401 -
   8.402 -        if i == -1:
   8.403 -            self.entry = 0
   8.404 -            return self.terms[0]
   8.405 -        else:
   8.406 -            self.entry = i
   8.407 -            return self.terms[i]
   8.408 -
   8.409 -    def _find_closest_term(self, term):
   8.410 -
   8.411 -        """
   8.412 -        Find the offsets and frequencies of 'term' from the term dictionary or
   8.413 -        the closest term starting with the value of 'term'.
   8.414 +                last_docnum = docnum
   8.415 +                i += 1
   8.416  
   8.417 -        Return the closest term (or the term itself), the position file offset,
   8.418 -        the term frequency, the document frequency, and the term details file
   8.419 -        offset (or None if the reader is already positioned).
   8.420 -        """
   8.421 -
   8.422 -        found_term, offset, frequency, doc_frequency, info_offset = self._find_closest_entry(term)
   8.423 -
   8.424 -        # Where the term is found immediately, return the offset and
   8.425 -        # frequencies. If the term does not appear, return the details of the
   8.426 -        # closest entry.
   8.427 -
   8.428 -        if term <= found_term:
   8.429 -            return found_term, offset, frequency, doc_frequency, info_offset
   8.430 +            # Read a terminating byte to discover whether more document pages
   8.431 +            # exist.
   8.432  
   8.433 -        # Otherwise, seek past the index term's entry in the information file
   8.434 -        # and scan for the desired term.
   8.435 -
   8.436 -        else:
   8.437 -            # Reset the term and offset for the new page.
   8.438 -            self.info_reader.go_to_term("", 0, info_offset)
   8.439 -            try:
   8.440 -                while term > found_term:
   8.441 -                    found_term, offset, frequency, doc_frequency = self._read_term()
   8.442 -            except EOFError:
   8.443 -                pass
   8.444 -
   8.445 -            return found_term, offset, frequency, doc_frequency, None
   8.446 -
   8.447 -    def _find_term(self, term):
   8.448 +            if not self.read_byte():
   8.449 +                break
   8.450  
   8.451 -        """
   8.452 -        Find the position file offset and frequency of 'term' from the term
   8.453 -        dictionary.
   8.454 -        """
   8.455 -
   8.456 -        found_term, offset, frequency, doc_frequency, info_offset = self._find_closest_term(term)
   8.457 -
   8.458 -        # If the term is found, return the offset and frequencies.
   8.459 -
   8.460 -        if term == found_term:
   8.461 -            return offset, frequency, doc_frequency
   8.462 -        else:
   8.463 -            return None
   8.464 -
   8.465 -    def _get_term_and_positions(self, term, offset, frequency, doc_frequency):
   8.466 +        return doc_positions
   8.467  
   8.468 -        """
   8.469 -        Return the term plus positions details using the given 'term', 'offset',
   8.470 -        'frequency' and 'doc_frequency'.
   8.471 -        """
   8.472 -
   8.473 -        return term, frequency, doc_frequency, self._get_positions(offset, doc_frequency)
   8.474 -
   8.475 -    def _get_positions(self, offset, doc_frequency):
   8.476 +class TermIterator(TermReader):
   8.477  
   8.478 -        """
   8.479 -        Obtain positions from the position index 'offset' expecting a number of
   8.480 -        documents equal to the given 'doc_frequency'.
   8.481 -        """
   8.482 -
   8.483 -        return self.position_dict_reader.read_term_positions(offset, doc_frequency)
   8.484 -
   8.485 -    # Iterator convenience methods.
   8.486 +    "An iterator over terms and positions read from a file."
   8.487  
   8.488      def __iter__(self):
   8.489 -        self.rewind()
   8.490          return self
   8.491  
   8.492      def next(self):
   8.493          try:
   8.494 +            self.begin_record()
   8.495              return self.read_term()
   8.496          except EOFError:
   8.497              raise StopIteration
   8.498  
   8.499 -    # Sequential access methods.
   8.500 -
   8.501 -    def rewind(self):
   8.502 -        self.entry = 0
   8.503 -        self.info_reader.rewind()
   8.504 -
   8.505 -    def read_term(self):
   8.506 -
   8.507 -        """
   8.508 -        Return the next term, its frequency, its document frequency, and the
   8.509 -        documents and positions at which the term is found.
   8.510 -        """
   8.511 -
   8.512 -        return self._get_term_and_positions(*self._read_term())
   8.513 -
   8.514 -    def _read_term(self):
   8.515 -
   8.516 -        try:
   8.517 -            term, offset, frequency, doc_frequency = self.info_reader.read_term()
   8.518 -        except EOFError:
   8.519 -            self.entry += 1
   8.520 -            try:
   8.521 -                term, offset, frequency, doc_frequency, info_offset = self.terms[self.entry]
   8.522 -            except IndexError:
   8.523 -                raise EOFError
   8.524 -            else:
   8.525 -                # Reset the term and offset for the new page.
   8.526 -
   8.527 -                self.info_reader.go_to_term("", 0, info_offset)
   8.528 -
   8.529 -                # Skip the term in the information file.
   8.530 -
   8.531 -                self.info_reader.read_term()
   8.532 +class TermDataIterator(TermReader):
   8.533  
   8.534 -        return term, offset, frequency, doc_frequency
   8.535 -
   8.536 -    def go_to_term(self, term):
   8.537 -
   8.538 -        """
   8.539 -        Navigate to 'term' in the dictionary, returning the details from its
   8.540 -        entry. The returned details can be augmented with position information
   8.541 -        when presented to the _get_term_and_positions method.
   8.542 -        """
   8.543 -
   8.544 -        found_term, offset, frequency, doc_frequency, info_offset = self._find_closest_term(term)
   8.545 -
   8.546 -        # Position the reader, if necessary.
   8.547 -
   8.548 -        if info_offset is not None:
   8.549 +    "An iterator over terms and unprocessed document positions data."
   8.550  
   8.551 -            # Reset the term and offset for the new page.
   8.552 -
   8.553 -            self.info_reader.go_to_term("", 0, info_offset)
   8.554 -
   8.555 -            # Skip the term in the information file.
   8.556 -
   8.557 -            self.info_reader.read_term()
   8.558 -
   8.559 -        return found_term, offset, frequency, doc_frequency
   8.560 -
   8.561 -    # Query methods.
   8.562 -
   8.563 -    def get_terms(self):
   8.564 -
   8.565 -        "Return a list of all terms."
   8.566 -
   8.567 -        return iter(self)
   8.568 +    def __iter__(self):
   8.569 +        return self
   8.570  
   8.571 -    def find_terms(self, term):
   8.572 -
   8.573 -        "Return all terms whose values start with the value of 'term'."
   8.574 -
   8.575 -        terms = []
   8.576 -
   8.577 -        found_term, offset, frequency, doc_frequency = self.go_to_term(term)
   8.578 -
   8.579 -        # Read and record terms.
   8.580 -
   8.581 +    def next(self):
   8.582          try:
   8.583 -            # Add the found term if it starts with the specified term.
   8.584 -
   8.585 -            while found_term.startswith(term):
   8.586 -                terms.append(found_term)
   8.587 -                found_term, offset, frequency, doc_frequency = self._read_term()
   8.588 -
   8.589 +            self.begin_record()
   8.590 +            return self.read_term_plus_remaining()
   8.591          except EOFError:
   8.592 -            pass
   8.593 -
   8.594 -        return terms
   8.595 -
   8.596 -    def find_positions(self, term):
   8.597 -
   8.598 -        "Return the documents and positions at which the given 'term' is found."
   8.599 -
   8.600 -        t = self._find_term(term)
   8.601 -        if t is None:
   8.602 -            return []
   8.603 -        else:
   8.604 -            offset, frequency, doc_frequency = t
   8.605 -            return self._get_positions(offset, doc_frequency)
   8.606 -
   8.607 -    def find_common_positions(self, terms):
   8.608 -
   8.609 -        """
   8.610 -        Return the documents and positions at which all the given 'terms' are
   8.611 -        found, where only common documents are returned.
   8.612 -        """
   8.613 -
   8.614 -        return PhraseIterator([self.find_positions(term) for term in terms])
   8.615 -
   8.616 -    def get_frequency(self, term):
   8.617 -
   8.618 -        "Return the frequency of the given 'term'."
   8.619 -
   8.620 -        t = self._find_term(term)
   8.621 -        if t is None:
   8.622 -            return None
   8.623 -        else:
   8.624 -            offset, frequency, doc_frequency = t
   8.625 -            return frequency
   8.626 -
   8.627 -    def get_document_frequency(self, term):
   8.628 -
   8.629 -        "Return the document frequency of the given 'term'."
   8.630 -
   8.631 -        t = self._find_term(term)
   8.632 -        if t is None:
   8.633 -            return None
   8.634 -        else:
   8.635 -            offset, frequency, doc_frequency = t
   8.636 -            return doc_frequency
   8.637 -
   8.638 -    def close(self):
   8.639 -        self.info_reader.close()
   8.640 -        self.index_reader.close()
   8.641 -        self.position_dict_reader.close()
   8.642 +            raise StopIteration
   8.643  
   8.644  # vim: tabstop=4 expandtab shiftwidth=4

     9.1 --- a/itermerge.py	Sat Feb 12 01:23:58 2011 +0100
     9.2 +++ b/itermerge.py	Sun Feb 13 02:49:55 2011 +0100
     9.3 @@ -3,7 +3,7 @@
     9.4  """
     9.5  An iterator merging class similar to heapq.merge in Python 2.6.
     9.6  
     9.7 -Copyright (C) 2009 Paul Boddie <paul@boddie.org.uk>
     9.8 +Copyright (C) 2009, 2011 Paul Boddie <paul@boddie.org.uk>
     9.9  
    9.10  This program is free software; you can redistribute it and/or modify it under
    9.11  the terms of the GNU General Public License as published by the Free Software

    10.1 --- a/test.py	Sat Feb 12 01:23:58 2011 +0100
    10.2 +++ b/test.py	Sun Feb 13 02:49:55 2011 +0100
    10.3 @@ -1,22 +1,21 @@
    10.4  #!/usr/bin/env python
    10.5 +# encoding: iso-8859-1
    10.6  
    10.7  from iixr.files import *
    10.8 -from iixr.fields import *
    10.9  from iixr.terms import *
   10.10 -from iixr.positions import *
   10.11  from iixr.index import *
   10.12  import os, sys
   10.13  
   10.14  # Remove old test files.
   10.15  
   10.16 -for filename in ("test", "testMS", "testNMS", "testF", "testFI", "testI", "testP", "testP2", "testPI"):
   10.17 +for filename in ("test", "testMS", "testNMS", "testP", "testP2"):
   10.18      try:
   10.19          os.remove(filename)
   10.20      except OSError:
   10.21          pass
   10.22  
   10.23  try:
   10.24 -    for dirname in ("test_index", "test_index2", "test_index3", "test_indexT"):
   10.25 +    for dirname in ("test_index",):
   10.26          for filename in os.listdir(dirname):
   10.27              os.remove(os.path.join(dirname, filename))
   10.28          os.rmdir(dirname)
   10.29 @@ -98,22 +97,20 @@
   10.30      ]
   10.31  
   10.32  f = open("testP", "wb")
   10.33 -w = PositionWriter(f)
   10.34 +w = TermWriter(f)
   10.35  w.begin(0, 0)
   10.36  for doc_positions in all_doc_positions:
   10.37 -    w.reset()
   10.38 -    for docnum, positions in doc_positions:
   10.39 -        w.write_positions(docnum, positions)
   10.40 +    w.write_positions(doc_positions)
   10.41 +    w.end_record()
   10.42  w.close()
   10.43  
   10.44  f = open("testP", "rb")
   10.45 -r = PositionReader(f)
   10.46 +r = TermReader(f)
   10.47  for doc_positions in all_doc_positions:
   10.48 -    r.reset()
   10.49 -    for docnum, positions in doc_positions:
   10.50 -        d, p = r.read_positions()
   10.51 -        print docnum == d, docnum, d
   10.52 -        print positions == p, positions, p
   10.53 +    r.begin_record()
   10.54 +    dp = r.read_positions()
   10.55 +    print doc_positions == dp, doc_positions
   10.56 +    print "     ", dp
   10.57  r.close()
   10.58  
   10.59  all_doc_positions_seq = [
   10.60 @@ -131,350 +128,56 @@
   10.61      ]
   10.62  
   10.63  f = open("testP2", "wb")
   10.64 -w = PositionWriter(f)
   10.65 +w = TermWriter(f)
   10.66  w.begin(2, 2)
   10.67  for doc_positions in all_doc_positions_seq:
   10.68 -    w.reset()
   10.69 -    for docnum, positions in doc_positions:
   10.70 -        w.write_positions(docnum, positions)
   10.71 +    w.write_positions(doc_positions)
   10.72 +    w.end_record()
   10.73  w.close()
   10.74  
   10.75  f = open("testP2", "rb")
   10.76 -r = PositionReader(f)
   10.77 +r = TermReader(f)
   10.78  for doc_positions in all_doc_positions_seq:
   10.79 -    r.reset()
   10.80 -    for docnum, positions in doc_positions:
   10.81 -        d, p = r.read_positions()
   10.82 -        print docnum == d, docnum, d
   10.83 -        print positions == p, positions, p
   10.84 -r.close()
   10.85 -
   10.86 -print "- Test position index files."
   10.87 -
   10.88 -indexed_positions = [
   10.89 -    [
   10.90 -        (1234, 0, 100),
   10.91 -        (2345, 700, 100),
   10.92 -        (3456, 1900, 50)
   10.93 -    ],
   10.94 -    [
   10.95 -        (4567, 2800, 20)
   10.96 -    ]
   10.97 -    ]
   10.98 -
   10.99 -offsets = []
  10.100 -f = open("testPI", "wb")
  10.101 -w = PositionIndexWriter(f)
  10.102 -w.begin(0)
  10.103 -for term_positions in indexed_positions:
  10.104 -    offset = None
  10.105 -    doc_frequency = 0
  10.106 -    w.reset()
  10.107 -    for docnum, pos_offset, count in term_positions:
  10.108 -        if offset is None:
  10.109 -            offset = w.tell()
  10.110 -        w.write_positions(docnum, pos_offset, count)
  10.111 -        doc_frequency += count
  10.112 -    offsets.append((offset, doc_frequency))
  10.113 -w.close()
  10.114 -
  10.115 -r = PositionIndexIterator(PositionIndexReader(open("testPI", "rb")))
  10.116 -offsets.reverse()
  10.117 -indexed_positions.reverse()
  10.118 -for (offset, doc_frequency), term_positions in zip(offsets, indexed_positions):
  10.119 -    r.seek(offset, doc_frequency)
  10.120 -    for (docnum, pos_offset, count), (dn, po, c) in zip(term_positions, r):
  10.121 -        print docnum == dn, docnum, dn
  10.122 -        print pos_offset == po, pos_offset, po
  10.123 -        print count == c, count, c
  10.124 -r.reader.close()
  10.125 -
  10.126 -print "- Test position dictionaries."
  10.127 -
  10.128 -f = open("testP", "wb")
  10.129 -w = PositionWriter(f)
  10.130 -f2 = open("testPI", "wb")
  10.131 -w2 = PositionIndexWriter(f2)
  10.132 -wd = PositionDictionaryWriter(w, w2, 2)
  10.133 -offsets = []
  10.134 -for doc_positions in all_doc_positions:
  10.135 -    offset, frequency, doc_frequency = wd.write_term_positions(doc_positions)
  10.136 -    offsets.append((offset, doc_frequency))
  10.137 -wd.close()
  10.138 -
  10.139 -r = PositionReader(open("testP", "rb"))
  10.140 -r2 = PositionIndexReader(open("testPI", "rb"))
  10.141 -rd = PositionDictionaryReader(r, r2)
  10.142 -offsets.reverse()
  10.143 -all_doc_positions.reverse()
  10.144 -for (offset, doc_frequency), doc_positions in zip(offsets, all_doc_positions):
  10.145 -    it = rd.read_term_positions(offset, doc_frequency)
  10.146 -    dp = list(it)
  10.147 -    print doc_positions == dp, doc_positions, dp
  10.148 -rd.close()
  10.149 -
  10.150 -print "- Test fields."
  10.151 -
  10.152 -doc_fields = [
  10.153 -    (123, ["testing", "fields", "stored", "compressed"]),
  10.154 -    (456, ["fields", "for a second", "document"]),
  10.155 -    (789, ["field value"]),
  10.156 -    (1234, []),
  10.157 -    (2345, ["abc", "def"]),
  10.158 -    (3456, ["apple", "banana", "cherry"]),
  10.159 -    (4567, ["drue", "eple"])
  10.160 -    ]
  10.161 -
  10.162 -f = open("testF", "wb")
  10.163 -w = FieldWriter(f)
  10.164 -w.begin(0)
  10.165 -w.reset()
  10.166 -for docnum, fields in doc_fields:
  10.167 -    w.write_fields(docnum, list(enumerate(fields)))
  10.168 -w.close()
  10.169 -
  10.170 -f = open("testF", "rb")
  10.171 -r = FieldReader(f)
  10.172 -r.reset()
  10.173 -for docnum, fields in doc_fields:
  10.174 -    dn, df = r.read_fields()
  10.175 -    print docnum == dn, docnum, dn
  10.176 -    print list(enumerate(fields)) == df, list(enumerate(fields)), df
  10.177 -r.close()
  10.178 -
  10.179 -print "- Test field index files."
  10.180 -
  10.181 -indexed_docs = [
  10.182 -    (123, 100000987),
  10.183 -    (456, 100004321),
  10.184 -    (789, 100008765)
  10.185 -    ]
  10.186 -
  10.187 -f = open("testFI", "wb")
  10.188 -w = FieldIndexWriter(f)
  10.189 -w.begin(0)
  10.190 -w.reset()
  10.191 -for docnum, offset in indexed_docs:
  10.192 -    w.write_document(docnum, offset)
  10.193 -w.close()
  10.194 -
  10.195 -f = open("testFI", "rb")
  10.196 -r = FieldIndexReader(f)
  10.197 -r.reset()
  10.198 -for docnum, offset in indexed_docs:
  10.199 -    dn, o = r.read_document()
  10.200 -    print docnum == dn, docnum, dn
  10.201 -    print offset == o, offset, o
  10.202 +    r.begin_record()
  10.203 +    dp = r.read_positions()
  10.204 +    print doc_positions == dp, doc_positions
  10.205 +    print "     ", dp
  10.206  r.close()
  10.207  
  10.208 -print "- Test field dictionaries."
  10.209 -
  10.210 -f = open("testF", "wb")
  10.211 -w = FieldWriter(f)
  10.212 -f2 = open("testFI", "wb")
  10.213 -w2 = FieldIndexWriter(f2)
  10.214 -wd = FieldDictionaryWriter(w, w2, 3)
  10.215 -for docnum, fields in doc_fields:
  10.216 -    wd.write_fields(docnum, list(enumerate(fields)))
  10.217 -wd.close()
  10.218 -
  10.219 -f = open("testF", "rb")
  10.220 -r = FieldReader(f)
  10.221 -f2 = open("testFI", "rb")
  10.222 -r2 = FieldIndexReader(f2)
  10.223 -rd = FieldDictionaryReader(r, r2)
  10.224 -doc_fields_reversed = doc_fields[:]
  10.225 -doc_fields_reversed.reverse()
  10.226 -for docnum, fields in doc_fields_reversed:
  10.227 -    df = dict(rd.get_fields(docnum))
  10.228 -    print dict(enumerate(fields)) == df, dict(enumerate(fields)), df
  10.229 -for docnum in (13579, 246810):
  10.230 -    df = rd.get_fields(docnum)
  10.231 -    print df is None, df
  10.232 -
  10.233 -print "- (Test sequential access.)"
  10.234 -
  10.235 -rd.rewind()
  10.236 -for docnum, fields in doc_fields:
  10.237 -    dn, df = rd.read_fields()
  10.238 -    print docnum == dn, docnum, dn
  10.239 -    print list(enumerate(fields)) == df, list(enumerate(fields)), df
  10.240 -rd.close()
  10.241 -
  10.242 -print "- Test terms."
  10.243 -
  10.244 -terms = [
  10.245 -    # term       offset      frequency  doc_frequency
  10.246 -    ("aardvark",  100000123,  1,         1),
  10.247 -    ("anteater",  100000456,  2,         1),
  10.248 -    ("badger",    100000789, 13,         7),
  10.249 -    ("bull",     1000001234, 59,        17),
  10.250 -    ("bulldog",  1000002345, 99,        80),
  10.251 -    ("cat",      1000003456, 89,        28)
  10.252 -    ]
  10.253 -
  10.254 -f = open("test", "wb")
  10.255 -w = TermWriter(f)
  10.256 -w.reset()
  10.257 -for term, offset, frequency, doc_frequency in terms:
  10.258 -    w.write_term(term, offset, frequency, doc_frequency)
  10.259 -w.close()
  10.260 -
  10.261 -f = open("test", "rb")
  10.262 -r = TermReader(f)
  10.263 -r.reset()
  10.264 -for term, offset, frequency, doc_frequency in terms:
  10.265 -    t, o, fr, df = r.read_term()
  10.266 -    print term == t, term, t
  10.267 -    print offset == o, offset, o
  10.268 -    print frequency == fr, frequency, fr
  10.269 -    print doc_frequency == df, doc_frequency, df
  10.270 -r.close()
  10.271 -
  10.272 -print "- Test terms in index files."
  10.273 -
  10.274 -indexed_terms = [
  10.275 -    # term       offset      frequency  doc_frequency   info_offset
  10.276 -    ("aardvark",  100000123,  1,         1,             200000321),
  10.277 -    ("anteater",  100000456,  2,         1,             200000654),
  10.278 -    ("badger",    100000789, 13,         7,             200000987),
  10.279 -    ("bull",     1000001234, 59,        17,             200004321),
  10.280 -    ("bulldog",  1000002345, 99,        80,             200005432),
  10.281 -    ("cat",      1000003456, 89,        28,             200006543)
  10.282 -    ]
  10.283 -
  10.284 -f = open("test", "wb")
  10.285 -w = TermIndexWriter(f)
  10.286 -w.reset()
  10.287 -for term, offset, frequency, doc_frequency, info_offset in indexed_terms:
  10.288 -    w.write_term(term, offset, frequency, doc_frequency, info_offset)
  10.289 -w.close()
  10.290 -
  10.291 -f = open("test", "rb")
  10.292 -r = TermIndexReader(f)
  10.293 -r.reset()
  10.294 -for term, offset, frequency, doc_frequency, info_offset in indexed_terms:
  10.295 -    t, o, fr, df, i = r.read_term()
  10.296 -    print term == t, term, t
  10.297 -    print offset == o, offset, o
  10.298 -    print frequency == fr, frequency, fr
  10.299 -    print doc_frequency == df, doc_frequency, df
  10.300 -    print info_offset == i, info_offset, i
  10.301 -r.close()
  10.302 -
  10.303 -print "- Test dictionaries with only term data."
  10.304 -
  10.305 -f = open("test", "wb")
  10.306 -w = TermWriter(f)
  10.307 -f2 = open("testI", "wb")
  10.308 -w2 = TermIndexWriter(f2)
  10.309 -f3 = open("testP", "wb")
  10.310 -w3 = PositionWriter(f3)
  10.311 -f4 = open("testPI", "wb")
  10.312 -w4 = PositionIndexWriter(f4)
  10.313 -wp = PositionDictionaryWriter(w3, w4, 2)
  10.314 -wd = TermDictionaryWriter(w, w2, wp, 3)
  10.315 -for term, offset, frequency, doc_frequency in terms:
  10.316 -    wd._write_term(term, offset, frequency, doc_frequency)
  10.317 -wd.close()
  10.318 -
  10.319 -f = open("test", "rb")
  10.320 -r = TermReader(f)
  10.321 -f2 = open("testI", "rb")
  10.322 -r2 = TermIndexReader(f2)
  10.323 -r3 = PositionReader(open("testP", "rb"))
  10.324 -r4 = PositionIndexReader(open("testPI", "rb"))
  10.325 -rp = PositionDictionaryReader(r3, r4)
  10.326 -rd = TermDictionaryReader(r, r2, rp)
  10.327 -terms_reversed = terms[:]
  10.328 -terms_reversed.reverse()
  10.329 -for term, offset, frequency, doc_frequency in terms_reversed:
  10.330 -    o, fr, df = rd._find_term(term)
  10.331 -    print offset == o, offset, o
  10.332 -    print frequency == fr, frequency, fr
  10.333 -    print doc_frequency == df, doc_frequency, df
  10.334 -for term in ("dog", "dingo"):
  10.335 -    t = rd._find_term(term)
  10.336 -    print t is None, t
  10.337 -
  10.338 -print "- (Test term prefix searching.)"
  10.339 -
  10.340 -print rd.find_terms("a") == ["aardvark", "anteater"], rd.find_terms("a"), ["aardvark", "anteater"]
  10.341 -print rd.find_terms("bu") == ["bull", "bulldog"], rd.find_terms("bu"), ["bull", "bulldog"]
  10.342 -print rd.find_terms("c") == ["cat"], rd.find_terms("c"), ["cat"]
  10.343 -print rd.find_terms("d") == [], rd.find_terms("d"), []
  10.344 -rd.close()
  10.345 -
  10.346  print "- Test dictionaries with term and position data."
  10.347  
  10.348  terms_with_positions = [
  10.349      ("aardvark",  [(1, [2, 45, 96]), (20, [13])]),
  10.350      ("anteater",  [(1, [43, 44])]),
  10.351      ("badger",    [(7, [2, 22, 196]), (19, [55, 1333]), (21, [0])]),
  10.352 +    (u"bj�rn",    [(11, [19, 54])]),
  10.353      ("bull",      [(6, [128]), (16, [12]), (26, [1, 3, 5, 7, 9]), (36, [2, 4, 6, 8, 10])]),
  10.354      ("bulldog",   [(43, [17, 19, 256, 512])]),
  10.355 -    ("cat",       [(123, [12, 145, 196]), (1200, [113])])
  10.356 -    ]
  10.357 -
  10.358 -position_dict_tests = [
  10.359 -    ("badger", 19, [55, 1333]),
  10.360 -    ("badger", 20, None),
  10.361 -    ("bull", 6, [128]),
  10.362 -    ("bull", 26, [1, 3, 5, 7, 9]),
  10.363 -    ("cat", 111, None),
  10.364 -    ("cat", 123, [12, 145, 196]),
  10.365 -    ("cat", 1234, None)
  10.366 +    ("cat",       [(123, [12, 145, 196]), (1200, [113])]),
  10.367 +    (u"�",        [(15, [384])]),
  10.368      ]
  10.369  
  10.370  f = open("test", "wb")
  10.371  w = TermWriter(f)
  10.372 -f2 = open("testI", "wb")
  10.373 -w2 = TermIndexWriter(f2)
  10.374 -f3 = open("testP", "wb")
  10.375 -w3 = PositionWriter(f3)
  10.376 -f4 = open("testPI", "wb")
  10.377 -w4 = PositionIndexWriter(f4)
  10.378 -wp = PositionDictionaryWriter(w3, w4, 2)
  10.379 -wd = TermDictionaryWriter(w, w2, wp, 3)
  10.380 -for term, doc_positions in terms_with_positions:
  10.381 -    wd.write_term_positions(term, doc_positions)
  10.382 -wd.close()
  10.383 +w.begin(0, 0)
  10.384 +w.write_terms(terms_with_positions)
  10.385 +w.close()
  10.386  
  10.387  f = open("test", "rb")
  10.388 -r = TermReader(f)
  10.389 -f2 = open("testI", "rb")
  10.390 -r2 = TermIndexReader(f2)
  10.391 -r3 = PositionReader(open("testP", "rb"))
  10.392 -r4 = PositionIndexReader(open("testPI", "rb"))
  10.393 -rp = PositionDictionaryReader(r3, r4)
  10.394 -rd = TermDictionaryReader(r, r2, rp)
  10.395 -terms_reversed = terms_with_positions[:]
  10.396 -terms_reversed.reverse()
  10.397 -for term, doc_positions in terms_reversed:
  10.398 -    dp = list(rd.find_positions(term))
  10.399 -    print doc_positions == dp, doc_positions, dp
  10.400 -for term in ("aaa", "dog", "dingo"):
  10.401 -    dp = rd.find_positions(term)
  10.402 -    print dp == [], dp
  10.403 +r = TermIterator(f)
  10.404 +for (term, doc_positions), (t, dp) in zip(terms_with_positions, r):
  10.405 +    print term == t, term, t
  10.406 +    print doc_positions == dp, doc_positions
  10.407 +    print "     ", dp
  10.408 +r.close()
  10.409  
  10.410 -print "- (Test iterators.)"
  10.411 -
  10.412 -for term, docnum, positions in position_dict_tests:
  10.413 -    dp = rd.find_positions(term)
  10.414 -    pos = dp.from_document(docnum)
  10.415 -    print positions is None and pos is None or pos is not None and positions == list(pos), positions, pos
  10.416 -
  10.417 -print "- (Test sequential access.)"
  10.418 +f = open("test", "rb")
  10.419 +r = TermDataIterator(f)
  10.420 +for (term, doc_positions), (t, data) in zip(terms_with_positions, r):
  10.421 +    print term == t, term, t, data
  10.422 +r.close()
  10.423  
  10.424 -rd.rewind()
  10.425 -for term, doc_positions in terms_with_positions:
  10.426 -    t, fr, df, dp = rd.read_term()
  10.427 -    dp = list(dp)
  10.428 -    print term == t, term, t
  10.429 -    print doc_positions == dp, doc_positions, dp
  10.430 -rd.close()
  10.431 -
  10.432 -print "- Test high-level index operations (including merging)."
  10.433 +print "- Test high-level index operations."
  10.434  
  10.435  docs = [
  10.436      (1, "The cat sat on the mat"),
  10.437 @@ -485,189 +188,26 @@
  10.438      (36, "She sells sea shells on the sea shore")
  10.439      ]
  10.440  
  10.441 -doc_tests = [
  10.442 -    ("Every", 2, [(2, [0]), (14, [0])]),
  10.443 -    ("good", 2, [(2, [1]), (13, [1])]),
  10.444 -    ("deserves", 2, [(2, [3]), (13, [3])]),
  10.445 -    ("sea", 2, [(36, [2, 6])])
  10.446 -    ]
  10.447 -
  10.448 -position_tests = [
  10.449 -    ("Every", 14, [0]),
  10.450 -    ("sea", 36, [2, 6]),
  10.451 -    ("shells", 1, None),
  10.452 -    ("shells", 37, None)
  10.453 -    ]
  10.454 -
  10.455 -phrase_tests = [
  10.456 -    (["good", "boy"], [(2, [1, 2])]),
  10.457 -    (["on", "the"], [(1, [3, 4]), (36, [4, 5])]),
  10.458 -    (["sea", "shore"], [(36, [6, 7])])
  10.459 -    ]
  10.460 -
  10.461 -index = Index("test_index", 3, 2, 3, 6)
  10.462 +index = Index("test_index", 3)
  10.463  wi = index.get_writer()
  10.464  for docnum, text in docs:
  10.465      doc = Document(docnum)
  10.466      for position, term in enumerate(text.split()):
  10.467          doc.add_position(term, position)
  10.468 -    doc.add_field(123, text)
  10.469 -    wi.add_document(doc)
  10.470 -wi.close()
  10.471 -
  10.472 -rd = index.get_reader()
  10.473 -
  10.474 -print "- (Test searching.)"
  10.475 -
  10.476 -for term, frequency, doc_positions in doc_tests:
  10.477 -    dp = list(rd.find_positions(term))
  10.478 -    print doc_positions == dp, doc_positions, dp
  10.479 -    fr = rd.get_frequency(term)
  10.480 -    print frequency == fr, frequency, fr
  10.481 -
  10.482 -print "- (Test fields.)"
  10.483 -
  10.484 -for docnum, text in docs:
  10.485 -    df = dict(rd.get_fields(docnum))
  10.486 -    print df[123] == text, text, df[123]
  10.487 -
  10.488 -print "- (Test navigation.)"
  10.489 -
  10.490 -for term, docnum, positions in position_tests:
  10.491 -    dp = rd.find_positions(term)
  10.492 -    pos = dp.from_document(docnum)
  10.493 -    print positions is None and pos is None or pos is not None and positions == list(pos), positions, pos
  10.494 -
  10.495 -print "- (Test phrases.)"
  10.496 -
  10.497 -for terms, results in phrase_tests:
  10.498 -    res = list(rd.find_common_positions(terms))
  10.499 -    print results == res, results, res
  10.500 -
  10.501 -index.close()
  10.502 -
  10.503 -docs2 = [
  10.504 -    ((1, 0), "The cat sat on the mat"),
  10.505 -    ((1, 2), "Every good boy deserves football"),
  10.506 -    ((13, 1), "One good turn deserves another"),
  10.507 -    ((14, 0), "Every man for himself"),
  10.508 -    ((14, 25), "Red sky at night shepherd's delight"),
  10.509 -    ((36, 12), "She sells sea shells on the sea shore")
  10.510 -    ]
  10.511 -
  10.512 -doc_tests2 = [
  10.513 -    ("Every", 2, [((1, 2), [(0, 0)]), ((14, 0), [(0, 0)])]),
  10.514 -    ("good", 2, [((1, 2), [(1, 6)]), ((13, 1), [(1, 4)])]),
  10.515 -    ("deserves", 2, [((1, 2), [(3, 15)]), ((13, 1), [(3, 14)])]),
  10.516 -    ("sea", 2, [((36, 12), [(2, 10), (6, 28)])])
  10.517 -    ]
  10.518 -
  10.519 -position_tests2 = [
  10.520 -    ("Every", (14, 0), [(0, 0)]),
  10.521 -    ("sea", (36, 12), [(2, 10), (6, 28)]),
  10.522 -    ("shells", (1, 0), None),
  10.523 -    ("shells", (37, 0), None)
  10.524 -    ]
  10.525 -
  10.526 -phrase_tests2 = [
  10.527 -    (["good", "boy"], [((1, 2), [(1, 6), (2, 11)])]),
  10.528 -    (["on", "the"], [((1, 0), [(3, 12), (4, 15)]), ((36, 12), [(4, 21), (5, 24)])]),
  10.529 -    (["sea", "shore"], [((36, 12), [(6, 28), (7, 32)])])
  10.530 -    ]
  10.531 -
  10.532 -index = Index("test_indexT", 3, 2, 3, 6)
  10.533 -wi = index.get_writer()
  10.534 -for docnum, text in docs2:
  10.535 -    doc = Document(docnum)
  10.536 -    offset = 0
  10.537 -    for position, term in enumerate(text.split()):
  10.538 -        doc.add_position(term, (position, offset))
  10.539 -        offset += len(term) + 1 # assume one space after the term
  10.540 -    doc.add_field(123, text)
  10.541      wi.add_document(doc)
  10.542  wi.close()
  10.543  
  10.544 -rd = index.get_reader()
  10.545 -
  10.546 -print "- (Test searching.)"
  10.547 -
  10.548 -for term, frequency, doc_positions in doc_tests2:
  10.549 -    dp = list(rd.find_positions(term))
  10.550 -    print doc_positions == dp, doc_positions, dp
  10.551 -    fr = rd.get_frequency(term)
  10.552 -    print frequency == fr, frequency, fr
  10.553 -
  10.554 -print "- (Test fields.)"
  10.555 +print "- Test merge."
  10.556  
  10.557 -for docnum, text in docs2:
  10.558 -    df = dict(rd.get_fields(docnum))
  10.559 -    print df[123] == text, text, df[123]
  10.560 -
  10.561 -print "- (Test navigation.)"
  10.562 +l1 = list(index.get_reader())
  10.563 +index.merge()
  10.564 +l2 = list(index.get_reader(1))
  10.565  
  10.566 -for term, docnum, positions in position_tests2:
  10.567 -    dp = rd.find_positions(term)
  10.568 -    pos = dp.from_document(docnum)
  10.569 -    print positions is None and pos is None or pos is not None and positions == list(pos), positions, pos
  10.570 -
  10.571 -print "- (Test phrases.)"
  10.572 -
  10.573 -for terms, results in phrase_tests2:
  10.574 -    res = list(rd.find_common_positions(terms))
  10.575 -    print results == res, results, res
  10.576 +for (t1, dp1), (t2, dp2) in zip(l1, l2):
  10.577 +    print t1 == t2, t1, t2
  10.578 +    print dp1 == dp1, dp1
  10.579 +    print "     ", dp2
  10.580  
  10.581  index.close()
  10.582  
  10.583 -print "- Test index updates."
  10.584 -
  10.585 -index = Index("test_index")
  10.586 -index2 = Index("test_index2", 3, 2, 3, 6)
  10.587 -wi = index2.get_writer()
  10.588 -for docnum, text in docs:
  10.589 -
  10.590 -    # Add the same documents but with different numbers.
  10.591 -
  10.592 -    doc = Document(docnum + 100)
  10.593 -    for position, term in enumerate(text.split()):
  10.594 -        doc.add_position(term, position)
  10.595 -    doc.add_field(123, text)
  10.596 -    wi.add_document(doc)
  10.597 -wi.close()
  10.598 -
  10.599 -index2.update([index])
  10.600 -index.close()
  10.601 -
  10.602 -rd = index2.get_reader()
  10.603 -for term, frequency, doc_positions in doc_tests:
  10.604 -
  10.605 -    # Add the extra documents to the expected result.
  10.606 -
  10.607 -    orig_doc_positions = doc_positions
  10.608 -    doc_positions = doc_positions[:]
  10.609 -
  10.610 -    for docnum, positions in orig_doc_positions:
  10.611 -        doc_positions.append((docnum + 100, positions))
  10.612 -    frequency *= 2
  10.613 -
  10.614 -    dp = list(rd.find_positions(term))
  10.615 -    print doc_positions == dp, doc_positions, dp
  10.616 -    fr = rd.get_frequency(term)
  10.617 -    print frequency == fr, frequency, fr
  10.618 -index2.close()
  10.619 -
  10.620 -print "- (Test update of an empty index.)"
  10.621 -
  10.622 -index = Index("test_index")
  10.623 -index3 = Index("test_index3")
  10.624 -index3.update([index])
  10.625 -index.close()
  10.626 -
  10.627 -rd = index3.get_reader()
  10.628 -for term, frequency, doc_positions in doc_tests:
  10.629 -    dp = list(rd.find_positions(term))
  10.630 -    print doc_positions == dp, doc_positions, dp
  10.631 -    fr = rd.get_frequency(term)
  10.632 -    print frequency == fr, frequency, fr
  10.633 -index3.close()
  10.634 -
  10.635  # vim: tabstop=4 expandtab shiftwidth=4
2011-02-13	Paul Boddie	raw files shortlog changelog graph	Removed numerous classes, simplifying the package and focusing on combined term and position files which can be merged using fewer processing operations.
			iixr/data.py (file) iixr/fields.py iixr/files.py (file) iixr/filesystem.py (file) iixr/index.py (file) iixr/merging.py iixr/positions.py iixr/terms.py (file) itermerge.py (file) test.py (file)