1.1 --- a/iixr/data.py Sat Feb 12 01:23:58 2011 +0100
1.2 +++ b/iixr/data.py Sun Feb 13 02:49:55 2011 +0100
1.3 @@ -23,7 +23,7 @@
1.4
1.5 # High-level representations.
1.6
1.7 -def convert_sequence(values, op):
1.8 +def convert_sequence(values, op, last_from_old):
1.9 if values:
1.10 new_values = list(values)
1.11 last = new_values[0]
1.12 @@ -31,10 +31,22 @@
1.13 length = len(new_values)
1.14 while i < length:
1.15 current = new_values[i]
1.16 - new_values[i] = op(new_values[i], last)
1.17 - last = current
1.18 + new_values[i] = op(current, last)
1.19 +
1.20 + # Subtracting entries requires the old value to be used.
1.21 + # Adding entries requires the new value.
1.22 +
1.23 + if last_from_old:
1.24 + last = current
1.25 + else:
1.26 + last = new_values[i]
1.27 +
1.28 i += 1
1.29
1.30 + return new_values
1.31 + else:
1.32 + return values
1.33 +
1.34 def op_seq_monotonic(x, y, op):
1.35 return tuple([op(a, b) for a, b in zip(x, y)])
1.36
1.37 @@ -44,15 +56,6 @@
1.38 def sub_seq_monotonic(x, y):
1.39 return op_seq_monotonic(x, y, operator.sub)
1.40
1.41 -def op_first_monotonic(x, y, op):
1.42 - return (op(x[0], y[0]),) + tuple(zip(x[1:], y[1:]))
1.43 -
1.44 -def add_first_monotonic(x, y):
1.45 - return op_first_monotonic(x, y, operator.add)
1.46 -
1.47 -def sub_first_monotonic(x, y):
1.48 - return op_first_monotonic(x, y, operator.sub)
1.49 -
1.50 def add_seq(x, y):
1.51 length = min(len(x), len(y))
1.52 seq = list(x)[:length]
1.53 @@ -84,17 +87,17 @@
1.54 def sizeof(value):
1.55 return is_sequence(value) and len(value) or 0
1.56
1.57 -def get_monotonic_adder(value):
1.58 - return is_sequence(value) and add_seq_monotonic or operator.add
1.59 +def get_monotonic_adder(size):
1.60 + return size and add_seq_monotonic or operator.add
1.61
1.62 -def get_monotonic_subtractor(value):
1.63 - return is_sequence(value) and sub_seq_monotonic or operator.sub
1.64 +def get_monotonic_subtractor(size):
1.65 + return size and sub_seq_monotonic or operator.sub
1.66
1.67 -def get_adder(value):
1.68 - return is_sequence(value) and add_seq or operator.add
1.69 +def get_adder(size):
1.70 + return size and add_seq or operator.add
1.71
1.72 -def get_subtractor(value):
1.73 - return is_sequence(value) and sub_seq or operator.sub
1.74 +def get_subtractor(size):
1.75 + return size and sub_seq or operator.sub
1.76
1.77 # Low-level representations.
1.78 # Variable-length integer functions.
1.79 @@ -177,15 +180,6 @@
1.80 break
1.81 return number, start
1.82
1.83 -# String serialisation.
1.84 -
1.85 -def string_to_array(s, bytes):
1.86 -
1.87 - "Write the given string 's' to 'bytes'."
1.88 -
1.89 - vint_to_array(len(s), bytes)
1.90 - bytes.fromstring(s.encode("utf-8"))
1.91 -
1.92 # Sequence serialisation.
1.93
1.94 def sequence_to_array(value, size, bytes):
2.1 --- a/iixr/fields.py Sat Feb 12 01:23:58 2011 +0100
2.2 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000
2.3 @@ -1,345 +0,0 @@
2.4 -#!/usr/bin/env python
2.5 -
2.6 -"""
2.7 -Specific classes for storing document information.
2.8 -
2.9 -Copyright (C) 2009, 2010, 2011 Paul Boddie <paul@boddie.org.uk>
2.10 -
2.11 -This program is free software; you can redistribute it and/or modify it under
2.12 -the terms of the GNU General Public License as published by the Free Software
2.13 -Foundation; either version 3 of the License, or (at your option) any later
2.14 -version.
2.15 -
2.16 -This program is distributed in the hope that it will be useful, but WITHOUT ANY
2.17 -WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
2.18 -PARTICULAR PURPOSE. See the GNU General Public License for more details.
2.19 -
2.20 -You should have received a copy of the GNU General Public License along
2.21 -with this program. If not, see <http://www.gnu.org/licenses/>.
2.22 -"""
2.23 -
2.24 -from iixr.data import *
2.25 -from iixr.files import *
2.26 -from bisect import bisect_right # to find terms in the dictionary index
2.27 -
2.28 -DOCUMENT_CACHE_LIMIT = 10000
2.29 -
2.30 -class FieldWriter(FileWriter):
2.31 -
2.32 - "Writing field data to files."
2.33 -
2.34 - def begin(self, docnum_size):
2.35 - self.write_number(docnum_size)
2.36 - self.end_record()
2.37 - self.docnum_size = docnum_size
2.38 - self.data_start = self.tell()
2.39 -
2.40 - def reset(self):
2.41 - self.end_record()
2.42 - self.last_docnum = None
2.43 - self.subtractor = None
2.44 -
2.45 - def write_fields(self, docnum, fields):
2.46 -
2.47 - """
2.48 - Write for the given 'docnum', a list of 'fields' (integer, string pairs
2.49 - representing field identifiers and values respectively).
2.50 - """
2.51 -
2.52 - # Find the size of document number values.
2.53 -
2.54 - if self.last_docnum is not None:
2.55 - docnum_seq = self.subtractor(docnum, self.last_docnum)
2.56 - else:
2.57 - self.subtractor = get_subtractor(docnum)
2.58 - docnum_seq = docnum
2.59 -
2.60 - # Write the document number.
2.61 -
2.62 - self.write_sequence_value(docnum_seq, self.docnum_size)
2.63 -
2.64 - # Write the number of fields.
2.65 -
2.66 - self.write_number(len(fields))
2.67 -
2.68 - # Write the fields themselves.
2.69 -
2.70 - for i, field in fields:
2.71 - self.write_number(i)
2.72 - self.write_string(field, 1) # compress
2.73 -
2.74 - self.last_docnum = docnum
2.75 -
2.76 -class FieldReader(FileReader):
2.77 -
2.78 - "Reading field data from files."
2.79 -
2.80 - def begin(self):
2.81 - self.begin_record()
2.82 - try:
2.83 - self.docnum_size = self.read_number()
2.84 - except EOFError:
2.85 - self.docnum_size = 0 # NOTE: No fields!
2.86 - self.data_start = self.tell()
2.87 -
2.88 - def reset(self):
2.89 - self.last_docnum = None
2.90 - self.adder = None
2.91 - self.begin_record()
2.92 -
2.93 - def read_fields(self):
2.94 -
2.95 - """
2.96 - Read fields from the file, returning a tuple containing the document
2.97 - number and a list of field (identifier, value) pairs.
2.98 - """
2.99 -
2.100 - # Read the document number.
2.101 -
2.102 - docnum = self.read_sequence_value(self.docnum_size)
2.103 -
2.104 - if self.last_docnum is not None:
2.105 - self.last_docnum = self.adder(docnum, self.last_docnum)
2.106 - else:
2.107 - self.adder = get_adder(docnum)
2.108 - self.last_docnum = docnum
2.109 -
2.110 - # Read the number of fields.
2.111 -
2.112 - nfields = self.read_number()
2.113 -
2.114 - # Collect the fields.
2.115 -
2.116 - fields = []
2.117 - i = 0
2.118 -
2.119 - while i < nfields:
2.120 - identifier = self.read_number()
2.121 - value = self.read_string(1) # decompress
2.122 - fields.append((identifier, value))
2.123 - i += 1
2.124 -
2.125 - return self.last_docnum, fields
2.126 -
2.127 - def read_document_fields(self, docnum, offset):
2.128 -
2.129 - """
2.130 - Read fields for 'docnum' at the given 'offset'. This permits the
2.131 - retrieval of details for the specified document, as well as scanning for
2.132 - later documents.
2.133 - """
2.134 -
2.135 - self.seek(offset)
2.136 - bad_docnum, fields = self.read_fields()
2.137 - self.last_docnum = docnum
2.138 - return docnum, fields
2.139 -
2.140 -class FieldIndexWriter(FieldWriter):
2.141 -
2.142 - "Writing field index details to files."
2.143 -
2.144 - def reset(self):
2.145 - FieldWriter.reset(self)
2.146 - self.last_offset = 0
2.147 -
2.148 - def write_document(self, docnum, offset):
2.149 -
2.150 - """
2.151 - Write for the given 'docnum', the 'offset' at which the fields for the
2.152 - document are stored in the fields file.
2.153 - """
2.154 -
2.155 - # Find the size of document number values.
2.156 -
2.157 - if self.last_docnum is not None:
2.158 - docnum_seq = self.subtractor(docnum, self.last_docnum)
2.159 - else:
2.160 - self.subtractor = get_subtractor(docnum)
2.161 - docnum_seq = docnum
2.162 -
2.163 - # Write the document number.
2.164 -
2.165 - self.write_sequence_value(docnum_seq, self.docnum_size)
2.166 -
2.167 - # Write the offset delta.
2.168 -
2.169 - self.write_number(offset - self.last_offset)
2.170 -
2.171 - self.last_docnum = docnum
2.172 - self.last_offset = offset
2.173 -
2.174 -class FieldIndexReader(FieldReader):
2.175 -
2.176 - "Reading field index details from files."
2.177 -
2.178 - def reset(self):
2.179 - FieldReader.reset(self)
2.180 - self.last_offset = 0
2.181 -
2.182 - def read_document(self):
2.183 -
2.184 - "Read a document number and field file offset."
2.185 -
2.186 - # Read the document number.
2.187 -
2.188 - docnum = self.read_sequence_value(self.docnum_size)
2.189 -
2.190 - if self.last_docnum is not None:
2.191 - self.last_docnum = self.adder(docnum, self.last_docnum)
2.192 - else:
2.193 - self.adder = get_adder(docnum)
2.194 - self.last_docnum = docnum
2.195 -
2.196 - # Read the offset.
2.197 -
2.198 - self.last_offset += self.read_number()
2.199 -
2.200 - return self.last_docnum, self.last_offset
2.201 -
2.202 -class FieldDictionaryWriter:
2.203 -
2.204 - "Writing field dictionary details."
2.205 -
2.206 - def __init__(self, field_writer, field_index_writer, interval):
2.207 - self.field_writer = field_writer
2.208 - self.field_index_writer = field_index_writer
2.209 - self.interval = interval
2.210 - self.entry = 0
2.211 -
2.212 - def write_fields(self, docnum, fields):
2.213 -
2.214 - "Write details of the given 'docnum' and 'fields'."
2.215 -
2.216 - if self.entry == 0:
2.217 - docnum_size = sizeof(docnum)
2.218 - self.field_writer.begin(docnum_size)
2.219 - self.field_index_writer.begin(docnum_size)
2.220 - self.field_index_writer.reset()
2.221 -
2.222 - if self.entry % self.interval == 0:
2.223 - self.field_writer.reset()
2.224 - offset = self.field_writer.tell()
2.225 - self.field_writer.write_fields(docnum, fields)
2.226 - self.field_index_writer.write_document(docnum, offset)
2.227 - else:
2.228 - self.field_writer.write_fields(docnum, fields)
2.229 -
2.230 - self.entry += 1
2.231 -
2.232 - def close(self):
2.233 - self.field_writer.close()
2.234 - self.field_index_writer.close()
2.235 -
2.236 -class FieldDictionaryReader:
2.237 -
2.238 - "Reading field dictionary details."
2.239 -
2.240 - def __init__(self, field_reader, field_index_reader):
2.241 - self.field_reader = field_reader
2.242 - self.field_index_reader = field_index_reader
2.243 -
2.244 - self.field_reader.reset()
2.245 - self.field_index_reader.reset()
2.246 -
2.247 - self.cache = {}
2.248 -
2.249 - self.entry = 0
2.250 - self.docs = []
2.251 - try:
2.252 - while 1:
2.253 - self.docs.append(self.field_index_reader.read_document())
2.254 - except EOFError:
2.255 - pass
2.256 -
2.257 - # Large numbers for ordering purposes.
2.258 -
2.259 - if self.docs:
2.260 - self.max_offset = self.docs[-1][1]
2.261 - else:
2.262 - self.max_offset = None
2.263 -
2.264 - # Iterator convenience methods.
2.265 -
2.266 - def __iter__(self):
2.267 - self.rewind()
2.268 - return self
2.269 -
2.270 - def next(self):
2.271 - try:
2.272 - return self.read_fields()
2.273 - except EOFError:
2.274 - raise StopIteration
2.275 -
2.276 - # Sequential access methods.
2.277 -
2.278 - def rewind(self):
2.279 - self.field_reader.rewind()
2.280 -
2.281 - def read_fields(self):
2.282 -
2.283 - "Return the next document number and fields."
2.284 -
2.285 - try:
2.286 - return self.field_reader.read_fields()
2.287 - except EOFError:
2.288 - self.entry += 1
2.289 - try:
2.290 - found_docnum, offset = self.docs[self.entry]
2.291 - except IndexError:
2.292 - raise EOFError
2.293 - else:
2.294 - self.field_reader.reset()
2.295 - return self.field_reader.read_fields()
2.296 -
2.297 - # Random access methods.
2.298 -
2.299 - def get_fields(self, docnum):
2.300 -
2.301 - "Read the fields of the document with the given 'docnum'."
2.302 -
2.303 - if self.cache.has_key(docnum):
2.304 - return self.cache[docnum]
2.305 -
2.306 - i = bisect_right(self.docs, (docnum, self.max_offset)) - 1
2.307 -
2.308 - # Get the entry position providing the term or one preceding it.
2.309 -
2.310 - if i == -1:
2.311 - return None
2.312 -
2.313 - found_docnum, offset = self.docs[i]
2.314 -
2.315 - # Read from the fields file.
2.316 -
2.317 - found_docnum, fields = self.field_reader.read_document_fields(found_docnum, offset)
2.318 -
2.319 - # Scan for the document, if necessary.
2.320 -
2.321 - try:
2.322 - while docnum > found_docnum:
2.323 - found_docnum, fields = self.field_reader.read_fields()
2.324 - except EOFError:
2.325 - pass
2.326 -
2.327 - # If the document is found, return the fields.
2.328 -
2.329 - if docnum == found_docnum:
2.330 -
2.331 - # Store the fields in the cache, removing entries if the limit has
2.332 - # been reached.
2.333 -
2.334 - keys = self.cache.keys()
2.335 -
2.336 - if len(keys) == DOCUMENT_CACHE_LIMIT:
2.337 - del self.cache[keys[0]]
2.338 -
2.339 - self.cache[docnum] = fields
2.340 - return fields
2.341 - else:
2.342 - return None
2.343 -
2.344 - def close(self):
2.345 - self.field_reader.close()
2.346 - self.field_index_reader.close()
2.347 -
2.348 -# vim: tabstop=4 expandtab shiftwidth=4
3.1 --- a/iixr/files.py Sat Feb 12 01:23:58 2011 +0100
3.2 +++ b/iixr/files.py Sun Feb 13 02:49:55 2011 +0100
3.3 @@ -22,10 +22,6 @@
3.4 from array import array
3.5 import zlib
3.6
3.7 -# Constants.
3.8 -
3.9 -CACHE_SIZE = 100000
3.10 -
3.11 # Classes.
3.12
3.13 class File:
3.14 @@ -35,14 +31,21 @@
3.15 def __init__(self, f):
3.16 self.f = f
3.17 self.record = array('B') # record buffer
3.18 - self.cache = array('B')
3.19 + self.data_start = None
3.20 +
3.21 + def begin(self):
3.22 +
3.23 + """
3.24 + Initialise file-wide parameters. In writers, this method may require
3.25 + parameters to be specified. In readers, the parameters may be read from
3.26 + the file.
3.27 + """
3.28 +
3.29 self.data_start = 0
3.30
3.31 - def reset(self):
3.32 -
3.33 - "To be used to reset the state of the reader or writer between records."
3.34 -
3.35 - pass
3.36 + def tell(self):
3.37 + # NOTE: Will not be accurate within the current record.
3.38 + return self.f.tell()
3.39
3.40 def seek(self, offset):
3.41 self.f.seek(offset)
3.42 @@ -60,27 +63,26 @@
3.43
3.44 "Writing basic data types to files."
3.45
3.46 - def __init__(self, f):
3.47 - File.__init__(self, f)
3.48 - self.written = 0
3.49 -
3.50 - def tell(self):
3.51 - # NOTE: Will not be accurate within the current record.
3.52 - return self.written
3.53 -
3.54 def begin_record(self):
3.55 pass
3.56
3.57 def end_record(self):
3.58 if self.record:
3.59 - length = len(self.record)
3.60 - before = len(self.cache)
3.61 - vint_to_array(length, self.cache)
3.62 - length_size = len(self.cache) - before
3.63 - self.cache += self.record
3.64 - self.written += length_size + length
3.65 + self.f.write(vint(len(self.record)))
3.66 + self.record.tofile(self.f)
3.67 self.record = array('B')
3.68 - self.flush_cache()
3.69 +
3.70 + def write_remaining(self, a):
3.71 +
3.72 + "Write remaining data from the raw array 'a'."
3.73 +
3.74 + self.record += a
3.75 +
3.76 + def write_byte(self, b):
3.77 +
3.78 + "Write the given byte 'b'."
3.79 +
3.80 + self.record.append(b)
3.81
3.82 def write_number(self, number):
3.83
3.84 @@ -137,25 +139,17 @@
3.85 self.write_sequence_value(value, size)
3.86
3.87 def write_delta_sequence(self, values, size):
3.88 - convert_sequence(values, get_subtractor(values[0]))
3.89 - self.write_sequence_values(values, size)
3.90 + self.write_sequence_values(
3.91 + convert_sequence(values, get_subtractor(size), 1),
3.92 + size)
3.93
3.94 def write_monotonic_sequence(self, values, size):
3.95 - convert_sequence(values, get_monotonic_subtractor(values[0]))
3.96 - self.write_sequence_values(values, size)
3.97 -
3.98 - def flush(self, force=0):
3.99 - self.end_record()
3.100 - self.flush_cache(force)
3.101 -
3.102 - def flush_cache(self, force=0):
3.103 - if self.f is not None:
3.104 - if force or len(self.cache) > CACHE_SIZE:
3.105 - self.cache.tofile(self.f)
3.106 - self.cache = array('B')
3.107 + self.write_sequence_values(
3.108 + convert_sequence(values, get_monotonic_subtractor(size), 1),
3.109 + size)
3.110
3.111 def close(self):
3.112 - self.flush(1)
3.113 + self.end_record()
3.114 File.close(self)
3.115
3.116 class FileReader(File):
3.117 @@ -164,58 +158,33 @@
3.118
3.119 def __init__(self, f):
3.120 File.__init__(self, f)
3.121 - self.record_start = 0
3.122 - self.record_end = 0
3.123 - self.cache_start = 0
3.124 self.begin()
3.125
3.126 - def begin(self):
3.127 -
3.128 - "Initialise file-wide parameters."
3.129 -
3.130 - pass
3.131 -
3.132 def begin_record(self):
3.133 self.start = 0
3.134 + self.record = array('B')
3.135 try:
3.136 size = self.read_number_from_file()
3.137 - self.record = self.from_cache(size)
3.138 + self.record.fromfile(self.f, size)
3.139 except EOFError:
3.140 pass
3.141
3.142 def end_record(self):
3.143 pass
3.144
3.145 - def seek(self, offset):
3.146 - from_cache_start = offset - self.cache_start
3.147 - if 0 <= from_cache_start < len(self.cache):
3.148 - self.record_start = self.record_end = from_cache_start
3.149 - else:
3.150 - self.f.seek(offset)
3.151 - self.cache = array('B')
3.152 - self.cache_start = offset
3.153 - self.record_start = self.record_end = 0
3.154 - self.reset()
3.155 + def read_remaining(self):
3.156
3.157 - def tell(self):
3.158 - return self.cache_start + self.record_start + self.start
3.159 + "Read remaining data as a raw array."
3.160 +
3.161 + return self.record[self.start:]
3.162
3.163 - def ensure_cache(self, size):
3.164 - if size > len(self.cache) - self.record_end:
3.165 - self.cache = self.cache[self.record_end:]
3.166 - self.cache_start += self.record_end
3.167 - s = self.f.read(CACHE_SIZE)
3.168 - self.cache.fromstring(s)
3.169 - self.record_start = 0
3.170 - if not s:
3.171 - raise EOFError
3.172 - else:
3.173 - self.record_start = self.record_end
3.174 - self.record_end = self.record_start + size
3.175 + def read_byte(self):
3.176 +
3.177 + "Read a byte from the record."
3.178
3.179 - def from_cache(self, size):
3.180 - self.ensure_cache(size)
3.181 - return self.cache[self.record_start:self.record_end]
3.182 + b = self.record[self.start]
3.183 + self.start += 1
3.184 + return b
3.185
3.186 def read_number_from_file(self):
3.187
3.188 @@ -224,13 +193,13 @@
3.189 # Read each byte, adding it to the number.
3.190
3.191 a = array('B')
3.192 - a += self.from_cache(1)
3.193 + a.fromfile(self.f, 1)
3.194 csd = a[-1]
3.195 if csd < 128:
3.196 return csd
3.197 else:
3.198 while csd & 128:
3.199 - a += self.from_cache(1)
3.200 + a.fromfile(self.f, 1)
3.201 csd = a[-1]
3.202 return vint_from_array(a)
3.203
3.204 @@ -292,13 +261,9 @@
3.205 return values
3.206
3.207 def read_delta_sequence(self, size):
3.208 - values = self.read_sequences(size)
3.209 - convert_sequence(values, get_adder(values[0]))
3.210 - return values
3.211 + return convert_sequence(self.read_sequences(size), get_adder(size), 0)
3.212
3.213 def read_monotonic_sequence(self, size):
3.214 - values = self.read_sequences(size)
3.215 - convert_sequence(values, get_monotonic_adder(values[0]))
3.216 - return values
3.217 + return convert_sequence(self.read_sequences(size), get_monotonic_adder(size), 0)
3.218
3.219 # vim: tabstop=4 expandtab shiftwidth=4
4.1 --- a/iixr/filesystem.py Sat Feb 12 01:23:58 2011 +0100
4.2 +++ b/iixr/filesystem.py Sun Feb 13 02:49:55 2011 +0100
4.3 @@ -3,7 +3,7 @@
4.4 """
4.5 File access.
4.6
4.7 -Copyright (C) 2009, 2010 Paul Boddie <paul@boddie.org.uk>
4.8 +Copyright (C) 2009, 2010, 2011 Paul Boddie <paul@boddie.org.uk>
4.9
4.10 This program is free software; you can redistribute it and/or modify it under
4.11 the terms of the GNU General Public License as published by the Free Software
4.12 @@ -18,9 +18,7 @@
4.13 with this program. If not, see <http://www.gnu.org/licenses/>.
4.14 """
4.15
4.16 -from iixr.fields import *
4.17 from iixr.terms import *
4.18 -from iixr.positions import *
4.19 from os import listdir, remove, rename # partition manipulation
4.20 from shutil import copy # index updating
4.21 from os.path import join
4.22 @@ -32,8 +30,7 @@
4.23
4.24 # Constants.
4.25
4.26 -TERM_FILENAMES = "terms", "terms_index", "positions", "positions_index"
4.27 -FIELD_FILENAMES = "fields", "fields_index"
4.28 +TERM_FILENAMES = "terms",
4.29
4.30 # Utility functions.
4.31
4.32 @@ -49,7 +46,9 @@
4.33 partitions = set()
4.34 for filename in listdir(pathname):
4.35 if filename.startswith(prefix):
4.36 - partitions.add(filename[prefix_length:])
4.37 + partition = filename[prefix_length:]
4.38 + if partition.isdigit():
4.39 + partitions.add(int(partition))
4.40 return partitions
4.41
4.42 def get_term_partitions(pathname):
4.43 @@ -61,95 +60,40 @@
4.44
4.45 return get_partitions(pathname, "terms-")
4.46
4.47 -def get_field_partitions(pathname):
4.48 +def get_next_partition(partitions):
4.49 + return max(partitions or [-1]) + 1
4.50 +
4.51 +def get_term_writer(pathname, partition):
4.52
4.53 """
4.54 - Return a set of field partition identifiers for partitions residing at the
4.55 - given 'pathname'.
4.56 - """
4.57 -
4.58 - return get_partitions(pathname, "fields-")
4.59 -
4.60 -def get_next_partition(partitions):
4.61 - return max([int(n) for n in partitions if n.isdigit()] or [-1]) + 1
4.62 -
4.63 -def get_term_writer(pathname, partition, interval, doc_interval):
4.64 -
4.65 - """
4.66 - Return a term dictionary writer using files under the given 'pathname'
4.67 - labelled according to the given 'partition', using the given indexing
4.68 - 'interval' for terms and 'doc_interval' for document position records.
4.69 + Return a term writer using files under the given 'pathname' labelled
4.70 + according to the given 'partition'.
4.71 """
4.72
4.73 - tdf = open(join(pathname, "terms-%s" % partition), "wb")
4.74 - info_writer = TermWriter(tdf)
4.75 -
4.76 - tdif = open(join(pathname, "terms_index-%s" % partition), "wb")
4.77 - index_writer = TermIndexWriter(tdif)
4.78 -
4.79 - tpf = open(join(pathname, "positions-%s" % partition), "wb")
4.80 - positions_writer = PositionWriter(tpf)
4.81 -
4.82 - tpif = open(join(pathname, "positions_index-%s" % partition), "wb")
4.83 - positions_index_writer = PositionIndexWriter(tpif)
4.84 -
4.85 - positions_dict_writer = PositionDictionaryWriter(positions_writer, positions_index_writer, doc_interval)
4.86 -
4.87 - return TermDictionaryWriter(info_writer, index_writer, positions_dict_writer, interval)
4.88 + f = open(join(pathname, "terms-%s" % partition), "wb")
4.89 + return TermWriter(f)
4.90
4.91 -def get_field_writer(pathname, partition, interval):
4.92 -
4.93 - """
4.94 - Return a field dictionary writer using files under the given 'pathname'
4.95 - labelled according to the given 'partition', using the given indexing
4.96 - 'interval'.
4.97 - """
4.98 -
4.99 - ff = open(join(pathname, "fields-%s" % partition), "wb")
4.100 - field_writer = FieldWriter(ff)
4.101 -
4.102 - fif = open(join(pathname, "fields_index-%s" % partition), "wb")
4.103 - field_index_writer = FieldIndexWriter(fif)
4.104 -
4.105 - return FieldDictionaryWriter(field_writer, field_index_writer, interval)
4.106 +def get_reader(pathname, name, partition, cls):
4.107 + f = open(join(pathname, "%s-%s" % (name, partition)), "rb")
4.108 + return cls(f)
4.109
4.110 def get_term_reader(pathname, partition):
4.111
4.112 """
4.113 - Return a term dictionary reader using files under the given 'pathname'
4.114 + Return a term reader using files under the given 'pathname' labelled
4.115 + according to the given 'partition'.
4.116 + """
4.117 +
4.118 + return get_reader(pathname, "terms", partition, TermIterator)
4.119 +
4.120 +def get_term_data_reader(pathname, partition):
4.121 +
4.122 + """
4.123 + Return a term plus data reader using files under the given 'pathname'
4.124 labelled according to the given 'partition'.
4.125 """
4.126
4.127 - tdf = open(join(pathname, "terms-%s" % partition), "rb")
4.128 - info_reader = TermReader(tdf)
4.129 -
4.130 - tdif = open(join(pathname, "terms_index-%s" % partition), "rb")
4.131 - index_reader = TermIndexReader(tdif)
4.132 -
4.133 - pf = open(join(pathname, "positions-%s" % partition), "rb")
4.134 - position_reader = PositionReader(pf)
4.135 -
4.136 - pif = open(join(pathname, "positions_index-%s" % partition), "rb")
4.137 - position_index_reader = PositionIndexReader(pif)
4.138 -
4.139 - position_dict_reader = PositionDictionaryReader(position_reader, position_index_reader)
4.140 -
4.141 - return TermDictionaryReader(info_reader, index_reader, position_dict_reader)
4.142 -
4.143 -def get_field_reader(pathname, partition):
4.144 -
4.145 - """
4.146 - Return a field dictionary reader using files under the given 'pathname'
4.147 - labelled according to the given 'partition'.
4.148 - """
4.149 -
4.150 - ff = open(join(pathname, "fields-%s" % partition), "rb")
4.151 - field_reader = FieldReader(ff)
4.152 -
4.153 - fif = open(join(pathname, "fields_index-%s" % partition), "rb")
4.154 - field_index_reader = FieldIndexReader(fif)
4.155 -
4.156 - return FieldDictionaryReader(field_reader, field_index_reader)
4.157 + return get_reader(pathname, "terms", partition, TermDataIterator)
4.158
4.159 # Renaming.
4.160
4.161 @@ -160,9 +104,6 @@
4.162 def rename_term_files(pathname, from_partition, to_partition):
4.163 rename_files(pathname, TERM_FILENAMES, from_partition, to_partition)
4.164
4.165 -def rename_field_files(pathname, from_partition, to_partition):
4.166 - rename_files(pathname, FIELD_FILENAMES, from_partition, to_partition)
4.167 -
4.168 # Removal/deletion.
4.169
4.170 def remove_files(pathname, names, partition):
4.171 @@ -172,9 +113,6 @@
4.172 def remove_term_files(pathname, partition):
4.173 remove_files(pathname, TERM_FILENAMES, partition)
4.174
4.175 -def remove_field_files(pathname, partition):
4.176 - remove_files(pathname, FIELD_FILENAMES, partition)
4.177 -
4.178 # Copying.
4.179
4.180 def copy_files(source, names, partition, destination, suffix):
4.181 @@ -185,7 +123,4 @@
4.182 def copy_term_files(source, partition, destination, suffix):
4.183 copy_files(source, TERM_FILENAMES, partition, destination, suffix)
4.184
4.185 -def copy_field_files(source, partition, destination, suffix):
4.186 - copy_files(source, FIELD_FILENAMES, partition, destination, suffix)
4.187 -
4.188 # vim: tabstop=4 expandtab shiftwidth=4
5.1 --- a/iixr/index.py Sat Feb 12 01:23:58 2011 +0100
5.2 +++ b/iixr/index.py Sun Feb 13 02:49:55 2011 +0100
5.3 @@ -19,18 +19,14 @@
5.4 """
5.5
5.6 from iixr.filesystem import *
5.7 -from iixr.merging import *
5.8 -from itertools import islice
5.9 +from itermerge import itermerge
5.10 from os import mkdir # index discovery
5.11 from os.path import exists
5.12 +import operator
5.13
5.14 # Constants.
5.15
5.16 -TERM_INTERVAL = 100
5.17 -DOCUMENT_INTERVAL = 100
5.18 -FIELD_INTERVAL = 100
5.19 FLUSH_INTERVAL = 10000
5.20 -POSITIONS_FLUSH_INTERVAL = 1000000
5.21 OPEN_PARTITIONS = 20
5.22
5.23 # High-level classes.
5.24 @@ -39,11 +35,9 @@
5.25
5.26 "A container of document information."
5.27
5.28 - def __init__(self, docnum, fields=None):
5.29 + def __init__(self, docnum):
5.30 self.docnum = docnum
5.31 - self.fields = fields or []
5.32 self.terms = {}
5.33 - self.field_dict = None
5.34
5.35 def add_position(self, term, position):
5.36
5.37 @@ -54,55 +48,18 @@
5.38
5.39 self.terms.setdefault(term, []).append(position)
5.40
5.41 - def add_field(self, identifier, value):
5.42 -
5.43 - "Add a field having the given 'identifier' and 'value'."
5.44 -
5.45 - self.fields.append((identifier, unicode(value))) # convert to string
5.46 -
5.47 - def set_fields(self, fields):
5.48 -
5.49 - """
5.50 - Set the document's 'fields': a list of tuples each containing an integer
5.51 - identifier and a string value.
5.52 - """
5.53 -
5.54 - self.fields = fields
5.55 -
5.56 - def _ensure_dict(self):
5.57 - if self.field_dict is None:
5.58 - self.field_dict = dict(self.fields)
5.59 -
5.60 - def keys(self):
5.61 - self._ensure_dict()
5.62 - return self.field_dict.keys()
5.63 -
5.64 - def __getitem__(self, key):
5.65 - self._ensure_dict()
5.66 - return self.field_dict[key]
5.67 -
5.68 class IndexWriter:
5.69
5.70 - """
5.71 - Building term information and writing it to the term and field dictionaries.
5.72 - """
5.73 + "Building term information and writing it to the term dictionary."
5.74
5.75 - def __init__(self, pathname, interval, doc_interval, field_interval, flush_interval, positions_flush_interval):
5.76 + def __init__(self, pathname, flush_interval):
5.77 self.pathname = pathname
5.78 - self.interval = interval
5.79 - self.doc_interval = doc_interval
5.80 - self.field_interval = field_interval
5.81 self.flush_interval = flush_interval
5.82 - self.positions_flush_interval = positions_flush_interval
5.83
5.84 - self.dict_partition = get_next_partition(get_term_partitions(self.pathname))
5.85 - self.field_dict_partition = get_next_partition(get_field_partitions(self.pathname))
5.86 + self.term_partition = get_next_partition(get_term_partitions(self.pathname))
5.87
5.88 self.terms = {}
5.89 - self.docs = []
5.90 -
5.91 self.doc_counter = 0
5.92 - self.position_counter = 0
5.93
5.94 def add_document(self, doc):
5.95
5.96 @@ -115,134 +72,105 @@
5.97
5.98 for term, positions in doc.terms.items():
5.99 self.terms.setdefault(term, {})[docnum] = positions
5.100 - self.position_counter += len(positions)
5.101 -
5.102 - self.docs.append((docnum, doc.fields))
5.103
5.104 self.doc_counter += 1
5.105
5.106 - if self.flush_interval and self.doc_counter >= self.flush_interval or \
5.107 - self.positions_flush_interval and self.position_counter >= self.positions_flush_interval:
5.108 -
5.109 + if self.flush_interval and self.doc_counter >= self.flush_interval:
5.110 self.flush_terms()
5.111 - self.flush_fields()
5.112 self.doc_counter = 0
5.113 - self.position_counter = 0
5.114
5.115 def get_term_writer(self):
5.116
5.117 - "Return a term dictionary writer for the current partition."
5.118 -
5.119 - return get_term_writer(self.pathname, self.dict_partition, self.interval, self.doc_interval)
5.120 + "Return a term writer for the current partition."
5.121
5.122 - def get_field_writer(self):
5.123 -
5.124 - "Return a field dictionary writer for the current partition."
5.125 -
5.126 - return get_field_writer(self.pathname, self.field_dict_partition, self.field_interval)
5.127 + return get_term_writer(self.pathname, self.term_partition)
5.128
5.129 def flush_terms(self):
5.130
5.131 - "Flush terms into the current term dictionary partition."
5.132 + "Flush terms into the current term partition."
5.133
5.134 # Get the terms in order.
5.135
5.136 - all_terms = self.terms
5.137 - terms = all_terms.keys()
5.138 - terms.sort()
5.139 -
5.140 - dict_writer = self.get_term_writer()
5.141 -
5.142 - for term in terms:
5.143 - doc_positions = all_terms[term].items()
5.144 - dict_writer.write_term_positions(term, doc_positions)
5.145 -
5.146 - dict_writer.close()
5.147 + term_writer = self.get_term_writer()
5.148 + try:
5.149 + term_writer.write_terms(self.terms)
5.150 + finally:
5.151 + term_writer.close()
5.152
5.153 self.terms = {}
5.154 - self.dict_partition += 1
5.155 -
5.156 - def flush_fields(self):
5.157 -
5.158 - "Flush fields into the current term dictionary partition."
5.159 -
5.160 - # Get the documents in order.
5.161 -
5.162 - self.docs.sort()
5.163 -
5.164 - field_dict_writer = self.get_field_writer()
5.165 - for docnum, fields in self.docs:
5.166 - field_dict_writer.write_fields(docnum, fields)
5.167 - field_dict_writer.close()
5.168 -
5.169 - self.docs = []
5.170 - self.field_dict_partition += 1
5.171 + self.term_partition += 1
5.172
5.173 def close(self):
5.174 if self.terms or not get_term_partitions(self.pathname):
5.175 self.flush_terms()
5.176 - if self.docs or not get_field_partitions(self.pathname):
5.177 - self.flush_fields()
5.178 +
5.179 +class IndexReader(itermerge):
5.180 +
5.181 + "Accessing the term dictionaries."
5.182
5.183 -class IndexReader:
5.184 + def __init__(self, pathname, get_reader=None, combine=None):
5.185
5.186 - "Accessing the term and field dictionaries."
5.187 + # Get the partitions in order.
5.188 +
5.189 + partitions = list(get_term_partitions(pathname))
5.190 + partitions.sort()
5.191
5.192 - def __init__(self, pathname):
5.193 - self.dict_reader = get_term_reader(pathname, "merged")
5.194 - self.field_dict_reader = get_field_reader(pathname, "merged")
5.195 + # Initialise the underlying term partition readers.
5.196
5.197 - # Sequential access.
5.198 + self.readers = [(get_reader or get_term_reader)(pathname, partition) for partition in partitions]
5.199 + self.combine = combine or operator.add
5.200 +
5.201 + # Initialise this object as an iterator over the readers.
5.202
5.203 - def read_term(self):
5.204 - return self.dict_reader.read_term()
5.205 + itermerge.__init__(self, self.readers)
5.206 + self.next_value = None
5.207
5.208 - def go_to_term(self, term):
5.209 - return self.dict_reader._get_term_and_positions(*self.dict_reader.go_to_term(term))
5.210 + def get_sizes(self):
5.211
5.212 - # Query access.
5.213 + # Readers must have compatible sizes.
5.214
5.215 - def get_terms(self):
5.216 - return self.dict_reader.get_terms()
5.217 -
5.218 - def find_terms(self, term):
5.219 - return self.dict_reader.find_terms(term)
5.220 + if self.readers:
5.221 + return self.readers[0].get_sizes()
5.222 + else:
5.223 + return 0, 0
5.224
5.225 - def find_positions(self, term):
5.226 - return self.dict_reader.find_positions(term)
5.227 + def next(self):
5.228 + if self.next_value is not None:
5.229 + term, positions = self.next_value
5.230 + else:
5.231 + term, positions = itermerge.next(self)
5.232
5.233 - def find_common_positions(self, terms):
5.234 - return self.dict_reader.find_common_positions(terms)
5.235 + # Look at the next item to see if it is has positions for the current
5.236 + # term.
5.237
5.238 - def get_frequency(self, term):
5.239 - return self.dict_reader.get_frequency(term)
5.240 -
5.241 - def get_document_frequency(self, term):
5.242 - return self.dict_reader.get_document_frequency(term)
5.243 + try:
5.244 + t, p = itermerge.next(self)
5.245 + while t == term:
5.246 + positions = self.combine(positions, p)
5.247 + t, p = itermerge.next(self)
5.248 + self.next_value = t, p
5.249
5.250 - def get_fields(self, docnum):
5.251 - return self.field_dict_reader.get_fields(docnum)
5.252 + # Where an item could not be fetched, cause future requests to fail.
5.253
5.254 - def get_document(self, docnum):
5.255 - return Document(docnum, self.get_fields(docnum))
5.256 + except StopIteration:
5.257 + self.next_value = None
5.258 +
5.259 + return term, positions
5.260
5.261 def close(self):
5.262 - self.dict_reader.close()
5.263 - self.field_dict_reader.close()
5.264 + for reader in self.readers:
5.265 + reader.close()
5.266 + self.readers = []
5.267
5.268 class Index:
5.269
5.270 "An inverted index solution encapsulating the various components."
5.271
5.272 - def __init__(self, pathname, interval=TERM_INTERVAL, doc_interval=DOCUMENT_INTERVAL, field_interval=FIELD_INTERVAL,
5.273 - flush_interval=FLUSH_INTERVAL, positions_flush_interval=POSITIONS_FLUSH_INTERVAL, open_partitions=OPEN_PARTITIONS):
5.274 + def __init__(self, pathname, flush_interval=FLUSH_INTERVAL,
5.275 + open_partitions=OPEN_PARTITIONS):
5.276
5.277 self.pathname = pathname
5.278 - self.interval = interval
5.279 - self.doc_interval = doc_interval
5.280 - self.field_interval = field_interval
5.281 self.flush_interval = flush_interval
5.282 - self.positions_flush_interval = positions_flush_interval
5.283 self.open_partitions = open_partitions
5.284 self.reader = None
5.285 self.writer = None
5.286 @@ -251,132 +179,60 @@
5.287
5.288 "Return a writer."
5.289
5.290 - self._ensure_directory()
5.291 - self.writer = IndexWriter(self.pathname, self.interval, self.doc_interval,
5.292 - self.field_interval, self.flush_interval, self.positions_flush_interval)
5.293 + if self.writer is None:
5.294 + self._ensure_directory()
5.295 + self.writer = IndexWriter(self.pathname, self.flush_interval)
5.296 return self.writer
5.297
5.298 def _ensure_directory(self):
5.299 if not exists(self.pathname):
5.300 mkdir(self.pathname)
5.301
5.302 - def get_reader(self, partition=0):
5.303 -
5.304 - "Return a reader for the index."
5.305 -
5.306 - # Ensure that only one partition exists.
5.307 -
5.308 - self.merge()
5.309 - return self._get_reader(partition)
5.310 -
5.311 - def _get_reader(self, partition):
5.312 + def get_reader(self, refresh=0):
5.313
5.314 "Return a reader for the index."
5.315
5.316 - if not exists(self.pathname):
5.317 - raise OSError, "Index path %r does not exist." % self.pathname
5.318 -
5.319 - self.reader = IndexReader(self.pathname)
5.320 - return self.reader
5.321 -
5.322 - def get_term_partitions(self):
5.323 + if refresh and self.reader is not None:
5.324 + self.reader.close()
5.325 + self.reader = None
5.326
5.327 - "Return a set of term partition identifiers."
5.328 -
5.329 - return get_term_partitions(self.pathname)
5.330 -
5.331 - def get_field_partitions(self):
5.332 -
5.333 - "Return a set of field partition identifiers."
5.334 -
5.335 - return get_field_partitions(self.pathname)
5.336 + if self.reader is None:
5.337 + if not exists(self.pathname):
5.338 + raise OSError, "Index path %r does not exist." % self.pathname
5.339 + self.reader = IndexReader(self.pathname)
5.340 + return self.reader
5.341
5.342 def merge(self):
5.343
5.344 - "Merge/optimise index partitions."
5.345 -
5.346 - self._merge_terms()
5.347 - self._merge_fields()
5.348 -
5.349 - def _merge_dictionaries(self, get_partitions, rename_files, remove_files, get_reader, get_writer, get_merger, intervals):
5.350 -
5.351 - "Merge term or field dictionaries."
5.352 -
5.353 - partitions = get_partitions()
5.354 -
5.355 - # Ensure the correct labelling of a single partition.
5.356 -
5.357 - if len(partitions) == 1:
5.358 - partition = list(partitions)[0]
5.359 - if partition != "merged":
5.360 - rename_files(self.pathname, partition, "merged")
5.361 - return
5.362 + "Merge the partitions in the index."
5.363
5.364 - # Merge the partitions.
5.365 -
5.366 - old_merged_counter = 0
5.367 -
5.368 - while len(partitions) > 1:
5.369 -
5.370 - if "merged" in partitions:
5.371 - rename_files(self.pathname, "merged", "old-merged-%d" % old_merged_counter)
5.372 - partitions.remove("merged")
5.373 - partitions.add("old-merged-%d" % old_merged_counter)
5.374 - old_merged_counter += 1
5.375 -
5.376 - # Process only a certain number at once, avoiding resource limits.
5.377 -
5.378 - active_partitions = list(islice(partitions, self.open_partitions))
5.379 -
5.380 - readers = []
5.381 - for partition in active_partitions:
5.382 - readers.append(get_reader(self.pathname, partition))
5.383 -
5.384 - # Write directly to a dictionary.
5.385 + reader = IndexReader(self.pathname, get_term_data_reader, self.merge_data)
5.386 + writer = get_term_writer(self.pathname, "merged")
5.387 + try:
5.388 + writer.begin(*reader.get_sizes())
5.389 + for term, data in reader:
5.390 + writer.write_term_plus_remaining(term, data)
5.391 + writer.end_record()
5.392 + finally:
5.393 + writer.close()
5.394 + reader.close()
5.395
5.396 - writer = get_writer(self.pathname, "merged", *intervals)
5.397 - merger = get_merger(writer, readers)
5.398 - merger.merge()
5.399 - merger.close()
5.400 -
5.401 - # Remove old files.
5.402 -
5.403 - for partition in active_partitions:
5.404 - remove_files(self.pathname, partition)
5.405 + for partition in get_term_partitions(self.pathname):
5.406 + remove_term_files(self.pathname, partition)
5.407
5.408 - # Acquire the partitions to check their number again.
5.409 -
5.410 - partitions = get_partitions()
5.411 -
5.412 - def _merge_terms(self):
5.413 + rename_term_files(self.pathname, "merged", 0)
5.414
5.415 - "Merge term dictionaries."
5.416 -
5.417 - self._merge_dictionaries(self.get_term_partitions, rename_term_files,
5.418 - remove_term_files, get_term_reader, get_term_writer,
5.419 - TermDictionaryMerger, [self.interval, self.doc_interval])
5.420 + def merge_data(self, a, b):
5.421
5.422 - def _merge_fields(self):
5.423 -
5.424 - "Merge field dictionaries."
5.425 -
5.426 - self._merge_dictionaries(self.get_field_partitions, rename_field_files,
5.427 - remove_field_files, get_field_reader, get_field_writer,
5.428 - FieldDictionaryMerger, [self.field_interval])
5.429 -
5.430 - def update(self, other_indexes):
5.431 + """
5.432 + Merge 'a' and 'b', modifying the data to permit concatenation.
5.433 + """
5.434
5.435 - "Copy the content of the 'other_indexes' into this index and merge."
5.436 -
5.437 - self._ensure_directory()
5.438 + # Modify the record to indicate a continuation of the data.
5.439
5.440 - for i, index in enumerate(other_indexes):
5.441 - for partition in index.get_term_partitions():
5.442 - copy_term_files(index.pathname, partition, self.pathname, "-added-%d" % i)
5.443 - for partition in index.get_field_partitions():
5.444 - copy_field_files(index.pathname, partition, self.pathname, "-added-%d" % i)
5.445 -
5.446 - self.merge()
5.447 + c = a + b
5.448 + c[len(a) - 1] = 1
5.449 + return c
5.450
5.451 def close(self):
5.452 if self.reader is not None:
6.1 --- a/iixr/merging.py Sat Feb 12 01:23:58 2011 +0100
6.2 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000
6.3 @@ -1,89 +0,0 @@
6.4 -#!/usr/bin/env python
6.5 -
6.6 -"""
6.7 -Dictionary merging classes.
6.8 -
6.9 -Copyright (C) 2009, 2010 Paul Boddie <paul@boddie.org.uk>
6.10 -
6.11 -This program is free software; you can redistribute it and/or modify it under
6.12 -the terms of the GNU General Public License as published by the Free Software
6.13 -Foundation; either version 3 of the License, or (at your option) any later
6.14 -version.
6.15 -
6.16 -This program is distributed in the hope that it will be useful, but WITHOUT ANY
6.17 -WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
6.18 -PARTICULAR PURPOSE. See the GNU General Public License for more details.
6.19 -
6.20 -You should have received a copy of the GNU General Public License along
6.21 -with this program. If not, see <http://www.gnu.org/licenses/>.
6.22 -"""
6.23 -
6.24 -from itermerge import itermerge
6.25 -
6.26 -class Merger:
6.27 -
6.28 - "Merge files."
6.29 -
6.30 - def __init__(self, writer, readers):
6.31 - self.writer = writer
6.32 - self.readers = readers
6.33 -
6.34 - def close(self):
6.35 - for reader in self.readers:
6.36 - reader.close()
6.37 - self.readers = []
6.38 - if self.writer is not None:
6.39 - self.writer.close()
6.40 - self.writer = None
6.41 -
6.42 -class TermDictionaryMerger(Merger):
6.43 -
6.44 - "Merge term and position files."
6.45 -
6.46 - def merge(self):
6.47 -
6.48 - """
6.49 - Merge terms and positions from the readers, sending them to the writer.
6.50 - """
6.51 -
6.52 - last_term = None
6.53 - current_readers = []
6.54 -
6.55 - for term, frequency, doc_frequency, positions in itermerge(self.readers):
6.56 - if term == last_term:
6.57 - current_readers.append(positions)
6.58 - else:
6.59 - if current_readers:
6.60 - self.writer.write_term_positions(last_term, itermerge(current_readers))
6.61 - last_term = term
6.62 - current_readers = [positions]
6.63 - else:
6.64 - if current_readers:
6.65 - self.writer.write_term_positions(last_term, itermerge(current_readers))
6.66 -
6.67 -class FieldDictionaryMerger(Merger):
6.68 -
6.69 - "Merge field files."
6.70 -
6.71 - def merge(self):
6.72 -
6.73 - """
6.74 - Merge fields from the readers, sending them to the writer.
6.75 - """
6.76 -
6.77 - last_docnum = None
6.78 - current_fields = []
6.79 -
6.80 - for docnum, fields in itermerge(self.readers):
6.81 - if docnum == last_docnum:
6.82 - current_fields += fields
6.83 - else:
6.84 - if current_fields:
6.85 - self.writer.write_fields(last_docnum, current_fields)
6.86 - last_docnum = docnum
6.87 - current_fields = fields
6.88 - else:
6.89 - if current_fields:
6.90 - self.writer.write_fields(last_docnum, current_fields)
6.91 -
6.92 -# vim: tabstop=4 expandtab shiftwidth=4
7.1 --- a/iixr/positions.py Sat Feb 12 01:23:58 2011 +0100
7.2 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000
7.3 @@ -1,566 +0,0 @@
7.4 -#!/usr/bin/env python
7.5 -
7.6 -"""
7.7 -Specific classes for storing position information.
7.8 -
7.9 -Copyright (C) 2009, 2010, 2011 Paul Boddie <paul@boddie.org.uk>
7.10 -
7.11 -This program is free software; you can redistribute it and/or modify it under
7.12 -the terms of the GNU General Public License as published by the Free Software
7.13 -Foundation; either version 3 of the License, or (at your option) any later
7.14 -version.
7.15 -
7.16 -This program is distributed in the hope that it will be useful, but WITHOUT ANY
7.17 -WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
7.18 -PARTICULAR PURPOSE. See the GNU General Public License for more details.
7.19 -
7.20 -You should have received a copy of the GNU General Public License along
7.21 -with this program. If not, see <http://www.gnu.org/licenses/>.
7.22 -"""
7.23 -
7.24 -from iixr.data import *
7.25 -from iixr.files import *
7.26 -
7.27 -class PositionWriter(FileWriter):
7.28 -
7.29 - "Writing position information to files."
7.30 -
7.31 - def begin(self, docnum_size, position_size):
7.32 - self.write_numbers((docnum_size, position_size))
7.33 - self.end_record()
7.34 - self.data_start = self.tell()
7.35 - self.docnum_size = docnum_size
7.36 - self.position_size = position_size
7.37 -
7.38 - def reset(self):
7.39 - self.end_record()
7.40 - self.last_docnum = None
7.41 - self.subtractor = None
7.42 -
7.43 - def write_positions(self, docnum, positions):
7.44 -
7.45 - """
7.46 - Write for the document 'docnum' the given 'positions'.
7.47 - """
7.48 -
7.49 - if not positions:
7.50 - return
7.51 -
7.52 - # Make sure that the positions are sorted.
7.53 -
7.54 - positions.sort()
7.55 -
7.56 - # Calculate an ongoing delta.
7.57 -
7.58 - if self.last_docnum is not None:
7.59 - if docnum < self.last_docnum:
7.60 - raise ValueError, "Document number %r is less than previous number %r." % (docnum, self.last_docnum)
7.61 -
7.62 - docnum_seq = self.subtractor(docnum, self.last_docnum)
7.63 -
7.64 - # Or preserve the document number and prepare for future deltas.
7.65 -
7.66 - else:
7.67 - self.subtractor = get_subtractor(docnum)
7.68 - docnum_seq = docnum
7.69 -
7.70 - self.write_sequence_value(docnum_seq, self.docnum_size)
7.71 - self.write_monotonic_sequence(positions, self.position_size)
7.72 -
7.73 - self.last_docnum = docnum
7.74 -
7.75 -class PositionReader(FileReader):
7.76 -
7.77 - "Reading position information within term-specific regions of a file."
7.78 -
7.79 - def begin(self):
7.80 - self.begin_record()
7.81 - try:
7.82 - self.docnum_size, self.position_size = self.read_numbers(2)
7.83 - except EOFError:
7.84 - self.docnum_size, self.position_size = 0, 0 # NOTE: No positions!
7.85 - self.data_start = self.tell()
7.86 -
7.87 - def reset(self):
7.88 - self.last_docnum = None
7.89 - self.adder = None
7.90 - self.begin_record()
7.91 -
7.92 - def read_positions(self):
7.93 -
7.94 - """
7.95 - Read positions, returning a document number and a list of positions.
7.96 - """
7.97 -
7.98 - # Read the document number.
7.99 -
7.100 - docnum = self.read_sequence_value(self.docnum_size)
7.101 -
7.102 - # Calculate an ongoing delta.
7.103 -
7.104 - if self.last_docnum is not None:
7.105 - self.last_docnum = self.adder(docnum, self.last_docnum)
7.106 -
7.107 - # Or preserve the document number and prepare for future deltas.
7.108 -
7.109 - else:
7.110 - self.adder = get_adder(docnum)
7.111 - self.last_docnum = docnum
7.112 -
7.113 - positions = self.read_monotonic_sequence(self.position_size)
7.114 -
7.115 - return self.last_docnum, positions
7.116 -
7.117 -class PositionIndexWriter(PositionWriter):
7.118 -
7.119 - "Writing position index information to files."
7.120 -
7.121 - def begin(self, docnum_size):
7.122 - PositionWriter.begin(self, docnum_size, 0)
7.123 -
7.124 - def reset(self):
7.125 - PositionWriter.reset(self)
7.126 - self.last_pos_offset = 0
7.127 -
7.128 - def write_positions(self, docnum, pos_offset, count):
7.129 -
7.130 - """
7.131 - Write the given 'docnum, 'pos_offset' and document 'count' to the
7.132 - position index file.
7.133 - """
7.134 -
7.135 - # Find the size of document number values.
7.136 -
7.137 - if self.last_docnum is not None:
7.138 - docnum_seq = self.subtractor(docnum, self.last_docnum)
7.139 - else:
7.140 - self.subtractor = get_subtractor(docnum)
7.141 - docnum_seq = docnum
7.142 -
7.143 - self.write_sequence_value(docnum_seq, self.docnum_size)
7.144 - self.write_number(pos_offset - self.last_pos_offset)
7.145 - self.write_number(count)
7.146 -
7.147 - self.last_docnum = docnum
7.148 - self.last_pos_offset = pos_offset
7.149 -
7.150 -class PositionIndexReader(PositionReader):
7.151 -
7.152 - "Reading position index information within term-specific regions of a file."
7.153 -
7.154 - def reset(self):
7.155 - PositionReader.reset(self)
7.156 - self.last_pos_offset = 0
7.157 -
7.158 - def read_positions(self):
7.159 -
7.160 - """
7.161 - Read a document number, a position file offset for the position index
7.162 - file, and the number of documents in a section of that file.
7.163 - """
7.164 -
7.165 - # Read the document number.
7.166 -
7.167 - docnum = self.read_sequence_value(self.docnum_size)
7.168 -
7.169 - if self.last_docnum is not None:
7.170 - self.last_docnum = self.adder(docnum, self.last_docnum)
7.171 - else:
7.172 - self.adder = get_adder(docnum)
7.173 - self.last_docnum = docnum
7.174 -
7.175 - # Read the offset delta.
7.176 -
7.177 - self.last_pos_offset += self.read_number()
7.178 -
7.179 - # Read the document count.
7.180 -
7.181 - count = self.read_number()
7.182 -
7.183 - return self.last_docnum, self.last_pos_offset, count
7.184 -
7.185 -# Iterators for position-related files.
7.186 -
7.187 -class IteratorBase:
7.188 -
7.189 - "Support for iterating over results."
7.190 -
7.191 - def __init__(self, reader):
7.192 -
7.193 - "Initialise the iterator using the given 'reader'."
7.194 -
7.195 - self.reader = reader
7.196 - self.replenish(0) # no iteration initially permitted
7.197 -
7.198 - def replenish(self, count):
7.199 -
7.200 - "Replenish the iterator with 'count' results."
7.201 -
7.202 - self.count = count
7.203 - self.read_documents = 0
7.204 -
7.205 - def __len__(self):
7.206 -
7.207 - "Return the total number of results."
7.208 -
7.209 - return self.count
7.210 -
7.211 - def sort(self):
7.212 - pass # Stored document positions are already sorted.
7.213 -
7.214 - def __iter__(self):
7.215 - return self
7.216 -
7.217 -class PositionIterator(IteratorBase):
7.218 -
7.219 - "Iterating over document positions."
7.220 -
7.221 - def replenish(self, count):
7.222 - IteratorBase.replenish(self, count)
7.223 -
7.224 - # Fill a cache of positions.
7.225 -
7.226 - self.cache = []
7.227 - n = 0
7.228 -
7.229 - while n < self.count:
7.230 - self.cache.append(self.reader.read_positions())
7.231 - n += 1
7.232 -
7.233 - def seek(self, offset, count):
7.234 -
7.235 - """
7.236 - Seek to 'offset' in the file, limiting the number of documents available
7.237 - for reading to 'count'.
7.238 - """
7.239 -
7.240 - self.reader.seek(offset)
7.241 - self.replenish(count)
7.242 -
7.243 - def next(self):
7.244 -
7.245 - "Read positions for a single document."
7.246 -
7.247 - if self.read_documents < self.count:
7.248 - positions = self.cache[self.read_documents]
7.249 - self.read_documents += 1
7.250 - return positions
7.251 - else:
7.252 - raise StopIteration
7.253 -
7.254 -class PositionIndexIterator(IteratorBase):
7.255 -
7.256 - "Iterating over document positions."
7.257 -
7.258 - def replenish(self, count):
7.259 - IteratorBase.replenish(self, count)
7.260 -
7.261 - # Fill a cache of offsets.
7.262 -
7.263 - self.cache = []
7.264 - self.current = 0
7.265 - n = 0
7.266 -
7.267 - while n < self.count:
7.268 - docnum, pos_offset, section_count = t = self.reader.read_positions()
7.269 - self.cache.append(t)
7.270 - n += section_count
7.271 -
7.272 - def seek(self, offset, doc_frequency):
7.273 -
7.274 - """
7.275 - Seek to 'offset' in the file, limiting the number of documents available
7.276 - for reading to 'doc_frequency'.
7.277 - """
7.278 -
7.279 - self.reader.seek(offset)
7.280 - self.replenish(doc_frequency)
7.281 -
7.282 - def next(self):
7.283 -
7.284 - "Read positions for a single document."
7.285 -
7.286 - if self.current < len(self.cache):
7.287 - docnum, pos_offset, self.section_count = t = self.cache[self.current]
7.288 - self.current += 1
7.289 - return t
7.290 - else:
7.291 - raise StopIteration
7.292 -
7.293 -class PositionDictionaryWriter:
7.294 -
7.295 - "Writing position dictionaries."
7.296 -
7.297 - def __init__(self, position_writer, position_index_writer, interval):
7.298 - self.position_writer = position_writer
7.299 - self.position_index_writer = position_index_writer
7.300 - self.interval = interval
7.301 -
7.302 - def write_term_positions(self, doc_positions):
7.303 -
7.304 - """
7.305 - Write all 'doc_positions' - a collection of tuples of the form (document
7.306 - number, position list) - to the file.
7.307 -
7.308 - Add some records to the index, making dictionary entries.
7.309 -
7.310 - Return a tuple containing the offset of the written data, the frequency
7.311 - (number of positions), and document frequency (number of documents) for
7.312 - the term involved.
7.313 - """
7.314 -
7.315 - # Write the positions.
7.316 -
7.317 - frequency = 0
7.318 - count = 0
7.319 -
7.320 - if doc_positions:
7.321 - doc_positions.sort()
7.322 -
7.323 - # Look ahead at the first document record.
7.324 - # NOTE: Any iterator would need to support this.
7.325 -
7.326 - first_docnum, first_positions = doc_positions[0]
7.327 - first_position = first_positions[0]
7.328 -
7.329 - # Write out size details.
7.330 -
7.331 - docnum_size, position_size = sizeof(first_docnum), sizeof(first_position)
7.332 - self.position_writer.begin(docnum_size, position_size)
7.333 - self.position_index_writer.begin(docnum_size)
7.334 -
7.335 - # Reset the writers.
7.336 -
7.337 - self.position_writer.reset()
7.338 - self.position_index_writer.reset()
7.339 -
7.340 - # Remember the first index entry offset.
7.341 -
7.342 - index_offset = self.position_index_writer.tell()
7.343 -
7.344 - # Retain the first record offset for a subsequent index entry.
7.345 -
7.346 - first_offset = self.position_writer.tell()
7.347 -
7.348 - for docnum, positions in doc_positions:
7.349 - if first_docnum is None:
7.350 - first_docnum = docnum
7.351 -
7.352 - self.position_writer.write_positions(docnum, positions)
7.353 -
7.354 - frequency += len(positions)
7.355 - count += 1
7.356 -
7.357 - # Every {interval} entries, write an index entry.
7.358 -
7.359 - if count % self.interval == 0:
7.360 -
7.361 - self.position_index_writer.write_positions(first_docnum, first_offset, self.interval)
7.362 -
7.363 - # Reset the position writer so that position readers accessing
7.364 - # a section start with the correct document number.
7.365 -
7.366 - self.position_writer.reset()
7.367 -
7.368 - first_offset = self.position_writer.tell()
7.369 - first_docnum = None
7.370 -
7.371 - # Finish writing an index entry for the remaining documents.
7.372 -
7.373 - else:
7.374 - if first_docnum is not None:
7.375 - self.position_index_writer.write_positions(first_docnum, first_offset, count % self.interval)
7.376 -
7.377 - return index_offset, frequency, count
7.378 -
7.379 - def close(self):
7.380 - self.position_writer.close()
7.381 - self.position_index_writer.close()
7.382 -
7.383 -class PositionDictionaryReader:
7.384 -
7.385 - "Access to position dictionary entries through iterators."
7.386 -
7.387 - def __init__(self, position_reader, position_index_reader):
7.388 - self.position_reader = position_reader
7.389 - self.position_index_reader = position_index_reader
7.390 -
7.391 - def read_term_positions(self, offset, doc_frequency):
7.392 - iterator = PositionDictionaryIterator(
7.393 - PositionIterator(self.position_reader),
7.394 - PositionIndexIterator(self.position_index_reader)
7.395 - )
7.396 - iterator.seek(offset, doc_frequency)
7.397 - return iterator
7.398 -
7.399 - def close(self):
7.400 - self.position_reader.close()
7.401 - self.position_index_reader.close()
7.402 -
7.403 -class PositionDictionaryIterator:
7.404 -
7.405 - "Iteration over position dictionary entries."
7.406 -
7.407 - def __init__(self, position_iterator, position_index_iterator):
7.408 - self.position_iterator = position_iterator
7.409 - self.position_index_iterator = position_index_iterator
7.410 - self.reset()
7.411 -
7.412 - def reset(self):
7.413 -
7.414 - # Remember the last values.
7.415 -
7.416 - self.found_docnum, self.found_positions = None, None
7.417 -
7.418 - # Maintain state for the next index entry, if read.
7.419 -
7.420 - self.next_docnum, self.next_pos_offset, self.next_section_count = None, None, None
7.421 -
7.422 - def seek(self, offset, doc_frequency):
7.423 -
7.424 - """
7.425 - Seek to 'offset' in the index file, limiting the number of documents
7.426 - available for reading to 'doc_frequency'.
7.427 - """
7.428 -
7.429 - self.reset()
7.430 -
7.431 - # Seek to the appropriate index entry.
7.432 -
7.433 - self.position_index_iterator.seek(offset, doc_frequency)
7.434 -
7.435 - # Initialise the current index entry and current position file iterator.
7.436 -
7.437 - self._next_section()
7.438 - self._init_section()
7.439 -
7.440 - # Sequence methods.
7.441 -
7.442 - def __len__(self):
7.443 - return len(self.position_index_iterator)
7.444 -
7.445 - def sort(self):
7.446 - pass
7.447 -
7.448 - # Iterator methods.
7.449 -
7.450 - def __iter__(self):
7.451 - return self
7.452 -
7.453 - def next(self):
7.454 -
7.455 - """
7.456 - Attempt to get the next document record from the section in the
7.457 - positions file.
7.458 - """
7.459 -
7.460 - # Return any visited but unrequested record.
7.461 -
7.462 - if self.found_docnum is not None:
7.463 - t = self.found_docnum, self.found_positions
7.464 - self.found_docnum, self.found_positions = None, None
7.465 - return t
7.466 -
7.467 - # Or search for the next record.
7.468 -
7.469 - while 1:
7.470 -
7.471 - # Either return the next record.
7.472 -
7.473 - try:
7.474 - return self.position_iterator.next()
7.475 -
7.476 - # Or, where a section is finished, get the next section and try again.
7.477 -
7.478 - except StopIteration:
7.479 -
7.480 - # Although, where a single iterator is in use, the file reader
7.481 - # would be positioned appropriately, this is not guaranteed in a
7.482 - # multiple iterator situation.
7.483 -
7.484 - self._next_section()
7.485 - self._init_section()
7.486 -
7.487 - def from_document(self, docnum):
7.488 -
7.489 - """
7.490 - Attempt to navigate to a positions entry for the given 'docnum',
7.491 - returning the positions for 'docnum', or None otherwise.
7.492 - """
7.493 -
7.494 - # Return any unrequested document positions.
7.495 -
7.496 - if docnum == self.found_docnum:
7.497 - return self.found_positions
7.498 -
7.499 - # Read ahead in the index until the next entry refers to a document
7.500 - # later than the desired document.
7.501 -
7.502 - try:
7.503 - if self.next_docnum is None:
7.504 - self.next_docnum, self.next_pos_offset, self.next_section_count = self.position_index_iterator.next()
7.505 -
7.506 - # Read until the next entry is after the desired document number,
7.507 - # or until the end of the results.
7.508 -
7.509 - while self.next_docnum <= docnum:
7.510 - self._next_read_section()
7.511 - if self.docnum < docnum:
7.512 - self.next_docnum, self.next_pos_offset, self.next_section_count = self.position_index_iterator.next()
7.513 - else:
7.514 - break
7.515 -
7.516 - except StopIteration:
7.517 - pass
7.518 -
7.519 - # Navigate in the position file to the document.
7.520 -
7.521 - self._init_section()
7.522 -
7.523 - try:
7.524 - while 1:
7.525 - found_docnum, found_positions = self.position_iterator.next()
7.526 -
7.527 - # Return the desired document positions or None (retaining the
7.528 - # positions for the document immediately after).
7.529 -
7.530 - if docnum <= found_docnum:
7.531 - self.found_docnum, self.found_positions = found_docnum, found_positions
7.532 - if docnum == found_docnum:
7.533 - return found_positions
7.534 - elif docnum < found_docnum:
7.535 - return None
7.536 -
7.537 - except StopIteration:
7.538 - return None
7.539 -
7.540 - # Internal methods.
7.541 -
7.542 - def _next_section(self):
7.543 -
7.544 - "Attempt to get the next section in the index."
7.545 -
7.546 - if self.next_docnum is None:
7.547 - self.docnum, self.pos_offset, self.section_count = self.position_index_iterator.next()
7.548 - else:
7.549 - self._next_read_section()
7.550 -
7.551 - def _next_read_section(self):
7.552 -
7.553 - """
7.554 - Make the next index entry the current one without reading from the
7.555 - index.
7.556 - """
7.557 -
7.558 - self.docnum, self.pos_offset, self.section_count = self.next_docnum, self.next_pos_offset, self.next_section_count
7.559 - self.next_docnum, self.next_pos_offset, self.next_section_count = None, None, None
7.560 -
7.561 - def _init_section(self):
7.562 -
7.563 - "Initialise the iterator for the section in the position file."
7.564 -
7.565 - # Seek to the position entry.
7.566 -
7.567 - self.position_iterator.seek(self.pos_offset, self.section_count)
7.568 -
7.569 -# vim: tabstop=4 expandtab shiftwidth=4
8.1 --- a/iixr/terms.py Sat Feb 12 01:23:58 2011 +0100
8.2 +++ b/iixr/terms.py Sun Feb 13 02:49:55 2011 +0100
8.3 @@ -18,29 +18,87 @@
8.4 with this program. If not, see <http://www.gnu.org/licenses/>.
8.5 """
8.6
8.7 +from iixr.data import *
8.8 from iixr.files import *
8.9 -from iixr.positions import *
8.10 from iixr.phrases import PhraseIterator
8.11 from os.path import commonprefix # to find common string prefixes
8.12 -from bisect import bisect_right # to find terms in the dictionary index
8.13
8.14 class TermWriter(FileWriter):
8.15
8.16 "Writing term information to files."
8.17
8.18 - def reset(self):
8.19 + def begin(self, docnum_size, position_size):
8.20 +
8.21 + "Begin writing to the file."
8.22 +
8.23 + self.write_numbers((docnum_size, position_size))
8.24 self.end_record()
8.25 +
8.26 + self.data_start = self.tell()
8.27 + self.docnum_size = docnum_size
8.28 + self.position_size = position_size
8.29 + self.subtractor = get_subtractor(docnum_size)
8.30 self.last_term = ""
8.31 - self.last_offset = 0
8.32
8.33 - def write_term(self, term, offset, frequency, doc_frequency):
8.34 + def write_terms(self, terms):
8.35
8.36 """
8.37 - Write the given 'term', its position file 'offset', its 'frequency' and
8.38 - its 'doc_frequency' (number of documents in which it appears) to the
8.39 - term information file.
8.40 + Write the 'terms' to the term information file, with each term's details
8.41 + stored in a separate record.
8.42 """
8.43
8.44 + if hasattr(terms, "items"):
8.45 + terms = terms.items()
8.46 + terms.sort()
8.47 +
8.48 + for term, doc_positions in terms:
8.49 + if not doc_positions:
8.50 + continue
8.51 +
8.52 + if hasattr(doc_positions, "items"):
8.53 + doc_positions = doc_positions.items()
8.54 +
8.55 + docnum, positions = doc_positions[0]
8.56 +
8.57 + if not positions:
8.58 + continue
8.59 +
8.60 + # Start the writing, if appropriate.
8.61 +
8.62 + if self.data_start is None:
8.63 + self.begin(sizeof(docnum), sizeof(positions[0]))
8.64 +
8.65 + # Write each term and document positions.
8.66 +
8.67 + self.write_term(term, doc_positions)
8.68 + self.end_record()
8.69 +
8.70 + # Methods requiring an open record.
8.71 +
8.72 + def write_term(self, term, doc_positions):
8.73 +
8.74 + """
8.75 + Write the given 'term', its document frequency (number of documents in
8.76 + which it appears), and 'doc_positions' to the term information file.
8.77 + """
8.78 +
8.79 + self.write_term_only(term)
8.80 +
8.81 + # Write the document frequency and the term positions.
8.82 +
8.83 + self.write_positions(doc_positions)
8.84 +
8.85 + def write_term_plus_remaining(self, term, data):
8.86 +
8.87 + "Write the given 'term' and the document position 'data'."
8.88 +
8.89 + self.write_term_only(term)
8.90 + self.write_remaining(data)
8.91 +
8.92 + def write_term_only(self, term):
8.93 +
8.94 + "Write only the given 'term'."
8.95 +
8.96 if term <= self.last_term:
8.97 raise ValueError, "Term %r precedes the previous term %r." % (term, self.last_term)
8.98
8.99 @@ -52,430 +110,173 @@
8.100 self.write_number(common)
8.101 self.write_string(suffix)
8.102
8.103 - # Write the offset delta.
8.104 - # Write the frequency.
8.105 + self.last_term = term
8.106 +
8.107 + def write_positions(self, doc_positions):
8.108 +
8.109 + "Write the given 'doc_positions' to the file."
8.110 +
8.111 + # Make sure that the positions are sorted.
8.112 +
8.113 + doc_positions.sort()
8.114 +
8.115 # Write the document frequency.
8.116
8.117 - self.write_numbers((
8.118 - offset - self.last_offset,
8.119 - frequency,
8.120 - doc_frequency
8.121 - ))
8.122 + self.write_number(len(doc_positions))
8.123 +
8.124 + last_docnum = None
8.125 +
8.126 + for docnum, positions in doc_positions:
8.127 +
8.128 + # Store the first document number as it is.
8.129 +
8.130 + if last_docnum is None:
8.131 + docnum_seq = docnum
8.132 +
8.133 + # Reject out-of-order documents.
8.134 +
8.135 + elif docnum < last_docnum:
8.136 + raise ValueError, "Document number %r is less than previous number %r." % (docnum, last_docnum)
8.137
8.138 - self.last_term = term
8.139 - self.last_offset = offset
8.140 + # Calculate an ongoing delta.
8.141 +
8.142 + else:
8.143 + docnum_seq = self.subtractor(docnum, last_docnum)
8.144 +
8.145 + # Write the document number and positions.
8.146 +
8.147 + self.write_sequence_value(docnum_seq, self.docnum_size)
8.148 + self.write_monotonic_sequence(positions, self.position_size)
8.149 +
8.150 + last_docnum = docnum
8.151 +
8.152 + # Write a terminating byte to indicate that no more document pages
8.153 + # exist.
8.154 +
8.155 + self.write_byte(0)
8.156
8.157 class TermReader(FileReader):
8.158
8.159 "Reading term information from files."
8.160
8.161 - def reset(self):
8.162 + def begin(self):
8.163 +
8.164 + "Begin reading from the file."
8.165 +
8.166 + self.begin_record()
8.167 + try:
8.168 + self.docnum_size, self.position_size = self.read_numbers(2)
8.169 + except EOFError:
8.170 + self.docnum_size, self.position_size = 0, 0 # NOTE: No positions!
8.171 +
8.172 + self.data_start = self.tell()
8.173 + self.adder = get_adder(self.docnum_size)
8.174 self.last_term = ""
8.175 - self.last_offset = 0
8.176 - self.begin_record()
8.177 +
8.178 + def get_sizes(self):
8.179 + return self.docnum_size, self.position_size
8.180 +
8.181 + # Methods requiring an open record.
8.182
8.183 def read_term(self):
8.184
8.185 + "Read a term and its document positions from the term information file."
8.186 +
8.187 + # Read the term.
8.188 +
8.189 + self.read_term_only()
8.190 +
8.191 + # Read the document frequency and the term positions.
8.192 +
8.193 + positions = self.read_positions()
8.194 +
8.195 + return self.last_term, positions
8.196 +
8.197 + def read_term_plus_remaining(self):
8.198 +
8.199 """
8.200 - Read a term, its position file offset, its frequency and its document
8.201 - frequency from the term information file.
8.202 + Read a term and the unprocessed document position data.
8.203 """
8.204
8.205 + self.read_term_only()
8.206 + return self.last_term, self.read_remaining()
8.207 +
8.208 + def read_term_only(self):
8.209 +
8.210 + "Read a term only."
8.211 +
8.212 # Read the prefix length and term suffix.
8.213
8.214 common = self.read_number()
8.215 suffix = self.read_string()
8.216
8.217 self.last_term = self.last_term[:common] + suffix
8.218 -
8.219 - # Read the offset delta.
8.220 -
8.221 - self.last_offset += self.read_number()
8.222 -
8.223 - # Read the frequency.
8.224 -
8.225 - frequency = self.read_number()
8.226 -
8.227 - # Read the document frequency.
8.228 -
8.229 - doc_frequency = self.read_number()
8.230 + return self.last_term
8.231
8.232 - return self.last_term, self.last_offset, frequency, doc_frequency
8.233 -
8.234 - def go_to_term(self, term, offset, info_offset):
8.235 -
8.236 - """
8.237 - Seek past the entry for 'term' having 'offset' to 'info_offset'. This
8.238 - permits the scanning for later terms from the specified term.
8.239 - """
8.240 -
8.241 - self.seek(info_offset)
8.242 - self.last_term = term
8.243 - self.last_offset = offset
8.244 -
8.245 -class TermIndexWriter(TermWriter):
8.246 + def read_positions(self):
8.247
8.248 - "Writing term dictionary index details to files."
8.249 -
8.250 - def reset(self):
8.251 - TermWriter.reset(self)
8.252 - self.last_info_offset = 0
8.253 -
8.254 - def write_term(self, term, offset, frequency, doc_frequency, info_offset):
8.255 -
8.256 - """
8.257 - Write the given 'term', its position file 'offset', its 'frequency' and
8.258 - its 'doc_frequency' to the term dictionary index file, along with the
8.259 - 'info_offset' in the term information file.
8.260 - """
8.261 + "Read document positions from the term information file."
8.262
8.263 - TermWriter.write_term(self, term, offset, frequency, doc_frequency)
8.264 -
8.265 - # Write the information file offset delta.
8.266 -
8.267 - self.write_number(info_offset - self.last_info_offset)
8.268 -
8.269 - self.last_info_offset = info_offset
8.270 + doc_positions = []
8.271
8.272 -class TermIndexReader(TermReader):
8.273 -
8.274 - "Reading term dictionary index details from files."
8.275 -
8.276 - def reset(self):
8.277 - TermReader.reset(self)
8.278 - self.last_info_offset = 0
8.279 + while 1:
8.280
8.281 - def read_term(self):
8.282 -
8.283 - """
8.284 - Read a term, its position file offset, its frequency, its document
8.285 - frequency and a term information file offset from the term dictionary
8.286 - index file.
8.287 - """
8.288 -
8.289 - term, offset, frequency, doc_frequency = TermReader.read_term(self)
8.290 -
8.291 - # Read the offset delta.
8.292 -
8.293 - self.last_info_offset += self.read_number()
8.294 + # Read the document frequency.
8.295
8.296 - return term, offset, frequency, doc_frequency, self.last_info_offset
8.297 -
8.298 -class TermDictionaryWriter:
8.299 -
8.300 - "Writing term dictionaries."
8.301 -
8.302 - def __init__(self, info_writer, index_writer, position_dict_writer, interval):
8.303 - self.info_writer = info_writer
8.304 - self.index_writer = index_writer
8.305 - self.position_dict_writer = position_dict_writer
8.306 - self.interval = interval
8.307 - self.entry = 0
8.308 -
8.309 - self.index_writer.reset()
8.310 + npositions = self.read_number()
8.311
8.312 - def _write_term(self, term, offset, frequency, doc_frequency):
8.313 -
8.314 - """
8.315 - Write the given 'term', its position file 'offset', its 'frequency' and
8.316 - its 'doc_frequency' (number of documents in which it appears) to the
8.317 - term information file. Return the offset before the term information was
8.318 - written to the file.
8.319 - """
8.320 -
8.321 - if self.entry % self.interval == 0:
8.322 - self.info_writer.reset()
8.323 - info_offset = self.info_writer.tell()
8.324 - self.index_writer.write_term(term, offset, frequency, doc_frequency, info_offset)
8.325 + last_docnum = None
8.326 + i = 0
8.327 + while i < npositions:
8.328
8.329 - self.info_writer.write_term(term, offset, frequency, doc_frequency)
8.330 - self.entry += 1
8.331 -
8.332 - def write_term_positions(self, term, doc_positions):
8.333 -
8.334 - """
8.335 - Write the given 'term' and the 'doc_positions' recording the documents
8.336 - and positions at which the term is found.
8.337 - """
8.338 -
8.339 - offset, frequency, doc_frequency = self.position_dict_writer.write_term_positions(doc_positions)
8.340 -
8.341 - if not frequency or not doc_frequency:
8.342 - raise ValueError, "Term %r has no occurrences recorded: %r" % (term, doc_positions)
8.343 -
8.344 - self._write_term(term, offset, frequency, doc_frequency)
8.345 + # Read the document number.
8.346
8.347 - def close(self):
8.348 - self.info_writer.close()
8.349 - self.index_writer.close()
8.350 - self.position_dict_writer.close()
8.351 -
8.352 -class TermDictionaryReader:
8.353 -
8.354 - "Reading term dictionaries."
8.355 + docnum = self.read_sequence_value(self.docnum_size)
8.356 + if last_docnum is not None:
8.357 + docnum = self.adder(docnum, last_docnum)
8.358
8.359 - def __init__(self, info_reader, index_reader, position_dict_reader):
8.360 - self.info_reader = info_reader
8.361 - self.index_reader = index_reader
8.362 - self.position_dict_reader = position_dict_reader
8.363 -
8.364 - self.info_reader.reset()
8.365 - self.index_reader.reset()
8.366 -
8.367 - self.entry = 0
8.368 - self.terms = []
8.369 - try:
8.370 - while 1:
8.371 - self.terms.append(self.index_reader.read_term())
8.372 - except EOFError:
8.373 - pass
8.374 -
8.375 - # Large numbers for ordering purposes.
8.376 + # Read the positions.
8.377
8.378 - if self.terms:
8.379 - self.max_offset = self.terms[-1][1] + 1
8.380 - else:
8.381 - self.max_offset = None
8.382 -
8.383 - def _find_closest_entry(self, term):
8.384 -
8.385 - """
8.386 - Find the offsets and frequencies of 'term' from the term dictionary or
8.387 - the closest term starting with the value of 'term'.
8.388 -
8.389 - Return the closest index entry consisting of a term, the position file
8.390 - offset, the term frequency, the document frequency, and the term details
8.391 - file offset.
8.392 - """
8.393 + positions = self.read_monotonic_sequence(self.position_size)
8.394 + doc_positions.append((docnum, positions))
8.395
8.396 - i = bisect_right(self.terms, (term, self.max_offset, 0, 0)) - 1
8.397 -
8.398 - # Get the entry position providing the term or one preceding it.
8.399 - # If no entry precedes the requested term, return the very first entry
8.400 - # as the closest.
8.401 -
8.402 - if i == -1:
8.403 - self.entry = 0
8.404 - return self.terms[0]
8.405 - else:
8.406 - self.entry = i
8.407 - return self.terms[i]
8.408 -
8.409 - def _find_closest_term(self, term):
8.410 -
8.411 - """
8.412 - Find the offsets and frequencies of 'term' from the term dictionary or
8.413 - the closest term starting with the value of 'term'.
8.414 + last_docnum = docnum
8.415 + i += 1
8.416
8.417 - Return the closest term (or the term itself), the position file offset,
8.418 - the term frequency, the document frequency, and the term details file
8.419 - offset (or None if the reader is already positioned).
8.420 - """
8.421 -
8.422 - found_term, offset, frequency, doc_frequency, info_offset = self._find_closest_entry(term)
8.423 -
8.424 - # Where the term is found immediately, return the offset and
8.425 - # frequencies. If the term does not appear, return the details of the
8.426 - # closest entry.
8.427 -
8.428 - if term <= found_term:
8.429 - return found_term, offset, frequency, doc_frequency, info_offset
8.430 + # Read a terminating byte to discover whether more document pages
8.431 + # exist.
8.432
8.433 - # Otherwise, seek past the index term's entry in the information file
8.434 - # and scan for the desired term.
8.435 -
8.436 - else:
8.437 - # Reset the term and offset for the new page.
8.438 - self.info_reader.go_to_term("", 0, info_offset)
8.439 - try:
8.440 - while term > found_term:
8.441 - found_term, offset, frequency, doc_frequency = self._read_term()
8.442 - except EOFError:
8.443 - pass
8.444 -
8.445 - return found_term, offset, frequency, doc_frequency, None
8.446 -
8.447 - def _find_term(self, term):
8.448 + if not self.read_byte():
8.449 + break
8.450
8.451 - """
8.452 - Find the position file offset and frequency of 'term' from the term
8.453 - dictionary.
8.454 - """
8.455 -
8.456 - found_term, offset, frequency, doc_frequency, info_offset = self._find_closest_term(term)
8.457 -
8.458 - # If the term is found, return the offset and frequencies.
8.459 -
8.460 - if term == found_term:
8.461 - return offset, frequency, doc_frequency
8.462 - else:
8.463 - return None
8.464 -
8.465 - def _get_term_and_positions(self, term, offset, frequency, doc_frequency):
8.466 + return doc_positions
8.467
8.468 - """
8.469 - Return the term plus positions details using the given 'term', 'offset',
8.470 - 'frequency' and 'doc_frequency'.
8.471 - """
8.472 -
8.473 - return term, frequency, doc_frequency, self._get_positions(offset, doc_frequency)
8.474 -
8.475 - def _get_positions(self, offset, doc_frequency):
8.476 +class TermIterator(TermReader):
8.477
8.478 - """
8.479 - Obtain positions from the position index 'offset' expecting a number of
8.480 - documents equal to the given 'doc_frequency'.
8.481 - """
8.482 -
8.483 - return self.position_dict_reader.read_term_positions(offset, doc_frequency)
8.484 -
8.485 - # Iterator convenience methods.
8.486 + "An iterator over terms and positions read from a file."
8.487
8.488 def __iter__(self):
8.489 - self.rewind()
8.490 return self
8.491
8.492 def next(self):
8.493 try:
8.494 + self.begin_record()
8.495 return self.read_term()
8.496 except EOFError:
8.497 raise StopIteration
8.498
8.499 - # Sequential access methods.
8.500 -
8.501 - def rewind(self):
8.502 - self.entry = 0
8.503 - self.info_reader.rewind()
8.504 -
8.505 - def read_term(self):
8.506 -
8.507 - """
8.508 - Return the next term, its frequency, its document frequency, and the
8.509 - documents and positions at which the term is found.
8.510 - """
8.511 -
8.512 - return self._get_term_and_positions(*self._read_term())
8.513 -
8.514 - def _read_term(self):
8.515 -
8.516 - try:
8.517 - term, offset, frequency, doc_frequency = self.info_reader.read_term()
8.518 - except EOFError:
8.519 - self.entry += 1
8.520 - try:
8.521 - term, offset, frequency, doc_frequency, info_offset = self.terms[self.entry]
8.522 - except IndexError:
8.523 - raise EOFError
8.524 - else:
8.525 - # Reset the term and offset for the new page.
8.526 -
8.527 - self.info_reader.go_to_term("", 0, info_offset)
8.528 -
8.529 - # Skip the term in the information file.
8.530 -
8.531 - self.info_reader.read_term()
8.532 +class TermDataIterator(TermReader):
8.533
8.534 - return term, offset, frequency, doc_frequency
8.535 -
8.536 - def go_to_term(self, term):
8.537 -
8.538 - """
8.539 - Navigate to 'term' in the dictionary, returning the details from its
8.540 - entry. The returned details can be augmented with position information
8.541 - when presented to the _get_term_and_positions method.
8.542 - """
8.543 -
8.544 - found_term, offset, frequency, doc_frequency, info_offset = self._find_closest_term(term)
8.545 -
8.546 - # Position the reader, if necessary.
8.547 -
8.548 - if info_offset is not None:
8.549 + "An iterator over terms and unprocessed document positions data."
8.550
8.551 - # Reset the term and offset for the new page.
8.552 -
8.553 - self.info_reader.go_to_term("", 0, info_offset)
8.554 -
8.555 - # Skip the term in the information file.
8.556 -
8.557 - self.info_reader.read_term()
8.558 -
8.559 - return found_term, offset, frequency, doc_frequency
8.560 -
8.561 - # Query methods.
8.562 -
8.563 - def get_terms(self):
8.564 -
8.565 - "Return a list of all terms."
8.566 -
8.567 - return iter(self)
8.568 + def __iter__(self):
8.569 + return self
8.570
8.571 - def find_terms(self, term):
8.572 -
8.573 - "Return all terms whose values start with the value of 'term'."
8.574 -
8.575 - terms = []
8.576 -
8.577 - found_term, offset, frequency, doc_frequency = self.go_to_term(term)
8.578 -
8.579 - # Read and record terms.
8.580 -
8.581 + def next(self):
8.582 try:
8.583 - # Add the found term if it starts with the specified term.
8.584 -
8.585 - while found_term.startswith(term):
8.586 - terms.append(found_term)
8.587 - found_term, offset, frequency, doc_frequency = self._read_term()
8.588 -
8.589 + self.begin_record()
8.590 + return self.read_term_plus_remaining()
8.591 except EOFError:
8.592 - pass
8.593 -
8.594 - return terms
8.595 -
8.596 - def find_positions(self, term):
8.597 -
8.598 - "Return the documents and positions at which the given 'term' is found."
8.599 -
8.600 - t = self._find_term(term)
8.601 - if t is None:
8.602 - return []
8.603 - else:
8.604 - offset, frequency, doc_frequency = t
8.605 - return self._get_positions(offset, doc_frequency)
8.606 -
8.607 - def find_common_positions(self, terms):
8.608 -
8.609 - """
8.610 - Return the documents and positions at which all the given 'terms' are
8.611 - found, where only common documents are returned.
8.612 - """
8.613 -
8.614 - return PhraseIterator([self.find_positions(term) for term in terms])
8.615 -
8.616 - def get_frequency(self, term):
8.617 -
8.618 - "Return the frequency of the given 'term'."
8.619 -
8.620 - t = self._find_term(term)
8.621 - if t is None:
8.622 - return None
8.623 - else:
8.624 - offset, frequency, doc_frequency = t
8.625 - return frequency
8.626 -
8.627 - def get_document_frequency(self, term):
8.628 -
8.629 - "Return the document frequency of the given 'term'."
8.630 -
8.631 - t = self._find_term(term)
8.632 - if t is None:
8.633 - return None
8.634 - else:
8.635 - offset, frequency, doc_frequency = t
8.636 - return doc_frequency
8.637 -
8.638 - def close(self):
8.639 - self.info_reader.close()
8.640 - self.index_reader.close()
8.641 - self.position_dict_reader.close()
8.642 + raise StopIteration
8.643
8.644 # vim: tabstop=4 expandtab shiftwidth=4
9.1 --- a/itermerge.py Sat Feb 12 01:23:58 2011 +0100
9.2 +++ b/itermerge.py Sun Feb 13 02:49:55 2011 +0100
9.3 @@ -3,7 +3,7 @@
9.4 """
9.5 An iterator merging class similar to heapq.merge in Python 2.6.
9.6
9.7 -Copyright (C) 2009 Paul Boddie <paul@boddie.org.uk>
9.8 +Copyright (C) 2009, 2011 Paul Boddie <paul@boddie.org.uk>
9.9
9.10 This program is free software; you can redistribute it and/or modify it under
9.11 the terms of the GNU General Public License as published by the Free Software
10.1 --- a/test.py Sat Feb 12 01:23:58 2011 +0100
10.2 +++ b/test.py Sun Feb 13 02:49:55 2011 +0100
10.3 @@ -1,22 +1,21 @@
10.4 #!/usr/bin/env python
10.5 +# encoding: iso-8859-1
10.6
10.7 from iixr.files import *
10.8 -from iixr.fields import *
10.9 from iixr.terms import *
10.10 -from iixr.positions import *
10.11 from iixr.index import *
10.12 import os, sys
10.13
10.14 # Remove old test files.
10.15
10.16 -for filename in ("test", "testMS", "testNMS", "testF", "testFI", "testI", "testP", "testP2", "testPI"):
10.17 +for filename in ("test", "testMS", "testNMS", "testP", "testP2"):
10.18 try:
10.19 os.remove(filename)
10.20 except OSError:
10.21 pass
10.22
10.23 try:
10.24 - for dirname in ("test_index", "test_index2", "test_index3", "test_indexT"):
10.25 + for dirname in ("test_index",):
10.26 for filename in os.listdir(dirname):
10.27 os.remove(os.path.join(dirname, filename))
10.28 os.rmdir(dirname)
10.29 @@ -98,22 +97,20 @@
10.30 ]
10.31
10.32 f = open("testP", "wb")
10.33 -w = PositionWriter(f)
10.34 +w = TermWriter(f)
10.35 w.begin(0, 0)
10.36 for doc_positions in all_doc_positions:
10.37 - w.reset()
10.38 - for docnum, positions in doc_positions:
10.39 - w.write_positions(docnum, positions)
10.40 + w.write_positions(doc_positions)
10.41 + w.end_record()
10.42 w.close()
10.43
10.44 f = open("testP", "rb")
10.45 -r = PositionReader(f)
10.46 +r = TermReader(f)
10.47 for doc_positions in all_doc_positions:
10.48 - r.reset()
10.49 - for docnum, positions in doc_positions:
10.50 - d, p = r.read_positions()
10.51 - print docnum == d, docnum, d
10.52 - print positions == p, positions, p
10.53 + r.begin_record()
10.54 + dp = r.read_positions()
10.55 + print doc_positions == dp, doc_positions
10.56 + print " ", dp
10.57 r.close()
10.58
10.59 all_doc_positions_seq = [
10.60 @@ -131,350 +128,56 @@
10.61 ]
10.62
10.63 f = open("testP2", "wb")
10.64 -w = PositionWriter(f)
10.65 +w = TermWriter(f)
10.66 w.begin(2, 2)
10.67 for doc_positions in all_doc_positions_seq:
10.68 - w.reset()
10.69 - for docnum, positions in doc_positions:
10.70 - w.write_positions(docnum, positions)
10.71 + w.write_positions(doc_positions)
10.72 + w.end_record()
10.73 w.close()
10.74
10.75 f = open("testP2", "rb")
10.76 -r = PositionReader(f)
10.77 +r = TermReader(f)
10.78 for doc_positions in all_doc_positions_seq:
10.79 - r.reset()
10.80 - for docnum, positions in doc_positions:
10.81 - d, p = r.read_positions()
10.82 - print docnum == d, docnum, d
10.83 - print positions == p, positions, p
10.84 -r.close()
10.85 -
10.86 -print "- Test position index files."
10.87 -
10.88 -indexed_positions = [
10.89 - [
10.90 - (1234, 0, 100),
10.91 - (2345, 700, 100),
10.92 - (3456, 1900, 50)
10.93 - ],
10.94 - [
10.95 - (4567, 2800, 20)
10.96 - ]
10.97 - ]
10.98 -
10.99 -offsets = []
10.100 -f = open("testPI", "wb")
10.101 -w = PositionIndexWriter(f)
10.102 -w.begin(0)
10.103 -for term_positions in indexed_positions:
10.104 - offset = None
10.105 - doc_frequency = 0
10.106 - w.reset()
10.107 - for docnum, pos_offset, count in term_positions:
10.108 - if offset is None:
10.109 - offset = w.tell()
10.110 - w.write_positions(docnum, pos_offset, count)
10.111 - doc_frequency += count
10.112 - offsets.append((offset, doc_frequency))
10.113 -w.close()
10.114 -
10.115 -r = PositionIndexIterator(PositionIndexReader(open("testPI", "rb")))
10.116 -offsets.reverse()
10.117 -indexed_positions.reverse()
10.118 -for (offset, doc_frequency), term_positions in zip(offsets, indexed_positions):
10.119 - r.seek(offset, doc_frequency)
10.120 - for (docnum, pos_offset, count), (dn, po, c) in zip(term_positions, r):
10.121 - print docnum == dn, docnum, dn
10.122 - print pos_offset == po, pos_offset, po
10.123 - print count == c, count, c
10.124 -r.reader.close()
10.125 -
10.126 -print "- Test position dictionaries."
10.127 -
10.128 -f = open("testP", "wb")
10.129 -w = PositionWriter(f)
10.130 -f2 = open("testPI", "wb")
10.131 -w2 = PositionIndexWriter(f2)
10.132 -wd = PositionDictionaryWriter(w, w2, 2)
10.133 -offsets = []
10.134 -for doc_positions in all_doc_positions:
10.135 - offset, frequency, doc_frequency = wd.write_term_positions(doc_positions)
10.136 - offsets.append((offset, doc_frequency))
10.137 -wd.close()
10.138 -
10.139 -r = PositionReader(open("testP", "rb"))
10.140 -r2 = PositionIndexReader(open("testPI", "rb"))
10.141 -rd = PositionDictionaryReader(r, r2)
10.142 -offsets.reverse()
10.143 -all_doc_positions.reverse()
10.144 -for (offset, doc_frequency), doc_positions in zip(offsets, all_doc_positions):
10.145 - it = rd.read_term_positions(offset, doc_frequency)
10.146 - dp = list(it)
10.147 - print doc_positions == dp, doc_positions, dp
10.148 -rd.close()
10.149 -
10.150 -print "- Test fields."
10.151 -
10.152 -doc_fields = [
10.153 - (123, ["testing", "fields", "stored", "compressed"]),
10.154 - (456, ["fields", "for a second", "document"]),
10.155 - (789, ["field value"]),
10.156 - (1234, []),
10.157 - (2345, ["abc", "def"]),
10.158 - (3456, ["apple", "banana", "cherry"]),
10.159 - (4567, ["drue", "eple"])
10.160 - ]
10.161 -
10.162 -f = open("testF", "wb")
10.163 -w = FieldWriter(f)
10.164 -w.begin(0)
10.165 -w.reset()
10.166 -for docnum, fields in doc_fields:
10.167 - w.write_fields(docnum, list(enumerate(fields)))
10.168 -w.close()
10.169 -
10.170 -f = open("testF", "rb")
10.171 -r = FieldReader(f)
10.172 -r.reset()
10.173 -for docnum, fields in doc_fields:
10.174 - dn, df = r.read_fields()
10.175 - print docnum == dn, docnum, dn
10.176 - print list(enumerate(fields)) == df, list(enumerate(fields)), df
10.177 -r.close()
10.178 -
10.179 -print "- Test field index files."
10.180 -
10.181 -indexed_docs = [
10.182 - (123, 100000987),
10.183 - (456, 100004321),
10.184 - (789, 100008765)
10.185 - ]
10.186 -
10.187 -f = open("testFI", "wb")
10.188 -w = FieldIndexWriter(f)
10.189 -w.begin(0)
10.190 -w.reset()
10.191 -for docnum, offset in indexed_docs:
10.192 - w.write_document(docnum, offset)
10.193 -w.close()
10.194 -
10.195 -f = open("testFI", "rb")
10.196 -r = FieldIndexReader(f)
10.197 -r.reset()
10.198 -for docnum, offset in indexed_docs:
10.199 - dn, o = r.read_document()
10.200 - print docnum == dn, docnum, dn
10.201 - print offset == o, offset, o
10.202 + r.begin_record()
10.203 + dp = r.read_positions()
10.204 + print doc_positions == dp, doc_positions
10.205 + print " ", dp
10.206 r.close()
10.207
10.208 -print "- Test field dictionaries."
10.209 -
10.210 -f = open("testF", "wb")
10.211 -w = FieldWriter(f)
10.212 -f2 = open("testFI", "wb")
10.213 -w2 = FieldIndexWriter(f2)
10.214 -wd = FieldDictionaryWriter(w, w2, 3)
10.215 -for docnum, fields in doc_fields:
10.216 - wd.write_fields(docnum, list(enumerate(fields)))
10.217 -wd.close()
10.218 -
10.219 -f = open("testF", "rb")
10.220 -r = FieldReader(f)
10.221 -f2 = open("testFI", "rb")
10.222 -r2 = FieldIndexReader(f2)
10.223 -rd = FieldDictionaryReader(r, r2)
10.224 -doc_fields_reversed = doc_fields[:]
10.225 -doc_fields_reversed.reverse()
10.226 -for docnum, fields in doc_fields_reversed:
10.227 - df = dict(rd.get_fields(docnum))
10.228 - print dict(enumerate(fields)) == df, dict(enumerate(fields)), df
10.229 -for docnum in (13579, 246810):
10.230 - df = rd.get_fields(docnum)
10.231 - print df is None, df
10.232 -
10.233 -print "- (Test sequential access.)"
10.234 -
10.235 -rd.rewind()
10.236 -for docnum, fields in doc_fields:
10.237 - dn, df = rd.read_fields()
10.238 - print docnum == dn, docnum, dn
10.239 - print list(enumerate(fields)) == df, list(enumerate(fields)), df
10.240 -rd.close()
10.241 -
10.242 -print "- Test terms."
10.243 -
10.244 -terms = [
10.245 - # term offset frequency doc_frequency
10.246 - ("aardvark", 100000123, 1, 1),
10.247 - ("anteater", 100000456, 2, 1),
10.248 - ("badger", 100000789, 13, 7),
10.249 - ("bull", 1000001234, 59, 17),
10.250 - ("bulldog", 1000002345, 99, 80),
10.251 - ("cat", 1000003456, 89, 28)
10.252 - ]
10.253 -
10.254 -f = open("test", "wb")
10.255 -w = TermWriter(f)
10.256 -w.reset()
10.257 -for term, offset, frequency, doc_frequency in terms:
10.258 - w.write_term(term, offset, frequency, doc_frequency)
10.259 -w.close()
10.260 -
10.261 -f = open("test", "rb")
10.262 -r = TermReader(f)
10.263 -r.reset()
10.264 -for term, offset, frequency, doc_frequency in terms:
10.265 - t, o, fr, df = r.read_term()
10.266 - print term == t, term, t
10.267 - print offset == o, offset, o
10.268 - print frequency == fr, frequency, fr
10.269 - print doc_frequency == df, doc_frequency, df
10.270 -r.close()
10.271 -
10.272 -print "- Test terms in index files."
10.273 -
10.274 -indexed_terms = [
10.275 - # term offset frequency doc_frequency info_offset
10.276 - ("aardvark", 100000123, 1, 1, 200000321),
10.277 - ("anteater", 100000456, 2, 1, 200000654),
10.278 - ("badger", 100000789, 13, 7, 200000987),
10.279 - ("bull", 1000001234, 59, 17, 200004321),
10.280 - ("bulldog", 1000002345, 99, 80, 200005432),
10.281 - ("cat", 1000003456, 89, 28, 200006543)
10.282 - ]
10.283 -
10.284 -f = open("test", "wb")
10.285 -w = TermIndexWriter(f)
10.286 -w.reset()
10.287 -for term, offset, frequency, doc_frequency, info_offset in indexed_terms:
10.288 - w.write_term(term, offset, frequency, doc_frequency, info_offset)
10.289 -w.close()
10.290 -
10.291 -f = open("test", "rb")
10.292 -r = TermIndexReader(f)
10.293 -r.reset()
10.294 -for term, offset, frequency, doc_frequency, info_offset in indexed_terms:
10.295 - t, o, fr, df, i = r.read_term()
10.296 - print term == t, term, t
10.297 - print offset == o, offset, o
10.298 - print frequency == fr, frequency, fr
10.299 - print doc_frequency == df, doc_frequency, df
10.300 - print info_offset == i, info_offset, i
10.301 -r.close()
10.302 -
10.303 -print "- Test dictionaries with only term data."
10.304 -
10.305 -f = open("test", "wb")
10.306 -w = TermWriter(f)
10.307 -f2 = open("testI", "wb")
10.308 -w2 = TermIndexWriter(f2)
10.309 -f3 = open("testP", "wb")
10.310 -w3 = PositionWriter(f3)
10.311 -f4 = open("testPI", "wb")
10.312 -w4 = PositionIndexWriter(f4)
10.313 -wp = PositionDictionaryWriter(w3, w4, 2)
10.314 -wd = TermDictionaryWriter(w, w2, wp, 3)
10.315 -for term, offset, frequency, doc_frequency in terms:
10.316 - wd._write_term(term, offset, frequency, doc_frequency)
10.317 -wd.close()
10.318 -
10.319 -f = open("test", "rb")
10.320 -r = TermReader(f)
10.321 -f2 = open("testI", "rb")
10.322 -r2 = TermIndexReader(f2)
10.323 -r3 = PositionReader(open("testP", "rb"))
10.324 -r4 = PositionIndexReader(open("testPI", "rb"))
10.325 -rp = PositionDictionaryReader(r3, r4)
10.326 -rd = TermDictionaryReader(r, r2, rp)
10.327 -terms_reversed = terms[:]
10.328 -terms_reversed.reverse()
10.329 -for term, offset, frequency, doc_frequency in terms_reversed:
10.330 - o, fr, df = rd._find_term(term)
10.331 - print offset == o, offset, o
10.332 - print frequency == fr, frequency, fr
10.333 - print doc_frequency == df, doc_frequency, df
10.334 -for term in ("dog", "dingo"):
10.335 - t = rd._find_term(term)
10.336 - print t is None, t
10.337 -
10.338 -print "- (Test term prefix searching.)"
10.339 -
10.340 -print rd.find_terms("a") == ["aardvark", "anteater"], rd.find_terms("a"), ["aardvark", "anteater"]
10.341 -print rd.find_terms("bu") == ["bull", "bulldog"], rd.find_terms("bu"), ["bull", "bulldog"]
10.342 -print rd.find_terms("c") == ["cat"], rd.find_terms("c"), ["cat"]
10.343 -print rd.find_terms("d") == [], rd.find_terms("d"), []
10.344 -rd.close()
10.345 -
10.346 print "- Test dictionaries with term and position data."
10.347
10.348 terms_with_positions = [
10.349 ("aardvark", [(1, [2, 45, 96]), (20, [13])]),
10.350 ("anteater", [(1, [43, 44])]),
10.351 ("badger", [(7, [2, 22, 196]), (19, [55, 1333]), (21, [0])]),
10.352 + (u"bjørn", [(11, [19, 54])]),
10.353 ("bull", [(6, [128]), (16, [12]), (26, [1, 3, 5, 7, 9]), (36, [2, 4, 6, 8, 10])]),
10.354 ("bulldog", [(43, [17, 19, 256, 512])]),
10.355 - ("cat", [(123, [12, 145, 196]), (1200, [113])])
10.356 - ]
10.357 -
10.358 -position_dict_tests = [
10.359 - ("badger", 19, [55, 1333]),
10.360 - ("badger", 20, None),
10.361 - ("bull", 6, [128]),
10.362 - ("bull", 26, [1, 3, 5, 7, 9]),
10.363 - ("cat", 111, None),
10.364 - ("cat", 123, [12, 145, 196]),
10.365 - ("cat", 1234, None)
10.366 + ("cat", [(123, [12, 145, 196]), (1200, [113])]),
10.367 + (u"å", [(15, [384])]),
10.368 ]
10.369
10.370 f = open("test", "wb")
10.371 w = TermWriter(f)
10.372 -f2 = open("testI", "wb")
10.373 -w2 = TermIndexWriter(f2)
10.374 -f3 = open("testP", "wb")
10.375 -w3 = PositionWriter(f3)
10.376 -f4 = open("testPI", "wb")
10.377 -w4 = PositionIndexWriter(f4)
10.378 -wp = PositionDictionaryWriter(w3, w4, 2)
10.379 -wd = TermDictionaryWriter(w, w2, wp, 3)
10.380 -for term, doc_positions in terms_with_positions:
10.381 - wd.write_term_positions(term, doc_positions)
10.382 -wd.close()
10.383 +w.begin(0, 0)
10.384 +w.write_terms(terms_with_positions)
10.385 +w.close()
10.386
10.387 f = open("test", "rb")
10.388 -r = TermReader(f)
10.389 -f2 = open("testI", "rb")
10.390 -r2 = TermIndexReader(f2)
10.391 -r3 = PositionReader(open("testP", "rb"))
10.392 -r4 = PositionIndexReader(open("testPI", "rb"))
10.393 -rp = PositionDictionaryReader(r3, r4)
10.394 -rd = TermDictionaryReader(r, r2, rp)
10.395 -terms_reversed = terms_with_positions[:]
10.396 -terms_reversed.reverse()
10.397 -for term, doc_positions in terms_reversed:
10.398 - dp = list(rd.find_positions(term))
10.399 - print doc_positions == dp, doc_positions, dp
10.400 -for term in ("aaa", "dog", "dingo"):
10.401 - dp = rd.find_positions(term)
10.402 - print dp == [], dp
10.403 +r = TermIterator(f)
10.404 +for (term, doc_positions), (t, dp) in zip(terms_with_positions, r):
10.405 + print term == t, term, t
10.406 + print doc_positions == dp, doc_positions
10.407 + print " ", dp
10.408 +r.close()
10.409
10.410 -print "- (Test iterators.)"
10.411 -
10.412 -for term, docnum, positions in position_dict_tests:
10.413 - dp = rd.find_positions(term)
10.414 - pos = dp.from_document(docnum)
10.415 - print positions is None and pos is None or pos is not None and positions == list(pos), positions, pos
10.416 -
10.417 -print "- (Test sequential access.)"
10.418 +f = open("test", "rb")
10.419 +r = TermDataIterator(f)
10.420 +for (term, doc_positions), (t, data) in zip(terms_with_positions, r):
10.421 + print term == t, term, t, data
10.422 +r.close()
10.423
10.424 -rd.rewind()
10.425 -for term, doc_positions in terms_with_positions:
10.426 - t, fr, df, dp = rd.read_term()
10.427 - dp = list(dp)
10.428 - print term == t, term, t
10.429 - print doc_positions == dp, doc_positions, dp
10.430 -rd.close()
10.431 -
10.432 -print "- Test high-level index operations (including merging)."
10.433 +print "- Test high-level index operations."
10.434
10.435 docs = [
10.436 (1, "The cat sat on the mat"),
10.437 @@ -485,189 +188,26 @@
10.438 (36, "She sells sea shells on the sea shore")
10.439 ]
10.440
10.441 -doc_tests = [
10.442 - ("Every", 2, [(2, [0]), (14, [0])]),
10.443 - ("good", 2, [(2, [1]), (13, [1])]),
10.444 - ("deserves", 2, [(2, [3]), (13, [3])]),
10.445 - ("sea", 2, [(36, [2, 6])])
10.446 - ]
10.447 -
10.448 -position_tests = [
10.449 - ("Every", 14, [0]),
10.450 - ("sea", 36, [2, 6]),
10.451 - ("shells", 1, None),
10.452 - ("shells", 37, None)
10.453 - ]
10.454 -
10.455 -phrase_tests = [
10.456 - (["good", "boy"], [(2, [1, 2])]),
10.457 - (["on", "the"], [(1, [3, 4]), (36, [4, 5])]),
10.458 - (["sea", "shore"], [(36, [6, 7])])
10.459 - ]
10.460 -
10.461 -index = Index("test_index", 3, 2, 3, 6)
10.462 +index = Index("test_index", 3)
10.463 wi = index.get_writer()
10.464 for docnum, text in docs:
10.465 doc = Document(docnum)
10.466 for position, term in enumerate(text.split()):
10.467 doc.add_position(term, position)
10.468 - doc.add_field(123, text)
10.469 - wi.add_document(doc)
10.470 -wi.close()
10.471 -
10.472 -rd = index.get_reader()
10.473 -
10.474 -print "- (Test searching.)"
10.475 -
10.476 -for term, frequency, doc_positions in doc_tests:
10.477 - dp = list(rd.find_positions(term))
10.478 - print doc_positions == dp, doc_positions, dp
10.479 - fr = rd.get_frequency(term)
10.480 - print frequency == fr, frequency, fr
10.481 -
10.482 -print "- (Test fields.)"
10.483 -
10.484 -for docnum, text in docs:
10.485 - df = dict(rd.get_fields(docnum))
10.486 - print df[123] == text, text, df[123]
10.487 -
10.488 -print "- (Test navigation.)"
10.489 -
10.490 -for term, docnum, positions in position_tests:
10.491 - dp = rd.find_positions(term)
10.492 - pos = dp.from_document(docnum)
10.493 - print positions is None and pos is None or pos is not None and positions == list(pos), positions, pos
10.494 -
10.495 -print "- (Test phrases.)"
10.496 -
10.497 -for terms, results in phrase_tests:
10.498 - res = list(rd.find_common_positions(terms))
10.499 - print results == res, results, res
10.500 -
10.501 -index.close()
10.502 -
10.503 -docs2 = [
10.504 - ((1, 0), "The cat sat on the mat"),
10.505 - ((1, 2), "Every good boy deserves football"),
10.506 - ((13, 1), "One good turn deserves another"),
10.507 - ((14, 0), "Every man for himself"),
10.508 - ((14, 25), "Red sky at night shepherd's delight"),
10.509 - ((36, 12), "She sells sea shells on the sea shore")
10.510 - ]
10.511 -
10.512 -doc_tests2 = [
10.513 - ("Every", 2, [((1, 2), [(0, 0)]), ((14, 0), [(0, 0)])]),
10.514 - ("good", 2, [((1, 2), [(1, 6)]), ((13, 1), [(1, 4)])]),
10.515 - ("deserves", 2, [((1, 2), [(3, 15)]), ((13, 1), [(3, 14)])]),
10.516 - ("sea", 2, [((36, 12), [(2, 10), (6, 28)])])
10.517 - ]
10.518 -
10.519 -position_tests2 = [
10.520 - ("Every", (14, 0), [(0, 0)]),
10.521 - ("sea", (36, 12), [(2, 10), (6, 28)]),
10.522 - ("shells", (1, 0), None),
10.523 - ("shells", (37, 0), None)
10.524 - ]
10.525 -
10.526 -phrase_tests2 = [
10.527 - (["good", "boy"], [((1, 2), [(1, 6), (2, 11)])]),
10.528 - (["on", "the"], [((1, 0), [(3, 12), (4, 15)]), ((36, 12), [(4, 21), (5, 24)])]),
10.529 - (["sea", "shore"], [((36, 12), [(6, 28), (7, 32)])])
10.530 - ]
10.531 -
10.532 -index = Index("test_indexT", 3, 2, 3, 6)
10.533 -wi = index.get_writer()
10.534 -for docnum, text in docs2:
10.535 - doc = Document(docnum)
10.536 - offset = 0
10.537 - for position, term in enumerate(text.split()):
10.538 - doc.add_position(term, (position, offset))
10.539 - offset += len(term) + 1 # assume one space after the term
10.540 - doc.add_field(123, text)
10.541 wi.add_document(doc)
10.542 wi.close()
10.543
10.544 -rd = index.get_reader()
10.545 -
10.546 -print "- (Test searching.)"
10.547 -
10.548 -for term, frequency, doc_positions in doc_tests2:
10.549 - dp = list(rd.find_positions(term))
10.550 - print doc_positions == dp, doc_positions, dp
10.551 - fr = rd.get_frequency(term)
10.552 - print frequency == fr, frequency, fr
10.553 -
10.554 -print "- (Test fields.)"
10.555 +print "- Test merge."
10.556
10.557 -for docnum, text in docs2:
10.558 - df = dict(rd.get_fields(docnum))
10.559 - print df[123] == text, text, df[123]
10.560 -
10.561 -print "- (Test navigation.)"
10.562 +l1 = list(index.get_reader())
10.563 +index.merge()
10.564 +l2 = list(index.get_reader(1))
10.565
10.566 -for term, docnum, positions in position_tests2:
10.567 - dp = rd.find_positions(term)
10.568 - pos = dp.from_document(docnum)
10.569 - print positions is None and pos is None or pos is not None and positions == list(pos), positions, pos
10.570 -
10.571 -print "- (Test phrases.)"
10.572 -
10.573 -for terms, results in phrase_tests2:
10.574 - res = list(rd.find_common_positions(terms))
10.575 - print results == res, results, res
10.576 +for (t1, dp1), (t2, dp2) in zip(l1, l2):
10.577 + print t1 == t2, t1, t2
10.578 + print dp1 == dp1, dp1
10.579 + print " ", dp2
10.580
10.581 index.close()
10.582
10.583 -print "- Test index updates."
10.584 -
10.585 -index = Index("test_index")
10.586 -index2 = Index("test_index2", 3, 2, 3, 6)
10.587 -wi = index2.get_writer()
10.588 -for docnum, text in docs:
10.589 -
10.590 - # Add the same documents but with different numbers.
10.591 -
10.592 - doc = Document(docnum + 100)
10.593 - for position, term in enumerate(text.split()):
10.594 - doc.add_position(term, position)
10.595 - doc.add_field(123, text)
10.596 - wi.add_document(doc)
10.597 -wi.close()
10.598 -
10.599 -index2.update([index])
10.600 -index.close()
10.601 -
10.602 -rd = index2.get_reader()
10.603 -for term, frequency, doc_positions in doc_tests:
10.604 -
10.605 - # Add the extra documents to the expected result.
10.606 -
10.607 - orig_doc_positions = doc_positions
10.608 - doc_positions = doc_positions[:]
10.609 -
10.610 - for docnum, positions in orig_doc_positions:
10.611 - doc_positions.append((docnum + 100, positions))
10.612 - frequency *= 2
10.613 -
10.614 - dp = list(rd.find_positions(term))
10.615 - print doc_positions == dp, doc_positions, dp
10.616 - fr = rd.get_frequency(term)
10.617 - print frequency == fr, frequency, fr
10.618 -index2.close()
10.619 -
10.620 -print "- (Test update of an empty index.)"
10.621 -
10.622 -index = Index("test_index")
10.623 -index3 = Index("test_index3")
10.624 -index3.update([index])
10.625 -index.close()
10.626 -
10.627 -rd = index3.get_reader()
10.628 -for term, frequency, doc_positions in doc_tests:
10.629 - dp = list(rd.find_positions(term))
10.630 - print doc_positions == dp, doc_positions, dp
10.631 - fr = rd.get_frequency(term)
10.632 - print frequency == fr, frequency, fr
10.633 -index3.close()
10.634 -
10.635 # vim: tabstop=4 expandtab shiftwidth=4