1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
1.2 +++ b/iixr/positions.py Tue Sep 15 00:15:11 2009 +0200
1.3 @@ -0,0 +1,525 @@
1.4 +#!/usr/bin/env python
1.5 +
1.6 +"""
1.7 +Specific classes for storing position information.
1.8 +
1.9 +Copyright (C) 2009 Paul Boddie <paul@boddie.org.uk>
1.10 +
1.11 +This program is free software; you can redistribute it and/or modify it under
1.12 +the terms of the GNU General Public License as published by the Free Software
1.13 +Foundation; either version 3 of the License, or (at your option) any later
1.14 +version.
1.15 +
1.16 +This program is distributed in the hope that it will be useful, but WITHOUT ANY
1.17 +WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
1.18 +PARTICULAR PURPOSE. See the GNU General Public License for more details.
1.19 +
1.20 +You should have received a copy of the GNU General Public License along
1.21 +with this program. If not, see <http://www.gnu.org/licenses/>.
1.22 +"""
1.23 +
1.24 +from iixr.files import *
1.25 +from iixr.data import vint
1.26 +
1.27 +class PositionWriter(FileWriter):
1.28 +
1.29 + "Writing position information to files."
1.30 +
1.31 + def reset(self):
1.32 + self.last_docnum = 0
1.33 +
1.34 + def write_positions(self, docnum, positions):
1.35 +
1.36 + """
1.37 + Write for the document 'docnum' the given 'positions'.
1.38 + Return the offset of the written record.
1.39 + """
1.40 +
1.41 + if docnum < self.last_docnum:
1.42 + raise ValueError, "Document number %r is less than previous number %r." % (docnum, self.last_docnum)
1.43 +
1.44 + # Record the offset of this record.
1.45 +
1.46 + offset = self.tell()
1.47 +
1.48 + # Make sure that the positions are sorted.
1.49 +
1.50 + positions.sort()
1.51 +
1.52 + # Write the position deltas.
1.53 +
1.54 + output = []
1.55 + last = 0
1.56 +
1.57 + for position in positions:
1.58 + output.append(vint(position - last))
1.59 + last = position
1.60 +
1.61 + # Write the document number delta.
1.62 + # Write the number of positions.
1.63 + # Then write the positions.
1.64 +
1.65 + self.write(vint(docnum - self.last_docnum) + vint(len(positions)) + "".join(output))
1.66 +
1.67 + self.last_docnum = docnum
1.68 + return offset
1.69 +
1.70 +class PositionOpener(FileOpener):
1.71 +
1.72 + "Reading position information from files."
1.73 +
1.74 + def read_term_positions(self, offset, count):
1.75 +
1.76 + """
1.77 + Read all positions from 'offset', seeking to that position in the file
1.78 + before reading. The number of documents available for reading is limited
1.79 + to 'count'.
1.80 + """
1.81 +
1.82 + # Duplicate the file handle.
1.83 +
1.84 + f = self.open("rb")
1.85 + return PositionIterator(f, offset, count)
1.86 +
1.87 +class PositionIndexWriter(FileWriter):
1.88 +
1.89 + "Writing position index information to files."
1.90 +
1.91 + def reset(self):
1.92 + self.last_docnum = 0
1.93 + self.last_pos_offset = 0
1.94 +
1.95 + def write_positions(self, docnum, pos_offset, count):
1.96 +
1.97 + """
1.98 + Write the given 'docnum, 'pos_offset' and document 'count' to the
1.99 + position index file.
1.100 + """
1.101 +
1.102 + # Record the offset of this record.
1.103 +
1.104 + offset = self.tell()
1.105 + output = []
1.106 +
1.107 + # Write the document number delta.
1.108 +
1.109 + output.append(vint(docnum - self.last_docnum))
1.110 + self.last_docnum = docnum
1.111 +
1.112 + # Write the position file offset delta.
1.113 +
1.114 + output.append(vint(pos_offset - self.last_pos_offset))
1.115 + self.last_pos_offset = pos_offset
1.116 +
1.117 + # Write the document count.
1.118 +
1.119 + output.append(vint(count))
1.120 +
1.121 + # Actually write the data.
1.122 +
1.123 + self.write("".join(output))
1.124 +
1.125 + return offset
1.126 +
1.127 +class PositionIndexOpener(FileOpener):
1.128 +
1.129 + "Reading position index information from files."
1.130 +
1.131 + def read_term_positions(self, offset, doc_frequency):
1.132 +
1.133 + """
1.134 + Read all positions from 'offset', seeking to that position in the file
1.135 + before reading. The number of documents available for reading is limited
1.136 + to 'doc_frequency'.
1.137 + """
1.138 +
1.139 + # Duplicate the file handle.
1.140 +
1.141 + f = self.open("rb")
1.142 + return PositionIndexIterator(f, offset, doc_frequency)
1.143 +
1.144 +# Iterators for position-related files.
1.145 +
1.146 +class IteratorBase:
1.147 +
1.148 + def __init__(self, count):
1.149 + self.replenish(count)
1.150 +
1.151 + def replenish(self, count):
1.152 + self.count = count
1.153 + self.read_documents = 0
1.154 +
1.155 + def __len__(self):
1.156 + return self.count
1.157 +
1.158 + def sort(self):
1.159 + pass # Stored document positions are already sorted.
1.160 +
1.161 + def __iter__(self):
1.162 + return self
1.163 +
1.164 +class PositionIterator(FileReader, IteratorBase):
1.165 +
1.166 + "Iterating over document positions."
1.167 +
1.168 + def __init__(self, f, offset, count):
1.169 + FileReader.__init__(self, f)
1.170 + IteratorBase.__init__(self, count)
1.171 + self.seek(offset)
1.172 +
1.173 + def reset(self):
1.174 + self.last_docnum = 0
1.175 +
1.176 + def read_positions(self):
1.177 +
1.178 + "Read positions, returning a document number and a list of positions."
1.179 +
1.180 + # Read the document number delta and add it to the last number.
1.181 +
1.182 + self.last_docnum += self.read_number()
1.183 +
1.184 + # Read the number of positions.
1.185 +
1.186 + npositions = self.read_number()
1.187 +
1.188 + # Read the position deltas, adding each previous position to get the
1.189 + # appropriate collection of absolute positions.
1.190 +
1.191 + i = 0
1.192 + last = 0
1.193 + positions = []
1.194 +
1.195 + while i < npositions:
1.196 + last += self.read_number()
1.197 + positions.append(last)
1.198 + i += 1
1.199 +
1.200 + return self.last_docnum, positions
1.201 +
1.202 + def next(self):
1.203 +
1.204 + "Read positions for a single document."
1.205 +
1.206 + if self.read_documents < self.count:
1.207 + self.read_documents += 1
1.208 + return self.read_positions()
1.209 + else:
1.210 + raise StopIteration
1.211 +
1.212 +class PositionIndexIterator(FileReader, IteratorBase):
1.213 +
1.214 + "Iterating over document positions."
1.215 +
1.216 + def __init__(self, f, offset, count):
1.217 + FileReader.__init__(self, f)
1.218 + IteratorBase.__init__(self, count)
1.219 + self.seek(offset)
1.220 + self.section_count = 0
1.221 +
1.222 + def reset(self):
1.223 + self.last_docnum = 0
1.224 + self.last_pos_offset = 0
1.225 +
1.226 + def read_positions(self):
1.227 +
1.228 + """
1.229 + Read a document number, a position file offset for the position index
1.230 + file, and the number of documents in a section of that file.
1.231 + """
1.232 +
1.233 + # Read the document number delta.
1.234 +
1.235 + self.last_docnum += self.read_number()
1.236 +
1.237 + # Read the offset delta.
1.238 +
1.239 + self.last_pos_offset += self.read_number()
1.240 +
1.241 + # Read the document count.
1.242 +
1.243 + count = self.read_number()
1.244 +
1.245 + return self.last_docnum, self.last_pos_offset, count
1.246 +
1.247 + def next(self):
1.248 +
1.249 + "Read positions for a single document."
1.250 +
1.251 + self.read_documents += self.section_count
1.252 + if self.read_documents < self.count:
1.253 + docnum, pos_offset, self.section_count = t = self.read_positions()
1.254 + return t
1.255 + else:
1.256 + raise StopIteration
1.257 +
1.258 +class PositionDictionaryWriter:
1.259 +
1.260 + "Writing position dictionaries."
1.261 +
1.262 + def __init__(self, position_writer, position_index_writer, interval):
1.263 + self.position_writer = position_writer
1.264 + self.position_index_writer = position_index_writer
1.265 + self.interval = interval
1.266 +
1.267 + def write_term_positions(self, doc_positions):
1.268 +
1.269 + """
1.270 + Write all 'doc_positions' - a collection of tuples of the form (document
1.271 + number, position list) - to the file.
1.272 +
1.273 + Add some records to the index, making dictionary entries.
1.274 +
1.275 + Return a tuple containing the offset of the written data, the frequency
1.276 + (number of positions), and document frequency (number of documents) for
1.277 + the term involved.
1.278 + """
1.279 +
1.280 + # Reset the writers.
1.281 +
1.282 + self.position_writer.reset()
1.283 + self.position_index_writer.reset()
1.284 +
1.285 + index_offset = None
1.286 +
1.287 + # Write the positions.
1.288 +
1.289 + frequency = 0
1.290 + first_docnum = None
1.291 + first_offset = None
1.292 + count = 0
1.293 +
1.294 + doc_positions.sort()
1.295 +
1.296 + for docnum, positions in doc_positions:
1.297 + pos_offset = self.position_writer.write_positions(docnum, positions)
1.298 +
1.299 + # Retain the first record offset for a subsequent index entry.
1.300 +
1.301 + if first_offset is None:
1.302 + first_offset = pos_offset
1.303 + first_docnum = docnum
1.304 +
1.305 + frequency += len(positions)
1.306 + count += 1
1.307 +
1.308 + # Every {interval} entries, write an index entry.
1.309 +
1.310 + if count % self.interval == 0:
1.311 + io = self.position_index_writer.write_positions(first_docnum, first_offset, self.interval)
1.312 +
1.313 + # Remember the first index entry offset.
1.314 +
1.315 + if index_offset is None:
1.316 + index_offset = io
1.317 +
1.318 + first_offset = None
1.319 + first_docnum = None
1.320 +
1.321 + # Reset the position writer so that position readers accessing
1.322 + # a section start with the correct document number.
1.323 +
1.324 + self.position_writer.reset()
1.325 +
1.326 + # Finish writing an index entry for the remaining documents.
1.327 +
1.328 + else:
1.329 + if first_offset is not None:
1.330 + io = self.position_index_writer.write_positions(first_docnum, first_offset, count % self.interval)
1.331 +
1.332 + # Remember the first index entry offset.
1.333 +
1.334 + if index_offset is None:
1.335 + index_offset = io
1.336 +
1.337 + return index_offset, frequency, count
1.338 +
1.339 + def close(self):
1.340 + self.position_writer.close()
1.341 + self.position_index_writer.close()
1.342 +
1.343 +class PositionDictionaryReader:
1.344 +
1.345 + "Reading position dictionaries."
1.346 +
1.347 + def __init__(self, position_opener, position_index_opener):
1.348 + self.position_opener = position_opener
1.349 + self.position_index_opener = position_index_opener
1.350 +
1.351 + def read_term_positions(self, offset, doc_frequency):
1.352 +
1.353 + """
1.354 + Return an iterator for dictionary entries starting at 'offset' with the
1.355 + given 'doc_frequency'.
1.356 + """
1.357 +
1.358 + return PositionDictionaryIterator(self.position_opener,
1.359 + self.position_index_opener, offset, doc_frequency)
1.360 +
1.361 + def close(self):
1.362 + pass
1.363 +
1.364 +class PositionDictionaryIterator:
1.365 +
1.366 + "Iteration over position dictionary entries."
1.367 +
1.368 + def __init__(self, position_opener, position_index_opener, offset, doc_frequency):
1.369 + self.position_opener = position_opener
1.370 + self.doc_frequency = doc_frequency
1.371 + self.index_iterator = position_index_opener.read_term_positions(offset, doc_frequency)
1.372 + self.iterator = None
1.373 +
1.374 + # Remember the last values.
1.375 +
1.376 + self.found_docnum, self.found_positions = None, None
1.377 +
1.378 + # Maintain state for the next index entry, if read.
1.379 +
1.380 + self.next_docnum, self.next_pos_offset, self.next_section_count = None, None, None
1.381 +
1.382 + # Initialise the current index entry and current position file iterator.
1.383 +
1.384 + self._next_section()
1.385 + self._init_section()
1.386 +
1.387 + # Sequence methods.
1.388 +
1.389 + def __len__(self):
1.390 + return self.doc_frequency
1.391 +
1.392 + def sort(self):
1.393 + pass
1.394 +
1.395 + # Iterator methods.
1.396 +
1.397 + def __iter__(self):
1.398 + return self
1.399 +
1.400 + def next(self):
1.401 +
1.402 + """
1.403 + Attempt to get the next document record from the section in the
1.404 + positions file.
1.405 + """
1.406 +
1.407 + # Return any visited but unrequested record.
1.408 +
1.409 + if self.found_docnum is not None:
1.410 + t = self.found_docnum, self.found_positions
1.411 + self.found_docnum, self.found_positions = None, None
1.412 + return t
1.413 +
1.414 + # Or search for the next record.
1.415 +
1.416 + while 1:
1.417 +
1.418 + # Either return the next record.
1.419 +
1.420 + try:
1.421 + return self.iterator.next()
1.422 +
1.423 + # Or, where a section is finished, get the next section and try again.
1.424 +
1.425 + except StopIteration:
1.426 +
1.427 + # Where a section follows, update the index iterator, but keep
1.428 + # reading using the same file iterator (since the data should
1.429 + # just follow on from the last section).
1.430 +
1.431 + self._next_section()
1.432 + self.iterator.replenish(self.section_count)
1.433 +
1.434 + # Reset the state of the iterator to make sure that document
1.435 + # numbers are correct.
1.436 +
1.437 + self.iterator.reset()
1.438 +
1.439 + def from_document(self, docnum):
1.440 +
1.441 + """
1.442 + Attempt to navigate to a positions entry for the given 'docnum',
1.443 + returning the positions for 'docnum', or None otherwise.
1.444 + """
1.445 +
1.446 + # Return any unrequested document positions.
1.447 +
1.448 + if docnum == self.found_docnum:
1.449 + return self.found_positions
1.450 +
1.451 + # Read ahead in the index until the next entry refers to a document
1.452 + # later than the desired document.
1.453 +
1.454 + try:
1.455 + if self.next_docnum is None:
1.456 + self.next_docnum, self.next_pos_offset, self.next_section_count = self.index_iterator.next()
1.457 +
1.458 + # Read until the next entry is after the desired document number,
1.459 + # or until the end of the results.
1.460 +
1.461 + while self.next_docnum <= docnum:
1.462 + self._next_read_section()
1.463 + if self.docnum < docnum:
1.464 + self.next_docnum, self.next_pos_offset, self.next_section_count = self.index_iterator.next()
1.465 + else:
1.466 + break
1.467 +
1.468 + except StopIteration:
1.469 + pass
1.470 +
1.471 + # Navigate in the position file to the document.
1.472 +
1.473 + self._init_section()
1.474 +
1.475 + try:
1.476 + while 1:
1.477 + found_docnum, found_positions = self.iterator.next()
1.478 +
1.479 + # Return the desired document positions or None (retaining the
1.480 + # positions for the document immediately after).
1.481 +
1.482 + if docnum == found_docnum:
1.483 + return found_positions
1.484 + elif docnum < found_docnum:
1.485 + self.found_docnum, self.found_positions = found_docnum, found_positions
1.486 + return None
1.487 +
1.488 + except StopIteration:
1.489 + return None
1.490 +
1.491 + # Internal methods.
1.492 +
1.493 + def _next_section(self):
1.494 +
1.495 + "Attempt to get the next section in the index."
1.496 +
1.497 + if self.next_docnum is None:
1.498 + self.docnum, self.pos_offset, self.section_count = self.index_iterator.next()
1.499 + else:
1.500 + self._next_read_section()
1.501 +
1.502 + def _next_read_section(self):
1.503 +
1.504 + """
1.505 + Make the next index entry the current one without reading from the
1.506 + index.
1.507 + """
1.508 +
1.509 + self.docnum, self.pos_offset, self.section_count = self.next_docnum, self.next_pos_offset, self.next_section_count
1.510 + self.next_docnum, self.next_pos_offset, self.next_section_count = None, None, None
1.511 +
1.512 + def _init_section(self):
1.513 +
1.514 + "Initialise the iterator for the section in the position file."
1.515 +
1.516 + if self.iterator is not None:
1.517 + self.iterator.close()
1.518 + self.iterator = self.position_opener.read_term_positions(self.pos_offset, self.section_count)
1.519 +
1.520 + def close(self):
1.521 + if self.iterator is not None:
1.522 + self.iterator.close()
1.523 + self.iterator = None
1.524 + if self.index_iterator is not None:
1.525 + self.index_iterator.close()
1.526 + self.index_iterator = None
1.527 +
1.528 +# vim: tabstop=4 expandtab shiftwidth=4