# HG changeset patch # User Paul Boddie # Date 1297561795 -3600 # Node ID 6542c54d115b72f837b31d24a08ba25c3bdd21f2 # Parent b75bd39cf61f4fdec7d3b85b66f50f443e194adb Removed numerous classes, simplifying the package and focusing on combined term and position files which can be merged using fewer processing operations. diff -r b75bd39cf61f -r 6542c54d115b iixr/data.py --- a/iixr/data.py Sat Feb 12 01:23:58 2011 +0100 +++ b/iixr/data.py Sun Feb 13 02:49:55 2011 +0100 @@ -23,7 +23,7 @@ # High-level representations. -def convert_sequence(values, op): +def convert_sequence(values, op, last_from_old): if values: new_values = list(values) last = new_values[0] @@ -31,10 +31,22 @@ length = len(new_values) while i < length: current = new_values[i] - new_values[i] = op(new_values[i], last) - last = current + new_values[i] = op(current, last) + + # Subtracting entries requires the old value to be used. + # Adding entries requires the new value. + + if last_from_old: + last = current + else: + last = new_values[i] + i += 1 + return new_values + else: + return values + def op_seq_monotonic(x, y, op): return tuple([op(a, b) for a, b in zip(x, y)]) @@ -44,15 +56,6 @@ def sub_seq_monotonic(x, y): return op_seq_monotonic(x, y, operator.sub) -def op_first_monotonic(x, y, op): - return (op(x[0], y[0]),) + tuple(zip(x[1:], y[1:])) - -def add_first_monotonic(x, y): - return op_first_monotonic(x, y, operator.add) - -def sub_first_monotonic(x, y): - return op_first_monotonic(x, y, operator.sub) - def add_seq(x, y): length = min(len(x), len(y)) seq = list(x)[:length] @@ -84,17 +87,17 @@ def sizeof(value): return is_sequence(value) and len(value) or 0 -def get_monotonic_adder(value): - return is_sequence(value) and add_seq_monotonic or operator.add +def get_monotonic_adder(size): + return size and add_seq_monotonic or operator.add -def get_monotonic_subtractor(value): - return is_sequence(value) and sub_seq_monotonic or operator.sub +def get_monotonic_subtractor(size): + return size and sub_seq_monotonic or operator.sub -def get_adder(value): - return is_sequence(value) and add_seq or operator.add +def get_adder(size): + return size and add_seq or operator.add -def get_subtractor(value): - return is_sequence(value) and sub_seq or operator.sub +def get_subtractor(size): + return size and sub_seq or operator.sub # Low-level representations. # Variable-length integer functions. @@ -177,15 +180,6 @@ break return number, start -# String serialisation. - -def string_to_array(s, bytes): - - "Write the given string 's' to 'bytes'." - - vint_to_array(len(s), bytes) - bytes.fromstring(s.encode("utf-8")) - # Sequence serialisation. def sequence_to_array(value, size, bytes): diff -r b75bd39cf61f -r 6542c54d115b iixr/fields.py --- a/iixr/fields.py Sat Feb 12 01:23:58 2011 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,345 +0,0 @@ -#!/usr/bin/env python - -""" -Specific classes for storing document information. - -Copyright (C) 2009, 2010, 2011 Paul Boddie - -This program is free software; you can redistribute it and/or modify it under -the terms of the GNU General Public License as published by the Free Software -Foundation; either version 3 of the License, or (at your option) any later -version. - -This program is distributed in the hope that it will be useful, but WITHOUT ANY -WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A -PARTICULAR PURPOSE. See the GNU General Public License for more details. - -You should have received a copy of the GNU General Public License along -with this program. If not, see . -""" - -from iixr.data import * -from iixr.files import * -from bisect import bisect_right # to find terms in the dictionary index - -DOCUMENT_CACHE_LIMIT = 10000 - -class FieldWriter(FileWriter): - - "Writing field data to files." - - def begin(self, docnum_size): - self.write_number(docnum_size) - self.end_record() - self.docnum_size = docnum_size - self.data_start = self.tell() - - def reset(self): - self.end_record() - self.last_docnum = None - self.subtractor = None - - def write_fields(self, docnum, fields): - - """ - Write for the given 'docnum', a list of 'fields' (integer, string pairs - representing field identifiers and values respectively). - """ - - # Find the size of document number values. - - if self.last_docnum is not None: - docnum_seq = self.subtractor(docnum, self.last_docnum) - else: - self.subtractor = get_subtractor(docnum) - docnum_seq = docnum - - # Write the document number. - - self.write_sequence_value(docnum_seq, self.docnum_size) - - # Write the number of fields. - - self.write_number(len(fields)) - - # Write the fields themselves. - - for i, field in fields: - self.write_number(i) - self.write_string(field, 1) # compress - - self.last_docnum = docnum - -class FieldReader(FileReader): - - "Reading field data from files." - - def begin(self): - self.begin_record() - try: - self.docnum_size = self.read_number() - except EOFError: - self.docnum_size = 0 # NOTE: No fields! - self.data_start = self.tell() - - def reset(self): - self.last_docnum = None - self.adder = None - self.begin_record() - - def read_fields(self): - - """ - Read fields from the file, returning a tuple containing the document - number and a list of field (identifier, value) pairs. - """ - - # Read the document number. - - docnum = self.read_sequence_value(self.docnum_size) - - if self.last_docnum is not None: - self.last_docnum = self.adder(docnum, self.last_docnum) - else: - self.adder = get_adder(docnum) - self.last_docnum = docnum - - # Read the number of fields. - - nfields = self.read_number() - - # Collect the fields. - - fields = [] - i = 0 - - while i < nfields: - identifier = self.read_number() - value = self.read_string(1) # decompress - fields.append((identifier, value)) - i += 1 - - return self.last_docnum, fields - - def read_document_fields(self, docnum, offset): - - """ - Read fields for 'docnum' at the given 'offset'. This permits the - retrieval of details for the specified document, as well as scanning for - later documents. - """ - - self.seek(offset) - bad_docnum, fields = self.read_fields() - self.last_docnum = docnum - return docnum, fields - -class FieldIndexWriter(FieldWriter): - - "Writing field index details to files." - - def reset(self): - FieldWriter.reset(self) - self.last_offset = 0 - - def write_document(self, docnum, offset): - - """ - Write for the given 'docnum', the 'offset' at which the fields for the - document are stored in the fields file. - """ - - # Find the size of document number values. - - if self.last_docnum is not None: - docnum_seq = self.subtractor(docnum, self.last_docnum) - else: - self.subtractor = get_subtractor(docnum) - docnum_seq = docnum - - # Write the document number. - - self.write_sequence_value(docnum_seq, self.docnum_size) - - # Write the offset delta. - - self.write_number(offset - self.last_offset) - - self.last_docnum = docnum - self.last_offset = offset - -class FieldIndexReader(FieldReader): - - "Reading field index details from files." - - def reset(self): - FieldReader.reset(self) - self.last_offset = 0 - - def read_document(self): - - "Read a document number and field file offset." - - # Read the document number. - - docnum = self.read_sequence_value(self.docnum_size) - - if self.last_docnum is not None: - self.last_docnum = self.adder(docnum, self.last_docnum) - else: - self.adder = get_adder(docnum) - self.last_docnum = docnum - - # Read the offset. - - self.last_offset += self.read_number() - - return self.last_docnum, self.last_offset - -class FieldDictionaryWriter: - - "Writing field dictionary details." - - def __init__(self, field_writer, field_index_writer, interval): - self.field_writer = field_writer - self.field_index_writer = field_index_writer - self.interval = interval - self.entry = 0 - - def write_fields(self, docnum, fields): - - "Write details of the given 'docnum' and 'fields'." - - if self.entry == 0: - docnum_size = sizeof(docnum) - self.field_writer.begin(docnum_size) - self.field_index_writer.begin(docnum_size) - self.field_index_writer.reset() - - if self.entry % self.interval == 0: - self.field_writer.reset() - offset = self.field_writer.tell() - self.field_writer.write_fields(docnum, fields) - self.field_index_writer.write_document(docnum, offset) - else: - self.field_writer.write_fields(docnum, fields) - - self.entry += 1 - - def close(self): - self.field_writer.close() - self.field_index_writer.close() - -class FieldDictionaryReader: - - "Reading field dictionary details." - - def __init__(self, field_reader, field_index_reader): - self.field_reader = field_reader - self.field_index_reader = field_index_reader - - self.field_reader.reset() - self.field_index_reader.reset() - - self.cache = {} - - self.entry = 0 - self.docs = [] - try: - while 1: - self.docs.append(self.field_index_reader.read_document()) - except EOFError: - pass - - # Large numbers for ordering purposes. - - if self.docs: - self.max_offset = self.docs[-1][1] - else: - self.max_offset = None - - # Iterator convenience methods. - - def __iter__(self): - self.rewind() - return self - - def next(self): - try: - return self.read_fields() - except EOFError: - raise StopIteration - - # Sequential access methods. - - def rewind(self): - self.field_reader.rewind() - - def read_fields(self): - - "Return the next document number and fields." - - try: - return self.field_reader.read_fields() - except EOFError: - self.entry += 1 - try: - found_docnum, offset = self.docs[self.entry] - except IndexError: - raise EOFError - else: - self.field_reader.reset() - return self.field_reader.read_fields() - - # Random access methods. - - def get_fields(self, docnum): - - "Read the fields of the document with the given 'docnum'." - - if self.cache.has_key(docnum): - return self.cache[docnum] - - i = bisect_right(self.docs, (docnum, self.max_offset)) - 1 - - # Get the entry position providing the term or one preceding it. - - if i == -1: - return None - - found_docnum, offset = self.docs[i] - - # Read from the fields file. - - found_docnum, fields = self.field_reader.read_document_fields(found_docnum, offset) - - # Scan for the document, if necessary. - - try: - while docnum > found_docnum: - found_docnum, fields = self.field_reader.read_fields() - except EOFError: - pass - - # If the document is found, return the fields. - - if docnum == found_docnum: - - # Store the fields in the cache, removing entries if the limit has - # been reached. - - keys = self.cache.keys() - - if len(keys) == DOCUMENT_CACHE_LIMIT: - del self.cache[keys[0]] - - self.cache[docnum] = fields - return fields - else: - return None - - def close(self): - self.field_reader.close() - self.field_index_reader.close() - -# vim: tabstop=4 expandtab shiftwidth=4 diff -r b75bd39cf61f -r 6542c54d115b iixr/files.py --- a/iixr/files.py Sat Feb 12 01:23:58 2011 +0100 +++ b/iixr/files.py Sun Feb 13 02:49:55 2011 +0100 @@ -22,10 +22,6 @@ from array import array import zlib -# Constants. - -CACHE_SIZE = 100000 - # Classes. class File: @@ -35,14 +31,21 @@ def __init__(self, f): self.f = f self.record = array('B') # record buffer - self.cache = array('B') + self.data_start = None + + def begin(self): + + """ + Initialise file-wide parameters. In writers, this method may require + parameters to be specified. In readers, the parameters may be read from + the file. + """ + self.data_start = 0 - def reset(self): - - "To be used to reset the state of the reader or writer between records." - - pass + def tell(self): + # NOTE: Will not be accurate within the current record. + return self.f.tell() def seek(self, offset): self.f.seek(offset) @@ -60,27 +63,26 @@ "Writing basic data types to files." - def __init__(self, f): - File.__init__(self, f) - self.written = 0 - - def tell(self): - # NOTE: Will not be accurate within the current record. - return self.written - def begin_record(self): pass def end_record(self): if self.record: - length = len(self.record) - before = len(self.cache) - vint_to_array(length, self.cache) - length_size = len(self.cache) - before - self.cache += self.record - self.written += length_size + length + self.f.write(vint(len(self.record))) + self.record.tofile(self.f) self.record = array('B') - self.flush_cache() + + def write_remaining(self, a): + + "Write remaining data from the raw array 'a'." + + self.record += a + + def write_byte(self, b): + + "Write the given byte 'b'." + + self.record.append(b) def write_number(self, number): @@ -137,25 +139,17 @@ self.write_sequence_value(value, size) def write_delta_sequence(self, values, size): - convert_sequence(values, get_subtractor(values[0])) - self.write_sequence_values(values, size) + self.write_sequence_values( + convert_sequence(values, get_subtractor(size), 1), + size) def write_monotonic_sequence(self, values, size): - convert_sequence(values, get_monotonic_subtractor(values[0])) - self.write_sequence_values(values, size) - - def flush(self, force=0): - self.end_record() - self.flush_cache(force) - - def flush_cache(self, force=0): - if self.f is not None: - if force or len(self.cache) > CACHE_SIZE: - self.cache.tofile(self.f) - self.cache = array('B') + self.write_sequence_values( + convert_sequence(values, get_monotonic_subtractor(size), 1), + size) def close(self): - self.flush(1) + self.end_record() File.close(self) class FileReader(File): @@ -164,58 +158,33 @@ def __init__(self, f): File.__init__(self, f) - self.record_start = 0 - self.record_end = 0 - self.cache_start = 0 self.begin() - def begin(self): - - "Initialise file-wide parameters." - - pass - def begin_record(self): self.start = 0 + self.record = array('B') try: size = self.read_number_from_file() - self.record = self.from_cache(size) + self.record.fromfile(self.f, size) except EOFError: pass def end_record(self): pass - def seek(self, offset): - from_cache_start = offset - self.cache_start - if 0 <= from_cache_start < len(self.cache): - self.record_start = self.record_end = from_cache_start - else: - self.f.seek(offset) - self.cache = array('B') - self.cache_start = offset - self.record_start = self.record_end = 0 - self.reset() + def read_remaining(self): - def tell(self): - return self.cache_start + self.record_start + self.start + "Read remaining data as a raw array." + + return self.record[self.start:] - def ensure_cache(self, size): - if size > len(self.cache) - self.record_end: - self.cache = self.cache[self.record_end:] - self.cache_start += self.record_end - s = self.f.read(CACHE_SIZE) - self.cache.fromstring(s) - self.record_start = 0 - if not s: - raise EOFError - else: - self.record_start = self.record_end - self.record_end = self.record_start + size + def read_byte(self): + + "Read a byte from the record." - def from_cache(self, size): - self.ensure_cache(size) - return self.cache[self.record_start:self.record_end] + b = self.record[self.start] + self.start += 1 + return b def read_number_from_file(self): @@ -224,13 +193,13 @@ # Read each byte, adding it to the number. a = array('B') - a += self.from_cache(1) + a.fromfile(self.f, 1) csd = a[-1] if csd < 128: return csd else: while csd & 128: - a += self.from_cache(1) + a.fromfile(self.f, 1) csd = a[-1] return vint_from_array(a) @@ -292,13 +261,9 @@ return values def read_delta_sequence(self, size): - values = self.read_sequences(size) - convert_sequence(values, get_adder(values[0])) - return values + return convert_sequence(self.read_sequences(size), get_adder(size), 0) def read_monotonic_sequence(self, size): - values = self.read_sequences(size) - convert_sequence(values, get_monotonic_adder(values[0])) - return values + return convert_sequence(self.read_sequences(size), get_monotonic_adder(size), 0) # vim: tabstop=4 expandtab shiftwidth=4 diff -r b75bd39cf61f -r 6542c54d115b iixr/filesystem.py --- a/iixr/filesystem.py Sat Feb 12 01:23:58 2011 +0100 +++ b/iixr/filesystem.py Sun Feb 13 02:49:55 2011 +0100 @@ -3,7 +3,7 @@ """ File access. -Copyright (C) 2009, 2010 Paul Boddie +Copyright (C) 2009, 2010, 2011 Paul Boddie This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -18,9 +18,7 @@ with this program. If not, see . """ -from iixr.fields import * from iixr.terms import * -from iixr.positions import * from os import listdir, remove, rename # partition manipulation from shutil import copy # index updating from os.path import join @@ -32,8 +30,7 @@ # Constants. -TERM_FILENAMES = "terms", "terms_index", "positions", "positions_index" -FIELD_FILENAMES = "fields", "fields_index" +TERM_FILENAMES = "terms", # Utility functions. @@ -49,7 +46,9 @@ partitions = set() for filename in listdir(pathname): if filename.startswith(prefix): - partitions.add(filename[prefix_length:]) + partition = filename[prefix_length:] + if partition.isdigit(): + partitions.add(int(partition)) return partitions def get_term_partitions(pathname): @@ -61,95 +60,40 @@ return get_partitions(pathname, "terms-") -def get_field_partitions(pathname): +def get_next_partition(partitions): + return max(partitions or [-1]) + 1 + +def get_term_writer(pathname, partition): """ - Return a set of field partition identifiers for partitions residing at the - given 'pathname'. - """ - - return get_partitions(pathname, "fields-") - -def get_next_partition(partitions): - return max([int(n) for n in partitions if n.isdigit()] or [-1]) + 1 - -def get_term_writer(pathname, partition, interval, doc_interval): - - """ - Return a term dictionary writer using files under the given 'pathname' - labelled according to the given 'partition', using the given indexing - 'interval' for terms and 'doc_interval' for document position records. + Return a term writer using files under the given 'pathname' labelled + according to the given 'partition'. """ - tdf = open(join(pathname, "terms-%s" % partition), "wb") - info_writer = TermWriter(tdf) - - tdif = open(join(pathname, "terms_index-%s" % partition), "wb") - index_writer = TermIndexWriter(tdif) - - tpf = open(join(pathname, "positions-%s" % partition), "wb") - positions_writer = PositionWriter(tpf) - - tpif = open(join(pathname, "positions_index-%s" % partition), "wb") - positions_index_writer = PositionIndexWriter(tpif) - - positions_dict_writer = PositionDictionaryWriter(positions_writer, positions_index_writer, doc_interval) - - return TermDictionaryWriter(info_writer, index_writer, positions_dict_writer, interval) + f = open(join(pathname, "terms-%s" % partition), "wb") + return TermWriter(f) -def get_field_writer(pathname, partition, interval): - - """ - Return a field dictionary writer using files under the given 'pathname' - labelled according to the given 'partition', using the given indexing - 'interval'. - """ - - ff = open(join(pathname, "fields-%s" % partition), "wb") - field_writer = FieldWriter(ff) - - fif = open(join(pathname, "fields_index-%s" % partition), "wb") - field_index_writer = FieldIndexWriter(fif) - - return FieldDictionaryWriter(field_writer, field_index_writer, interval) +def get_reader(pathname, name, partition, cls): + f = open(join(pathname, "%s-%s" % (name, partition)), "rb") + return cls(f) def get_term_reader(pathname, partition): """ - Return a term dictionary reader using files under the given 'pathname' + Return a term reader using files under the given 'pathname' labelled + according to the given 'partition'. + """ + + return get_reader(pathname, "terms", partition, TermIterator) + +def get_term_data_reader(pathname, partition): + + """ + Return a term plus data reader using files under the given 'pathname' labelled according to the given 'partition'. """ - tdf = open(join(pathname, "terms-%s" % partition), "rb") - info_reader = TermReader(tdf) - - tdif = open(join(pathname, "terms_index-%s" % partition), "rb") - index_reader = TermIndexReader(tdif) - - pf = open(join(pathname, "positions-%s" % partition), "rb") - position_reader = PositionReader(pf) - - pif = open(join(pathname, "positions_index-%s" % partition), "rb") - position_index_reader = PositionIndexReader(pif) - - position_dict_reader = PositionDictionaryReader(position_reader, position_index_reader) - - return TermDictionaryReader(info_reader, index_reader, position_dict_reader) - -def get_field_reader(pathname, partition): - - """ - Return a field dictionary reader using files under the given 'pathname' - labelled according to the given 'partition'. - """ - - ff = open(join(pathname, "fields-%s" % partition), "rb") - field_reader = FieldReader(ff) - - fif = open(join(pathname, "fields_index-%s" % partition), "rb") - field_index_reader = FieldIndexReader(fif) - - return FieldDictionaryReader(field_reader, field_index_reader) + return get_reader(pathname, "terms", partition, TermDataIterator) # Renaming. @@ -160,9 +104,6 @@ def rename_term_files(pathname, from_partition, to_partition): rename_files(pathname, TERM_FILENAMES, from_partition, to_partition) -def rename_field_files(pathname, from_partition, to_partition): - rename_files(pathname, FIELD_FILENAMES, from_partition, to_partition) - # Removal/deletion. def remove_files(pathname, names, partition): @@ -172,9 +113,6 @@ def remove_term_files(pathname, partition): remove_files(pathname, TERM_FILENAMES, partition) -def remove_field_files(pathname, partition): - remove_files(pathname, FIELD_FILENAMES, partition) - # Copying. def copy_files(source, names, partition, destination, suffix): @@ -185,7 +123,4 @@ def copy_term_files(source, partition, destination, suffix): copy_files(source, TERM_FILENAMES, partition, destination, suffix) -def copy_field_files(source, partition, destination, suffix): - copy_files(source, FIELD_FILENAMES, partition, destination, suffix) - # vim: tabstop=4 expandtab shiftwidth=4 diff -r b75bd39cf61f -r 6542c54d115b iixr/index.py --- a/iixr/index.py Sat Feb 12 01:23:58 2011 +0100 +++ b/iixr/index.py Sun Feb 13 02:49:55 2011 +0100 @@ -19,18 +19,14 @@ """ from iixr.filesystem import * -from iixr.merging import * -from itertools import islice +from itermerge import itermerge from os import mkdir # index discovery from os.path import exists +import operator # Constants. -TERM_INTERVAL = 100 -DOCUMENT_INTERVAL = 100 -FIELD_INTERVAL = 100 FLUSH_INTERVAL = 10000 -POSITIONS_FLUSH_INTERVAL = 1000000 OPEN_PARTITIONS = 20 # High-level classes. @@ -39,11 +35,9 @@ "A container of document information." - def __init__(self, docnum, fields=None): + def __init__(self, docnum): self.docnum = docnum - self.fields = fields or [] self.terms = {} - self.field_dict = None def add_position(self, term, position): @@ -54,55 +48,18 @@ self.terms.setdefault(term, []).append(position) - def add_field(self, identifier, value): - - "Add a field having the given 'identifier' and 'value'." - - self.fields.append((identifier, unicode(value))) # convert to string - - def set_fields(self, fields): - - """ - Set the document's 'fields': a list of tuples each containing an integer - identifier and a string value. - """ - - self.fields = fields - - def _ensure_dict(self): - if self.field_dict is None: - self.field_dict = dict(self.fields) - - def keys(self): - self._ensure_dict() - return self.field_dict.keys() - - def __getitem__(self, key): - self._ensure_dict() - return self.field_dict[key] - class IndexWriter: - """ - Building term information and writing it to the term and field dictionaries. - """ + "Building term information and writing it to the term dictionary." - def __init__(self, pathname, interval, doc_interval, field_interval, flush_interval, positions_flush_interval): + def __init__(self, pathname, flush_interval): self.pathname = pathname - self.interval = interval - self.doc_interval = doc_interval - self.field_interval = field_interval self.flush_interval = flush_interval - self.positions_flush_interval = positions_flush_interval - self.dict_partition = get_next_partition(get_term_partitions(self.pathname)) - self.field_dict_partition = get_next_partition(get_field_partitions(self.pathname)) + self.term_partition = get_next_partition(get_term_partitions(self.pathname)) self.terms = {} - self.docs = [] - self.doc_counter = 0 - self.position_counter = 0 def add_document(self, doc): @@ -115,134 +72,105 @@ for term, positions in doc.terms.items(): self.terms.setdefault(term, {})[docnum] = positions - self.position_counter += len(positions) - - self.docs.append((docnum, doc.fields)) self.doc_counter += 1 - if self.flush_interval and self.doc_counter >= self.flush_interval or \ - self.positions_flush_interval and self.position_counter >= self.positions_flush_interval: - + if self.flush_interval and self.doc_counter >= self.flush_interval: self.flush_terms() - self.flush_fields() self.doc_counter = 0 - self.position_counter = 0 def get_term_writer(self): - "Return a term dictionary writer for the current partition." - - return get_term_writer(self.pathname, self.dict_partition, self.interval, self.doc_interval) + "Return a term writer for the current partition." - def get_field_writer(self): - - "Return a field dictionary writer for the current partition." - - return get_field_writer(self.pathname, self.field_dict_partition, self.field_interval) + return get_term_writer(self.pathname, self.term_partition) def flush_terms(self): - "Flush terms into the current term dictionary partition." + "Flush terms into the current term partition." # Get the terms in order. - all_terms = self.terms - terms = all_terms.keys() - terms.sort() - - dict_writer = self.get_term_writer() - - for term in terms: - doc_positions = all_terms[term].items() - dict_writer.write_term_positions(term, doc_positions) - - dict_writer.close() + term_writer = self.get_term_writer() + try: + term_writer.write_terms(self.terms) + finally: + term_writer.close() self.terms = {} - self.dict_partition += 1 - - def flush_fields(self): - - "Flush fields into the current term dictionary partition." - - # Get the documents in order. - - self.docs.sort() - - field_dict_writer = self.get_field_writer() - for docnum, fields in self.docs: - field_dict_writer.write_fields(docnum, fields) - field_dict_writer.close() - - self.docs = [] - self.field_dict_partition += 1 + self.term_partition += 1 def close(self): if self.terms or not get_term_partitions(self.pathname): self.flush_terms() - if self.docs or not get_field_partitions(self.pathname): - self.flush_fields() + +class IndexReader(itermerge): + + "Accessing the term dictionaries." -class IndexReader: + def __init__(self, pathname, get_reader=None, combine=None): - "Accessing the term and field dictionaries." + # Get the partitions in order. + + partitions = list(get_term_partitions(pathname)) + partitions.sort() - def __init__(self, pathname): - self.dict_reader = get_term_reader(pathname, "merged") - self.field_dict_reader = get_field_reader(pathname, "merged") + # Initialise the underlying term partition readers. - # Sequential access. + self.readers = [(get_reader or get_term_reader)(pathname, partition) for partition in partitions] + self.combine = combine or operator.add + + # Initialise this object as an iterator over the readers. - def read_term(self): - return self.dict_reader.read_term() + itermerge.__init__(self, self.readers) + self.next_value = None - def go_to_term(self, term): - return self.dict_reader._get_term_and_positions(*self.dict_reader.go_to_term(term)) + def get_sizes(self): - # Query access. + # Readers must have compatible sizes. - def get_terms(self): - return self.dict_reader.get_terms() - - def find_terms(self, term): - return self.dict_reader.find_terms(term) + if self.readers: + return self.readers[0].get_sizes() + else: + return 0, 0 - def find_positions(self, term): - return self.dict_reader.find_positions(term) + def next(self): + if self.next_value is not None: + term, positions = self.next_value + else: + term, positions = itermerge.next(self) - def find_common_positions(self, terms): - return self.dict_reader.find_common_positions(terms) + # Look at the next item to see if it is has positions for the current + # term. - def get_frequency(self, term): - return self.dict_reader.get_frequency(term) - - def get_document_frequency(self, term): - return self.dict_reader.get_document_frequency(term) + try: + t, p = itermerge.next(self) + while t == term: + positions = self.combine(positions, p) + t, p = itermerge.next(self) + self.next_value = t, p - def get_fields(self, docnum): - return self.field_dict_reader.get_fields(docnum) + # Where an item could not be fetched, cause future requests to fail. - def get_document(self, docnum): - return Document(docnum, self.get_fields(docnum)) + except StopIteration: + self.next_value = None + + return term, positions def close(self): - self.dict_reader.close() - self.field_dict_reader.close() + for reader in self.readers: + reader.close() + self.readers = [] class Index: "An inverted index solution encapsulating the various components." - def __init__(self, pathname, interval=TERM_INTERVAL, doc_interval=DOCUMENT_INTERVAL, field_interval=FIELD_INTERVAL, - flush_interval=FLUSH_INTERVAL, positions_flush_interval=POSITIONS_FLUSH_INTERVAL, open_partitions=OPEN_PARTITIONS): + def __init__(self, pathname, flush_interval=FLUSH_INTERVAL, + open_partitions=OPEN_PARTITIONS): self.pathname = pathname - self.interval = interval - self.doc_interval = doc_interval - self.field_interval = field_interval self.flush_interval = flush_interval - self.positions_flush_interval = positions_flush_interval self.open_partitions = open_partitions self.reader = None self.writer = None @@ -251,132 +179,60 @@ "Return a writer." - self._ensure_directory() - self.writer = IndexWriter(self.pathname, self.interval, self.doc_interval, - self.field_interval, self.flush_interval, self.positions_flush_interval) + if self.writer is None: + self._ensure_directory() + self.writer = IndexWriter(self.pathname, self.flush_interval) return self.writer def _ensure_directory(self): if not exists(self.pathname): mkdir(self.pathname) - def get_reader(self, partition=0): - - "Return a reader for the index." - - # Ensure that only one partition exists. - - self.merge() - return self._get_reader(partition) - - def _get_reader(self, partition): + def get_reader(self, refresh=0): "Return a reader for the index." - if not exists(self.pathname): - raise OSError, "Index path %r does not exist." % self.pathname - - self.reader = IndexReader(self.pathname) - return self.reader - - def get_term_partitions(self): + if refresh and self.reader is not None: + self.reader.close() + self.reader = None - "Return a set of term partition identifiers." - - return get_term_partitions(self.pathname) - - def get_field_partitions(self): - - "Return a set of field partition identifiers." - - return get_field_partitions(self.pathname) + if self.reader is None: + if not exists(self.pathname): + raise OSError, "Index path %r does not exist." % self.pathname + self.reader = IndexReader(self.pathname) + return self.reader def merge(self): - "Merge/optimise index partitions." - - self._merge_terms() - self._merge_fields() - - def _merge_dictionaries(self, get_partitions, rename_files, remove_files, get_reader, get_writer, get_merger, intervals): - - "Merge term or field dictionaries." - - partitions = get_partitions() - - # Ensure the correct labelling of a single partition. - - if len(partitions) == 1: - partition = list(partitions)[0] - if partition != "merged": - rename_files(self.pathname, partition, "merged") - return + "Merge the partitions in the index." - # Merge the partitions. - - old_merged_counter = 0 - - while len(partitions) > 1: - - if "merged" in partitions: - rename_files(self.pathname, "merged", "old-merged-%d" % old_merged_counter) - partitions.remove("merged") - partitions.add("old-merged-%d" % old_merged_counter) - old_merged_counter += 1 - - # Process only a certain number at once, avoiding resource limits. - - active_partitions = list(islice(partitions, self.open_partitions)) - - readers = [] - for partition in active_partitions: - readers.append(get_reader(self.pathname, partition)) - - # Write directly to a dictionary. + reader = IndexReader(self.pathname, get_term_data_reader, self.merge_data) + writer = get_term_writer(self.pathname, "merged") + try: + writer.begin(*reader.get_sizes()) + for term, data in reader: + writer.write_term_plus_remaining(term, data) + writer.end_record() + finally: + writer.close() + reader.close() - writer = get_writer(self.pathname, "merged", *intervals) - merger = get_merger(writer, readers) - merger.merge() - merger.close() - - # Remove old files. - - for partition in active_partitions: - remove_files(self.pathname, partition) + for partition in get_term_partitions(self.pathname): + remove_term_files(self.pathname, partition) - # Acquire the partitions to check their number again. - - partitions = get_partitions() - - def _merge_terms(self): + rename_term_files(self.pathname, "merged", 0) - "Merge term dictionaries." - - self._merge_dictionaries(self.get_term_partitions, rename_term_files, - remove_term_files, get_term_reader, get_term_writer, - TermDictionaryMerger, [self.interval, self.doc_interval]) + def merge_data(self, a, b): - def _merge_fields(self): - - "Merge field dictionaries." - - self._merge_dictionaries(self.get_field_partitions, rename_field_files, - remove_field_files, get_field_reader, get_field_writer, - FieldDictionaryMerger, [self.field_interval]) - - def update(self, other_indexes): + """ + Merge 'a' and 'b', modifying the data to permit concatenation. + """ - "Copy the content of the 'other_indexes' into this index and merge." - - self._ensure_directory() + # Modify the record to indicate a continuation of the data. - for i, index in enumerate(other_indexes): - for partition in index.get_term_partitions(): - copy_term_files(index.pathname, partition, self.pathname, "-added-%d" % i) - for partition in index.get_field_partitions(): - copy_field_files(index.pathname, partition, self.pathname, "-added-%d" % i) - - self.merge() + c = a + b + c[len(a) - 1] = 1 + return c def close(self): if self.reader is not None: diff -r b75bd39cf61f -r 6542c54d115b iixr/merging.py --- a/iixr/merging.py Sat Feb 12 01:23:58 2011 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,89 +0,0 @@ -#!/usr/bin/env python - -""" -Dictionary merging classes. - -Copyright (C) 2009, 2010 Paul Boddie - -This program is free software; you can redistribute it and/or modify it under -the terms of the GNU General Public License as published by the Free Software -Foundation; either version 3 of the License, or (at your option) any later -version. - -This program is distributed in the hope that it will be useful, but WITHOUT ANY -WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A -PARTICULAR PURPOSE. See the GNU General Public License for more details. - -You should have received a copy of the GNU General Public License along -with this program. If not, see . -""" - -from itermerge import itermerge - -class Merger: - - "Merge files." - - def __init__(self, writer, readers): - self.writer = writer - self.readers = readers - - def close(self): - for reader in self.readers: - reader.close() - self.readers = [] - if self.writer is not None: - self.writer.close() - self.writer = None - -class TermDictionaryMerger(Merger): - - "Merge term and position files." - - def merge(self): - - """ - Merge terms and positions from the readers, sending them to the writer. - """ - - last_term = None - current_readers = [] - - for term, frequency, doc_frequency, positions in itermerge(self.readers): - if term == last_term: - current_readers.append(positions) - else: - if current_readers: - self.writer.write_term_positions(last_term, itermerge(current_readers)) - last_term = term - current_readers = [positions] - else: - if current_readers: - self.writer.write_term_positions(last_term, itermerge(current_readers)) - -class FieldDictionaryMerger(Merger): - - "Merge field files." - - def merge(self): - - """ - Merge fields from the readers, sending them to the writer. - """ - - last_docnum = None - current_fields = [] - - for docnum, fields in itermerge(self.readers): - if docnum == last_docnum: - current_fields += fields - else: - if current_fields: - self.writer.write_fields(last_docnum, current_fields) - last_docnum = docnum - current_fields = fields - else: - if current_fields: - self.writer.write_fields(last_docnum, current_fields) - -# vim: tabstop=4 expandtab shiftwidth=4 diff -r b75bd39cf61f -r 6542c54d115b iixr/positions.py --- a/iixr/positions.py Sat Feb 12 01:23:58 2011 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,566 +0,0 @@ -#!/usr/bin/env python - -""" -Specific classes for storing position information. - -Copyright (C) 2009, 2010, 2011 Paul Boddie - -This program is free software; you can redistribute it and/or modify it under -the terms of the GNU General Public License as published by the Free Software -Foundation; either version 3 of the License, or (at your option) any later -version. - -This program is distributed in the hope that it will be useful, but WITHOUT ANY -WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A -PARTICULAR PURPOSE. See the GNU General Public License for more details. - -You should have received a copy of the GNU General Public License along -with this program. If not, see . -""" - -from iixr.data import * -from iixr.files import * - -class PositionWriter(FileWriter): - - "Writing position information to files." - - def begin(self, docnum_size, position_size): - self.write_numbers((docnum_size, position_size)) - self.end_record() - self.data_start = self.tell() - self.docnum_size = docnum_size - self.position_size = position_size - - def reset(self): - self.end_record() - self.last_docnum = None - self.subtractor = None - - def write_positions(self, docnum, positions): - - """ - Write for the document 'docnum' the given 'positions'. - """ - - if not positions: - return - - # Make sure that the positions are sorted. - - positions.sort() - - # Calculate an ongoing delta. - - if self.last_docnum is not None: - if docnum < self.last_docnum: - raise ValueError, "Document number %r is less than previous number %r." % (docnum, self.last_docnum) - - docnum_seq = self.subtractor(docnum, self.last_docnum) - - # Or preserve the document number and prepare for future deltas. - - else: - self.subtractor = get_subtractor(docnum) - docnum_seq = docnum - - self.write_sequence_value(docnum_seq, self.docnum_size) - self.write_monotonic_sequence(positions, self.position_size) - - self.last_docnum = docnum - -class PositionReader(FileReader): - - "Reading position information within term-specific regions of a file." - - def begin(self): - self.begin_record() - try: - self.docnum_size, self.position_size = self.read_numbers(2) - except EOFError: - self.docnum_size, self.position_size = 0, 0 # NOTE: No positions! - self.data_start = self.tell() - - def reset(self): - self.last_docnum = None - self.adder = None - self.begin_record() - - def read_positions(self): - - """ - Read positions, returning a document number and a list of positions. - """ - - # Read the document number. - - docnum = self.read_sequence_value(self.docnum_size) - - # Calculate an ongoing delta. - - if self.last_docnum is not None: - self.last_docnum = self.adder(docnum, self.last_docnum) - - # Or preserve the document number and prepare for future deltas. - - else: - self.adder = get_adder(docnum) - self.last_docnum = docnum - - positions = self.read_monotonic_sequence(self.position_size) - - return self.last_docnum, positions - -class PositionIndexWriter(PositionWriter): - - "Writing position index information to files." - - def begin(self, docnum_size): - PositionWriter.begin(self, docnum_size, 0) - - def reset(self): - PositionWriter.reset(self) - self.last_pos_offset = 0 - - def write_positions(self, docnum, pos_offset, count): - - """ - Write the given 'docnum, 'pos_offset' and document 'count' to the - position index file. - """ - - # Find the size of document number values. - - if self.last_docnum is not None: - docnum_seq = self.subtractor(docnum, self.last_docnum) - else: - self.subtractor = get_subtractor(docnum) - docnum_seq = docnum - - self.write_sequence_value(docnum_seq, self.docnum_size) - self.write_number(pos_offset - self.last_pos_offset) - self.write_number(count) - - self.last_docnum = docnum - self.last_pos_offset = pos_offset - -class PositionIndexReader(PositionReader): - - "Reading position index information within term-specific regions of a file." - - def reset(self): - PositionReader.reset(self) - self.last_pos_offset = 0 - - def read_positions(self): - - """ - Read a document number, a position file offset for the position index - file, and the number of documents in a section of that file. - """ - - # Read the document number. - - docnum = self.read_sequence_value(self.docnum_size) - - if self.last_docnum is not None: - self.last_docnum = self.adder(docnum, self.last_docnum) - else: - self.adder = get_adder(docnum) - self.last_docnum = docnum - - # Read the offset delta. - - self.last_pos_offset += self.read_number() - - # Read the document count. - - count = self.read_number() - - return self.last_docnum, self.last_pos_offset, count - -# Iterators for position-related files. - -class IteratorBase: - - "Support for iterating over results." - - def __init__(self, reader): - - "Initialise the iterator using the given 'reader'." - - self.reader = reader - self.replenish(0) # no iteration initially permitted - - def replenish(self, count): - - "Replenish the iterator with 'count' results." - - self.count = count - self.read_documents = 0 - - def __len__(self): - - "Return the total number of results." - - return self.count - - def sort(self): - pass # Stored document positions are already sorted. - - def __iter__(self): - return self - -class PositionIterator(IteratorBase): - - "Iterating over document positions." - - def replenish(self, count): - IteratorBase.replenish(self, count) - - # Fill a cache of positions. - - self.cache = [] - n = 0 - - while n < self.count: - self.cache.append(self.reader.read_positions()) - n += 1 - - def seek(self, offset, count): - - """ - Seek to 'offset' in the file, limiting the number of documents available - for reading to 'count'. - """ - - self.reader.seek(offset) - self.replenish(count) - - def next(self): - - "Read positions for a single document." - - if self.read_documents < self.count: - positions = self.cache[self.read_documents] - self.read_documents += 1 - return positions - else: - raise StopIteration - -class PositionIndexIterator(IteratorBase): - - "Iterating over document positions." - - def replenish(self, count): - IteratorBase.replenish(self, count) - - # Fill a cache of offsets. - - self.cache = [] - self.current = 0 - n = 0 - - while n < self.count: - docnum, pos_offset, section_count = t = self.reader.read_positions() - self.cache.append(t) - n += section_count - - def seek(self, offset, doc_frequency): - - """ - Seek to 'offset' in the file, limiting the number of documents available - for reading to 'doc_frequency'. - """ - - self.reader.seek(offset) - self.replenish(doc_frequency) - - def next(self): - - "Read positions for a single document." - - if self.current < len(self.cache): - docnum, pos_offset, self.section_count = t = self.cache[self.current] - self.current += 1 - return t - else: - raise StopIteration - -class PositionDictionaryWriter: - - "Writing position dictionaries." - - def __init__(self, position_writer, position_index_writer, interval): - self.position_writer = position_writer - self.position_index_writer = position_index_writer - self.interval = interval - - def write_term_positions(self, doc_positions): - - """ - Write all 'doc_positions' - a collection of tuples of the form (document - number, position list) - to the file. - - Add some records to the index, making dictionary entries. - - Return a tuple containing the offset of the written data, the frequency - (number of positions), and document frequency (number of documents) for - the term involved. - """ - - # Write the positions. - - frequency = 0 - count = 0 - - if doc_positions: - doc_positions.sort() - - # Look ahead at the first document record. - # NOTE: Any iterator would need to support this. - - first_docnum, first_positions = doc_positions[0] - first_position = first_positions[0] - - # Write out size details. - - docnum_size, position_size = sizeof(first_docnum), sizeof(first_position) - self.position_writer.begin(docnum_size, position_size) - self.position_index_writer.begin(docnum_size) - - # Reset the writers. - - self.position_writer.reset() - self.position_index_writer.reset() - - # Remember the first index entry offset. - - index_offset = self.position_index_writer.tell() - - # Retain the first record offset for a subsequent index entry. - - first_offset = self.position_writer.tell() - - for docnum, positions in doc_positions: - if first_docnum is None: - first_docnum = docnum - - self.position_writer.write_positions(docnum, positions) - - frequency += len(positions) - count += 1 - - # Every {interval} entries, write an index entry. - - if count % self.interval == 0: - - self.position_index_writer.write_positions(first_docnum, first_offset, self.interval) - - # Reset the position writer so that position readers accessing - # a section start with the correct document number. - - self.position_writer.reset() - - first_offset = self.position_writer.tell() - first_docnum = None - - # Finish writing an index entry for the remaining documents. - - else: - if first_docnum is not None: - self.position_index_writer.write_positions(first_docnum, first_offset, count % self.interval) - - return index_offset, frequency, count - - def close(self): - self.position_writer.close() - self.position_index_writer.close() - -class PositionDictionaryReader: - - "Access to position dictionary entries through iterators." - - def __init__(self, position_reader, position_index_reader): - self.position_reader = position_reader - self.position_index_reader = position_index_reader - - def read_term_positions(self, offset, doc_frequency): - iterator = PositionDictionaryIterator( - PositionIterator(self.position_reader), - PositionIndexIterator(self.position_index_reader) - ) - iterator.seek(offset, doc_frequency) - return iterator - - def close(self): - self.position_reader.close() - self.position_index_reader.close() - -class PositionDictionaryIterator: - - "Iteration over position dictionary entries." - - def __init__(self, position_iterator, position_index_iterator): - self.position_iterator = position_iterator - self.position_index_iterator = position_index_iterator - self.reset() - - def reset(self): - - # Remember the last values. - - self.found_docnum, self.found_positions = None, None - - # Maintain state for the next index entry, if read. - - self.next_docnum, self.next_pos_offset, self.next_section_count = None, None, None - - def seek(self, offset, doc_frequency): - - """ - Seek to 'offset' in the index file, limiting the number of documents - available for reading to 'doc_frequency'. - """ - - self.reset() - - # Seek to the appropriate index entry. - - self.position_index_iterator.seek(offset, doc_frequency) - - # Initialise the current index entry and current position file iterator. - - self._next_section() - self._init_section() - - # Sequence methods. - - def __len__(self): - return len(self.position_index_iterator) - - def sort(self): - pass - - # Iterator methods. - - def __iter__(self): - return self - - def next(self): - - """ - Attempt to get the next document record from the section in the - positions file. - """ - - # Return any visited but unrequested record. - - if self.found_docnum is not None: - t = self.found_docnum, self.found_positions - self.found_docnum, self.found_positions = None, None - return t - - # Or search for the next record. - - while 1: - - # Either return the next record. - - try: - return self.position_iterator.next() - - # Or, where a section is finished, get the next section and try again. - - except StopIteration: - - # Although, where a single iterator is in use, the file reader - # would be positioned appropriately, this is not guaranteed in a - # multiple iterator situation. - - self._next_section() - self._init_section() - - def from_document(self, docnum): - - """ - Attempt to navigate to a positions entry for the given 'docnum', - returning the positions for 'docnum', or None otherwise. - """ - - # Return any unrequested document positions. - - if docnum == self.found_docnum: - return self.found_positions - - # Read ahead in the index until the next entry refers to a document - # later than the desired document. - - try: - if self.next_docnum is None: - self.next_docnum, self.next_pos_offset, self.next_section_count = self.position_index_iterator.next() - - # Read until the next entry is after the desired document number, - # or until the end of the results. - - while self.next_docnum <= docnum: - self._next_read_section() - if self.docnum < docnum: - self.next_docnum, self.next_pos_offset, self.next_section_count = self.position_index_iterator.next() - else: - break - - except StopIteration: - pass - - # Navigate in the position file to the document. - - self._init_section() - - try: - while 1: - found_docnum, found_positions = self.position_iterator.next() - - # Return the desired document positions or None (retaining the - # positions for the document immediately after). - - if docnum <= found_docnum: - self.found_docnum, self.found_positions = found_docnum, found_positions - if docnum == found_docnum: - return found_positions - elif docnum < found_docnum: - return None - - except StopIteration: - return None - - # Internal methods. - - def _next_section(self): - - "Attempt to get the next section in the index." - - if self.next_docnum is None: - self.docnum, self.pos_offset, self.section_count = self.position_index_iterator.next() - else: - self._next_read_section() - - def _next_read_section(self): - - """ - Make the next index entry the current one without reading from the - index. - """ - - self.docnum, self.pos_offset, self.section_count = self.next_docnum, self.next_pos_offset, self.next_section_count - self.next_docnum, self.next_pos_offset, self.next_section_count = None, None, None - - def _init_section(self): - - "Initialise the iterator for the section in the position file." - - # Seek to the position entry. - - self.position_iterator.seek(self.pos_offset, self.section_count) - -# vim: tabstop=4 expandtab shiftwidth=4 diff -r b75bd39cf61f -r 6542c54d115b iixr/terms.py --- a/iixr/terms.py Sat Feb 12 01:23:58 2011 +0100 +++ b/iixr/terms.py Sun Feb 13 02:49:55 2011 +0100 @@ -18,29 +18,87 @@ with this program. If not, see . """ +from iixr.data import * from iixr.files import * -from iixr.positions import * from iixr.phrases import PhraseIterator from os.path import commonprefix # to find common string prefixes -from bisect import bisect_right # to find terms in the dictionary index class TermWriter(FileWriter): "Writing term information to files." - def reset(self): + def begin(self, docnum_size, position_size): + + "Begin writing to the file." + + self.write_numbers((docnum_size, position_size)) self.end_record() + + self.data_start = self.tell() + self.docnum_size = docnum_size + self.position_size = position_size + self.subtractor = get_subtractor(docnum_size) self.last_term = "" - self.last_offset = 0 - def write_term(self, term, offset, frequency, doc_frequency): + def write_terms(self, terms): """ - Write the given 'term', its position file 'offset', its 'frequency' and - its 'doc_frequency' (number of documents in which it appears) to the - term information file. + Write the 'terms' to the term information file, with each term's details + stored in a separate record. """ + if hasattr(terms, "items"): + terms = terms.items() + terms.sort() + + for term, doc_positions in terms: + if not doc_positions: + continue + + if hasattr(doc_positions, "items"): + doc_positions = doc_positions.items() + + docnum, positions = doc_positions[0] + + if not positions: + continue + + # Start the writing, if appropriate. + + if self.data_start is None: + self.begin(sizeof(docnum), sizeof(positions[0])) + + # Write each term and document positions. + + self.write_term(term, doc_positions) + self.end_record() + + # Methods requiring an open record. + + def write_term(self, term, doc_positions): + + """ + Write the given 'term', its document frequency (number of documents in + which it appears), and 'doc_positions' to the term information file. + """ + + self.write_term_only(term) + + # Write the document frequency and the term positions. + + self.write_positions(doc_positions) + + def write_term_plus_remaining(self, term, data): + + "Write the given 'term' and the document position 'data'." + + self.write_term_only(term) + self.write_remaining(data) + + def write_term_only(self, term): + + "Write only the given 'term'." + if term <= self.last_term: raise ValueError, "Term %r precedes the previous term %r." % (term, self.last_term) @@ -52,430 +110,173 @@ self.write_number(common) self.write_string(suffix) - # Write the offset delta. - # Write the frequency. + self.last_term = term + + def write_positions(self, doc_positions): + + "Write the given 'doc_positions' to the file." + + # Make sure that the positions are sorted. + + doc_positions.sort() + # Write the document frequency. - self.write_numbers(( - offset - self.last_offset, - frequency, - doc_frequency - )) + self.write_number(len(doc_positions)) + + last_docnum = None + + for docnum, positions in doc_positions: + + # Store the first document number as it is. + + if last_docnum is None: + docnum_seq = docnum + + # Reject out-of-order documents. + + elif docnum < last_docnum: + raise ValueError, "Document number %r is less than previous number %r." % (docnum, last_docnum) - self.last_term = term - self.last_offset = offset + # Calculate an ongoing delta. + + else: + docnum_seq = self.subtractor(docnum, last_docnum) + + # Write the document number and positions. + + self.write_sequence_value(docnum_seq, self.docnum_size) + self.write_monotonic_sequence(positions, self.position_size) + + last_docnum = docnum + + # Write a terminating byte to indicate that no more document pages + # exist. + + self.write_byte(0) class TermReader(FileReader): "Reading term information from files." - def reset(self): + def begin(self): + + "Begin reading from the file." + + self.begin_record() + try: + self.docnum_size, self.position_size = self.read_numbers(2) + except EOFError: + self.docnum_size, self.position_size = 0, 0 # NOTE: No positions! + + self.data_start = self.tell() + self.adder = get_adder(self.docnum_size) self.last_term = "" - self.last_offset = 0 - self.begin_record() + + def get_sizes(self): + return self.docnum_size, self.position_size + + # Methods requiring an open record. def read_term(self): + "Read a term and its document positions from the term information file." + + # Read the term. + + self.read_term_only() + + # Read the document frequency and the term positions. + + positions = self.read_positions() + + return self.last_term, positions + + def read_term_plus_remaining(self): + """ - Read a term, its position file offset, its frequency and its document - frequency from the term information file. + Read a term and the unprocessed document position data. """ + self.read_term_only() + return self.last_term, self.read_remaining() + + def read_term_only(self): + + "Read a term only." + # Read the prefix length and term suffix. common = self.read_number() suffix = self.read_string() self.last_term = self.last_term[:common] + suffix - - # Read the offset delta. - - self.last_offset += self.read_number() - - # Read the frequency. - - frequency = self.read_number() - - # Read the document frequency. - - doc_frequency = self.read_number() + return self.last_term - return self.last_term, self.last_offset, frequency, doc_frequency - - def go_to_term(self, term, offset, info_offset): - - """ - Seek past the entry for 'term' having 'offset' to 'info_offset'. This - permits the scanning for later terms from the specified term. - """ - - self.seek(info_offset) - self.last_term = term - self.last_offset = offset - -class TermIndexWriter(TermWriter): + def read_positions(self): - "Writing term dictionary index details to files." - - def reset(self): - TermWriter.reset(self) - self.last_info_offset = 0 - - def write_term(self, term, offset, frequency, doc_frequency, info_offset): - - """ - Write the given 'term', its position file 'offset', its 'frequency' and - its 'doc_frequency' to the term dictionary index file, along with the - 'info_offset' in the term information file. - """ + "Read document positions from the term information file." - TermWriter.write_term(self, term, offset, frequency, doc_frequency) - - # Write the information file offset delta. - - self.write_number(info_offset - self.last_info_offset) - - self.last_info_offset = info_offset + doc_positions = [] -class TermIndexReader(TermReader): - - "Reading term dictionary index details from files." - - def reset(self): - TermReader.reset(self) - self.last_info_offset = 0 + while 1: - def read_term(self): - - """ - Read a term, its position file offset, its frequency, its document - frequency and a term information file offset from the term dictionary - index file. - """ - - term, offset, frequency, doc_frequency = TermReader.read_term(self) - - # Read the offset delta. - - self.last_info_offset += self.read_number() + # Read the document frequency. - return term, offset, frequency, doc_frequency, self.last_info_offset - -class TermDictionaryWriter: - - "Writing term dictionaries." - - def __init__(self, info_writer, index_writer, position_dict_writer, interval): - self.info_writer = info_writer - self.index_writer = index_writer - self.position_dict_writer = position_dict_writer - self.interval = interval - self.entry = 0 - - self.index_writer.reset() + npositions = self.read_number() - def _write_term(self, term, offset, frequency, doc_frequency): - - """ - Write the given 'term', its position file 'offset', its 'frequency' and - its 'doc_frequency' (number of documents in which it appears) to the - term information file. Return the offset before the term information was - written to the file. - """ - - if self.entry % self.interval == 0: - self.info_writer.reset() - info_offset = self.info_writer.tell() - self.index_writer.write_term(term, offset, frequency, doc_frequency, info_offset) + last_docnum = None + i = 0 + while i < npositions: - self.info_writer.write_term(term, offset, frequency, doc_frequency) - self.entry += 1 - - def write_term_positions(self, term, doc_positions): - - """ - Write the given 'term' and the 'doc_positions' recording the documents - and positions at which the term is found. - """ - - offset, frequency, doc_frequency = self.position_dict_writer.write_term_positions(doc_positions) - - if not frequency or not doc_frequency: - raise ValueError, "Term %r has no occurrences recorded: %r" % (term, doc_positions) - - self._write_term(term, offset, frequency, doc_frequency) + # Read the document number. - def close(self): - self.info_writer.close() - self.index_writer.close() - self.position_dict_writer.close() - -class TermDictionaryReader: - - "Reading term dictionaries." + docnum = self.read_sequence_value(self.docnum_size) + if last_docnum is not None: + docnum = self.adder(docnum, last_docnum) - def __init__(self, info_reader, index_reader, position_dict_reader): - self.info_reader = info_reader - self.index_reader = index_reader - self.position_dict_reader = position_dict_reader - - self.info_reader.reset() - self.index_reader.reset() - - self.entry = 0 - self.terms = [] - try: - while 1: - self.terms.append(self.index_reader.read_term()) - except EOFError: - pass - - # Large numbers for ordering purposes. + # Read the positions. - if self.terms: - self.max_offset = self.terms[-1][1] + 1 - else: - self.max_offset = None - - def _find_closest_entry(self, term): - - """ - Find the offsets and frequencies of 'term' from the term dictionary or - the closest term starting with the value of 'term'. - - Return the closest index entry consisting of a term, the position file - offset, the term frequency, the document frequency, and the term details - file offset. - """ + positions = self.read_monotonic_sequence(self.position_size) + doc_positions.append((docnum, positions)) - i = bisect_right(self.terms, (term, self.max_offset, 0, 0)) - 1 - - # Get the entry position providing the term or one preceding it. - # If no entry precedes the requested term, return the very first entry - # as the closest. - - if i == -1: - self.entry = 0 - return self.terms[0] - else: - self.entry = i - return self.terms[i] - - def _find_closest_term(self, term): - - """ - Find the offsets and frequencies of 'term' from the term dictionary or - the closest term starting with the value of 'term'. + last_docnum = docnum + i += 1 - Return the closest term (or the term itself), the position file offset, - the term frequency, the document frequency, and the term details file - offset (or None if the reader is already positioned). - """ - - found_term, offset, frequency, doc_frequency, info_offset = self._find_closest_entry(term) - - # Where the term is found immediately, return the offset and - # frequencies. If the term does not appear, return the details of the - # closest entry. - - if term <= found_term: - return found_term, offset, frequency, doc_frequency, info_offset + # Read a terminating byte to discover whether more document pages + # exist. - # Otherwise, seek past the index term's entry in the information file - # and scan for the desired term. - - else: - # Reset the term and offset for the new page. - self.info_reader.go_to_term("", 0, info_offset) - try: - while term > found_term: - found_term, offset, frequency, doc_frequency = self._read_term() - except EOFError: - pass - - return found_term, offset, frequency, doc_frequency, None - - def _find_term(self, term): + if not self.read_byte(): + break - """ - Find the position file offset and frequency of 'term' from the term - dictionary. - """ - - found_term, offset, frequency, doc_frequency, info_offset = self._find_closest_term(term) - - # If the term is found, return the offset and frequencies. - - if term == found_term: - return offset, frequency, doc_frequency - else: - return None - - def _get_term_and_positions(self, term, offset, frequency, doc_frequency): + return doc_positions - """ - Return the term plus positions details using the given 'term', 'offset', - 'frequency' and 'doc_frequency'. - """ - - return term, frequency, doc_frequency, self._get_positions(offset, doc_frequency) - - def _get_positions(self, offset, doc_frequency): +class TermIterator(TermReader): - """ - Obtain positions from the position index 'offset' expecting a number of - documents equal to the given 'doc_frequency'. - """ - - return self.position_dict_reader.read_term_positions(offset, doc_frequency) - - # Iterator convenience methods. + "An iterator over terms and positions read from a file." def __iter__(self): - self.rewind() return self def next(self): try: + self.begin_record() return self.read_term() except EOFError: raise StopIteration - # Sequential access methods. - - def rewind(self): - self.entry = 0 - self.info_reader.rewind() - - def read_term(self): - - """ - Return the next term, its frequency, its document frequency, and the - documents and positions at which the term is found. - """ - - return self._get_term_and_positions(*self._read_term()) - - def _read_term(self): - - try: - term, offset, frequency, doc_frequency = self.info_reader.read_term() - except EOFError: - self.entry += 1 - try: - term, offset, frequency, doc_frequency, info_offset = self.terms[self.entry] - except IndexError: - raise EOFError - else: - # Reset the term and offset for the new page. - - self.info_reader.go_to_term("", 0, info_offset) - - # Skip the term in the information file. - - self.info_reader.read_term() +class TermDataIterator(TermReader): - return term, offset, frequency, doc_frequency - - def go_to_term(self, term): - - """ - Navigate to 'term' in the dictionary, returning the details from its - entry. The returned details can be augmented with position information - when presented to the _get_term_and_positions method. - """ - - found_term, offset, frequency, doc_frequency, info_offset = self._find_closest_term(term) - - # Position the reader, if necessary. - - if info_offset is not None: + "An iterator over terms and unprocessed document positions data." - # Reset the term and offset for the new page. - - self.info_reader.go_to_term("", 0, info_offset) - - # Skip the term in the information file. - - self.info_reader.read_term() - - return found_term, offset, frequency, doc_frequency - - # Query methods. - - def get_terms(self): - - "Return a list of all terms." - - return iter(self) + def __iter__(self): + return self - def find_terms(self, term): - - "Return all terms whose values start with the value of 'term'." - - terms = [] - - found_term, offset, frequency, doc_frequency = self.go_to_term(term) - - # Read and record terms. - + def next(self): try: - # Add the found term if it starts with the specified term. - - while found_term.startswith(term): - terms.append(found_term) - found_term, offset, frequency, doc_frequency = self._read_term() - + self.begin_record() + return self.read_term_plus_remaining() except EOFError: - pass - - return terms - - def find_positions(self, term): - - "Return the documents and positions at which the given 'term' is found." - - t = self._find_term(term) - if t is None: - return [] - else: - offset, frequency, doc_frequency = t - return self._get_positions(offset, doc_frequency) - - def find_common_positions(self, terms): - - """ - Return the documents and positions at which all the given 'terms' are - found, where only common documents are returned. - """ - - return PhraseIterator([self.find_positions(term) for term in terms]) - - def get_frequency(self, term): - - "Return the frequency of the given 'term'." - - t = self._find_term(term) - if t is None: - return None - else: - offset, frequency, doc_frequency = t - return frequency - - def get_document_frequency(self, term): - - "Return the document frequency of the given 'term'." - - t = self._find_term(term) - if t is None: - return None - else: - offset, frequency, doc_frequency = t - return doc_frequency - - def close(self): - self.info_reader.close() - self.index_reader.close() - self.position_dict_reader.close() + raise StopIteration # vim: tabstop=4 expandtab shiftwidth=4 diff -r b75bd39cf61f -r 6542c54d115b itermerge.py --- a/itermerge.py Sat Feb 12 01:23:58 2011 +0100 +++ b/itermerge.py Sun Feb 13 02:49:55 2011 +0100 @@ -3,7 +3,7 @@ """ An iterator merging class similar to heapq.merge in Python 2.6. -Copyright (C) 2009 Paul Boddie +Copyright (C) 2009, 2011 Paul Boddie This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software diff -r b75bd39cf61f -r 6542c54d115b test.py --- a/test.py Sat Feb 12 01:23:58 2011 +0100 +++ b/test.py Sun Feb 13 02:49:55 2011 +0100 @@ -1,22 +1,21 @@ #!/usr/bin/env python +# encoding: iso-8859-1 from iixr.files import * -from iixr.fields import * from iixr.terms import * -from iixr.positions import * from iixr.index import * import os, sys # Remove old test files. -for filename in ("test", "testMS", "testNMS", "testF", "testFI", "testI", "testP", "testP2", "testPI"): +for filename in ("test", "testMS", "testNMS", "testP", "testP2"): try: os.remove(filename) except OSError: pass try: - for dirname in ("test_index", "test_index2", "test_index3", "test_indexT"): + for dirname in ("test_index",): for filename in os.listdir(dirname): os.remove(os.path.join(dirname, filename)) os.rmdir(dirname) @@ -98,22 +97,20 @@ ] f = open("testP", "wb") -w = PositionWriter(f) +w = TermWriter(f) w.begin(0, 0) for doc_positions in all_doc_positions: - w.reset() - for docnum, positions in doc_positions: - w.write_positions(docnum, positions) + w.write_positions(doc_positions) + w.end_record() w.close() f = open("testP", "rb") -r = PositionReader(f) +r = TermReader(f) for doc_positions in all_doc_positions: - r.reset() - for docnum, positions in doc_positions: - d, p = r.read_positions() - print docnum == d, docnum, d - print positions == p, positions, p + r.begin_record() + dp = r.read_positions() + print doc_positions == dp, doc_positions + print " ", dp r.close() all_doc_positions_seq = [ @@ -131,350 +128,56 @@ ] f = open("testP2", "wb") -w = PositionWriter(f) +w = TermWriter(f) w.begin(2, 2) for doc_positions in all_doc_positions_seq: - w.reset() - for docnum, positions in doc_positions: - w.write_positions(docnum, positions) + w.write_positions(doc_positions) + w.end_record() w.close() f = open("testP2", "rb") -r = PositionReader(f) +r = TermReader(f) for doc_positions in all_doc_positions_seq: - r.reset() - for docnum, positions in doc_positions: - d, p = r.read_positions() - print docnum == d, docnum, d - print positions == p, positions, p -r.close() - -print "- Test position index files." - -indexed_positions = [ - [ - (1234, 0, 100), - (2345, 700, 100), - (3456, 1900, 50) - ], - [ - (4567, 2800, 20) - ] - ] - -offsets = [] -f = open("testPI", "wb") -w = PositionIndexWriter(f) -w.begin(0) -for term_positions in indexed_positions: - offset = None - doc_frequency = 0 - w.reset() - for docnum, pos_offset, count in term_positions: - if offset is None: - offset = w.tell() - w.write_positions(docnum, pos_offset, count) - doc_frequency += count - offsets.append((offset, doc_frequency)) -w.close() - -r = PositionIndexIterator(PositionIndexReader(open("testPI", "rb"))) -offsets.reverse() -indexed_positions.reverse() -for (offset, doc_frequency), term_positions in zip(offsets, indexed_positions): - r.seek(offset, doc_frequency) - for (docnum, pos_offset, count), (dn, po, c) in zip(term_positions, r): - print docnum == dn, docnum, dn - print pos_offset == po, pos_offset, po - print count == c, count, c -r.reader.close() - -print "- Test position dictionaries." - -f = open("testP", "wb") -w = PositionWriter(f) -f2 = open("testPI", "wb") -w2 = PositionIndexWriter(f2) -wd = PositionDictionaryWriter(w, w2, 2) -offsets = [] -for doc_positions in all_doc_positions: - offset, frequency, doc_frequency = wd.write_term_positions(doc_positions) - offsets.append((offset, doc_frequency)) -wd.close() - -r = PositionReader(open("testP", "rb")) -r2 = PositionIndexReader(open("testPI", "rb")) -rd = PositionDictionaryReader(r, r2) -offsets.reverse() -all_doc_positions.reverse() -for (offset, doc_frequency), doc_positions in zip(offsets, all_doc_positions): - it = rd.read_term_positions(offset, doc_frequency) - dp = list(it) - print doc_positions == dp, doc_positions, dp -rd.close() - -print "- Test fields." - -doc_fields = [ - (123, ["testing", "fields", "stored", "compressed"]), - (456, ["fields", "for a second", "document"]), - (789, ["field value"]), - (1234, []), - (2345, ["abc", "def"]), - (3456, ["apple", "banana", "cherry"]), - (4567, ["drue", "eple"]) - ] - -f = open("testF", "wb") -w = FieldWriter(f) -w.begin(0) -w.reset() -for docnum, fields in doc_fields: - w.write_fields(docnum, list(enumerate(fields))) -w.close() - -f = open("testF", "rb") -r = FieldReader(f) -r.reset() -for docnum, fields in doc_fields: - dn, df = r.read_fields() - print docnum == dn, docnum, dn - print list(enumerate(fields)) == df, list(enumerate(fields)), df -r.close() - -print "- Test field index files." - -indexed_docs = [ - (123, 100000987), - (456, 100004321), - (789, 100008765) - ] - -f = open("testFI", "wb") -w = FieldIndexWriter(f) -w.begin(0) -w.reset() -for docnum, offset in indexed_docs: - w.write_document(docnum, offset) -w.close() - -f = open("testFI", "rb") -r = FieldIndexReader(f) -r.reset() -for docnum, offset in indexed_docs: - dn, o = r.read_document() - print docnum == dn, docnum, dn - print offset == o, offset, o + r.begin_record() + dp = r.read_positions() + print doc_positions == dp, doc_positions + print " ", dp r.close() -print "- Test field dictionaries." - -f = open("testF", "wb") -w = FieldWriter(f) -f2 = open("testFI", "wb") -w2 = FieldIndexWriter(f2) -wd = FieldDictionaryWriter(w, w2, 3) -for docnum, fields in doc_fields: - wd.write_fields(docnum, list(enumerate(fields))) -wd.close() - -f = open("testF", "rb") -r = FieldReader(f) -f2 = open("testFI", "rb") -r2 = FieldIndexReader(f2) -rd = FieldDictionaryReader(r, r2) -doc_fields_reversed = doc_fields[:] -doc_fields_reversed.reverse() -for docnum, fields in doc_fields_reversed: - df = dict(rd.get_fields(docnum)) - print dict(enumerate(fields)) == df, dict(enumerate(fields)), df -for docnum in (13579, 246810): - df = rd.get_fields(docnum) - print df is None, df - -print "- (Test sequential access.)" - -rd.rewind() -for docnum, fields in doc_fields: - dn, df = rd.read_fields() - print docnum == dn, docnum, dn - print list(enumerate(fields)) == df, list(enumerate(fields)), df -rd.close() - -print "- Test terms." - -terms = [ - # term offset frequency doc_frequency - ("aardvark", 100000123, 1, 1), - ("anteater", 100000456, 2, 1), - ("badger", 100000789, 13, 7), - ("bull", 1000001234, 59, 17), - ("bulldog", 1000002345, 99, 80), - ("cat", 1000003456, 89, 28) - ] - -f = open("test", "wb") -w = TermWriter(f) -w.reset() -for term, offset, frequency, doc_frequency in terms: - w.write_term(term, offset, frequency, doc_frequency) -w.close() - -f = open("test", "rb") -r = TermReader(f) -r.reset() -for term, offset, frequency, doc_frequency in terms: - t, o, fr, df = r.read_term() - print term == t, term, t - print offset == o, offset, o - print frequency == fr, frequency, fr - print doc_frequency == df, doc_frequency, df -r.close() - -print "- Test terms in index files." - -indexed_terms = [ - # term offset frequency doc_frequency info_offset - ("aardvark", 100000123, 1, 1, 200000321), - ("anteater", 100000456, 2, 1, 200000654), - ("badger", 100000789, 13, 7, 200000987), - ("bull", 1000001234, 59, 17, 200004321), - ("bulldog", 1000002345, 99, 80, 200005432), - ("cat", 1000003456, 89, 28, 200006543) - ] - -f = open("test", "wb") -w = TermIndexWriter(f) -w.reset() -for term, offset, frequency, doc_frequency, info_offset in indexed_terms: - w.write_term(term, offset, frequency, doc_frequency, info_offset) -w.close() - -f = open("test", "rb") -r = TermIndexReader(f) -r.reset() -for term, offset, frequency, doc_frequency, info_offset in indexed_terms: - t, o, fr, df, i = r.read_term() - print term == t, term, t - print offset == o, offset, o - print frequency == fr, frequency, fr - print doc_frequency == df, doc_frequency, df - print info_offset == i, info_offset, i -r.close() - -print "- Test dictionaries with only term data." - -f = open("test", "wb") -w = TermWriter(f) -f2 = open("testI", "wb") -w2 = TermIndexWriter(f2) -f3 = open("testP", "wb") -w3 = PositionWriter(f3) -f4 = open("testPI", "wb") -w4 = PositionIndexWriter(f4) -wp = PositionDictionaryWriter(w3, w4, 2) -wd = TermDictionaryWriter(w, w2, wp, 3) -for term, offset, frequency, doc_frequency in terms: - wd._write_term(term, offset, frequency, doc_frequency) -wd.close() - -f = open("test", "rb") -r = TermReader(f) -f2 = open("testI", "rb") -r2 = TermIndexReader(f2) -r3 = PositionReader(open("testP", "rb")) -r4 = PositionIndexReader(open("testPI", "rb")) -rp = PositionDictionaryReader(r3, r4) -rd = TermDictionaryReader(r, r2, rp) -terms_reversed = terms[:] -terms_reversed.reverse() -for term, offset, frequency, doc_frequency in terms_reversed: - o, fr, df = rd._find_term(term) - print offset == o, offset, o - print frequency == fr, frequency, fr - print doc_frequency == df, doc_frequency, df -for term in ("dog", "dingo"): - t = rd._find_term(term) - print t is None, t - -print "- (Test term prefix searching.)" - -print rd.find_terms("a") == ["aardvark", "anteater"], rd.find_terms("a"), ["aardvark", "anteater"] -print rd.find_terms("bu") == ["bull", "bulldog"], rd.find_terms("bu"), ["bull", "bulldog"] -print rd.find_terms("c") == ["cat"], rd.find_terms("c"), ["cat"] -print rd.find_terms("d") == [], rd.find_terms("d"), [] -rd.close() - print "- Test dictionaries with term and position data." terms_with_positions = [ ("aardvark", [(1, [2, 45, 96]), (20, [13])]), ("anteater", [(1, [43, 44])]), ("badger", [(7, [2, 22, 196]), (19, [55, 1333]), (21, [0])]), + (u"bjørn", [(11, [19, 54])]), ("bull", [(6, [128]), (16, [12]), (26, [1, 3, 5, 7, 9]), (36, [2, 4, 6, 8, 10])]), ("bulldog", [(43, [17, 19, 256, 512])]), - ("cat", [(123, [12, 145, 196]), (1200, [113])]) - ] - -position_dict_tests = [ - ("badger", 19, [55, 1333]), - ("badger", 20, None), - ("bull", 6, [128]), - ("bull", 26, [1, 3, 5, 7, 9]), - ("cat", 111, None), - ("cat", 123, [12, 145, 196]), - ("cat", 1234, None) + ("cat", [(123, [12, 145, 196]), (1200, [113])]), + (u"å", [(15, [384])]), ] f = open("test", "wb") w = TermWriter(f) -f2 = open("testI", "wb") -w2 = TermIndexWriter(f2) -f3 = open("testP", "wb") -w3 = PositionWriter(f3) -f4 = open("testPI", "wb") -w4 = PositionIndexWriter(f4) -wp = PositionDictionaryWriter(w3, w4, 2) -wd = TermDictionaryWriter(w, w2, wp, 3) -for term, doc_positions in terms_with_positions: - wd.write_term_positions(term, doc_positions) -wd.close() +w.begin(0, 0) +w.write_terms(terms_with_positions) +w.close() f = open("test", "rb") -r = TermReader(f) -f2 = open("testI", "rb") -r2 = TermIndexReader(f2) -r3 = PositionReader(open("testP", "rb")) -r4 = PositionIndexReader(open("testPI", "rb")) -rp = PositionDictionaryReader(r3, r4) -rd = TermDictionaryReader(r, r2, rp) -terms_reversed = terms_with_positions[:] -terms_reversed.reverse() -for term, doc_positions in terms_reversed: - dp = list(rd.find_positions(term)) - print doc_positions == dp, doc_positions, dp -for term in ("aaa", "dog", "dingo"): - dp = rd.find_positions(term) - print dp == [], dp +r = TermIterator(f) +for (term, doc_positions), (t, dp) in zip(terms_with_positions, r): + print term == t, term, t + print doc_positions == dp, doc_positions + print " ", dp +r.close() -print "- (Test iterators.)" - -for term, docnum, positions in position_dict_tests: - dp = rd.find_positions(term) - pos = dp.from_document(docnum) - print positions is None and pos is None or pos is not None and positions == list(pos), positions, pos - -print "- (Test sequential access.)" +f = open("test", "rb") +r = TermDataIterator(f) +for (term, doc_positions), (t, data) in zip(terms_with_positions, r): + print term == t, term, t, data +r.close() -rd.rewind() -for term, doc_positions in terms_with_positions: - t, fr, df, dp = rd.read_term() - dp = list(dp) - print term == t, term, t - print doc_positions == dp, doc_positions, dp -rd.close() - -print "- Test high-level index operations (including merging)." +print "- Test high-level index operations." docs = [ (1, "The cat sat on the mat"), @@ -485,189 +188,26 @@ (36, "She sells sea shells on the sea shore") ] -doc_tests = [ - ("Every", 2, [(2, [0]), (14, [0])]), - ("good", 2, [(2, [1]), (13, [1])]), - ("deserves", 2, [(2, [3]), (13, [3])]), - ("sea", 2, [(36, [2, 6])]) - ] - -position_tests = [ - ("Every", 14, [0]), - ("sea", 36, [2, 6]), - ("shells", 1, None), - ("shells", 37, None) - ] - -phrase_tests = [ - (["good", "boy"], [(2, [1, 2])]), - (["on", "the"], [(1, [3, 4]), (36, [4, 5])]), - (["sea", "shore"], [(36, [6, 7])]) - ] - -index = Index("test_index", 3, 2, 3, 6) +index = Index("test_index", 3) wi = index.get_writer() for docnum, text in docs: doc = Document(docnum) for position, term in enumerate(text.split()): doc.add_position(term, position) - doc.add_field(123, text) - wi.add_document(doc) -wi.close() - -rd = index.get_reader() - -print "- (Test searching.)" - -for term, frequency, doc_positions in doc_tests: - dp = list(rd.find_positions(term)) - print doc_positions == dp, doc_positions, dp - fr = rd.get_frequency(term) - print frequency == fr, frequency, fr - -print "- (Test fields.)" - -for docnum, text in docs: - df = dict(rd.get_fields(docnum)) - print df[123] == text, text, df[123] - -print "- (Test navigation.)" - -for term, docnum, positions in position_tests: - dp = rd.find_positions(term) - pos = dp.from_document(docnum) - print positions is None and pos is None or pos is not None and positions == list(pos), positions, pos - -print "- (Test phrases.)" - -for terms, results in phrase_tests: - res = list(rd.find_common_positions(terms)) - print results == res, results, res - -index.close() - -docs2 = [ - ((1, 0), "The cat sat on the mat"), - ((1, 2), "Every good boy deserves football"), - ((13, 1), "One good turn deserves another"), - ((14, 0), "Every man for himself"), - ((14, 25), "Red sky at night shepherd's delight"), - ((36, 12), "She sells sea shells on the sea shore") - ] - -doc_tests2 = [ - ("Every", 2, [((1, 2), [(0, 0)]), ((14, 0), [(0, 0)])]), - ("good", 2, [((1, 2), [(1, 6)]), ((13, 1), [(1, 4)])]), - ("deserves", 2, [((1, 2), [(3, 15)]), ((13, 1), [(3, 14)])]), - ("sea", 2, [((36, 12), [(2, 10), (6, 28)])]) - ] - -position_tests2 = [ - ("Every", (14, 0), [(0, 0)]), - ("sea", (36, 12), [(2, 10), (6, 28)]), - ("shells", (1, 0), None), - ("shells", (37, 0), None) - ] - -phrase_tests2 = [ - (["good", "boy"], [((1, 2), [(1, 6), (2, 11)])]), - (["on", "the"], [((1, 0), [(3, 12), (4, 15)]), ((36, 12), [(4, 21), (5, 24)])]), - (["sea", "shore"], [((36, 12), [(6, 28), (7, 32)])]) - ] - -index = Index("test_indexT", 3, 2, 3, 6) -wi = index.get_writer() -for docnum, text in docs2: - doc = Document(docnum) - offset = 0 - for position, term in enumerate(text.split()): - doc.add_position(term, (position, offset)) - offset += len(term) + 1 # assume one space after the term - doc.add_field(123, text) wi.add_document(doc) wi.close() -rd = index.get_reader() - -print "- (Test searching.)" - -for term, frequency, doc_positions in doc_tests2: - dp = list(rd.find_positions(term)) - print doc_positions == dp, doc_positions, dp - fr = rd.get_frequency(term) - print frequency == fr, frequency, fr - -print "- (Test fields.)" +print "- Test merge." -for docnum, text in docs2: - df = dict(rd.get_fields(docnum)) - print df[123] == text, text, df[123] - -print "- (Test navigation.)" +l1 = list(index.get_reader()) +index.merge() +l2 = list(index.get_reader(1)) -for term, docnum, positions in position_tests2: - dp = rd.find_positions(term) - pos = dp.from_document(docnum) - print positions is None and pos is None or pos is not None and positions == list(pos), positions, pos - -print "- (Test phrases.)" - -for terms, results in phrase_tests2: - res = list(rd.find_common_positions(terms)) - print results == res, results, res +for (t1, dp1), (t2, dp2) in zip(l1, l2): + print t1 == t2, t1, t2 + print dp1 == dp1, dp1 + print " ", dp2 index.close() -print "- Test index updates." - -index = Index("test_index") -index2 = Index("test_index2", 3, 2, 3, 6) -wi = index2.get_writer() -for docnum, text in docs: - - # Add the same documents but with different numbers. - - doc = Document(docnum + 100) - for position, term in enumerate(text.split()): - doc.add_position(term, position) - doc.add_field(123, text) - wi.add_document(doc) -wi.close() - -index2.update([index]) -index.close() - -rd = index2.get_reader() -for term, frequency, doc_positions in doc_tests: - - # Add the extra documents to the expected result. - - orig_doc_positions = doc_positions - doc_positions = doc_positions[:] - - for docnum, positions in orig_doc_positions: - doc_positions.append((docnum + 100, positions)) - frequency *= 2 - - dp = list(rd.find_positions(term)) - print doc_positions == dp, doc_positions, dp - fr = rd.get_frequency(term) - print frequency == fr, frequency, fr -index2.close() - -print "- (Test update of an empty index.)" - -index = Index("test_index") -index3 = Index("test_index3") -index3.update([index]) -index.close() - -rd = index3.get_reader() -for term, frequency, doc_positions in doc_tests: - dp = list(rd.find_positions(term)) - print doc_positions == dp, doc_positions, dp - fr = rd.get_frequency(term) - print frequency == fr, frequency, fr -index3.close() - # vim: tabstop=4 expandtab shiftwidth=4