# HG changeset patch # User Paul Boddie # Date 1251932946 -7200 # Node ID af39faebc7346d77b29a0b18f3861b9d20159dfb # Parent e8e80bfe5b5d8295fd2cabfca50deda93ac5c121 Added navigation to specific documents in the position dictionary iterator. Fixed merging to preserve existing merged partitions. diff -r e8e80bfe5b5d -r af39faebc734 iixr.py --- a/iixr.py Wed Sep 02 22:25:29 2009 +0200 +++ b/iixr.py Thu Sep 03 01:09:06 2009 +0200 @@ -27,6 +27,11 @@ from bisect import insort_right # to maintain a sorted list of data for merging import bz2, zlib # for field compression +try: + set +except NameError: + from sets import Set as set + # Constants. TERM_INTERVAL = 100 @@ -285,42 +290,6 @@ f.seek(offset) return PositionIterator(f, count) -class IteratorBase: - - def __init__(self, count): - self.replenish(count) - - def replenish(self, count): - self.count = count - self.read_documents = 0 - - def __len__(self): - return self.count - - def sort(self): - pass # Stored document positions are already sorted. - - def __iter__(self): - return self - -class PositionIterator(PositionReader, IteratorBase): - - "Iterating over document positions." - - def __init__(self, f, count): - PositionReader.__init__(self, f) - IteratorBase.__init__(self, count) - - def next(self): - - "Read positions for a single document." - - if self.read_documents < self.count: - self.read_documents += 1 - return self.read_positions() - else: - raise StopIteration - class PositionIndexWriter(FileWriter): "Writing position index information to files." @@ -403,6 +372,44 @@ f.seek(offset) return PositionIndexIterator(f, doc_frequency) +# Iterators for position-related files. + +class IteratorBase: + + def __init__(self, count): + self.replenish(count) + + def replenish(self, count): + self.count = count + self.read_documents = 0 + + def __len__(self): + return self.count + + def sort(self): + pass # Stored document positions are already sorted. + + def __iter__(self): + return self + +class PositionIterator(PositionReader, IteratorBase): + + "Iterating over document positions." + + def __init__(self, f, count): + PositionReader.__init__(self, f) + IteratorBase.__init__(self, count) + + def next(self): + + "Read positions for a single document." + + if self.read_documents < self.count: + self.read_documents += 1 + return self.read_positions() + else: + raise StopIteration + class PositionIndexIterator(PositionIndexReader, IteratorBase): "Iterating over document positions." @@ -533,10 +540,16 @@ def __init__(self, position_reader, position_index_reader, offset, doc_frequency): self.position_reader = position_reader self.doc_frequency = doc_frequency + self.index_iterator = position_index_reader.read_term_positions(offset, doc_frequency) - self.index_iterator = position_index_reader.read_term_positions(offset, doc_frequency) - self.next_section() - self.init_section() + # Maintain state for the next index entry, if read. + + self.next_docnum, self.next_pos_offset, self.next_section_count = None, None, None + + # Initialise the current index entry and current position file iterator. + + self._next_section() + self._init_section() def __len__(self): return self.doc_frequency @@ -549,7 +562,10 @@ def next(self): - # Attempt to get the next document record from the section in the positions file. + """ + Attempt to get the next document record from the section in the + positions file. + """ while 1: @@ -566,13 +582,69 @@ # reading using the same file iterator (since the data should # just follow on from the last section). - self.next_section() + self._next_section() self.iterator.replenish(self.section_count) - def next_section(self): - self.docnum, self.pos_offset, self.section_count = self.index_iterator.next() + def __getitem__(self, docnum): + + """ + Attempt to navigate to a positions entry for the given 'docnum', + returning the positions, if present, or None otherwise. + """ + + # Read ahead in the index until the next entry refers to a document + # later than the desired document. + + try: + if self.next_docnum is None: + self.next_docnum, self.next_pos_offset, self.next_section_count = self.index_iterator.next() + + while self.next_docnum < docnum: + self._next_read_section() + self.next_docnum, self.next_pos_offset, self.next_section_count = self.index_iterator.next() + + except StopIteration: + pass + + # Navigate in the position file to the document. + + self._init_section() - def init_section(self): + try: + while 1: + found_docnum, positions = self.iterator.next() + if docnum == found_docnum: + return positions + elif docnum < found_docnum: + return None + except StopIteration: + return None + + # Internal methods. + + def _next_section(self): + + "Attempt to get the next section in the index." + + if self.next_docnum is None: + self.docnum, self.pos_offset, self.section_count = self.index_iterator.next() + else: + self._next_read_section() + self.next_docnum, self.next_pos_offset, self.next_section_count = None, None, None + + def _next_read_section(self): + + """ + Make the next index entry the current one without reading from the + index. + """ + + self.docnum, self.pos_offset, self.section_count = self.next_docnum, self.next_pos_offset, self.next_section_count + + def _init_section(self): + + "Initialise the iterator for the section in the position file." + self.iterator = self.position_reader.read_term_positions(self.pos_offset, self.section_count) class TermWriter(FileWriter): @@ -1534,17 +1606,22 @@ """ readers = [] - partitions = [] + partitions = set() for filename in listdir(self.pathname): if filename.startswith("terms-"): # 6 character prefix partition = filename[6:] readers.append(get_term_reader(self.pathname, partition)) - partitions.append(partition) + partitions.add(partition) # Write directly to a dictionary. if len(readers) > 1: + if "merged" in partitions: + rename_term_files(self.pathname, "merged", "old-merged") + partitions.remove("merged") + partitions.add("old-merged") + writer = get_term_writer(self.pathname, "merged", interval, doc_interval) merger = TermDictionaryMerger(writer, readers) merger.merge() @@ -1555,25 +1632,32 @@ for partition in partitions: remove_term_files(self.pathname, partition) - elif len(readers) == 1 and partitions[0] != "merged": - rename_term_files(self.pathname, partitions[0], "merged") + elif len(readers) == 1: + partition = list(partitions)[0] + if partition != "merged": + rename_term_files(self.pathname, partition, "merged") def merge_fields(self, interval=FIELD_INTERVAL): "Merge field dictionaries using the given indexing 'interval'." readers = [] - partitions = [] + partitions = set() for filename in listdir(self.pathname): if filename.startswith("fields-"): # 7 character prefix partition = filename[7:] readers.append(get_field_reader(self.pathname, partition)) - partitions.append(partition) + partitions.add(partition) # Write directly to a dictionary. if len(readers) > 1: + if "merged" in partitions: + rename_field_files(self.pathname, "merged", "old-merged") + partitions.remove("merged") + partitions.add("old-merged") + writer = get_field_writer(self.pathname, "merged", interval) merger = FieldDictionaryMerger(writer, readers) merger.merge() @@ -1584,8 +1668,10 @@ for partition in partitions: remove_field_files(self.pathname, partition) - elif len(readers) == 1 and partitions[0] != "merged": - rename_field_files(self.pathname, partitions[0], "merged") + elif len(readers) == 1: + partition = list(partitions)[0] + if partition != "merged": + rename_field_files(self.pathname, partition, "merged") def close(self): if self.reader is not None: diff -r e8e80bfe5b5d -r af39faebc734 test.py --- a/test.py Wed Sep 02 22:25:29 2009 +0200 +++ b/test.py Thu Sep 03 01:09:06 2009 +0200 @@ -383,6 +383,12 @@ ("sea", 2, [(36, [2, 6])]) ] +position_tests = [ + ("Every", 14, [0]), + ("sea", 36, [2, 6]), + ("shells", 1, None) + ] + index = iixr.Index("test_index") wi = index.get_writer(3, 2, 6) for docnum, text in docs: @@ -400,6 +406,10 @@ for docnum, text in docs: df = rd.get_fields(docnum) print (123, text) == df[0], (123, text), df[0] +for term, docnum, positions in position_tests: + dp = rd.find_positions(term) + pos = dp[docnum] + print positions is None and positions is pos or positions == list(pos), positions, pos index.close() # vim: tabstop=4 expandtab shiftwidth=4