# HG changeset patch
# User Paul Boddie <paul@boddie.org.uk>
# Date 1251932946 -7200
# Node ID af39faebc7346d77b29a0b18f3861b9d20159dfb
# Parent  e8e80bfe5b5d8295fd2cabfca50deda93ac5c121
Added navigation to specific documents in the position dictionary iterator.
Fixed merging to preserve existing merged partitions.

diff -r e8e80bfe5b5d -r af39faebc734 iixr.py
--- a/iixr.py	Wed Sep 02 22:25:29 2009 +0200
+++ b/iixr.py	Thu Sep 03 01:09:06 2009 +0200
@@ -27,6 +27,11 @@
 from bisect import insort_right  # to maintain a sorted list of data for merging
 import bz2, zlib                 # for field compression
 
+try:
+    set
+except NameError:
+    from sets import Set as set
+
 # Constants.
 
 TERM_INTERVAL     = 100
@@ -285,42 +290,6 @@
         f.seek(offset)
         return PositionIterator(f, count)
 
-class IteratorBase:
-
-    def __init__(self, count):
-        self.replenish(count)
-
-    def replenish(self, count):
-        self.count = count
-        self.read_documents = 0
-
-    def __len__(self):
-        return self.count
-
-    def sort(self):
-        pass # Stored document positions are already sorted.
-
-    def __iter__(self):
-        return self
-
-class PositionIterator(PositionReader, IteratorBase):
-
-    "Iterating over document positions."
-
-    def __init__(self, f, count):
-        PositionReader.__init__(self, f)
-        IteratorBase.__init__(self, count)
-
-    def next(self):
-
-        "Read positions for a single document."
-
-        if self.read_documents < self.count:
-            self.read_documents += 1
-            return self.read_positions()
-        else:
-            raise StopIteration
-
 class PositionIndexWriter(FileWriter):
 
     "Writing position index information to files."
@@ -403,6 +372,44 @@
         f.seek(offset)
         return PositionIndexIterator(f, doc_frequency)
 
+# Iterators for position-related files.
+
+class IteratorBase:
+
+    def __init__(self, count):
+        self.replenish(count)
+
+    def replenish(self, count):
+        self.count = count
+        self.read_documents = 0
+
+    def __len__(self):
+        return self.count
+
+    def sort(self):
+        pass # Stored document positions are already sorted.
+
+    def __iter__(self):
+        return self
+
+class PositionIterator(PositionReader, IteratorBase):
+
+    "Iterating over document positions."
+
+    def __init__(self, f, count):
+        PositionReader.__init__(self, f)
+        IteratorBase.__init__(self, count)
+
+    def next(self):
+
+        "Read positions for a single document."
+
+        if self.read_documents < self.count:
+            self.read_documents += 1
+            return self.read_positions()
+        else:
+            raise StopIteration
+
 class PositionIndexIterator(PositionIndexReader, IteratorBase):
 
     "Iterating over document positions."
@@ -533,10 +540,16 @@
     def __init__(self, position_reader, position_index_reader, offset, doc_frequency):
         self.position_reader = position_reader
         self.doc_frequency = doc_frequency
+        self.index_iterator = position_index_reader.read_term_positions(offset, doc_frequency)
 
-        self.index_iterator = position_index_reader.read_term_positions(offset, doc_frequency)
-        self.next_section()
-        self.init_section()
+        # Maintain state for the next index entry, if read.
+
+        self.next_docnum, self.next_pos_offset, self.next_section_count = None, None, None
+
+        # Initialise the current index entry and current position file iterator.
+
+        self._next_section()
+        self._init_section()
 
     def __len__(self):
         return self.doc_frequency
@@ -549,7 +562,10 @@
 
     def next(self):
 
-        # Attempt to get the next document record from the section in the positions file.
+        """
+        Attempt to get the next document record from the section in the
+        positions file.
+        """
 
         while 1:
 
@@ -566,13 +582,69 @@
                 # reading using the same file iterator (since the data should
                 # just follow on from the last section).
 
-                self.next_section()
+                self._next_section()
                 self.iterator.replenish(self.section_count)
 
-    def next_section(self):
-        self.docnum, self.pos_offset, self.section_count = self.index_iterator.next()
+    def __getitem__(self, docnum):
+
+        """
+        Attempt to navigate to a positions entry for the given 'docnum',
+        returning the positions, if present, or None otherwise.
+        """
+
+        # Read ahead in the index until the next entry refers to a document
+        # later than the desired document.
+
+        try:
+            if self.next_docnum is None:
+                self.next_docnum, self.next_pos_offset, self.next_section_count = self.index_iterator.next()
+
+            while self.next_docnum < docnum:
+                self._next_read_section()
+                self.next_docnum, self.next_pos_offset, self.next_section_count = self.index_iterator.next()
+
+        except StopIteration:
+            pass
+
+        # Navigate in the position file to the document.
+
+        self._init_section()
 
-    def init_section(self):
+        try:
+            while 1:
+                found_docnum, positions = self.iterator.next()
+                if docnum == found_docnum:
+                    return positions
+                elif docnum < found_docnum:
+                    return None
+        except StopIteration:
+            return None
+
+    # Internal methods.
+
+    def _next_section(self):
+
+        "Attempt to get the next section in the index."
+
+        if self.next_docnum is None:
+            self.docnum, self.pos_offset, self.section_count = self.index_iterator.next()
+        else:
+            self._next_read_section()
+            self.next_docnum, self.next_pos_offset, self.next_section_count = None, None, None
+
+    def _next_read_section(self):
+
+        """
+        Make the next index entry the current one without reading from the
+        index.
+        """
+
+        self.docnum, self.pos_offset, self.section_count = self.next_docnum, self.next_pos_offset, self.next_section_count
+
+    def _init_section(self):
+
+        "Initialise the iterator for the section in the position file."
+
         self.iterator = self.position_reader.read_term_positions(self.pos_offset, self.section_count)
 
 class TermWriter(FileWriter):
@@ -1534,17 +1606,22 @@
         """
 
         readers = []
-        partitions = []
+        partitions = set()
 
         for filename in listdir(self.pathname):
             if filename.startswith("terms-"): # 6 character prefix
                 partition = filename[6:]
                 readers.append(get_term_reader(self.pathname, partition))
-                partitions.append(partition)
+                partitions.add(partition)
 
         # Write directly to a dictionary.
 
         if len(readers) > 1:
+            if "merged" in partitions:
+                rename_term_files(self.pathname, "merged", "old-merged")
+                partitions.remove("merged")
+                partitions.add("old-merged")
+
             writer = get_term_writer(self.pathname, "merged", interval, doc_interval)
             merger = TermDictionaryMerger(writer, readers)
             merger.merge()
@@ -1555,25 +1632,32 @@
             for partition in partitions:
                 remove_term_files(self.pathname, partition)
 
-        elif len(readers) == 1 and partitions[0] != "merged":
-            rename_term_files(self.pathname, partitions[0], "merged")
+        elif len(readers) == 1:
+            partition = list(partitions)[0]
+            if partition != "merged":
+                rename_term_files(self.pathname, partition, "merged")
 
     def merge_fields(self, interval=FIELD_INTERVAL):
 
         "Merge field dictionaries using the given indexing 'interval'."
 
         readers = []
-        partitions = []
+        partitions = set()
 
         for filename in listdir(self.pathname):
             if filename.startswith("fields-"): # 7 character prefix
                 partition = filename[7:]
                 readers.append(get_field_reader(self.pathname, partition))
-                partitions.append(partition)
+                partitions.add(partition)
 
         # Write directly to a dictionary.
 
         if len(readers) > 1:
+            if "merged" in partitions:
+                rename_field_files(self.pathname, "merged", "old-merged")
+                partitions.remove("merged")
+                partitions.add("old-merged")
+
             writer = get_field_writer(self.pathname, "merged", interval)
             merger = FieldDictionaryMerger(writer, readers)
             merger.merge()
@@ -1584,8 +1668,10 @@
             for partition in partitions:
                 remove_field_files(self.pathname, partition)
 
-        elif len(readers) == 1 and partitions[0] != "merged":
-            rename_field_files(self.pathname, partitions[0], "merged")
+        elif len(readers) == 1:
+            partition = list(partitions)[0]
+            if partition != "merged":
+                rename_field_files(self.pathname, partition, "merged")
 
     def close(self):
         if self.reader is not None:
diff -r e8e80bfe5b5d -r af39faebc734 test.py
--- a/test.py	Wed Sep 02 22:25:29 2009 +0200
+++ b/test.py	Thu Sep 03 01:09:06 2009 +0200
@@ -383,6 +383,12 @@
     ("sea", 2, [(36, [2, 6])])
     ]
 
+position_tests = [
+    ("Every", 14, [0]),
+    ("sea", 36, [2, 6]),
+    ("shells", 1, None)
+    ]
+
 index = iixr.Index("test_index")
 wi = index.get_writer(3, 2, 6)
 for docnum, text in docs:
@@ -400,6 +406,10 @@
 for docnum, text in docs:
     df = rd.get_fields(docnum)
     print (123, text) == df[0], (123, text), df[0]
+for term, docnum, positions in position_tests:
+    dp = rd.find_positions(term)
+    pos = dp[docnum]
+    print positions is None and positions is pos or positions == list(pos), positions, pos
 index.close()
 
 # vim: tabstop=4 expandtab shiftwidth=4