# HG changeset patch
# User Paul Boddie <paul@boddie.org.uk>
# Date 1252195260 -7200
# Node ID 37a06a948a52c71940b0b9bb2ce7b707582eca8d
# Parent  ffec42ec943a2286a935bc9a5fe95563cf978b4c
Added term retrieval by prefix.
Added a conversion of field values to Unicode in the add_field method.

diff -r ffec42ec943a -r 37a06a948a52 iixr.py
--- a/iixr.py	Sat Sep 05 18:10:50 2009 +0200
+++ b/iixr.py	Sun Sep 06 02:01:00 2009 +0200
@@ -735,7 +735,7 @@
 
         """
         Read a term, its position file offset, its frequency and its document
-        frequence from the term information file.
+        frequency from the term information file.
         """
 
         # Read the prefix length and term suffix.
@@ -879,27 +879,47 @@
 
         self.max_offset = self.terms[-1][1] + 1
 
-    def _find_term(self, term):
+    def _find_closest_entry(self, term):
 
         """
-        Find the position file offset and frequency of 'term' from the term
-        dictionary.
+        Find the offsets and frequencies of 'term' from the term dictionary or
+        the closest term starting with the value of 'term'.
+
+        Return the closest index entry consisting of a term, the position file
+        offset, the term frequency, the document frequency, and the term details
+        file offset.
         """
 
         i = bisect_right(self.terms, (term, self.max_offset, 0, 0)) - 1
 
         # Get the entry position providing the term or one preceding it.
+        # If no entry precedes the requested term, return the very first entry
+        # as the closest.
 
         if i == -1:
-            return None
-
-        found_term, offset, frequency, doc_frequency, info_offset = self.terms[i]
+            return self.terms[0]
+        else:
+            return self.terms[i]
+
+    def _find_closest_term(self, term):
+
+        """
+        Find the offsets and frequencies of 'term' from the term dictionary or
+        the closest term starting with the value of 'term'.
+
+        Return the closest term (or the term itself), the position file offset,
+        the term frequency, the document frequency, and the term details file
+        offset (or None if the reader is already positioned).
+        """
+
+        found_term, offset, frequency, doc_frequency, info_offset = self._find_closest_entry(term)
 
         # Where the term is found immediately, return the offset and
-        # frequencies.
-
-        if term == found_term:
-            return offset, frequency, doc_frequency
+        # frequencies. If the term does not appear, return the details of the
+        # closest entry.
+
+        if term <= found_term:
+            return found_term, offset, frequency, doc_frequency, info_offset
 
         # Otherwise, seek past the index term's entry in the information file
         # and scan for the desired term.
@@ -912,19 +932,32 @@
             except EOFError:
                 pass
 
-            # If the term is found, return the offset and frequencies.
-
-            if term == found_term:
-                return offset, frequency, doc_frequency
-            else:
-                return None
+            return found_term, offset, frequency, doc_frequency, None
+
+    def _find_term(self, term):
+
+        """
+        Find the position file offset and frequency of 'term' from the term
+        dictionary.
+        """
+
+        found_term, offset, frequency, doc_frequency, info_offset = self._find_closest_term(term)
+
+        # If the term is found, return the offset and frequencies.
+
+        if term == found_term:
+            return offset, frequency, doc_frequency
+        else:
+            return None
+
+    def _get_positions(self, offset, doc_frequency):
+        return self.position_dict_reader.read_term_positions(offset, doc_frequency)
+
+    # Sequential access methods.
 
     def rewind(self):
         self.info_reader.rewind()
 
-    def _get_positions(self, offset, doc_frequency):
-        return self.position_dict_reader.read_term_positions(offset, doc_frequency)
-
     def read_term(self):
 
         """
@@ -936,6 +969,35 @@
         positions = self._get_positions(offset, doc_frequency)
         return term, frequency, doc_frequency, positions
 
+    # Query methods.
+
+    def find_terms(self, term):
+
+        "Return all terms whose values start with the value of 'term'."
+
+        terms = []
+
+        found_term, offset, frequency, doc_frequency, info_offset = self._find_closest_term(term)
+
+        # Position the reader, if necessary.
+
+        if info_offset is not None:
+            self.info_reader.go_to_term(found_term, offset, info_offset)
+
+        # Read and record terms.
+
+        try:
+            # Add the found term if it starts with the specified term.
+
+            while found_term.startswith(term):
+                terms.append(found_term)
+                found_term, offset, frequency, doc_frequency = self.info_reader.read_term()
+
+        except EOFError:
+            pass
+
+        return terms
+
     def find_positions(self, term):
 
         "Return the documents and positions at which the given 'term' is found."
@@ -1504,7 +1566,7 @@
         else:
             doc_fields = self.docs[docnum]
 
-        doc_fields.append((identifier, value))
+        doc_fields.append((identifier, unicode(value))) # convert to string
 
         self.field_counter += 1
         if self.flush_interval and self.field_counter >= self.flush_interval:
diff -r ffec42ec943a -r 37a06a948a52 test.py
--- a/test.py	Sat Sep 05 18:10:50 2009 +0200
+++ b/test.py	Sun Sep 06 02:01:00 2009 +0200
@@ -201,8 +201,8 @@
 doc_fields_reversed = doc_fields[:]
 doc_fields_reversed.reverse()
 for docnum, fields in doc_fields_reversed:
-    df = rd.get_fields(docnum)
-    print list(enumerate(fields)) == df, list(enumerate(fields)), df
+    df = dict(rd.get_fields(docnum))
+    print dict(enumerate(fields)) == df, dict(enumerate(fields)), df
 for docnum in (13579, 246810):
     df = rd.get_fields(docnum)
     print df is None, df
@@ -309,6 +309,13 @@
 for term in ("dog", "dingo"):
     t = rd._find_term(term)
     print t is None, t
+
+# (Test term prefix searching.)
+
+print rd.find_terms("a") == ["aardvark", "anteater"], rd.find_terms("a"), ["aardvark", "anteater"]
+print rd.find_terms("bu") == ["bull", "bulldog"], rd.find_terms("bu"), ["bull", "bulldog"]
+print rd.find_terms("c") == ["cat"], rd.find_terms("c"), ["cat"]
+print rd.find_terms("d") == [], rd.find_terms("d"), []
 rd.close()
 
 # Test dictionaries with term and position data.
@@ -361,7 +368,7 @@
 for term, doc_positions in terms_reversed:
     dp = list(rd.find_positions(term))
     print doc_positions == dp, doc_positions, dp
-for term in ("dog", "dingo"):
+for term in ("aaa", "dog", "dingo"):
     dp = rd.find_positions(term)
     print dp is None, dp
 
@@ -422,8 +429,8 @@
     fr = rd.get_frequency(term)
     print frequency == fr, frequency, fr
 for docnum, text in docs:
-    df = rd.get_fields(docnum)
-    print (123, text) == df[0], (123, text), df[0]
+    df = dict(rd.get_fields(docnum))
+    print df[123] == text, text, df[123]
 for term, docnum, positions in position_tests:
     dp = rd.find_positions(term)
     pos = dp.from_document(docnum)