# HG changeset patch # User Paul Boddie # Date 1252195260 -7200 # Node ID 37a06a948a52c71940b0b9bb2ce7b707582eca8d # Parent ffec42ec943a2286a935bc9a5fe95563cf978b4c Added term retrieval by prefix. Added a conversion of field values to Unicode in the add_field method. diff -r ffec42ec943a -r 37a06a948a52 iixr.py --- a/iixr.py Sat Sep 05 18:10:50 2009 +0200 +++ b/iixr.py Sun Sep 06 02:01:00 2009 +0200 @@ -735,7 +735,7 @@ """ Read a term, its position file offset, its frequency and its document - frequence from the term information file. + frequency from the term information file. """ # Read the prefix length and term suffix. @@ -879,27 +879,47 @@ self.max_offset = self.terms[-1][1] + 1 - def _find_term(self, term): + def _find_closest_entry(self, term): """ - Find the position file offset and frequency of 'term' from the term - dictionary. + Find the offsets and frequencies of 'term' from the term dictionary or + the closest term starting with the value of 'term'. + + Return the closest index entry consisting of a term, the position file + offset, the term frequency, the document frequency, and the term details + file offset. """ i = bisect_right(self.terms, (term, self.max_offset, 0, 0)) - 1 # Get the entry position providing the term or one preceding it. + # If no entry precedes the requested term, return the very first entry + # as the closest. if i == -1: - return None - - found_term, offset, frequency, doc_frequency, info_offset = self.terms[i] + return self.terms[0] + else: + return self.terms[i] + + def _find_closest_term(self, term): + + """ + Find the offsets and frequencies of 'term' from the term dictionary or + the closest term starting with the value of 'term'. + + Return the closest term (or the term itself), the position file offset, + the term frequency, the document frequency, and the term details file + offset (or None if the reader is already positioned). + """ + + found_term, offset, frequency, doc_frequency, info_offset = self._find_closest_entry(term) # Where the term is found immediately, return the offset and - # frequencies. - - if term == found_term: - return offset, frequency, doc_frequency + # frequencies. If the term does not appear, return the details of the + # closest entry. + + if term <= found_term: + return found_term, offset, frequency, doc_frequency, info_offset # Otherwise, seek past the index term's entry in the information file # and scan for the desired term. @@ -912,19 +932,32 @@ except EOFError: pass - # If the term is found, return the offset and frequencies. - - if term == found_term: - return offset, frequency, doc_frequency - else: - return None + return found_term, offset, frequency, doc_frequency, None + + def _find_term(self, term): + + """ + Find the position file offset and frequency of 'term' from the term + dictionary. + """ + + found_term, offset, frequency, doc_frequency, info_offset = self._find_closest_term(term) + + # If the term is found, return the offset and frequencies. + + if term == found_term: + return offset, frequency, doc_frequency + else: + return None + + def _get_positions(self, offset, doc_frequency): + return self.position_dict_reader.read_term_positions(offset, doc_frequency) + + # Sequential access methods. def rewind(self): self.info_reader.rewind() - def _get_positions(self, offset, doc_frequency): - return self.position_dict_reader.read_term_positions(offset, doc_frequency) - def read_term(self): """ @@ -936,6 +969,35 @@ positions = self._get_positions(offset, doc_frequency) return term, frequency, doc_frequency, positions + # Query methods. + + def find_terms(self, term): + + "Return all terms whose values start with the value of 'term'." + + terms = [] + + found_term, offset, frequency, doc_frequency, info_offset = self._find_closest_term(term) + + # Position the reader, if necessary. + + if info_offset is not None: + self.info_reader.go_to_term(found_term, offset, info_offset) + + # Read and record terms. + + try: + # Add the found term if it starts with the specified term. + + while found_term.startswith(term): + terms.append(found_term) + found_term, offset, frequency, doc_frequency = self.info_reader.read_term() + + except EOFError: + pass + + return terms + def find_positions(self, term): "Return the documents and positions at which the given 'term' is found." @@ -1504,7 +1566,7 @@ else: doc_fields = self.docs[docnum] - doc_fields.append((identifier, value)) + doc_fields.append((identifier, unicode(value))) # convert to string self.field_counter += 1 if self.flush_interval and self.field_counter >= self.flush_interval: diff -r ffec42ec943a -r 37a06a948a52 test.py --- a/test.py Sat Sep 05 18:10:50 2009 +0200 +++ b/test.py Sun Sep 06 02:01:00 2009 +0200 @@ -201,8 +201,8 @@ doc_fields_reversed = doc_fields[:] doc_fields_reversed.reverse() for docnum, fields in doc_fields_reversed: - df = rd.get_fields(docnum) - print list(enumerate(fields)) == df, list(enumerate(fields)), df + df = dict(rd.get_fields(docnum)) + print dict(enumerate(fields)) == df, dict(enumerate(fields)), df for docnum in (13579, 246810): df = rd.get_fields(docnum) print df is None, df @@ -309,6 +309,13 @@ for term in ("dog", "dingo"): t = rd._find_term(term) print t is None, t + +# (Test term prefix searching.) + +print rd.find_terms("a") == ["aardvark", "anteater"], rd.find_terms("a"), ["aardvark", "anteater"] +print rd.find_terms("bu") == ["bull", "bulldog"], rd.find_terms("bu"), ["bull", "bulldog"] +print rd.find_terms("c") == ["cat"], rd.find_terms("c"), ["cat"] +print rd.find_terms("d") == [], rd.find_terms("d"), [] rd.close() # Test dictionaries with term and position data. @@ -361,7 +368,7 @@ for term, doc_positions in terms_reversed: dp = list(rd.find_positions(term)) print doc_positions == dp, doc_positions, dp -for term in ("dog", "dingo"): +for term in ("aaa", "dog", "dingo"): dp = rd.find_positions(term) print dp is None, dp @@ -422,8 +429,8 @@ fr = rd.get_frequency(term) print frequency == fr, frequency, fr for docnum, text in docs: - df = rd.get_fields(docnum) - print (123, text) == df[0], (123, text), df[0] + df = dict(rd.get_fields(docnum)) + print df[123] == text, text, df[123] for term, docnum, positions in position_tests: dp = rd.find_positions(term) pos = dp.from_document(docnum)