1.1 --- a/iixr/terms.py Mon Feb 07 02:05:38 2011 +0100
1.2 +++ b/iixr/terms.py Tue Feb 08 00:08:27 2011 +0100
1.3 @@ -29,6 +29,7 @@
1.4 "Writing term information to files."
1.5
1.6 def reset(self):
1.7 + self.end_record()
1.8 self.last_term = ""
1.9 self.last_offset = 0
1.10
1.11 @@ -40,14 +41,6 @@
1.12 term information file.
1.13 """
1.14
1.15 - self.begin_record()
1.16 - self._write_term(term, offset, frequency, doc_frequency)
1.17 - self.end_record()
1.18 -
1.19 - def _write_term(self, term, offset, frequency, doc_frequency):
1.20 -
1.21 - "Performs the term writing for 'write_term'."
1.22 -
1.23 if term <= self.last_term:
1.24 raise ValueError, "Term %r precedes the previous term %r." % (term, self.last_term)
1.25
1.26 @@ -79,6 +72,7 @@
1.27 def reset(self):
1.28 self.last_term = ""
1.29 self.last_offset = 0
1.30 + self.begin_record()
1.31
1.32 def read_term(self):
1.33
1.34 @@ -87,16 +81,6 @@
1.35 frequency from the term information file.
1.36 """
1.37
1.38 - self.begin_record()
1.39 - try:
1.40 - return self._read_term()
1.41 - finally:
1.42 - self.end_record()
1.43 -
1.44 - def _read_term(self):
1.45 -
1.46 - "Performs the term reading for 'read_term'."
1.47 -
1.48 # Read the prefix length and term suffix.
1.49
1.50 common = self.read_number()
1.51 @@ -145,13 +129,11 @@
1.52 'info_offset' in the term information file.
1.53 """
1.54
1.55 - self.begin_record()
1.56 - TermWriter._write_term(self, term, offset, frequency, doc_frequency)
1.57 + TermWriter.write_term(self, term, offset, frequency, doc_frequency)
1.58
1.59 # Write the information file offset delta.
1.60
1.61 self.write_number(info_offset - self.last_info_offset)
1.62 - self.end_record()
1.63
1.64 self.last_info_offset = info_offset
1.65
1.66 @@ -171,13 +153,11 @@
1.67 index file.
1.68 """
1.69
1.70 - self.begin_record()
1.71 - term, offset, frequency, doc_frequency = TermReader._read_term(self)
1.72 + term, offset, frequency, doc_frequency = TermReader.read_term(self)
1.73
1.74 # Read the offset delta.
1.75
1.76 self.last_info_offset += self.read_number()
1.77 - self.end_record()
1.78
1.79 return term, offset, frequency, doc_frequency, self.last_info_offset
1.80
1.81 @@ -197,16 +177,16 @@
1.82 """
1.83 Write the given 'term', its position file 'offset', its 'frequency' and
1.84 its 'doc_frequency' (number of documents in which it appears) to the
1.85 - term information file. Return the offset after the term information was
1.86 + term information file. Return the offset before the term information was
1.87 written to the file.
1.88 """
1.89
1.90 - self.info_writer.write_term(term, offset, frequency, doc_frequency)
1.91 -
1.92 if self.entry % self.interval == 0:
1.93 + self.info_writer.reset()
1.94 info_offset = self.info_writer.tell()
1.95 self.index_writer.write_term(term, offset, frequency, doc_frequency, info_offset)
1.96
1.97 + self.info_writer.write_term(term, offset, frequency, doc_frequency)
1.98 self.entry += 1
1.99
1.100 def write_term_positions(self, term, doc_positions):
1.101 @@ -236,6 +216,7 @@
1.102 self.info_reader = info_reader
1.103 self.index_reader = index_reader
1.104 self.position_dict_reader = position_dict_reader
1.105 + self.entry = 0
1.106
1.107 self.terms = []
1.108 try:
1.109 @@ -269,8 +250,10 @@
1.110 # as the closest.
1.111
1.112 if i == -1:
1.113 + self.entry = 0
1.114 return self.terms[0]
1.115 else:
1.116 + self.entry = i
1.117 return self.terms[i]
1.118
1.119 def _find_closest_term(self, term):
1.120 @@ -297,10 +280,11 @@
1.121 # and scan for the desired term.
1.122
1.123 else:
1.124 - self.info_reader.go_to_term(found_term, offset, info_offset)
1.125 + # Reset the term and offset for the new page.
1.126 + self.info_reader.go_to_term("", 0, info_offset)
1.127 try:
1.128 while term > found_term:
1.129 - found_term, offset, frequency, doc_frequency = self.info_reader.read_term()
1.130 + found_term, offset, frequency, doc_frequency = self._read_term()
1.131 except EOFError:
1.132 pass
1.133
1.134 @@ -355,6 +339,7 @@
1.135 # Sequential access methods.
1.136
1.137 def rewind(self):
1.138 + self.entry = 0
1.139 self.info_reader.rewind()
1.140
1.141 def read_term(self):
1.142 @@ -364,8 +349,28 @@
1.143 documents and positions at which the term is found.
1.144 """
1.145
1.146 - term, offset, frequency, doc_frequency = self.info_reader.read_term()
1.147 - return self._get_term_and_positions(term, offset, frequency, doc_frequency)
1.148 + return self._get_term_and_positions(*self._read_term())
1.149 +
1.150 + def _read_term(self):
1.151 +
1.152 + try:
1.153 + term, offset, frequency, doc_frequency = self.info_reader.read_term()
1.154 + except EOFError:
1.155 + self.entry += 1
1.156 + try:
1.157 + term, offset, frequency, doc_frequency, info_offset = self.terms[self.entry]
1.158 + except IndexError:
1.159 + raise EOFError
1.160 + else:
1.161 + # Reset the term and offset for the new page.
1.162 +
1.163 + self.info_reader.go_to_term("", 0, info_offset)
1.164 +
1.165 + # Skip the term in the information file.
1.166 +
1.167 + self.info_reader.read_term()
1.168 +
1.169 + return term, offset, frequency, doc_frequency
1.170
1.171 def go_to_term(self, term):
1.172
1.173 @@ -380,7 +385,14 @@
1.174 # Position the reader, if necessary.
1.175
1.176 if info_offset is not None:
1.177 - self.info_reader.go_to_term(found_term, offset, info_offset)
1.178 +
1.179 + # Reset the term and offset for the new page.
1.180 +
1.181 + self.info_reader.go_to_term("", 0, info_offset)
1.182 +
1.183 + # Skip the term in the information file.
1.184 +
1.185 + self.info_reader.read_term()
1.186
1.187 return found_term, offset, frequency, doc_frequency
1.188
1.189 @@ -407,7 +419,7 @@
1.190
1.191 while found_term.startswith(term):
1.192 terms.append(found_term)
1.193 - found_term, offset, frequency, doc_frequency = self.info_reader.read_term()
1.194 + found_term, offset, frequency, doc_frequency = self._read_term()
1.195
1.196 except EOFError:
1.197 pass