1.1 --- a/iixr.py Sat Aug 29 21:15:47 2009 +0200
1.2 +++ b/iixr.py Sat Aug 29 22:12:25 2009 +0200
1.3 @@ -215,8 +215,9 @@
1.4
1.5 """
1.6 Write all 'doc_positions' - a collection of tuples of the form (document
1.7 - number, position list) - to the file, returning the offset at which they
1.8 - were stored.
1.9 + number, position list) - to the file, returning a tuple containing the
1.10 + offset at which they were stored together with the frequency (number of
1.11 + positions) for the term involved.
1.12 """
1.13
1.14 # Reset the writer and record the current file offset.
1.15 @@ -230,10 +231,13 @@
1.16
1.17 # Write the positions.
1.18
1.19 + frequency = 0
1.20 +
1.21 for docnum, positions in doc_positions:
1.22 self.write_positions(docnum, positions)
1.23 + frequency += len(positions)
1.24
1.25 - return offset
1.26 + return offset, frequency
1.27
1.28 class PositionReader(FileReader):
1.29
1.30 @@ -301,12 +305,12 @@
1.31 self.last_term = ""
1.32 self.last_offset = 0
1.33
1.34 - def write_term(self, term, offset):
1.35 + def write_term(self, term, offset, frequency):
1.36
1.37 """
1.38 - Write the given 'term' and its position file 'offset' to the term
1.39 - information file. Return the offset after the term information was
1.40 - written to the file.
1.41 + Write the given 'term', its position file 'offset', and its 'frequency'
1.42 + to the term information file. Return the offset after the term
1.43 + information was written to the file.
1.44 """
1.45
1.46 # Too long terms are not currently supported.
1.47 @@ -326,6 +330,10 @@
1.48
1.49 self.write_number(offset - self.last_offset)
1.50
1.51 + # Write the frequency.
1.52 +
1.53 + self.write_number(frequency)
1.54 +
1.55 self.last_term = term
1.56 self.last_offset = offset
1.57
1.58 @@ -342,7 +350,8 @@
1.59 def read_term(self):
1.60
1.61 """
1.62 - Read a term and its position file offset from the term information file.
1.63 + Read a term, its position file offset, and its frequency from the term
1.64 + information file.
1.65 """
1.66
1.67 # Read the prefix length and term suffix.
1.68 @@ -356,7 +365,11 @@
1.69
1.70 self.last_offset += self.read_number()
1.71
1.72 - return self.last_term, self.last_offset
1.73 + # Read the frequency.
1.74 +
1.75 + frequency = self.read_number()
1.76 +
1.77 + return self.last_term, self.last_offset, frequency
1.78
1.79 def go_to_term(self, term, offset, info_offset):
1.80
1.81 @@ -377,15 +390,15 @@
1.82 TermWriter.reset(self)
1.83 self.last_info_offset = 0
1.84
1.85 - def write_term(self, term, offset, info_offset):
1.86 + def write_term(self, term, offset, frequency, info_offset):
1.87
1.88 """
1.89 - Write the given 'term' and its position file 'offset' to the term
1.90 - dictionary index file, along with the 'info_offset' in the term
1.91 - information file.
1.92 + Write the given 'term', its position file 'offset', and its 'frequency'
1.93 + to the term dictionary index file, along with the 'info_offset' in the
1.94 + term information file.
1.95 """
1.96
1.97 - TermWriter.write_term(self, term, offset)
1.98 + TermWriter.write_term(self, term, offset, frequency)
1.99
1.100 # Write the information file offset delta.
1.101
1.102 @@ -403,17 +416,17 @@
1.103 def read_term(self):
1.104
1.105 """
1.106 - Read a term, its position file offset, and its term information file
1.107 - offset from the term dictionary index file.
1.108 + Read a term, its position file offset, its frequency, and its term
1.109 + information file offset from the term dictionary index file.
1.110 """
1.111
1.112 - term, offset = TermReader.read_term(self)
1.113 + term, offset, frequency = TermReader.read_term(self)
1.114
1.115 # Read the offset delta.
1.116
1.117 self.last_info_offset += self.read_number()
1.118
1.119 - return term, offset, self.last_info_offset
1.120 + return term, offset, frequency, self.last_info_offset
1.121
1.122 class TermDictionaryWriter:
1.123
1.124 @@ -426,17 +439,18 @@
1.125 self.interval = interval
1.126 self.entry = 0
1.127
1.128 - def _write_term(self, term, offset):
1.129 + def _write_term(self, term, offset, frequency):
1.130
1.131 """
1.132 - Write the given 'term' and its position file 'offset' to the term
1.133 - information file and optionally to the index, making a dictionary entry.
1.134 + Write the given 'term', its position file 'offset', and its 'frequency'
1.135 + to the term information file and optionally to the index, making a
1.136 + dictionary entry.
1.137 """
1.138
1.139 - info_offset = self.info_writer.write_term(term, offset)
1.140 + info_offset = self.info_writer.write_term(term, offset, frequency)
1.141
1.142 if self.entry % self.interval == 0:
1.143 - self.index_writer.write_term(term, offset, info_offset)
1.144 + self.index_writer.write_term(term, offset, frequency, info_offset)
1.145
1.146 self.entry += 1
1.147
1.148 @@ -447,8 +461,8 @@
1.149 and positions at which the term is found.
1.150 """
1.151
1.152 - offset = self.position_writer.write_all_positions(doc_positions)
1.153 - self._write_term(term, offset)
1.154 + offset, frequency = self.position_writer.write_all_positions(doc_positions)
1.155 + self._write_term(term, offset, frequency)
1.156
1.157 def close(self):
1.158 self.info_writer.close()
1.159 @@ -478,7 +492,10 @@
1.160
1.161 def _find_term(self, term):
1.162
1.163 - "Find the position file offset of 'term' from the term dictionary."
1.164 + """
1.165 + Find the position file offset and frequency of 'term' from the term
1.166 + dictionary.
1.167 + """
1.168
1.169 i = bisect_right(self.terms, (term, self.max_offset, self.max_info_offset)) - 1
1.170
1.171 @@ -487,12 +504,12 @@
1.172 if i == -1:
1.173 return None
1.174
1.175 - found_term, offset, info_offset = self.terms[i]
1.176 + found_term, offset, frequency, info_offset = self.terms[i]
1.177
1.178 # Where the term is found immediately, return the offset.
1.179
1.180 if term == found_term:
1.181 - return offset
1.182 + return offset, frequency
1.183
1.184 # Otherwise, seek past the index term's entry in the information file
1.185 # and scan for the desired term.
1.186 @@ -501,14 +518,14 @@
1.187 self.info_reader.go_to_term(found_term, offset, info_offset)
1.188 try:
1.189 while term > found_term:
1.190 - found_term, offset = self.info_reader.read_term()
1.191 + found_term, offset, frequency = self.info_reader.read_term()
1.192 except EOFError:
1.193 pass
1.194
1.195 - # If the term is found, return the offset.
1.196 + # If the term is found, return the offset and frequency.
1.197
1.198 if term == found_term:
1.199 - return offset
1.200 + return offset, frequency
1.201 else:
1.202 return None
1.203
1.204 @@ -516,12 +533,24 @@
1.205
1.206 "Return the documents and positions at which the given 'term' is found."
1.207
1.208 - offset = self._find_term(term)
1.209 - if offset is None:
1.210 + t = self._find_term(term)
1.211 + if t is None:
1.212 return None
1.213 else:
1.214 + offset, frequency = t
1.215 return self.position_reader.read_all_positions(offset)
1.216
1.217 + def get_frequency(self, term):
1.218 +
1.219 + "Return the frequency of the given 'term'."
1.220 +
1.221 + t = self._find_term(term)
1.222 + if t is None:
1.223 + return None
1.224 + else:
1.225 + offset, frequency = t
1.226 + return frequency
1.227 +
1.228 def close(self):
1.229 self.info_reader.close()
1.230 self.index_reader.close()
1.231 @@ -810,6 +839,9 @@
1.232 def find_positions(self, term):
1.233 return self.dict_reader.find_positions(term)
1.234
1.235 + def get_frequency(self, term):
1.236 + return self.dict_reader.get_frequency(term)
1.237 +
1.238 def get_fields(self, docnum):
1.239 return self.field_dict_reader.read_fields(docnum)
1.240