1.1 --- a/iixr/positions.py Thu Feb 03 01:26:35 2011 +0100
1.2 +++ b/iixr/positions.py Mon Feb 07 02:05:38 2011 +0100
1.3 @@ -3,7 +3,7 @@
1.4 """
1.5 Specific classes for storing position information.
1.6
1.7 -Copyright (C) 2009, 2010 Paul Boddie <paul@boddie.org.uk>
1.8 +Copyright (C) 2009, 2010, 2011 Paul Boddie <paul@boddie.org.uk>
1.9
1.10 This program is free software; you can redistribute it and/or modify it under
1.11 the terms of the GNU General Public License as published by the Free Software
1.12 @@ -18,8 +18,8 @@
1.13 with this program. If not, see <http://www.gnu.org/licenses/>.
1.14 """
1.15
1.16 +from iixr.data import *
1.17 from iixr.files import *
1.18 -from iixr.data import vint, vint_to_array
1.19
1.20 class PositionWriter(FileWriter):
1.21
1.22 @@ -27,7 +27,7 @@
1.23
1.24 def reset(self):
1.25 self.last_docnum = None
1.26 - self.docnum_size = None
1.27 + self.subtractor = None
1.28
1.29 def write_positions(self, docnum, positions):
1.30
1.31 @@ -35,39 +35,31 @@
1.32 Write for the document 'docnum' the given 'positions'.
1.33 """
1.34
1.35 - # Find the size of document number values.
1.36 -
1.37 - if self.docnum_size is None:
1.38 - self.docnum_size = self.get_value_size(docnum)
1.39 - self.last_docnum = self.get_initial_value(self.docnum_size)
1.40 -
1.41 - if docnum < self.last_docnum:
1.42 - raise ValueError, "Document number %r is less than previous number %r." % (docnum, self.last_docnum)
1.43 + if not positions:
1.44 + return
1.45
1.46 # Make sure that the positions are sorted.
1.47
1.48 positions.sort()
1.49
1.50 - # Find the size of position values.
1.51 -
1.52 - size = self.get_value_size(positions[0])
1.53 + # Calculate an ongoing delta.
1.54
1.55 - # Write the number of values per document number.
1.56 - # Write the document number delta.
1.57 - # Write the number of positions.
1.58 - # Write the number of values per position.
1.59 + if self.last_docnum is not None:
1.60 + if docnum < self.last_docnum:
1.61 + raise ValueError, "Document number %r is less than previous number %r." % (docnum, self.last_docnum)
1.62 +
1.63 + docnum_seq = self.subtractor(docnum, self.last_docnum)
1.64
1.65 - self.write_number(self.docnum_size)
1.66 - self.last_docnum = self.write_sequence(docnum, self.last_docnum, self.docnum_size, monotonic=0)
1.67 - self.write_number(len(positions))
1.68 - self.write_number(size)
1.69 + # Or preserve the document number and prepare for future deltas.
1.70
1.71 - # Write the position deltas.
1.72 + else:
1.73 + self.subtractor = get_subtractor(docnum)
1.74 + docnum_seq = docnum
1.75
1.76 - last = self.get_initial_value(size)
1.77 -
1.78 - for position in positions:
1.79 - last = self.write_sequence(position, last, size)
1.80 + self.begin_record()
1.81 + self.write_sequence_value(docnum_seq)
1.82 + self.write_monotonic_sequence(positions)
1.83 + self.end_record()
1.84
1.85 self.last_docnum = docnum
1.86
1.87 @@ -77,6 +69,7 @@
1.88
1.89 def reset(self):
1.90 self.last_docnum = None
1.91 + self.adder = None
1.92
1.93 def read_positions(self):
1.94
1.95 @@ -84,38 +77,25 @@
1.96 Read positions, returning a document number and a list of positions.
1.97 """
1.98
1.99 - # Read the number of values per document number.
1.100 + self.begin_record()
1.101
1.102 - docnum_size = self.read_number()
1.103 -
1.104 - if self.last_docnum is None:
1.105 - self.last_docnum = self.get_initial_value(docnum_size)
1.106 + # Read the document number.
1.107
1.108 - # Read the document number delta and add it to the last number.
1.109 -
1.110 - self.last_docnum = self.read_sequence(self.last_docnum, docnum_size, monotonic=0)
1.111 + docnum = self.read_sequence_value()
1.112
1.113 - # Read the number of positions.
1.114 -
1.115 - npositions = self.read_number()
1.116 + # Calculate an ongoing delta.
1.117
1.118 - # Read the number of values per position.
1.119 -
1.120 - size = self.read_number()
1.121 + if self.last_docnum is not None:
1.122 + self.last_docnum = self.adder(docnum, self.last_docnum)
1.123
1.124 - # Read the position deltas, adding each previous position to get the
1.125 - # appropriate collection of absolute positions.
1.126 -
1.127 - i = 0
1.128 + # Or preserve the document number and prepare for future deltas.
1.129
1.130 - last = self.get_initial_value(size)
1.131 -
1.132 - positions = []
1.133 + else:
1.134 + self.adder = get_adder(docnum)
1.135 + self.last_docnum = docnum
1.136
1.137 - while i < npositions:
1.138 - last = self.read_sequence(last, size)
1.139 - positions.append(last)
1.140 - i += 1
1.141 + positions = self.read_monotonic_sequence()
1.142 + self.end_record()
1.143
1.144 return self.last_docnum, positions
1.145
1.146 @@ -125,7 +105,7 @@
1.147
1.148 def reset(self):
1.149 self.last_docnum = None
1.150 - self.docnum_size = None
1.151 + self.subtractor = None
1.152 self.last_pos_offset = 0
1.153
1.154 def write_positions(self, docnum, pos_offset, count):
1.155 @@ -137,20 +117,19 @@
1.156
1.157 # Find the size of document number values.
1.158
1.159 - if self.docnum_size is None:
1.160 - self.docnum_size = self.get_value_size(docnum)
1.161 - self.last_docnum = self.get_initial_value(self.docnum_size)
1.162 + if self.last_docnum is not None:
1.163 + docnum_seq = self.subtractor(docnum, self.last_docnum)
1.164 + else:
1.165 + self.subtractor = get_subtractor(docnum)
1.166 + docnum_seq = docnum
1.167
1.168 - # Write the number of values per document number.
1.169 - # Write the document number delta.
1.170 - # Write the position file offset delta.
1.171 - # Write the document count.
1.172 -
1.173 - self.write_number(self.docnum_size)
1.174 - self.last_docnum = self.write_sequence(docnum, self.last_docnum, self.docnum_size, monotonic=0)
1.175 + self.begin_record()
1.176 + self.write_sequence_value(docnum_seq)
1.177 self.write_number(pos_offset - self.last_pos_offset)
1.178 self.write_number(count)
1.179 + self.end_record()
1.180
1.181 + self.last_docnum = docnum
1.182 self.last_pos_offset = pos_offset
1.183
1.184 class PositionIndexReader(FileReader):
1.185 @@ -159,6 +138,7 @@
1.186
1.187 def reset(self):
1.188 self.last_docnum = None
1.189 + self.adder = None
1.190 self.last_pos_offset = 0
1.191
1.192 def read_positions(self):
1.193 @@ -168,16 +148,17 @@
1.194 file, and the number of documents in a section of that file.
1.195 """
1.196
1.197 - # Read the number of values per document number.
1.198 + self.begin_record()
1.199
1.200 - docnum_size = self.read_number()
1.201 + # Read the document number.
1.202 +
1.203 + docnum = self.read_sequence_value()
1.204
1.205 - if self.last_docnum is None:
1.206 - self.last_docnum = self.get_initial_value(docnum_size)
1.207 -
1.208 - # Read the document number delta and add it to the last number.
1.209 -
1.210 - self.last_docnum = self.read_sequence(self.last_docnum, docnum_size, monotonic=0)
1.211 + if self.last_docnum is not None:
1.212 + self.last_docnum = self.adder(docnum, self.last_docnum)
1.213 + else:
1.214 + self.adder = get_adder(docnum)
1.215 + self.last_docnum = docnum
1.216
1.217 # Read the offset delta.
1.218
1.219 @@ -186,6 +167,7 @@
1.220 # Read the document count.
1.221
1.222 count = self.read_number()
1.223 + self.end_record()
1.224
1.225 return self.last_docnum, self.last_pos_offset, count
1.226