1.1 --- a/test.py Mon Aug 31 21:02:30 2009 +0200
1.2 +++ b/test.py Wed Sep 02 01:30:42 2009 +0200
1.3 @@ -38,15 +38,18 @@
1.4 all_doc_positions = [
1.5 [
1.6 (123, [1, 3, 5, 15, 25]),
1.7 - (124, [0, 100])
1.8 + (124, [0, 100]),
1.9 + (125, [11, 99, 199]),
1.10 + (130, [77, 78, 80, 82, 89])
1.11 ],
1.12 [
1.13 (78, [9]),
1.14 - (196, [10, 11])
1.15 + (196, [10, 11]),
1.16 + (197, [17, 21, 30])
1.17 ]
1.18 ]
1.19
1.20 -f = open("test", "wb")
1.21 +f = open("testP", "wb")
1.22 w = iixr.PositionWriter(f)
1.23 for doc_positions in all_doc_positions:
1.24 for docnum, positions in doc_positions:
1.25 @@ -54,7 +57,7 @@
1.26 w.reset()
1.27 w.close()
1.28
1.29 -f = open("test", "rb")
1.30 +f = open("testP", "rb")
1.31 r = iixr.PositionReader(f)
1.32 for doc_positions in all_doc_positions:
1.33 for docnum, positions in doc_positions:
1.34 @@ -64,20 +67,68 @@
1.35 r.reset()
1.36 r.close()
1.37
1.38 -f = open("test", "wb")
1.39 +# Test position index files.
1.40 +
1.41 +indexed_positions = [
1.42 + [
1.43 + (1234, 0, 100),
1.44 + (2345, 700, 100),
1.45 + (3456, 1900, 50)
1.46 + ],
1.47 + [
1.48 + (4567, 2800, 20)
1.49 + ]
1.50 + ]
1.51 +
1.52 +offsets = []
1.53 +f = open("testPI", "wb")
1.54 +w = iixr.PositionIndexWriter(f)
1.55 +for term_positions in indexed_positions:
1.56 + offset = None
1.57 + doc_frequency = 0
1.58 + w.reset()
1.59 + for docnum, pos_offset, count in term_positions:
1.60 + io = w.write_positions(docnum, pos_offset, count)
1.61 + if offset is None:
1.62 + offset = io
1.63 + doc_frequency += count
1.64 + offsets.append((offset, doc_frequency))
1.65 +w.close()
1.66 +
1.67 +f = open("testPI", "rb")
1.68 +r = iixr.PositionIndexReader(f)
1.69 +offsets.reverse()
1.70 +indexed_positions.reverse()
1.71 +for (offset, doc_frequency), term_positions in zip(offsets, indexed_positions):
1.72 + found_positions = r.read_term_positions(offset, doc_frequency)
1.73 + for (docnum, pos_offset, count), (dn, po, c) in zip(term_positions, found_positions):
1.74 + print docnum == dn, docnum, dn
1.75 + print pos_offset == po, pos_offset, po
1.76 + print count == c, count, c
1.77 +r.close()
1.78 +
1.79 +# Test position dictionaries.
1.80 +
1.81 +f = open("testP", "wb")
1.82 w = iixr.PositionWriter(f)
1.83 +f2 = open("testPI", "wb")
1.84 +w2 = iixr.PositionIndexWriter(f2)
1.85 +wd = iixr.PositionDictionaryWriter(w, w2, 2)
1.86 offsets = []
1.87 for doc_positions in all_doc_positions:
1.88 - offset, frequency = w.write_term_positions(doc_positions)
1.89 - offsets.append(offset)
1.90 + offset, frequency, doc_frequency = wd.write_term_positions(doc_positions)
1.91 + offsets.append((offset, doc_frequency))
1.92 w.close()
1.93
1.94 -f = open("test", "rb")
1.95 +f = open("testP", "rb")
1.96 r = iixr.PositionReader(f)
1.97 +f2 = open("testPI", "rb")
1.98 +r2 = iixr.PositionIndexReader(f2)
1.99 +rd = iixr.PositionDictionaryReader(r, r2)
1.100 offsets.reverse()
1.101 all_doc_positions.reverse()
1.102 -for offset, doc_positions in zip(offsets, all_doc_positions):
1.103 - dp = list(r.read_term_positions(offset))
1.104 +for (offset, doc_frequency), doc_positions in zip(offsets, all_doc_positions):
1.105 + dp = list(rd.read_term_positions(offset, doc_frequency))
1.106 print doc_positions == dp, doc_positions, dp
1.107 r.close()
1.108
1.109 @@ -166,55 +217,57 @@
1.110 # Test terms.
1.111
1.112 terms = [
1.113 - # term offset frequency
1.114 - ("aardvark", 100000123, 1),
1.115 - ("anteater", 100000456, 2),
1.116 - ("badger", 100000789, 13),
1.117 - ("bull", 1000001234, 59),
1.118 - ("bulldog", 1000002345, 99),
1.119 - ("cat", 1000003456, 89)
1.120 + # term offset frequency doc_frequency
1.121 + ("aardvark", 100000123, 1, 1),
1.122 + ("anteater", 100000456, 2, 1),
1.123 + ("badger", 100000789, 13, 7),
1.124 + ("bull", 1000001234, 59, 17),
1.125 + ("bulldog", 1000002345, 99, 80),
1.126 + ("cat", 1000003456, 89, 28)
1.127 ]
1.128
1.129 f = open("test", "wb")
1.130 w = iixr.TermWriter(f)
1.131 -for term, offset, frequency in terms:
1.132 - w.write_term(term, offset, frequency)
1.133 +for term, offset, frequency, doc_frequency in terms:
1.134 + w.write_term(term, offset, frequency, doc_frequency)
1.135 w.close()
1.136
1.137 f = open("test", "rb")
1.138 r = iixr.TermReader(f)
1.139 -for term, offset, frequency in terms:
1.140 - t, o, fr = r.read_term()
1.141 +for term, offset, frequency, doc_frequency in terms:
1.142 + t, o, fr, df = r.read_term()
1.143 print term == t, term, t
1.144 print offset == o, offset, o
1.145 print frequency == fr, frequency, fr
1.146 + print doc_frequency == df, doc_frequency, df
1.147 r.close()
1.148
1.149 # Test terms in index files.
1.150
1.151 indexed_terms = [
1.152 - # term offset frequency info_offset
1.153 - ("aardvark", 100000123, 1, 200000321),
1.154 - ("anteater", 100000456, 2, 200000654),
1.155 - ("badger", 100000789, 13, 200000987),
1.156 - ("bull", 1000001234, 59, 200004321),
1.157 - ("bulldog", 1000002345, 99, 200005432),
1.158 - ("cat", 1000003456, 89, 200006543)
1.159 + # term offset frequency doc_frequency info_offset
1.160 + ("aardvark", 100000123, 1, 1, 200000321),
1.161 + ("anteater", 100000456, 2, 1, 200000654),
1.162 + ("badger", 100000789, 13, 7, 200000987),
1.163 + ("bull", 1000001234, 59, 17, 200004321),
1.164 + ("bulldog", 1000002345, 99, 80, 200005432),
1.165 + ("cat", 1000003456, 89, 28, 200006543)
1.166 ]
1.167
1.168 f = open("test", "wb")
1.169 w = iixr.TermIndexWriter(f)
1.170 -for term, offset, frequency, info_offset in indexed_terms:
1.171 - w.write_term(term, offset, frequency, info_offset)
1.172 +for term, offset, frequency, doc_frequency, info_offset in indexed_terms:
1.173 + w.write_term(term, offset, frequency, doc_frequency, info_offset)
1.174 w.close()
1.175
1.176 f = open("test", "rb")
1.177 r = iixr.TermIndexReader(f)
1.178 -for term, offset, frequency, info_offset in indexed_terms:
1.179 - t, o, fr, i = r.read_term()
1.180 +for term, offset, frequency, doc_frequency, info_offset in indexed_terms:
1.181 + t, o, fr, df, i = r.read_term()
1.182 print term == t, term, t
1.183 print offset == o, offset, o
1.184 print frequency == fr, frequency, fr
1.185 + print doc_frequency == df, doc_frequency, df
1.186 print info_offset == i, info_offset, i
1.187 r.close()
1.188
1.189 @@ -224,26 +277,23 @@
1.190 w = iixr.TermWriter(f)
1.191 f2 = open("testI", "wb")
1.192 w2 = iixr.TermIndexWriter(f2)
1.193 -f3 = open("testP", "wb")
1.194 -w3 = iixr.PositionWriter(f3)
1.195 -wd = iixr.TermDictionaryWriter(w, w2, w3, 3)
1.196 -for term, offset, frequency in terms:
1.197 - wd._write_term(term, offset, frequency)
1.198 +wd = iixr.TermDictionaryWriter(w, w2, None, 3)
1.199 +for term, offset, frequency, doc_frequency in terms:
1.200 + wd._write_term(term, offset, frequency, doc_frequency)
1.201 wd.close()
1.202
1.203 f = open("test", "rb")
1.204 r = iixr.TermReader(f)
1.205 f2 = open("testI", "rb")
1.206 r2 = iixr.TermIndexReader(f2)
1.207 -f3 = open("testP", "rb")
1.208 -r3 = iixr.PositionReader(f3)
1.209 -rd = iixr.TermDictionaryReader(r, r2, r3)
1.210 +rd = iixr.TermDictionaryReader(r, r2, None)
1.211 terms_reversed = terms[:]
1.212 terms_reversed.reverse()
1.213 -for term, offset, frequency in terms_reversed:
1.214 - o, fr = rd._find_term(term)
1.215 +for term, offset, frequency, doc_frequency in terms_reversed:
1.216 + o, fr, df = rd._find_term(term)
1.217 print offset == o, offset, o
1.218 print frequency == fr, frequency, fr
1.219 + print doc_frequency == df, doc_frequency, df
1.220 for term in ("dog", "dingo"):
1.221 t = rd._find_term(term)
1.222 print t is None, t
1.223 @@ -255,7 +305,7 @@
1.224 ("aardvark", [(1, [2, 45, 96]), (20, [13])]),
1.225 ("anteater", [(1, [43, 44])]),
1.226 ("badger", [(7, [2, 22, 196]), (19, [55, 1333]), (21, [0])]),
1.227 - ("bull", [(6, [128]), (16, [12])]),
1.228 + ("bull", [(6, [128]), (16, [12]), (26, [1, 3, 5, 7, 9]), (36, [2, 4, 6, 8, 10])]),
1.229 ("bulldog", [(43, [17, 19, 256, 512])]),
1.230 ("cat", [(123, [12, 145, 196]), (1200, [113])])
1.231 ]
1.232 @@ -266,7 +316,10 @@
1.233 w2 = iixr.TermIndexWriter(f2)
1.234 f3 = open("testP", "wb")
1.235 w3 = iixr.PositionWriter(f3)
1.236 -wd = iixr.TermDictionaryWriter(w, w2, w3, 3)
1.237 +f4 = open("testPI", "wb")
1.238 +w4 = iixr.PositionIndexWriter(f4)
1.239 +wp = iixr.PositionDictionaryWriter(r3, r4, 2)
1.240 +wd = iixr.TermDictionaryWriter(w, w2, wp, 3)
1.241 for term, doc_positions in terms_with_positions:
1.242 wd.write_term_positions(term, doc_positions)
1.243 wd.close()
1.244 @@ -277,7 +330,10 @@
1.245 r2 = iixr.TermIndexReader(f2)
1.246 f3 = open("testP", "rb")
1.247 r3 = iixr.PositionReader(f3)
1.248 -rd = iixr.TermDictionaryReader(r, r2, r3)
1.249 +f4 = open("testPI", "rb")
1.250 +r4 = iixr.PositionIndexReader(f4)
1.251 +rp = iixr.PositionDictionaryReader(r3, r4)
1.252 +rd = iixr.TermDictionaryReader(r, r2, rp)
1.253 terms_reversed = terms_with_positions[:]
1.254 terms_reversed.reverse()
1.255 for term, doc_positions in terms_reversed:
1.256 @@ -291,7 +347,7 @@
1.257
1.258 rd.rewind()
1.259 for term, doc_positions in terms_with_positions:
1.260 - t, fr, dp = rd.read_term()
1.261 + t, fr, df, dp = rd.read_term()
1.262 dp = list(dp)
1.263 print term == t, term, t
1.264 print doc_positions == dp, doc_positions, dp