1 #!/usr/bin/env python 2 3 import iixr 4 5 # Test basic data types. 6 7 numbers = [12345678, 0, 1, 127, 128, 255, 256] 8 9 f = open("test", "wb") 10 w = iixr.FileWriter(f) 11 for number in numbers: 12 w.write_number(number) 13 w.close() 14 15 f = open("test", "rb") 16 r = iixr.FileReader(f) 17 for number in numbers: 18 n = r.read_number() 19 print number == n, number, n 20 r.close() 21 22 # Test positions. 23 24 all_doc_positions = [ 25 [ 26 (123, [1, 3, 5, 15, 25]), 27 (124, [0, 100]) 28 ], 29 [ 30 (78, [9]), 31 (196, [10, 11]) 32 ] 33 ] 34 35 f = open("test", "wb") 36 w = iixr.PositionWriter(f) 37 for doc_positions in all_doc_positions: 38 for docnum, positions in doc_positions: 39 w.write_positions(docnum, positions) 40 w.reset() 41 w.close() 42 43 f = open("test", "rb") 44 r = iixr.PositionReader(f) 45 for doc_positions in all_doc_positions: 46 for docnum, positions in doc_positions: 47 d, p = r.read_positions() 48 print docnum == d, docnum, d 49 print positions == p, positions, p 50 r.reset() 51 r.close() 52 53 f = open("test", "wb") 54 w = iixr.PositionWriter(f) 55 offsets = [] 56 for doc_positions in all_doc_positions: 57 offset, frequency = w.write_term_positions(doc_positions) 58 offsets.append(offset) 59 w.close() 60 61 f = open("test", "rb") 62 r = iixr.PositionReader(f) 63 offsets.reverse() 64 all_doc_positions.reverse() 65 for offset, doc_positions in zip(offsets, all_doc_positions): 66 dp = r.read_term_positions(offset) 67 print doc_positions == dp, doc_positions, dp 68 r.close() 69 70 # Test fields. 71 72 doc_fields = [ 73 (123, ["testing", "fields", "stored", "compressed"]), 74 (456, ["fields", "for a second", "document"]), 75 (789, ["field value"]), 76 (1234, []), 77 (2345, ["abc", "def"]), 78 (3456, ["apple", "banana", "cherry"]), 79 (4567, ["drue", "eple"]) 80 ] 81 82 f = open("testF", "wb") 83 w = iixr.FieldWriter(f) 84 for docnum, fields in doc_fields: 85 w.write_fields(docnum, fields) 86 w.close() 87 88 f = open("testF", "rb") 89 r = iixr.FieldReader(f) 90 for docnum, fields in doc_fields: 91 dn, df = r.read_fields() 92 print docnum == dn, docnum, dn 93 print fields == df, fields, df 94 r.close() 95 96 # Test field index files. 97 98 indexed_docs = [ 99 (123, 100000987), 100 (456, 100004321), 101 (789, 100008765) 102 ] 103 104 f = open("testFI", "wb") 105 w = iixr.FieldIndexWriter(f) 106 for docnum, offset in indexed_docs: 107 w.write_document(docnum, offset) 108 w.close() 109 110 f = open("testFI", "rb") 111 r = iixr.FieldIndexReader(f) 112 for docnum, offset in indexed_docs: 113 dn, o = r.read_document() 114 print docnum == dn, docnum, dn 115 print offset == o, offset, o 116 r.close() 117 118 # Test field dictionaries. 119 120 f = open("testF", "wb") 121 w = iixr.FieldWriter(f) 122 f2 = open("testFI", "wb") 123 w2 = iixr.FieldIndexWriter(f2) 124 wd = iixr.FieldDictionaryWriter(w, w2, 3) 125 for docnum, fields in doc_fields: 126 wd.write_fields(docnum, fields) 127 wd.close() 128 129 f = open("testF", "rb") 130 r = iixr.FieldReader(f) 131 f2 = open("testFI", "rb") 132 r2 = iixr.FieldIndexReader(f2) 133 rd = iixr.FieldDictionaryReader(r, r2) 134 doc_fields_reversed = doc_fields[:] 135 doc_fields_reversed.reverse() 136 for docnum, fields in doc_fields_reversed: 137 df = rd.read_fields(docnum) 138 print fields == df, fields, df 139 for docnum in (13579, 246810): 140 df = rd.read_fields(docnum) 141 print df is None, df 142 rd.close() 143 144 # Test terms. 145 146 terms = [ 147 # term offset frequency 148 ("aardvark", 100000123, 1), 149 ("anteater", 100000456, 2), 150 ("badger", 100000789, 13), 151 ("bull", 1000001234, 59), 152 ("bulldog", 1000002345, 99), 153 ("cat", 1000003456, 89) 154 ] 155 156 f = open("test", "wb") 157 w = iixr.TermWriter(f) 158 for term, offset, frequency in terms: 159 w.write_term(term, offset, frequency) 160 w.close() 161 162 f = open("test", "rb") 163 r = iixr.TermReader(f) 164 for term, offset, frequency in terms: 165 t, o, fr = r.read_term() 166 print term == t, term, t 167 print offset == o, offset, o 168 print frequency == fr, frequency, fr 169 r.close() 170 171 # Test terms in index files. 172 173 indexed_terms = [ 174 # term offset frequency info_offset 175 ("aardvark", 100000123, 1, 200000321), 176 ("anteater", 100000456, 2, 200000654), 177 ("badger", 100000789, 13, 200000987), 178 ("bull", 1000001234, 59, 200004321), 179 ("bulldog", 1000002345, 99, 200005432), 180 ("cat", 1000003456, 89, 200006543) 181 ] 182 183 f = open("test", "wb") 184 w = iixr.TermIndexWriter(f) 185 for term, offset, frequency, info_offset in indexed_terms: 186 w.write_term(term, offset, frequency, info_offset) 187 w.close() 188 189 f = open("test", "rb") 190 r = iixr.TermIndexReader(f) 191 for term, offset, frequency, info_offset in indexed_terms: 192 t, o, fr, i = r.read_term() 193 print term == t, term, t 194 print offset == o, offset, o 195 print frequency == fr, frequency, fr 196 print info_offset == i, info_offset, i 197 r.close() 198 199 # Test dictionaries with only term data. 200 201 f = open("test", "wb") 202 w = iixr.TermWriter(f) 203 f2 = open("testI", "wb") 204 w2 = iixr.TermIndexWriter(f2) 205 f3 = open("testP", "wb") 206 w3 = iixr.PositionWriter(f3) 207 wd = iixr.TermDictionaryWriter(w, w2, w3, 3) 208 for term, offset, frequency in terms: 209 wd._write_term(term, offset, frequency) 210 wd.close() 211 212 f = open("test", "rb") 213 r = iixr.TermReader(f) 214 f2 = open("testI", "rb") 215 r2 = iixr.TermIndexReader(f2) 216 f3 = open("testP", "rb") 217 r3 = iixr.PositionReader(f3) 218 rd = iixr.TermDictionaryReader(r, r2, r3) 219 terms_reversed = terms[:] 220 terms_reversed.reverse() 221 for term, offset, frequency in terms_reversed: 222 o, fr = rd._find_term(term) 223 print offset == o, offset, o 224 print frequency == fr, frequency, fr 225 for term in ("dog", "dingo"): 226 t = rd._find_term(term) 227 print t is None, t 228 rd.close() 229 230 # Test dictionaries with term and position data. 231 232 terms_with_positions = [ 233 ("aardvark", [(1, [2, 45, 96]), (20, [13])]), 234 ("anteater", [(1, [43, 44])]), 235 ("badger", [(7, [2, 22, 196]), (19, [55, 1333]), (21, [0])]), 236 ("bull", [(6, [128]), (16, [12])]), 237 ("bulldog", [(43, [17, 19, 256, 512])]), 238 ("cat", [(123, [12, 145, 196]), (1200, [113])]) 239 ] 240 241 f = open("test", "wb") 242 w = iixr.TermWriter(f) 243 f2 = open("testI", "wb") 244 w2 = iixr.TermIndexWriter(f2) 245 f3 = open("testP", "wb") 246 w3 = iixr.PositionWriter(f3) 247 wd = iixr.TermDictionaryWriter(w, w2, w3, 3) 248 for term, doc_positions in terms_with_positions: 249 wd.write_term_positions(term, doc_positions) 250 wd.close() 251 252 f = open("test", "rb") 253 r = iixr.TermReader(f) 254 f2 = open("testI", "rb") 255 r2 = iixr.TermIndexReader(f2) 256 f3 = open("testP", "rb") 257 r3 = iixr.PositionReader(f3) 258 rd = iixr.TermDictionaryReader(r, r2, r3) 259 terms_reversed = terms_with_positions[:] 260 terms_reversed.reverse() 261 for term, doc_positions in terms_reversed: 262 dp = rd.find_positions(term) 263 print doc_positions == dp, doc_positions, dp 264 for term in ("dog", "dingo"): 265 dp = rd.find_positions(term) 266 print dp is None, dp 267 268 # (Test sequential access.) 269 270 rd.rewind() 271 for term, doc_positions in terms_with_positions: 272 t, fr, dp = rd.read_term() 273 print term == t, term, t 274 print doc_positions == dp, doc_positions, dp 275 rd.close() 276 277 # Test high-level index operations. 278 279 docs = [ 280 (1, "The cat sat on the mat"), 281 (2, "Every good boy deserves football"), 282 (13, "One good turn deserves another"), 283 (14, "Every man for himself"), 284 (25, "Red sky at night shepherd's delight"), 285 (36, "She sells sea shells on the sea shore") 286 ] 287 288 doc_tests = [ 289 ("Every", 2, [(2, [0]), (14, [0])]), 290 ("good", 2, [(2, [1]), (13, [1])]), 291 ("deserves", 2, [(2, [3]), (13, [3])]), 292 ("sea", 2, [(36, [2, 6])]) 293 ] 294 295 index = iixr.Index("test_index") 296 wi = index.get_writer(3) 297 for docnum, text in docs: 298 for position, term in enumerate(text.split()): 299 wi.add_position(term, docnum, position) 300 wi.add_fields(docnum, [text]) 301 wi.close() 302 303 rd = index.get_reader() 304 for term, frequency, doc_positions in doc_tests: 305 dp = rd.find_positions(term) 306 print doc_positions == dp, doc_positions, dp 307 fr = rd.get_frequency(term) 308 print frequency == fr, frequency, fr 309 for docnum, text in docs: 310 df = rd.get_fields(docnum) 311 print text == df[0], text, df[0] 312 index.close() 313 314 # vim: tabstop=4 expandtab shiftwidth=4