1 #!/usr/bin/env python 2 3 import iixr 4 import os 5 6 # Remove old test files. 7 8 for filename in ("test", "testF", "testFI", "testI", "testP"): 9 try: 10 os.remove(filename) 11 except OSError: 12 pass 13 14 try: 15 for filename in os.listdir("test_index"): 16 os.remove(os.path.join("test_index", filename)) 17 os.rmdir("test_index") 18 except OSError: 19 pass 20 21 # Test basic data types. 22 23 numbers = [12345678, 0, 1, 127, 128, 255, 256] 24 25 f = open("test", "wb") 26 w = iixr.FileWriter(f) 27 for number in numbers: 28 w.write_number(number) 29 w.close() 30 31 f = open("test", "rb") 32 r = iixr.FileReader(f) 33 for number in numbers: 34 n = r.read_number() 35 print number == n, number, n 36 r.close() 37 38 # Test positions. 39 40 all_doc_positions = [ 41 [ 42 (123, [1, 3, 5, 15, 25]), 43 (124, [0, 100]), 44 (125, [11, 99, 199]), 45 (130, [77, 78, 80, 82, 89]) 46 ], 47 [ 48 (78, [9]), 49 (196, [10, 11]), 50 (197, [17, 21, 30]) 51 ] 52 ] 53 54 f = open("testP", "wb") 55 w = iixr.PositionWriter(f) 56 for doc_positions in all_doc_positions: 57 for docnum, positions in doc_positions: 58 w.write_positions(docnum, positions) 59 w.reset() 60 w.close() 61 62 f = open("testP", "rb") 63 r = iixr.PositionIterator(f, None) 64 for doc_positions in all_doc_positions: 65 for docnum, positions in doc_positions: 66 d, p = r.read_positions() 67 print docnum == d, docnum, d 68 print positions == p, positions, p 69 r.reset() 70 r.close() 71 72 # Test position index files. 73 74 indexed_positions = [ 75 [ 76 (1234, 0, 100), 77 (2345, 700, 100), 78 (3456, 1900, 50) 79 ], 80 [ 81 (4567, 2800, 20) 82 ] 83 ] 84 85 offsets = [] 86 f = open("testPI", "wb") 87 w = iixr.PositionIndexWriter(f) 88 for term_positions in indexed_positions: 89 offset = None 90 doc_frequency = 0 91 w.reset() 92 for docnum, pos_offset, count in term_positions: 93 io = w.write_positions(docnum, pos_offset, count) 94 if offset is None: 95 offset = io 96 doc_frequency += count 97 offsets.append((offset, doc_frequency)) 98 w.close() 99 100 r = iixr.PositionIndexOpener("testPI") 101 offsets.reverse() 102 indexed_positions.reverse() 103 for (offset, doc_frequency), term_positions in zip(offsets, indexed_positions): 104 found_positions = r.read_term_positions(offset, doc_frequency) 105 for (docnum, pos_offset, count), (dn, po, c) in zip(term_positions, found_positions): 106 print docnum == dn, docnum, dn 107 print pos_offset == po, pos_offset, po 108 print count == c, count, c 109 r.close() 110 111 # Test position dictionaries. 112 113 f = open("testP", "wb") 114 w = iixr.PositionWriter(f) 115 f2 = open("testPI", "wb") 116 w2 = iixr.PositionIndexWriter(f2) 117 wd = iixr.PositionDictionaryWriter(w, w2, 2) 118 offsets = [] 119 for doc_positions in all_doc_positions: 120 offset, frequency, doc_frequency = wd.write_term_positions(doc_positions) 121 offsets.append((offset, doc_frequency)) 122 wd.close() 123 124 r = iixr.PositionOpener("testP") 125 r2 = iixr.PositionIndexOpener("testPI") 126 rd = iixr.PositionDictionaryReader(r, r2) 127 offsets.reverse() 128 all_doc_positions.reverse() 129 for (offset, doc_frequency), doc_positions in zip(offsets, all_doc_positions): 130 dp = list(rd.read_term_positions(offset, doc_frequency)) 131 print doc_positions == dp, doc_positions, dp 132 rd.close() 133 134 # Test fields. 135 136 doc_fields = [ 137 (123, ["testing", "fields", "stored", "compressed"]), 138 (456, ["fields", "for a second", "document"]), 139 (789, ["field value"]), 140 (1234, []), 141 (2345, ["abc", "def"]), 142 (3456, ["apple", "banana", "cherry"]), 143 (4567, ["drue", "eple"]) 144 ] 145 146 f = open("testF", "wb") 147 w = iixr.FieldWriter(f) 148 for docnum, fields in doc_fields: 149 w.write_fields(docnum, list(enumerate(fields))) 150 w.close() 151 152 f = open("testF", "rb") 153 r = iixr.FieldReader(f) 154 for docnum, fields in doc_fields: 155 dn, df = r.read_fields() 156 print docnum == dn, docnum, dn 157 print list(enumerate(fields)) == df, list(enumerate(fields)), df 158 r.close() 159 160 # Test field index files. 161 162 indexed_docs = [ 163 (123, 100000987), 164 (456, 100004321), 165 (789, 100008765) 166 ] 167 168 f = open("testFI", "wb") 169 w = iixr.FieldIndexWriter(f) 170 for docnum, offset in indexed_docs: 171 w.write_document(docnum, offset) 172 w.close() 173 174 f = open("testFI", "rb") 175 r = iixr.FieldIndexReader(f) 176 for docnum, offset in indexed_docs: 177 dn, o = r.read_document() 178 print docnum == dn, docnum, dn 179 print offset == o, offset, o 180 r.close() 181 182 # Test field dictionaries. 183 184 f = open("testF", "wb") 185 w = iixr.FieldWriter(f) 186 f2 = open("testFI", "wb") 187 w2 = iixr.FieldIndexWriter(f2) 188 wd = iixr.FieldDictionaryWriter(w, w2, 3) 189 for docnum, fields in doc_fields: 190 wd.write_fields(docnum, list(enumerate(fields))) 191 wd.close() 192 193 f = open("testF", "rb") 194 r = iixr.FieldReader(f) 195 f2 = open("testFI", "rb") 196 r2 = iixr.FieldIndexReader(f2) 197 rd = iixr.FieldDictionaryReader(r, r2) 198 doc_fields_reversed = doc_fields[:] 199 doc_fields_reversed.reverse() 200 for docnum, fields in doc_fields_reversed: 201 df = dict(rd.get_fields(docnum)) 202 print dict(enumerate(fields)) == df, dict(enumerate(fields)), df 203 for docnum in (13579, 246810): 204 df = rd.get_fields(docnum) 205 print df is None, df 206 207 # (Test sequential access.) 208 209 rd.rewind() 210 for docnum, fields in doc_fields: 211 dn, df = rd.read_fields() 212 print docnum == dn, docnum, dn 213 print list(enumerate(fields)) == df, list(enumerate(fields)), df 214 rd.close() 215 216 # Test terms. 217 218 terms = [ 219 # term offset frequency doc_frequency 220 ("aardvark", 100000123, 1, 1), 221 ("anteater", 100000456, 2, 1), 222 ("badger", 100000789, 13, 7), 223 ("bull", 1000001234, 59, 17), 224 ("bulldog", 1000002345, 99, 80), 225 ("cat", 1000003456, 89, 28) 226 ] 227 228 f = open("test", "wb") 229 w = iixr.TermWriter(f) 230 for term, offset, frequency, doc_frequency in terms: 231 w.write_term(term, offset, frequency, doc_frequency) 232 w.close() 233 234 f = open("test", "rb") 235 r = iixr.TermReader(f) 236 for term, offset, frequency, doc_frequency in terms: 237 t, o, fr, df = r.read_term() 238 print term == t, term, t 239 print offset == o, offset, o 240 print frequency == fr, frequency, fr 241 print doc_frequency == df, doc_frequency, df 242 r.close() 243 244 # Test terms in index files. 245 246 indexed_terms = [ 247 # term offset frequency doc_frequency info_offset 248 ("aardvark", 100000123, 1, 1, 200000321), 249 ("anteater", 100000456, 2, 1, 200000654), 250 ("badger", 100000789, 13, 7, 200000987), 251 ("bull", 1000001234, 59, 17, 200004321), 252 ("bulldog", 1000002345, 99, 80, 200005432), 253 ("cat", 1000003456, 89, 28, 200006543) 254 ] 255 256 f = open("test", "wb") 257 w = iixr.TermIndexWriter(f) 258 for term, offset, frequency, doc_frequency, info_offset in indexed_terms: 259 w.write_term(term, offset, frequency, doc_frequency, info_offset) 260 w.close() 261 262 f = open("test", "rb") 263 r = iixr.TermIndexReader(f) 264 for term, offset, frequency, doc_frequency, info_offset in indexed_terms: 265 t, o, fr, df, i = r.read_term() 266 print term == t, term, t 267 print offset == o, offset, o 268 print frequency == fr, frequency, fr 269 print doc_frequency == df, doc_frequency, df 270 print info_offset == i, info_offset, i 271 r.close() 272 273 # Test dictionaries with only term data. 274 275 f = open("test", "wb") 276 w = iixr.TermWriter(f) 277 f2 = open("testI", "wb") 278 w2 = iixr.TermIndexWriter(f2) 279 f3 = open("testP", "wb") 280 w3 = iixr.PositionWriter(f3) 281 f4 = open("testPI", "wb") 282 w4 = iixr.PositionIndexWriter(f4) 283 wp = iixr.PositionDictionaryWriter(w3, w4, 2) 284 wd = iixr.TermDictionaryWriter(w, w2, wp, 3) 285 for term, offset, frequency, doc_frequency in terms: 286 wd._write_term(term, offset, frequency, doc_frequency) 287 wd.close() 288 289 f = open("test", "rb") 290 r = iixr.TermReader(f) 291 f2 = open("testI", "rb") 292 r2 = iixr.TermIndexReader(f2) 293 r3 = iixr.PositionOpener("testP") 294 r4 = iixr.PositionIndexOpener("testPI") 295 rp = iixr.PositionDictionaryReader(r3, r4) 296 rd = iixr.TermDictionaryReader(r, r2, rp) 297 terms_reversed = terms[:] 298 terms_reversed.reverse() 299 for term, offset, frequency, doc_frequency in terms_reversed: 300 o, fr, df = rd._find_term(term) 301 print offset == o, offset, o 302 print frequency == fr, frequency, fr 303 print doc_frequency == df, doc_frequency, df 304 for term in ("dog", "dingo"): 305 t = rd._find_term(term) 306 print t is None, t 307 308 # (Test term prefix searching.) 309 310 print rd.find_terms("a") == ["aardvark", "anteater"], rd.find_terms("a"), ["aardvark", "anteater"] 311 print rd.find_terms("bu") == ["bull", "bulldog"], rd.find_terms("bu"), ["bull", "bulldog"] 312 print rd.find_terms("c") == ["cat"], rd.find_terms("c"), ["cat"] 313 print rd.find_terms("d") == [], rd.find_terms("d"), [] 314 rd.close() 315 316 # Test dictionaries with term and position data. 317 318 terms_with_positions = [ 319 ("aardvark", [(1, [2, 45, 96]), (20, [13])]), 320 ("anteater", [(1, [43, 44])]), 321 ("badger", [(7, [2, 22, 196]), (19, [55, 1333]), (21, [0])]), 322 ("bull", [(6, [128]), (16, [12]), (26, [1, 3, 5, 7, 9]), (36, [2, 4, 6, 8, 10])]), 323 ("bulldog", [(43, [17, 19, 256, 512])]), 324 ("cat", [(123, [12, 145, 196]), (1200, [113])]) 325 ] 326 327 position_dict_tests = [ 328 ("badger", 19, [55, 1333]), 329 ("badger", 20, None), 330 ("bull", 6, [128]), 331 ("bull", 26, [1, 3, 5, 7, 9]), 332 ("cat", 111, None), 333 ("cat", 123, [12, 145, 196]), 334 ("cat", 1234, None) 335 ] 336 337 f = open("test", "wb") 338 w = iixr.TermWriter(f) 339 f2 = open("testI", "wb") 340 w2 = iixr.TermIndexWriter(f2) 341 f3 = open("testP", "wb") 342 w3 = iixr.PositionWriter(f3) 343 f4 = open("testPI", "wb") 344 w4 = iixr.PositionIndexWriter(f4) 345 wp = iixr.PositionDictionaryWriter(w3, w4, 2) 346 wd = iixr.TermDictionaryWriter(w, w2, wp, 3) 347 for term, doc_positions in terms_with_positions: 348 wd.write_term_positions(term, doc_positions) 349 wd.close() 350 351 f = open("test", "rb") 352 r = iixr.TermReader(f) 353 f2 = open("testI", "rb") 354 r2 = iixr.TermIndexReader(f2) 355 r3 = iixr.PositionOpener("testP") 356 r4 = iixr.PositionIndexOpener("testPI") 357 rp = iixr.PositionDictionaryReader(r3, r4) 358 rd = iixr.TermDictionaryReader(r, r2, rp) 359 terms_reversed = terms_with_positions[:] 360 terms_reversed.reverse() 361 for term, doc_positions in terms_reversed: 362 dp = list(rd.find_positions(term)) 363 print doc_positions == dp, doc_positions, dp 364 for term in ("aaa", "dog", "dingo"): 365 dp = rd.find_positions(term) 366 print dp is None, dp 367 368 # (Test iterators.) 369 370 for term, docnum, positions in position_dict_tests: 371 dp = rd.find_positions(term) 372 pos = dp.from_document(docnum) 373 print positions is None and pos is None or pos is not None and positions == list(pos), positions, pos 374 375 # (Test sequential access.) 376 377 rd.rewind() 378 for term, doc_positions in terms_with_positions: 379 t, fr, df, dp = rd.read_term() 380 dp = list(dp) 381 print term == t, term, t 382 print doc_positions == dp, doc_positions, dp 383 rd.close() 384 385 # Test high-level index operations (including merging). 386 387 docs = [ 388 (1, "The cat sat on the mat"), 389 (2, "Every good boy deserves football"), 390 (13, "One good turn deserves another"), 391 (14, "Every man for himself"), 392 (25, "Red sky at night shepherd's delight"), 393 (36, "She sells sea shells on the sea shore") 394 ] 395 396 doc_tests = [ 397 ("Every", 2, [(2, [0]), (14, [0])]), 398 ("good", 2, [(2, [1]), (13, [1])]), 399 ("deserves", 2, [(2, [3]), (13, [3])]), 400 ("sea", 2, [(36, [2, 6])]) 401 ] 402 403 position_tests = [ 404 ("Every", 14, [0]), 405 ("sea", 36, [2, 6]), 406 ("shells", 1, None), 407 ("shells", 37, None) 408 ] 409 410 index = iixr.Index("test_index") 411 wi = index.get_writer(3, 2, 6) 412 for docnum, text in docs: 413 doc = iixr.Document(docnum) 414 for position, term in enumerate(text.split()): 415 doc.add_position(term, position) 416 doc.add_field(123, text) 417 wi.add_document(doc) 418 wi.close() 419 420 rd = index.get_reader() 421 for term, frequency, doc_positions in doc_tests: 422 dp = list(rd.find_positions(term)) 423 print doc_positions == dp, doc_positions, dp 424 fr = rd.get_frequency(term) 425 print frequency == fr, frequency, fr 426 for docnum, text in docs: 427 df = dict(rd.get_fields(docnum)) 428 print df[123] == text, text, df[123] 429 for term, docnum, positions in position_tests: 430 dp = rd.find_positions(term) 431 pos = dp.from_document(docnum) 432 print positions is None and pos is None or pos is not None and positions == list(pos), positions, pos 433 index.close() 434 435 # vim: tabstop=4 expandtab shiftwidth=4