1 #!/usr/bin/env python 2 3 import iixr 4 import os 5 6 # Remove old test files. 7 8 for filename in ("test", "testF", "testFI", "testI", "testP"): 9 try: 10 os.remove(filename) 11 except OSError: 12 pass 13 14 try: 15 os.removedirs("test_index") 16 except OSError: 17 pass 18 19 # Test basic data types. 20 21 numbers = [12345678, 0, 1, 127, 128, 255, 256] 22 23 f = open("test", "wb") 24 w = iixr.FileWriter(f) 25 for number in numbers: 26 w.write_number(number) 27 w.close() 28 29 f = open("test", "rb") 30 r = iixr.FileReader(f) 31 for number in numbers: 32 n = r.read_number() 33 print number == n, number, n 34 r.close() 35 36 # Test positions. 37 38 all_doc_positions = [ 39 [ 40 (123, [1, 3, 5, 15, 25]), 41 (124, [0, 100]), 42 (125, [11, 99, 199]), 43 (130, [77, 78, 80, 82, 89]) 44 ], 45 [ 46 (78, [9]), 47 (196, [10, 11]), 48 (197, [17, 21, 30]) 49 ] 50 ] 51 52 f = open("testP", "wb") 53 w = iixr.PositionWriter(f) 54 for doc_positions in all_doc_positions: 55 for docnum, positions in doc_positions: 56 w.write_positions(docnum, positions) 57 w.reset() 58 w.close() 59 60 f = open("testP", "rb") 61 r = iixr.PositionReader(f) 62 for doc_positions in all_doc_positions: 63 for docnum, positions in doc_positions: 64 d, p = r.read_positions() 65 print docnum == d, docnum, d 66 print positions == p, positions, p 67 r.reset() 68 r.close() 69 70 # Test position index files. 71 72 indexed_positions = [ 73 [ 74 (1234, 0, 100), 75 (2345, 700, 100), 76 (3456, 1900, 50) 77 ], 78 [ 79 (4567, 2800, 20) 80 ] 81 ] 82 83 offsets = [] 84 f = open("testPI", "wb") 85 w = iixr.PositionIndexWriter(f) 86 for term_positions in indexed_positions: 87 offset = None 88 doc_frequency = 0 89 w.reset() 90 for docnum, pos_offset, count in term_positions: 91 io = w.write_positions(docnum, pos_offset, count) 92 if offset is None: 93 offset = io 94 doc_frequency += count 95 offsets.append((offset, doc_frequency)) 96 w.close() 97 98 f = open("testPI", "rb") 99 r = iixr.PositionIndexReader(f) 100 offsets.reverse() 101 indexed_positions.reverse() 102 for (offset, doc_frequency), term_positions in zip(offsets, indexed_positions): 103 found_positions = r.read_term_positions(offset, doc_frequency) 104 for (docnum, pos_offset, count), (dn, po, c) in zip(term_positions, found_positions): 105 print docnum == dn, docnum, dn 106 print pos_offset == po, pos_offset, po 107 print count == c, count, c 108 r.close() 109 110 # Test position dictionaries. 111 112 f = open("testP", "wb") 113 w = iixr.PositionWriter(f) 114 f2 = open("testPI", "wb") 115 w2 = iixr.PositionIndexWriter(f2) 116 wd = iixr.PositionDictionaryWriter(w, w2, 2) 117 offsets = [] 118 for doc_positions in all_doc_positions: 119 offset, frequency, doc_frequency = wd.write_term_positions(doc_positions) 120 offsets.append((offset, doc_frequency)) 121 w.close() 122 123 f = open("testP", "rb") 124 r = iixr.PositionReader(f) 125 f2 = open("testPI", "rb") 126 r2 = iixr.PositionIndexReader(f2) 127 rd = iixr.PositionDictionaryReader(r, r2) 128 offsets.reverse() 129 all_doc_positions.reverse() 130 for (offset, doc_frequency), doc_positions in zip(offsets, all_doc_positions): 131 dp = list(rd.read_term_positions(offset, doc_frequency)) 132 print doc_positions == dp, doc_positions, dp 133 r.close() 134 135 # Test fields. 136 137 doc_fields = [ 138 (123, ["testing", "fields", "stored", "compressed"]), 139 (456, ["fields", "for a second", "document"]), 140 (789, ["field value"]), 141 (1234, []), 142 (2345, ["abc", "def"]), 143 (3456, ["apple", "banana", "cherry"]), 144 (4567, ["drue", "eple"]) 145 ] 146 147 f = open("testF", "wb") 148 w = iixr.FieldWriter(f) 149 for docnum, fields in doc_fields: 150 w.write_fields(docnum, list(enumerate(fields))) 151 w.close() 152 153 f = open("testF", "rb") 154 r = iixr.FieldReader(f) 155 for docnum, fields in doc_fields: 156 dn, df = r.read_fields() 157 print docnum == dn, docnum, dn 158 print list(enumerate(fields)) == df, list(enumerate(fields)), df 159 r.close() 160 161 # Test field index files. 162 163 indexed_docs = [ 164 (123, 100000987), 165 (456, 100004321), 166 (789, 100008765) 167 ] 168 169 f = open("testFI", "wb") 170 w = iixr.FieldIndexWriter(f) 171 for docnum, offset in indexed_docs: 172 w.write_document(docnum, offset) 173 w.close() 174 175 f = open("testFI", "rb") 176 r = iixr.FieldIndexReader(f) 177 for docnum, offset in indexed_docs: 178 dn, o = r.read_document() 179 print docnum == dn, docnum, dn 180 print offset == o, offset, o 181 r.close() 182 183 # Test field dictionaries. 184 185 f = open("testF", "wb") 186 w = iixr.FieldWriter(f) 187 f2 = open("testFI", "wb") 188 w2 = iixr.FieldIndexWriter(f2) 189 wd = iixr.FieldDictionaryWriter(w, w2, 3) 190 for docnum, fields in doc_fields: 191 wd.write_fields(docnum, list(enumerate(fields))) 192 wd.close() 193 194 f = open("testF", "rb") 195 r = iixr.FieldReader(f) 196 f2 = open("testFI", "rb") 197 r2 = iixr.FieldIndexReader(f2) 198 rd = iixr.FieldDictionaryReader(r, r2) 199 doc_fields_reversed = doc_fields[:] 200 doc_fields_reversed.reverse() 201 for docnum, fields in doc_fields_reversed: 202 df = rd.get_fields(docnum) 203 print list(enumerate(fields)) == df, list(enumerate(fields)), df 204 for docnum in (13579, 246810): 205 df = rd.get_fields(docnum) 206 print df is None, df 207 208 # (Test sequential access.) 209 210 rd.rewind() 211 for docnum, fields in doc_fields: 212 dn, df = rd.read_fields() 213 print docnum == dn, docnum, dn 214 print list(enumerate(fields)) == df, list(enumerate(fields)), df 215 rd.close() 216 217 # Test terms. 218 219 terms = [ 220 # term offset frequency doc_frequency 221 ("aardvark", 100000123, 1, 1), 222 ("anteater", 100000456, 2, 1), 223 ("badger", 100000789, 13, 7), 224 ("bull", 1000001234, 59, 17), 225 ("bulldog", 1000002345, 99, 80), 226 ("cat", 1000003456, 89, 28) 227 ] 228 229 f = open("test", "wb") 230 w = iixr.TermWriter(f) 231 for term, offset, frequency, doc_frequency in terms: 232 w.write_term(term, offset, frequency, doc_frequency) 233 w.close() 234 235 f = open("test", "rb") 236 r = iixr.TermReader(f) 237 for term, offset, frequency, doc_frequency in terms: 238 t, o, fr, df = r.read_term() 239 print term == t, term, t 240 print offset == o, offset, o 241 print frequency == fr, frequency, fr 242 print doc_frequency == df, doc_frequency, df 243 r.close() 244 245 # Test terms in index files. 246 247 indexed_terms = [ 248 # term offset frequency doc_frequency info_offset 249 ("aardvark", 100000123, 1, 1, 200000321), 250 ("anteater", 100000456, 2, 1, 200000654), 251 ("badger", 100000789, 13, 7, 200000987), 252 ("bull", 1000001234, 59, 17, 200004321), 253 ("bulldog", 1000002345, 99, 80, 200005432), 254 ("cat", 1000003456, 89, 28, 200006543) 255 ] 256 257 f = open("test", "wb") 258 w = iixr.TermIndexWriter(f) 259 for term, offset, frequency, doc_frequency, info_offset in indexed_terms: 260 w.write_term(term, offset, frequency, doc_frequency, info_offset) 261 w.close() 262 263 f = open("test", "rb") 264 r = iixr.TermIndexReader(f) 265 for term, offset, frequency, doc_frequency, info_offset in indexed_terms: 266 t, o, fr, df, i = r.read_term() 267 print term == t, term, t 268 print offset == o, offset, o 269 print frequency == fr, frequency, fr 270 print doc_frequency == df, doc_frequency, df 271 print info_offset == i, info_offset, i 272 r.close() 273 274 # Test dictionaries with only term data. 275 276 f = open("test", "wb") 277 w = iixr.TermWriter(f) 278 f2 = open("testI", "wb") 279 w2 = iixr.TermIndexWriter(f2) 280 wd = iixr.TermDictionaryWriter(w, w2, None, 3) 281 for term, offset, frequency, doc_frequency in terms: 282 wd._write_term(term, offset, frequency, doc_frequency) 283 wd.close() 284 285 f = open("test", "rb") 286 r = iixr.TermReader(f) 287 f2 = open("testI", "rb") 288 r2 = iixr.TermIndexReader(f2) 289 rd = iixr.TermDictionaryReader(r, r2, None) 290 terms_reversed = terms[:] 291 terms_reversed.reverse() 292 for term, offset, frequency, doc_frequency in terms_reversed: 293 o, fr, df = rd._find_term(term) 294 print offset == o, offset, o 295 print frequency == fr, frequency, fr 296 print doc_frequency == df, doc_frequency, df 297 for term in ("dog", "dingo"): 298 t = rd._find_term(term) 299 print t is None, t 300 rd.close() 301 302 # Test dictionaries with term and position data. 303 304 terms_with_positions = [ 305 ("aardvark", [(1, [2, 45, 96]), (20, [13])]), 306 ("anteater", [(1, [43, 44])]), 307 ("badger", [(7, [2, 22, 196]), (19, [55, 1333]), (21, [0])]), 308 ("bull", [(6, [128]), (16, [12]), (26, [1, 3, 5, 7, 9]), (36, [2, 4, 6, 8, 10])]), 309 ("bulldog", [(43, [17, 19, 256, 512])]), 310 ("cat", [(123, [12, 145, 196]), (1200, [113])]) 311 ] 312 313 f = open("test", "wb") 314 w = iixr.TermWriter(f) 315 f2 = open("testI", "wb") 316 w2 = iixr.TermIndexWriter(f2) 317 f3 = open("testP", "wb") 318 w3 = iixr.PositionWriter(f3) 319 f4 = open("testPI", "wb") 320 w4 = iixr.PositionIndexWriter(f4) 321 wp = iixr.PositionDictionaryWriter(r3, r4, 2) 322 wd = iixr.TermDictionaryWriter(w, w2, wp, 3) 323 for term, doc_positions in terms_with_positions: 324 wd.write_term_positions(term, doc_positions) 325 wd.close() 326 327 f = open("test", "rb") 328 r = iixr.TermReader(f) 329 f2 = open("testI", "rb") 330 r2 = iixr.TermIndexReader(f2) 331 f3 = open("testP", "rb") 332 r3 = iixr.PositionReader(f3) 333 f4 = open("testPI", "rb") 334 r4 = iixr.PositionIndexReader(f4) 335 rp = iixr.PositionDictionaryReader(r3, r4) 336 rd = iixr.TermDictionaryReader(r, r2, rp) 337 terms_reversed = terms_with_positions[:] 338 terms_reversed.reverse() 339 for term, doc_positions in terms_reversed: 340 dp = list(rd.find_positions(term)) 341 print doc_positions == dp, doc_positions, dp 342 for term in ("dog", "dingo"): 343 dp = rd.find_positions(term) 344 print dp is None, dp 345 346 # (Test sequential access.) 347 348 rd.rewind() 349 for term, doc_positions in terms_with_positions: 350 t, fr, df, dp = rd.read_term() 351 dp = list(dp) 352 print term == t, term, t 353 print doc_positions == dp, doc_positions, dp 354 rd.close() 355 356 # Test high-level index operations (including merging). 357 358 docs = [ 359 (1, "The cat sat on the mat"), 360 (2, "Every good boy deserves football"), 361 (13, "One good turn deserves another"), 362 (14, "Every man for himself"), 363 (25, "Red sky at night shepherd's delight"), 364 (36, "She sells sea shells on the sea shore") 365 ] 366 367 doc_tests = [ 368 ("Every", 2, [(2, [0]), (14, [0])]), 369 ("good", 2, [(2, [1]), (13, [1])]), 370 ("deserves", 2, [(2, [3]), (13, [3])]), 371 ("sea", 2, [(36, [2, 6])]) 372 ] 373 374 index = iixr.Index("test_index") 375 wi = index.get_writer(3, 6) 376 for docnum, text in docs: 377 for position, term in enumerate(text.split()): 378 wi.add_position(term, docnum, position) 379 wi.add_field(docnum, 123, text) 380 wi.close() 381 382 rd = index.get_reader() 383 for term, frequency, doc_positions in doc_tests: 384 dp = list(rd.find_positions(term)) 385 print doc_positions == dp, doc_positions, dp 386 fr = rd.get_frequency(term) 387 print frequency == fr, frequency, fr 388 for docnum, text in docs: 389 df = rd.get_fields(docnum) 390 print (123, text) == df[0], (123, text), df[0] 391 index.close() 392 393 # vim: tabstop=4 expandtab shiftwidth=4