1 #!/usr/bin/env python 2 3 import iixr 4 import os 5 6 # Remove old test files. 7 8 for filename in ("test", "testF", "testFI", "testI", "testP"): 9 try: 10 os.remove(filename) 11 except OSError: 12 pass 13 14 try: 15 os.removedirs("test_index") 16 except OSError: 17 pass 18 19 # Test basic data types. 20 21 numbers = [12345678, 0, 1, 127, 128, 255, 256] 22 23 f = open("test", "wb") 24 w = iixr.FileWriter(f) 25 for number in numbers: 26 w.write_number(number) 27 w.close() 28 29 f = open("test", "rb") 30 r = iixr.FileReader(f) 31 for number in numbers: 32 n = r.read_number() 33 print number == n, number, n 34 r.close() 35 36 # Test positions. 37 38 all_doc_positions = [ 39 [ 40 (123, [1, 3, 5, 15, 25]), 41 (124, [0, 100]) 42 ], 43 [ 44 (78, [9]), 45 (196, [10, 11]) 46 ] 47 ] 48 49 f = open("test", "wb") 50 w = iixr.PositionWriter(f) 51 for doc_positions in all_doc_positions: 52 for docnum, positions in doc_positions: 53 w.write_positions(docnum, positions) 54 w.reset() 55 w.close() 56 57 f = open("test", "rb") 58 r = iixr.PositionReader(f) 59 for doc_positions in all_doc_positions: 60 for docnum, positions in doc_positions: 61 d, p = r.read_positions() 62 print docnum == d, docnum, d 63 print positions == p, positions, p 64 r.reset() 65 r.close() 66 67 f = open("test", "wb") 68 w = iixr.PositionWriter(f) 69 offsets = [] 70 for doc_positions in all_doc_positions: 71 offset, frequency = w.write_term_positions(doc_positions) 72 offsets.append(offset) 73 w.close() 74 75 f = open("test", "rb") 76 r = iixr.PositionReader(f) 77 offsets.reverse() 78 all_doc_positions.reverse() 79 for offset, doc_positions in zip(offsets, all_doc_positions): 80 dp = list(r.read_term_positions(offset)) 81 print doc_positions == dp, doc_positions, dp 82 r.close() 83 84 # Test fields. 85 86 doc_fields = [ 87 (123, ["testing", "fields", "stored", "compressed"]), 88 (456, ["fields", "for a second", "document"]), 89 (789, ["field value"]), 90 (1234, []), 91 (2345, ["abc", "def"]), 92 (3456, ["apple", "banana", "cherry"]), 93 (4567, ["drue", "eple"]) 94 ] 95 96 f = open("testF", "wb") 97 w = iixr.FieldWriter(f) 98 for docnum, fields in doc_fields: 99 w.write_fields(docnum, list(enumerate(fields))) 100 w.close() 101 102 f = open("testF", "rb") 103 r = iixr.FieldReader(f) 104 for docnum, fields in doc_fields: 105 dn, df = r.read_fields() 106 print docnum == dn, docnum, dn 107 print list(enumerate(fields)) == df, list(enumerate(fields)), df 108 r.close() 109 110 # Test field index files. 111 112 indexed_docs = [ 113 (123, 100000987), 114 (456, 100004321), 115 (789, 100008765) 116 ] 117 118 f = open("testFI", "wb") 119 w = iixr.FieldIndexWriter(f) 120 for docnum, offset in indexed_docs: 121 w.write_document(docnum, offset) 122 w.close() 123 124 f = open("testFI", "rb") 125 r = iixr.FieldIndexReader(f) 126 for docnum, offset in indexed_docs: 127 dn, o = r.read_document() 128 print docnum == dn, docnum, dn 129 print offset == o, offset, o 130 r.close() 131 132 # Test field dictionaries. 133 134 f = open("testF", "wb") 135 w = iixr.FieldWriter(f) 136 f2 = open("testFI", "wb") 137 w2 = iixr.FieldIndexWriter(f2) 138 wd = iixr.FieldDictionaryWriter(w, w2, 3) 139 for docnum, fields in doc_fields: 140 wd.write_fields(docnum, list(enumerate(fields))) 141 wd.close() 142 143 f = open("testF", "rb") 144 r = iixr.FieldReader(f) 145 f2 = open("testFI", "rb") 146 r2 = iixr.FieldIndexReader(f2) 147 rd = iixr.FieldDictionaryReader(r, r2) 148 doc_fields_reversed = doc_fields[:] 149 doc_fields_reversed.reverse() 150 for docnum, fields in doc_fields_reversed: 151 df = rd.get_fields(docnum) 152 print list(enumerate(fields)) == df, list(enumerate(fields)), df 153 for docnum in (13579, 246810): 154 df = rd.get_fields(docnum) 155 print df is None, df 156 157 # (Test sequential access.) 158 159 rd.rewind() 160 for docnum, fields in doc_fields: 161 dn, df = rd.read_fields() 162 print docnum == dn, docnum, dn 163 print list(enumerate(fields)) == df, list(enumerate(fields)), df 164 rd.close() 165 166 # Test terms. 167 168 terms = [ 169 # term offset frequency 170 ("aardvark", 100000123, 1), 171 ("anteater", 100000456, 2), 172 ("badger", 100000789, 13), 173 ("bull", 1000001234, 59), 174 ("bulldog", 1000002345, 99), 175 ("cat", 1000003456, 89) 176 ] 177 178 f = open("test", "wb") 179 w = iixr.TermWriter(f) 180 for term, offset, frequency in terms: 181 w.write_term(term, offset, frequency) 182 w.close() 183 184 f = open("test", "rb") 185 r = iixr.TermReader(f) 186 for term, offset, frequency in terms: 187 t, o, fr = r.read_term() 188 print term == t, term, t 189 print offset == o, offset, o 190 print frequency == fr, frequency, fr 191 r.close() 192 193 # Test terms in index files. 194 195 indexed_terms = [ 196 # term offset frequency info_offset 197 ("aardvark", 100000123, 1, 200000321), 198 ("anteater", 100000456, 2, 200000654), 199 ("badger", 100000789, 13, 200000987), 200 ("bull", 1000001234, 59, 200004321), 201 ("bulldog", 1000002345, 99, 200005432), 202 ("cat", 1000003456, 89, 200006543) 203 ] 204 205 f = open("test", "wb") 206 w = iixr.TermIndexWriter(f) 207 for term, offset, frequency, info_offset in indexed_terms: 208 w.write_term(term, offset, frequency, info_offset) 209 w.close() 210 211 f = open("test", "rb") 212 r = iixr.TermIndexReader(f) 213 for term, offset, frequency, info_offset in indexed_terms: 214 t, o, fr, i = r.read_term() 215 print term == t, term, t 216 print offset == o, offset, o 217 print frequency == fr, frequency, fr 218 print info_offset == i, info_offset, i 219 r.close() 220 221 # Test dictionaries with only term data. 222 223 f = open("test", "wb") 224 w = iixr.TermWriter(f) 225 f2 = open("testI", "wb") 226 w2 = iixr.TermIndexWriter(f2) 227 f3 = open("testP", "wb") 228 w3 = iixr.PositionWriter(f3) 229 wd = iixr.TermDictionaryWriter(w, w2, w3, 3) 230 for term, offset, frequency in terms: 231 wd._write_term(term, offset, frequency) 232 wd.close() 233 234 f = open("test", "rb") 235 r = iixr.TermReader(f) 236 f2 = open("testI", "rb") 237 r2 = iixr.TermIndexReader(f2) 238 f3 = open("testP", "rb") 239 r3 = iixr.PositionReader(f3) 240 rd = iixr.TermDictionaryReader(r, r2, r3) 241 terms_reversed = terms[:] 242 terms_reversed.reverse() 243 for term, offset, frequency in terms_reversed: 244 o, fr = rd._find_term(term) 245 print offset == o, offset, o 246 print frequency == fr, frequency, fr 247 for term in ("dog", "dingo"): 248 t = rd._find_term(term) 249 print t is None, t 250 rd.close() 251 252 # Test dictionaries with term and position data. 253 254 terms_with_positions = [ 255 ("aardvark", [(1, [2, 45, 96]), (20, [13])]), 256 ("anteater", [(1, [43, 44])]), 257 ("badger", [(7, [2, 22, 196]), (19, [55, 1333]), (21, [0])]), 258 ("bull", [(6, [128]), (16, [12])]), 259 ("bulldog", [(43, [17, 19, 256, 512])]), 260 ("cat", [(123, [12, 145, 196]), (1200, [113])]) 261 ] 262 263 f = open("test", "wb") 264 w = iixr.TermWriter(f) 265 f2 = open("testI", "wb") 266 w2 = iixr.TermIndexWriter(f2) 267 f3 = open("testP", "wb") 268 w3 = iixr.PositionWriter(f3) 269 wd = iixr.TermDictionaryWriter(w, w2, w3, 3) 270 for term, doc_positions in terms_with_positions: 271 wd.write_term_positions(term, doc_positions) 272 wd.close() 273 274 f = open("test", "rb") 275 r = iixr.TermReader(f) 276 f2 = open("testI", "rb") 277 r2 = iixr.TermIndexReader(f2) 278 f3 = open("testP", "rb") 279 r3 = iixr.PositionReader(f3) 280 rd = iixr.TermDictionaryReader(r, r2, r3) 281 terms_reversed = terms_with_positions[:] 282 terms_reversed.reverse() 283 for term, doc_positions in terms_reversed: 284 dp = list(rd.find_positions(term)) 285 print doc_positions == dp, doc_positions, dp 286 for term in ("dog", "dingo"): 287 dp = rd.find_positions(term) 288 print dp is None, dp 289 290 # (Test sequential access.) 291 292 rd.rewind() 293 for term, doc_positions in terms_with_positions: 294 t, fr, dp = rd.read_term() 295 dp = list(dp) 296 print term == t, term, t 297 print doc_positions == dp, doc_positions, dp 298 rd.close() 299 300 # Test high-level index operations (including merging). 301 302 docs = [ 303 (1, "The cat sat on the mat"), 304 (2, "Every good boy deserves football"), 305 (13, "One good turn deserves another"), 306 (14, "Every man for himself"), 307 (25, "Red sky at night shepherd's delight"), 308 (36, "She sells sea shells on the sea shore") 309 ] 310 311 doc_tests = [ 312 ("Every", 2, [(2, [0]), (14, [0])]), 313 ("good", 2, [(2, [1]), (13, [1])]), 314 ("deserves", 2, [(2, [3]), (13, [3])]), 315 ("sea", 2, [(36, [2, 6])]) 316 ] 317 318 index = iixr.Index("test_index") 319 wi = index.get_writer(3, 6) 320 for docnum, text in docs: 321 for position, term in enumerate(text.split()): 322 wi.add_position(term, docnum, position) 323 wi.add_field(docnum, 123, text) 324 wi.close() 325 326 rd = index.get_reader() 327 for term, frequency, doc_positions in doc_tests: 328 dp = list(rd.find_positions(term)) 329 print doc_positions == dp, doc_positions, dp 330 fr = rd.get_frequency(term) 331 print frequency == fr, frequency, fr 332 for docnum, text in docs: 333 df = rd.get_fields(docnum) 334 print (123, text) == df[0], (123, text), df[0] 335 index.close() 336 337 # vim: tabstop=4 expandtab shiftwidth=4