1 #!/usr/bin/env python 2 3 import iixr 4 5 # Test basic data types. 6 7 numbers = [12345678, 0, 1, 127, 128, 255, 256] 8 9 f = open("test", "wb") 10 w = iixr.FileWriter(f) 11 for number in numbers: 12 w.write_number(number) 13 w.close() 14 15 f = open("test", "rb") 16 r = iixr.FileReader(f) 17 for number in numbers: 18 n = r.read_number() 19 print number == n, number, n 20 r.close() 21 22 # Test positions. 23 24 all_doc_positions = [ 25 [ 26 (123, [1, 3, 5, 15, 25]), 27 (124, [0, 100]) 28 ], 29 [ 30 (78, [9]), 31 (196, [10, 11]) 32 ] 33 ] 34 35 f = open("test", "wb") 36 w = iixr.PositionWriter(f) 37 for doc_positions in all_doc_positions: 38 for docnum, positions in doc_positions: 39 w.write_positions(docnum, positions) 40 w.reset() 41 w.close() 42 43 f = open("test", "rb") 44 r = iixr.PositionReader(f) 45 for doc_positions in all_doc_positions: 46 for docnum, positions in doc_positions: 47 d, p = r.read_positions() 48 print docnum == d, docnum, d 49 print positions == p, positions, p 50 r.reset() 51 r.close() 52 53 f = open("test", "wb") 54 w = iixr.PositionWriter(f) 55 offsets = [] 56 for doc_positions in all_doc_positions: 57 offset, frequency = w.write_term_positions(doc_positions) 58 offsets.append(offset) 59 w.close() 60 61 f = open("test", "rb") 62 r = iixr.PositionReader(f) 63 offsets.reverse() 64 all_doc_positions.reverse() 65 for offset, doc_positions in zip(offsets, all_doc_positions): 66 dp = r.read_term_positions(offset) 67 print doc_positions == dp, doc_positions, dp 68 r.close() 69 70 # Test fields. 71 72 doc_fields = [ 73 (123, ["testing", "fields", "stored", "compressed"]), 74 (456, ["fields", "for a second", "document"]), 75 (789, ["field value"]), 76 (1234, []), 77 (2345, ["abc", "def"]), 78 (3456, ["apple", "banana", "cherry"]), 79 (4567, ["drue", "eple"]) 80 ] 81 82 f = open("testF", "wb") 83 w = iixr.FieldWriter(f) 84 for docnum, fields in doc_fields: 85 w.write_fields(docnum, list(enumerate(fields))) 86 w.close() 87 88 f = open("testF", "rb") 89 r = iixr.FieldReader(f) 90 for docnum, fields in doc_fields: 91 dn, df = r.read_fields() 92 print docnum == dn, docnum, dn 93 print list(enumerate(fields)) == df, list(enumerate(fields)), df 94 r.close() 95 96 # Test field index files. 97 98 indexed_docs = [ 99 (123, 100000987), 100 (456, 100004321), 101 (789, 100008765) 102 ] 103 104 f = open("testFI", "wb") 105 w = iixr.FieldIndexWriter(f) 106 for docnum, offset in indexed_docs: 107 w.write_document(docnum, offset) 108 w.close() 109 110 f = open("testFI", "rb") 111 r = iixr.FieldIndexReader(f) 112 for docnum, offset in indexed_docs: 113 dn, o = r.read_document() 114 print docnum == dn, docnum, dn 115 print offset == o, offset, o 116 r.close() 117 118 # Test field dictionaries. 119 120 f = open("testF", "wb") 121 w = iixr.FieldWriter(f) 122 f2 = open("testFI", "wb") 123 w2 = iixr.FieldIndexWriter(f2) 124 wd = iixr.FieldDictionaryWriter(w, w2, 3) 125 for docnum, fields in doc_fields: 126 wd.write_fields(docnum, list(enumerate(fields))) 127 wd.close() 128 129 f = open("testF", "rb") 130 r = iixr.FieldReader(f) 131 f2 = open("testFI", "rb") 132 r2 = iixr.FieldIndexReader(f2) 133 rd = iixr.FieldDictionaryReader(r, r2) 134 doc_fields_reversed = doc_fields[:] 135 doc_fields_reversed.reverse() 136 for docnum, fields in doc_fields_reversed: 137 df = rd.get_fields(docnum) 138 print list(enumerate(fields)) == df, list(enumerate(fields)), df 139 for docnum in (13579, 246810): 140 df = rd.get_fields(docnum) 141 print df is None, df 142 143 # (Test sequential access.) 144 145 rd.rewind() 146 for docnum, fields in doc_fields: 147 dn, df = rd.read_fields() 148 print docnum == dn, docnum, dn 149 print list(enumerate(fields)) == df, list(enumerate(fields)), df 150 rd.close() 151 152 # Test terms. 153 154 terms = [ 155 # term offset frequency 156 ("aardvark", 100000123, 1), 157 ("anteater", 100000456, 2), 158 ("badger", 100000789, 13), 159 ("bull", 1000001234, 59), 160 ("bulldog", 1000002345, 99), 161 ("cat", 1000003456, 89) 162 ] 163 164 f = open("test", "wb") 165 w = iixr.TermWriter(f) 166 for term, offset, frequency in terms: 167 w.write_term(term, offset, frequency) 168 w.close() 169 170 f = open("test", "rb") 171 r = iixr.TermReader(f) 172 for term, offset, frequency in terms: 173 t, o, fr = r.read_term() 174 print term == t, term, t 175 print offset == o, offset, o 176 print frequency == fr, frequency, fr 177 r.close() 178 179 # Test terms in index files. 180 181 indexed_terms = [ 182 # term offset frequency info_offset 183 ("aardvark", 100000123, 1, 200000321), 184 ("anteater", 100000456, 2, 200000654), 185 ("badger", 100000789, 13, 200000987), 186 ("bull", 1000001234, 59, 200004321), 187 ("bulldog", 1000002345, 99, 200005432), 188 ("cat", 1000003456, 89, 200006543) 189 ] 190 191 f = open("test", "wb") 192 w = iixr.TermIndexWriter(f) 193 for term, offset, frequency, info_offset in indexed_terms: 194 w.write_term(term, offset, frequency, info_offset) 195 w.close() 196 197 f = open("test", "rb") 198 r = iixr.TermIndexReader(f) 199 for term, offset, frequency, info_offset in indexed_terms: 200 t, o, fr, i = r.read_term() 201 print term == t, term, t 202 print offset == o, offset, o 203 print frequency == fr, frequency, fr 204 print info_offset == i, info_offset, i 205 r.close() 206 207 # Test dictionaries with only term data. 208 209 f = open("test", "wb") 210 w = iixr.TermWriter(f) 211 f2 = open("testI", "wb") 212 w2 = iixr.TermIndexWriter(f2) 213 f3 = open("testP", "wb") 214 w3 = iixr.PositionWriter(f3) 215 wd = iixr.TermDictionaryWriter(w, w2, w3, 3) 216 for term, offset, frequency in terms: 217 wd._write_term(term, offset, frequency) 218 wd.close() 219 220 f = open("test", "rb") 221 r = iixr.TermReader(f) 222 f2 = open("testI", "rb") 223 r2 = iixr.TermIndexReader(f2) 224 f3 = open("testP", "rb") 225 r3 = iixr.PositionReader(f3) 226 rd = iixr.TermDictionaryReader(r, r2, r3) 227 terms_reversed = terms[:] 228 terms_reversed.reverse() 229 for term, offset, frequency in terms_reversed: 230 o, fr = rd._find_term(term) 231 print offset == o, offset, o 232 print frequency == fr, frequency, fr 233 for term in ("dog", "dingo"): 234 t = rd._find_term(term) 235 print t is None, t 236 rd.close() 237 238 # Test dictionaries with term and position data. 239 240 terms_with_positions = [ 241 ("aardvark", [(1, [2, 45, 96]), (20, [13])]), 242 ("anteater", [(1, [43, 44])]), 243 ("badger", [(7, [2, 22, 196]), (19, [55, 1333]), (21, [0])]), 244 ("bull", [(6, [128]), (16, [12])]), 245 ("bulldog", [(43, [17, 19, 256, 512])]), 246 ("cat", [(123, [12, 145, 196]), (1200, [113])]) 247 ] 248 249 f = open("test", "wb") 250 w = iixr.TermWriter(f) 251 f2 = open("testI", "wb") 252 w2 = iixr.TermIndexWriter(f2) 253 f3 = open("testP", "wb") 254 w3 = iixr.PositionWriter(f3) 255 wd = iixr.TermDictionaryWriter(w, w2, w3, 3) 256 for term, doc_positions in terms_with_positions: 257 wd.write_term_positions(term, doc_positions) 258 wd.close() 259 260 f = open("test", "rb") 261 r = iixr.TermReader(f) 262 f2 = open("testI", "rb") 263 r2 = iixr.TermIndexReader(f2) 264 f3 = open("testP", "rb") 265 r3 = iixr.PositionReader(f3) 266 rd = iixr.TermDictionaryReader(r, r2, r3) 267 terms_reversed = terms_with_positions[:] 268 terms_reversed.reverse() 269 for term, doc_positions in terms_reversed: 270 dp = rd.find_positions(term) 271 print doc_positions == dp, doc_positions, dp 272 for term in ("dog", "dingo"): 273 dp = rd.find_positions(term) 274 print dp is None, dp 275 276 # (Test sequential access.) 277 278 rd.rewind() 279 for term, doc_positions in terms_with_positions: 280 t, fr, dp = rd.read_term() 281 print term == t, term, t 282 print doc_positions == dp, doc_positions, dp 283 rd.close() 284 285 # Test high-level index operations (including merging). 286 287 docs = [ 288 (1, "The cat sat on the mat"), 289 (2, "Every good boy deserves football"), 290 (13, "One good turn deserves another"), 291 (14, "Every man for himself"), 292 (25, "Red sky at night shepherd's delight"), 293 (36, "She sells sea shells on the sea shore") 294 ] 295 296 doc_tests = [ 297 ("Every", 2, [(2, [0]), (14, [0])]), 298 ("good", 2, [(2, [1]), (13, [1])]), 299 ("deserves", 2, [(2, [3]), (13, [3])]), 300 ("sea", 2, [(36, [2, 6])]) 301 ] 302 303 index = iixr.Index("test_index") 304 wi = index.get_writer(3, 6) 305 for docnum, text in docs: 306 for position, term in enumerate(text.split()): 307 wi.add_position(term, docnum, position) 308 wi.add_field(docnum, 123, text) 309 wi.close() 310 311 rd = index.get_reader() 312 for term, frequency, doc_positions in doc_tests: 313 dp = rd.find_positions(term) 314 print doc_positions == dp, doc_positions, dp 315 fr = rd.get_frequency(term) 316 print frequency == fr, frequency, fr 317 for docnum, text in docs: 318 df = rd.get_fields(docnum) 319 print (123, text) == df[0], (123, text), df[0] 320 index.close() 321 322 # vim: tabstop=4 expandtab shiftwidth=4