1 #!/usr/bin/env python 2 3 import iixr 4 5 # Test basic data types. 6 7 numbers = [12345678, 0, 1, 127, 128, 255, 256] 8 9 f = open("test", "wb") 10 w = iixr.FileWriter(f) 11 for number in numbers: 12 w.write_number(number) 13 w.close() 14 15 f = open("test", "rb") 16 r = iixr.FileReader(f) 17 for number in numbers: 18 n = r.read_number() 19 print number == n, number, n 20 r.close() 21 22 # Test positions. 23 24 all_doc_positions = [ 25 [ 26 (123, [1, 3, 5, 15, 25]), 27 (124, [0, 100]) 28 ], 29 [ 30 (78, [9]), 31 (196, [10, 11]) 32 ] 33 ] 34 35 f = open("test", "wb") 36 w = iixr.PositionWriter(f) 37 for doc_positions in all_doc_positions: 38 for docnum, positions in doc_positions: 39 w.write_positions(docnum, positions) 40 w.reset() 41 w.close() 42 43 f = open("test", "rb") 44 r = iixr.PositionReader(f) 45 for doc_positions in all_doc_positions: 46 for docnum, positions in doc_positions: 47 d, p = r.read_positions() 48 print docnum == d, docnum, d 49 print positions == p, positions, p 50 r.reset() 51 r.close() 52 53 f = open("test", "wb") 54 w = iixr.PositionWriter(f) 55 offsets = [] 56 for doc_positions in all_doc_positions: 57 offsets.append( 58 w.write_all_positions(doc_positions) 59 ) 60 w.close() 61 62 f = open("test", "rb") 63 r = iixr.PositionReader(f) 64 offsets.reverse() 65 all_doc_positions.reverse() 66 for offset, doc_positions in zip(offsets, all_doc_positions): 67 dp = r.read_all_positions(offset) 68 print doc_positions == dp, doc_positions, dp 69 r.close() 70 71 # Test fields. 72 73 doc_fields = [ 74 (123, ["testing", "fields", "stored", "compressed"]), 75 (456, ["fields", "for a second", "document"]), 76 (789, ["field value"]), 77 (1234, []), 78 (2345, ["abc", "def"]), 79 (3456, ["apple", "banana", "cherry"]), 80 (4567, ["drue", "eple"]) 81 ] 82 83 f = open("testF", "wb") 84 w = iixr.FieldWriter(f) 85 for docnum, fields in doc_fields: 86 w.write_fields(docnum, fields) 87 w.close() 88 89 f = open("testF", "rb") 90 r = iixr.FieldReader(f) 91 for docnum, fields in doc_fields: 92 dn, df = r.read_fields() 93 print docnum == dn, docnum, dn 94 print fields == df, fields, df 95 r.close() 96 97 # Test field index files. 98 99 indexed_docs = [ 100 (123, 100000987), 101 (456, 100004321), 102 (789, 100008765) 103 ] 104 105 f = open("testFI", "wb") 106 w = iixr.FieldIndexWriter(f) 107 for docnum, offset in indexed_docs: 108 w.write_document(docnum, offset) 109 w.close() 110 111 f = open("testFI", "rb") 112 r = iixr.FieldIndexReader(f) 113 for docnum, offset in indexed_docs: 114 dn, o = r.read_document() 115 print docnum == dn, docnum, dn 116 print offset == o, offset, o 117 r.close() 118 119 # Test field dictionaries. 120 121 f = open("testF", "wb") 122 w = iixr.FieldWriter(f) 123 f2 = open("testFI", "wb") 124 w2 = iixr.FieldIndexWriter(f2) 125 wd = iixr.FieldDictionaryWriter(w, w2, 3) 126 for docnum, fields in doc_fields: 127 wd.write_fields(docnum, fields) 128 wd.close() 129 130 f = open("testF", "rb") 131 r = iixr.FieldReader(f) 132 f2 = open("testFI", "rb") 133 r2 = iixr.FieldIndexReader(f2) 134 rd = iixr.FieldDictionaryReader(r, r2) 135 doc_fields_reversed = doc_fields[:] 136 doc_fields_reversed.reverse() 137 for docnum, fields in doc_fields_reversed: 138 df = rd.read_fields(docnum) 139 print fields == df, fields, df 140 for docnum in (13579, 246810): 141 df = rd.read_fields(docnum) 142 print df is None, df 143 rd.close() 144 145 # Test terms. 146 147 terms = [ 148 ("aardvark", 100000123), 149 ("anteater", 100000456), 150 ("badger", 100000789), 151 ("bull", 1000001234), 152 ("bulldog", 1000002345), 153 ("cat", 1000003456) 154 ] 155 156 f = open("test", "wb") 157 w = iixr.TermWriter(f) 158 for term, offset in terms: 159 w.write_term(term, offset) 160 w.close() 161 162 f = open("test", "rb") 163 r = iixr.TermReader(f) 164 for term, offset in terms: 165 t, o = r.read_term() 166 print term == t, term, t 167 print offset == o, offset, o 168 r.close() 169 170 # Test terms in index files. 171 172 indexed_terms = [ 173 ("aardvark", 100000123, 200000321), 174 ("anteater", 100000456, 200000654), 175 ("badger", 100000789, 200000987), 176 ("bull", 1000001234, 200004321), 177 ("bulldog", 1000002345, 200005432), 178 ("cat", 1000003456, 200006543) 179 ] 180 181 f = open("test", "wb") 182 w = iixr.TermIndexWriter(f) 183 for term, offset, info_offset in indexed_terms: 184 w.write_term(term, offset, info_offset) 185 w.close() 186 187 f = open("test", "rb") 188 r = iixr.TermIndexReader(f) 189 for term, offset, info_offset in indexed_terms: 190 t, o, i = r.read_term() 191 print term == t, term, t 192 print offset == o, offset, o 193 print info_offset == i, info_offset, i 194 r.close() 195 196 # Test dictionaries with only term data. 197 198 f = open("test", "wb") 199 w = iixr.TermWriter(f) 200 f2 = open("testI", "wb") 201 w2 = iixr.TermIndexWriter(f2) 202 f3 = open("testP", "wb") 203 w3 = iixr.PositionWriter(f3) 204 wd = iixr.TermDictionaryWriter(w, w2, w3, 3) 205 for term, offset in terms: 206 wd._write_term(term, offset) 207 wd.close() 208 209 f = open("test", "rb") 210 r = iixr.TermReader(f) 211 f2 = open("testI", "rb") 212 r2 = iixr.TermIndexReader(f2) 213 f3 = open("testP", "rb") 214 r3 = iixr.PositionReader(f3) 215 rd = iixr.TermDictionaryReader(r, r2, r3) 216 terms_reversed = terms[:] 217 terms_reversed.reverse() 218 for term, offset in terms_reversed: 219 o = rd._find_term(term) 220 print offset == o, offset, o 221 for term in ("dog", "dingo"): 222 o = rd._find_term(term) 223 print o is None, o 224 rd.close() 225 226 # Test dictionaries with term and position data. 227 228 terms_with_positions = [ 229 ("aardvark", [(1, [2, 45, 96]), (20, [13])]), 230 ("anteater", [(1, [43, 44])]), 231 ("badger", [(7, [2, 22, 196]), (19, [55, 1333]), (21, [0])]), 232 ("bull", [(6, [128]), (16, [12])]), 233 ("bulldog", [(43, [17, 19, 256, 512])]), 234 ("cat", [(123, [12, 145, 196]), (1200, [113])]) 235 ] 236 237 f = open("test", "wb") 238 w = iixr.TermWriter(f) 239 f2 = open("testI", "wb") 240 w2 = iixr.TermIndexWriter(f2) 241 f3 = open("testP", "wb") 242 w3 = iixr.PositionWriter(f3) 243 wd = iixr.TermDictionaryWriter(w, w2, w3, 3) 244 for term, doc_positions in terms_with_positions: 245 wd.write_term_positions(term, doc_positions) 246 wd.close() 247 248 f = open("test", "rb") 249 r = iixr.TermReader(f) 250 f2 = open("testI", "rb") 251 r2 = iixr.TermIndexReader(f2) 252 f3 = open("testP", "rb") 253 r3 = iixr.PositionReader(f3) 254 rd = iixr.TermDictionaryReader(r, r2, r3) 255 terms_reversed = terms_with_positions[:] 256 terms_reversed.reverse() 257 for term, doc_positions in terms_reversed: 258 dp = rd.find_positions(term) 259 print doc_positions == dp, doc_positions, dp 260 for term in ("dog", "dingo"): 261 dp = rd.find_positions(term) 262 print dp is None, dp 263 rd.close() 264 265 # Test high-level index operations. 266 267 docs = [ 268 (1, "The cat sat on the mat"), 269 (2, "Every good boy deserves football"), 270 (13, "One good turn deserves another"), 271 (14, "Every man for himself"), 272 (25, "Red sky at night shepherd's delight"), 273 (36, "She sells sea shells on the sea shore") 274 ] 275 276 doc_tests = [ 277 ("Every", [(2, [0]), (14, [0])]), 278 ("good", [(2, [1]), (13, [1])]), 279 ("deserves", [(2, [3]), (13, [3])]), 280 ("sea", [(36, [2, 6])]) 281 ] 282 283 index = iixr.Index("test_index") 284 wi = index.get_writer(3) 285 for docnum, text in docs: 286 for position, term in enumerate(text.split()): 287 wi.add_position(term, docnum, position) 288 wi.close() 289 290 rd = index.get_reader() 291 for term, doc_positions in doc_tests: 292 dp = rd.find_positions(term) 293 print doc_positions == dp, doc_positions, dp 294 index.close() 295 296 # vim: tabstop=4 expandtab shiftwidth=4