paul@0 | 1 | #!/usr/bin/env python |
paul@0 | 2 | |
paul@0 | 3 | import iixr |
paul@18 | 4 | import os |
paul@18 | 5 | |
paul@18 | 6 | # Remove old test files. |
paul@18 | 7 | |
paul@18 | 8 | for filename in ("test", "testF", "testFI", "testI", "testP"): |
paul@18 | 9 | try: |
paul@18 | 10 | os.remove(filename) |
paul@18 | 11 | except OSError: |
paul@18 | 12 | pass |
paul@18 | 13 | |
paul@18 | 14 | try: |
paul@18 | 15 | os.removedirs("test_index") |
paul@18 | 16 | except OSError: |
paul@18 | 17 | pass |
paul@0 | 18 | |
paul@9 | 19 | # Test basic data types. |
paul@9 | 20 | |
paul@5 | 21 | numbers = [12345678, 0, 1, 127, 128, 255, 256] |
paul@0 | 22 | |
paul@0 | 23 | f = open("test", "wb") |
paul@0 | 24 | w = iixr.FileWriter(f) |
paul@0 | 25 | for number in numbers: |
paul@0 | 26 | w.write_number(number) |
paul@0 | 27 | w.close() |
paul@0 | 28 | |
paul@3 | 29 | f = open("test", "rb") |
paul@0 | 30 | r = iixr.FileReader(f) |
paul@0 | 31 | for number in numbers: |
paul@0 | 32 | n = r.read_number() |
paul@0 | 33 | print number == n, number, n |
paul@0 | 34 | r.close() |
paul@0 | 35 | |
paul@9 | 36 | # Test positions. |
paul@9 | 37 | |
paul@0 | 38 | all_doc_positions = [ |
paul@0 | 39 | [ |
paul@0 | 40 | (123, [1, 3, 5, 15, 25]), |
paul@0 | 41 | (124, [0, 100]) |
paul@0 | 42 | ], |
paul@0 | 43 | [ |
paul@0 | 44 | (78, [9]), |
paul@0 | 45 | (196, [10, 11]) |
paul@0 | 46 | ] |
paul@0 | 47 | ] |
paul@0 | 48 | |
paul@0 | 49 | f = open("test", "wb") |
paul@0 | 50 | w = iixr.PositionWriter(f) |
paul@0 | 51 | for doc_positions in all_doc_positions: |
paul@0 | 52 | for docnum, positions in doc_positions: |
paul@0 | 53 | w.write_positions(docnum, positions) |
paul@0 | 54 | w.reset() |
paul@0 | 55 | w.close() |
paul@0 | 56 | |
paul@3 | 57 | f = open("test", "rb") |
paul@0 | 58 | r = iixr.PositionReader(f) |
paul@0 | 59 | for doc_positions in all_doc_positions: |
paul@0 | 60 | for docnum, positions in doc_positions: |
paul@0 | 61 | d, p = r.read_positions() |
paul@0 | 62 | print docnum == d, docnum, d |
paul@0 | 63 | print positions == p, positions, p |
paul@0 | 64 | r.reset() |
paul@0 | 65 | r.close() |
paul@0 | 66 | |
paul@0 | 67 | f = open("test", "wb") |
paul@0 | 68 | w = iixr.PositionWriter(f) |
paul@0 | 69 | offsets = [] |
paul@0 | 70 | for doc_positions in all_doc_positions: |
paul@12 | 71 | offset, frequency = w.write_term_positions(doc_positions) |
paul@11 | 72 | offsets.append(offset) |
paul@0 | 73 | w.close() |
paul@0 | 74 | |
paul@3 | 75 | f = open("test", "rb") |
paul@0 | 76 | r = iixr.PositionReader(f) |
paul@0 | 77 | offsets.reverse() |
paul@0 | 78 | all_doc_positions.reverse() |
paul@0 | 79 | for offset, doc_positions in zip(offsets, all_doc_positions): |
paul@18 | 80 | dp = list(r.read_term_positions(offset)) |
paul@0 | 81 | print doc_positions == dp, doc_positions, dp |
paul@0 | 82 | r.close() |
paul@0 | 83 | |
paul@9 | 84 | # Test fields. |
paul@9 | 85 | |
paul@8 | 86 | doc_fields = [ |
paul@9 | 87 | (123, ["testing", "fields", "stored", "compressed"]), |
paul@9 | 88 | (456, ["fields", "for a second", "document"]), |
paul@9 | 89 | (789, ["field value"]), |
paul@9 | 90 | (1234, []), |
paul@9 | 91 | (2345, ["abc", "def"]), |
paul@9 | 92 | (3456, ["apple", "banana", "cherry"]), |
paul@9 | 93 | (4567, ["drue", "eple"]) |
paul@8 | 94 | ] |
paul@8 | 95 | |
paul@8 | 96 | f = open("testF", "wb") |
paul@8 | 97 | w = iixr.FieldWriter(f) |
paul@9 | 98 | for docnum, fields in doc_fields: |
paul@13 | 99 | w.write_fields(docnum, list(enumerate(fields))) |
paul@8 | 100 | w.close() |
paul@8 | 101 | |
paul@8 | 102 | f = open("testF", "rb") |
paul@8 | 103 | r = iixr.FieldReader(f) |
paul@9 | 104 | for docnum, fields in doc_fields: |
paul@9 | 105 | dn, df = r.read_fields() |
paul@9 | 106 | print docnum == dn, docnum, dn |
paul@13 | 107 | print list(enumerate(fields)) == df, list(enumerate(fields)), df |
paul@8 | 108 | r.close() |
paul@8 | 109 | |
paul@9 | 110 | # Test field index files. |
paul@9 | 111 | |
paul@9 | 112 | indexed_docs = [ |
paul@9 | 113 | (123, 100000987), |
paul@9 | 114 | (456, 100004321), |
paul@9 | 115 | (789, 100008765) |
paul@9 | 116 | ] |
paul@9 | 117 | |
paul@9 | 118 | f = open("testFI", "wb") |
paul@9 | 119 | w = iixr.FieldIndexWriter(f) |
paul@9 | 120 | for docnum, offset in indexed_docs: |
paul@9 | 121 | w.write_document(docnum, offset) |
paul@9 | 122 | w.close() |
paul@9 | 123 | |
paul@9 | 124 | f = open("testFI", "rb") |
paul@9 | 125 | r = iixr.FieldIndexReader(f) |
paul@9 | 126 | for docnum, offset in indexed_docs: |
paul@9 | 127 | dn, o = r.read_document() |
paul@9 | 128 | print docnum == dn, docnum, dn |
paul@9 | 129 | print offset == o, offset, o |
paul@9 | 130 | r.close() |
paul@9 | 131 | |
paul@9 | 132 | # Test field dictionaries. |
paul@9 | 133 | |
paul@9 | 134 | f = open("testF", "wb") |
paul@9 | 135 | w = iixr.FieldWriter(f) |
paul@9 | 136 | f2 = open("testFI", "wb") |
paul@9 | 137 | w2 = iixr.FieldIndexWriter(f2) |
paul@9 | 138 | wd = iixr.FieldDictionaryWriter(w, w2, 3) |
paul@9 | 139 | for docnum, fields in doc_fields: |
paul@13 | 140 | wd.write_fields(docnum, list(enumerate(fields))) |
paul@9 | 141 | wd.close() |
paul@9 | 142 | |
paul@9 | 143 | f = open("testF", "rb") |
paul@9 | 144 | r = iixr.FieldReader(f) |
paul@9 | 145 | f2 = open("testFI", "rb") |
paul@9 | 146 | r2 = iixr.FieldIndexReader(f2) |
paul@9 | 147 | rd = iixr.FieldDictionaryReader(r, r2) |
paul@9 | 148 | doc_fields_reversed = doc_fields[:] |
paul@9 | 149 | doc_fields_reversed.reverse() |
paul@9 | 150 | for docnum, fields in doc_fields_reversed: |
paul@13 | 151 | df = rd.get_fields(docnum) |
paul@13 | 152 | print list(enumerate(fields)) == df, list(enumerate(fields)), df |
paul@9 | 153 | for docnum in (13579, 246810): |
paul@13 | 154 | df = rd.get_fields(docnum) |
paul@9 | 155 | print df is None, df |
paul@13 | 156 | |
paul@13 | 157 | # (Test sequential access.) |
paul@13 | 158 | |
paul@13 | 159 | rd.rewind() |
paul@13 | 160 | for docnum, fields in doc_fields: |
paul@13 | 161 | dn, df = rd.read_fields() |
paul@13 | 162 | print docnum == dn, docnum, dn |
paul@13 | 163 | print list(enumerate(fields)) == df, list(enumerate(fields)), df |
paul@9 | 164 | rd.close() |
paul@9 | 165 | |
paul@9 | 166 | # Test terms. |
paul@9 | 167 | |
paul@2 | 168 | terms = [ |
paul@11 | 169 | # term offset frequency |
paul@11 | 170 | ("aardvark", 100000123, 1), |
paul@11 | 171 | ("anteater", 100000456, 2), |
paul@11 | 172 | ("badger", 100000789, 13), |
paul@11 | 173 | ("bull", 1000001234, 59), |
paul@11 | 174 | ("bulldog", 1000002345, 99), |
paul@11 | 175 | ("cat", 1000003456, 89) |
paul@2 | 176 | ] |
paul@2 | 177 | |
paul@2 | 178 | f = open("test", "wb") |
paul@2 | 179 | w = iixr.TermWriter(f) |
paul@11 | 180 | for term, offset, frequency in terms: |
paul@11 | 181 | w.write_term(term, offset, frequency) |
paul@2 | 182 | w.close() |
paul@2 | 183 | |
paul@3 | 184 | f = open("test", "rb") |
paul@2 | 185 | r = iixr.TermReader(f) |
paul@11 | 186 | for term, offset, frequency in terms: |
paul@11 | 187 | t, o, fr = r.read_term() |
paul@2 | 188 | print term == t, term, t |
paul@2 | 189 | print offset == o, offset, o |
paul@11 | 190 | print frequency == fr, frequency, fr |
paul@2 | 191 | r.close() |
paul@2 | 192 | |
paul@9 | 193 | # Test terms in index files. |
paul@9 | 194 | |
paul@3 | 195 | indexed_terms = [ |
paul@11 | 196 | # term offset frequency info_offset |
paul@11 | 197 | ("aardvark", 100000123, 1, 200000321), |
paul@11 | 198 | ("anteater", 100000456, 2, 200000654), |
paul@11 | 199 | ("badger", 100000789, 13, 200000987), |
paul@11 | 200 | ("bull", 1000001234, 59, 200004321), |
paul@11 | 201 | ("bulldog", 1000002345, 99, 200005432), |
paul@11 | 202 | ("cat", 1000003456, 89, 200006543) |
paul@3 | 203 | ] |
paul@3 | 204 | |
paul@3 | 205 | f = open("test", "wb") |
paul@3 | 206 | w = iixr.TermIndexWriter(f) |
paul@11 | 207 | for term, offset, frequency, info_offset in indexed_terms: |
paul@11 | 208 | w.write_term(term, offset, frequency, info_offset) |
paul@3 | 209 | w.close() |
paul@3 | 210 | |
paul@3 | 211 | f = open("test", "rb") |
paul@3 | 212 | r = iixr.TermIndexReader(f) |
paul@11 | 213 | for term, offset, frequency, info_offset in indexed_terms: |
paul@11 | 214 | t, o, fr, i = r.read_term() |
paul@3 | 215 | print term == t, term, t |
paul@3 | 216 | print offset == o, offset, o |
paul@11 | 217 | print frequency == fr, frequency, fr |
paul@3 | 218 | print info_offset == i, info_offset, i |
paul@3 | 219 | r.close() |
paul@3 | 220 | |
paul@9 | 221 | # Test dictionaries with only term data. |
paul@9 | 222 | |
paul@3 | 223 | f = open("test", "wb") |
paul@3 | 224 | w = iixr.TermWriter(f) |
paul@3 | 225 | f2 = open("testI", "wb") |
paul@3 | 226 | w2 = iixr.TermIndexWriter(f2) |
paul@5 | 227 | f3 = open("testP", "wb") |
paul@5 | 228 | w3 = iixr.PositionWriter(f3) |
paul@5 | 229 | wd = iixr.TermDictionaryWriter(w, w2, w3, 3) |
paul@11 | 230 | for term, offset, frequency in terms: |
paul@11 | 231 | wd._write_term(term, offset, frequency) |
paul@5 | 232 | wd.close() |
paul@3 | 233 | |
paul@3 | 234 | f = open("test", "rb") |
paul@3 | 235 | r = iixr.TermReader(f) |
paul@3 | 236 | f2 = open("testI", "rb") |
paul@3 | 237 | r2 = iixr.TermIndexReader(f2) |
paul@5 | 238 | f3 = open("testP", "rb") |
paul@5 | 239 | r3 = iixr.PositionReader(f3) |
paul@5 | 240 | rd = iixr.TermDictionaryReader(r, r2, r3) |
paul@3 | 241 | terms_reversed = terms[:] |
paul@3 | 242 | terms_reversed.reverse() |
paul@11 | 243 | for term, offset, frequency in terms_reversed: |
paul@11 | 244 | o, fr = rd._find_term(term) |
paul@3 | 245 | print offset == o, offset, o |
paul@11 | 246 | print frequency == fr, frequency, fr |
paul@3 | 247 | for term in ("dog", "dingo"): |
paul@11 | 248 | t = rd._find_term(term) |
paul@11 | 249 | print t is None, t |
paul@5 | 250 | rd.close() |
paul@5 | 251 | |
paul@9 | 252 | # Test dictionaries with term and position data. |
paul@9 | 253 | |
paul@5 | 254 | terms_with_positions = [ |
paul@5 | 255 | ("aardvark", [(1, [2, 45, 96]), (20, [13])]), |
paul@5 | 256 | ("anteater", [(1, [43, 44])]), |
paul@5 | 257 | ("badger", [(7, [2, 22, 196]), (19, [55, 1333]), (21, [0])]), |
paul@5 | 258 | ("bull", [(6, [128]), (16, [12])]), |
paul@5 | 259 | ("bulldog", [(43, [17, 19, 256, 512])]), |
paul@5 | 260 | ("cat", [(123, [12, 145, 196]), (1200, [113])]) |
paul@5 | 261 | ] |
paul@5 | 262 | |
paul@5 | 263 | f = open("test", "wb") |
paul@5 | 264 | w = iixr.TermWriter(f) |
paul@5 | 265 | f2 = open("testI", "wb") |
paul@5 | 266 | w2 = iixr.TermIndexWriter(f2) |
paul@5 | 267 | f3 = open("testP", "wb") |
paul@5 | 268 | w3 = iixr.PositionWriter(f3) |
paul@5 | 269 | wd = iixr.TermDictionaryWriter(w, w2, w3, 3) |
paul@5 | 270 | for term, doc_positions in terms_with_positions: |
paul@5 | 271 | wd.write_term_positions(term, doc_positions) |
paul@5 | 272 | wd.close() |
paul@5 | 273 | |
paul@5 | 274 | f = open("test", "rb") |
paul@5 | 275 | r = iixr.TermReader(f) |
paul@5 | 276 | f2 = open("testI", "rb") |
paul@5 | 277 | r2 = iixr.TermIndexReader(f2) |
paul@5 | 278 | f3 = open("testP", "rb") |
paul@5 | 279 | r3 = iixr.PositionReader(f3) |
paul@5 | 280 | rd = iixr.TermDictionaryReader(r, r2, r3) |
paul@5 | 281 | terms_reversed = terms_with_positions[:] |
paul@5 | 282 | terms_reversed.reverse() |
paul@5 | 283 | for term, doc_positions in terms_reversed: |
paul@18 | 284 | dp = list(rd.find_positions(term)) |
paul@5 | 285 | print doc_positions == dp, doc_positions, dp |
paul@5 | 286 | for term in ("dog", "dingo"): |
paul@5 | 287 | dp = rd.find_positions(term) |
paul@5 | 288 | print dp is None, dp |
paul@12 | 289 | |
paul@12 | 290 | # (Test sequential access.) |
paul@12 | 291 | |
paul@12 | 292 | rd.rewind() |
paul@12 | 293 | for term, doc_positions in terms_with_positions: |
paul@12 | 294 | t, fr, dp = rd.read_term() |
paul@18 | 295 | dp = list(dp) |
paul@12 | 296 | print term == t, term, t |
paul@12 | 297 | print doc_positions == dp, doc_positions, dp |
paul@5 | 298 | rd.close() |
paul@3 | 299 | |
paul@14 | 300 | # Test high-level index operations (including merging). |
paul@9 | 301 | |
paul@6 | 302 | docs = [ |
paul@6 | 303 | (1, "The cat sat on the mat"), |
paul@6 | 304 | (2, "Every good boy deserves football"), |
paul@6 | 305 | (13, "One good turn deserves another"), |
paul@6 | 306 | (14, "Every man for himself"), |
paul@6 | 307 | (25, "Red sky at night shepherd's delight"), |
paul@6 | 308 | (36, "She sells sea shells on the sea shore") |
paul@6 | 309 | ] |
paul@6 | 310 | |
paul@6 | 311 | doc_tests = [ |
paul@11 | 312 | ("Every", 2, [(2, [0]), (14, [0])]), |
paul@11 | 313 | ("good", 2, [(2, [1]), (13, [1])]), |
paul@11 | 314 | ("deserves", 2, [(2, [3]), (13, [3])]), |
paul@11 | 315 | ("sea", 2, [(36, [2, 6])]) |
paul@6 | 316 | ] |
paul@6 | 317 | |
paul@7 | 318 | index = iixr.Index("test_index") |
paul@14 | 319 | wi = index.get_writer(3, 6) |
paul@6 | 320 | for docnum, text in docs: |
paul@6 | 321 | for position, term in enumerate(text.split()): |
paul@6 | 322 | wi.add_position(term, docnum, position) |
paul@13 | 323 | wi.add_field(docnum, 123, text) |
paul@6 | 324 | wi.close() |
paul@6 | 325 | |
paul@7 | 326 | rd = index.get_reader() |
paul@11 | 327 | for term, frequency, doc_positions in doc_tests: |
paul@18 | 328 | dp = list(rd.find_positions(term)) |
paul@6 | 329 | print doc_positions == dp, doc_positions, dp |
paul@11 | 330 | fr = rd.get_frequency(term) |
paul@11 | 331 | print frequency == fr, frequency, fr |
paul@10 | 332 | for docnum, text in docs: |
paul@10 | 333 | df = rd.get_fields(docnum) |
paul@13 | 334 | print (123, text) == df[0], (123, text), df[0] |
paul@7 | 335 | index.close() |
paul@6 | 336 | |
paul@0 | 337 | # vim: tabstop=4 expandtab shiftwidth=4 |