paul@0 | 1 | #!/usr/bin/env python |
paul@0 | 2 | |
paul@0 | 3 | import iixr |
paul@0 | 4 | |
paul@9 | 5 | # Test basic data types. |
paul@9 | 6 | |
paul@5 | 7 | numbers = [12345678, 0, 1, 127, 128, 255, 256] |
paul@0 | 8 | |
paul@0 | 9 | f = open("test", "wb") |
paul@0 | 10 | w = iixr.FileWriter(f) |
paul@0 | 11 | for number in numbers: |
paul@0 | 12 | w.write_number(number) |
paul@0 | 13 | w.close() |
paul@0 | 14 | |
paul@3 | 15 | f = open("test", "rb") |
paul@0 | 16 | r = iixr.FileReader(f) |
paul@0 | 17 | for number in numbers: |
paul@0 | 18 | n = r.read_number() |
paul@0 | 19 | print number == n, number, n |
paul@0 | 20 | r.close() |
paul@0 | 21 | |
paul@9 | 22 | # Test positions. |
paul@9 | 23 | |
paul@0 | 24 | all_doc_positions = [ |
paul@0 | 25 | [ |
paul@0 | 26 | (123, [1, 3, 5, 15, 25]), |
paul@0 | 27 | (124, [0, 100]) |
paul@0 | 28 | ], |
paul@0 | 29 | [ |
paul@0 | 30 | (78, [9]), |
paul@0 | 31 | (196, [10, 11]) |
paul@0 | 32 | ] |
paul@0 | 33 | ] |
paul@0 | 34 | |
paul@0 | 35 | f = open("test", "wb") |
paul@0 | 36 | w = iixr.PositionWriter(f) |
paul@0 | 37 | for doc_positions in all_doc_positions: |
paul@0 | 38 | for docnum, positions in doc_positions: |
paul@0 | 39 | w.write_positions(docnum, positions) |
paul@0 | 40 | w.reset() |
paul@0 | 41 | w.close() |
paul@0 | 42 | |
paul@3 | 43 | f = open("test", "rb") |
paul@0 | 44 | r = iixr.PositionReader(f) |
paul@0 | 45 | for doc_positions in all_doc_positions: |
paul@0 | 46 | for docnum, positions in doc_positions: |
paul@0 | 47 | d, p = r.read_positions() |
paul@0 | 48 | print docnum == d, docnum, d |
paul@0 | 49 | print positions == p, positions, p |
paul@0 | 50 | r.reset() |
paul@0 | 51 | r.close() |
paul@0 | 52 | |
paul@0 | 53 | f = open("test", "wb") |
paul@0 | 54 | w = iixr.PositionWriter(f) |
paul@0 | 55 | offsets = [] |
paul@0 | 56 | for doc_positions in all_doc_positions: |
paul@12 | 57 | offset, frequency = w.write_term_positions(doc_positions) |
paul@11 | 58 | offsets.append(offset) |
paul@0 | 59 | w.close() |
paul@0 | 60 | |
paul@3 | 61 | f = open("test", "rb") |
paul@0 | 62 | r = iixr.PositionReader(f) |
paul@0 | 63 | offsets.reverse() |
paul@0 | 64 | all_doc_positions.reverse() |
paul@0 | 65 | for offset, doc_positions in zip(offsets, all_doc_positions): |
paul@12 | 66 | dp = r.read_term_positions(offset) |
paul@0 | 67 | print doc_positions == dp, doc_positions, dp |
paul@0 | 68 | r.close() |
paul@0 | 69 | |
paul@9 | 70 | # Test fields. |
paul@9 | 71 | |
paul@8 | 72 | doc_fields = [ |
paul@9 | 73 | (123, ["testing", "fields", "stored", "compressed"]), |
paul@9 | 74 | (456, ["fields", "for a second", "document"]), |
paul@9 | 75 | (789, ["field value"]), |
paul@9 | 76 | (1234, []), |
paul@9 | 77 | (2345, ["abc", "def"]), |
paul@9 | 78 | (3456, ["apple", "banana", "cherry"]), |
paul@9 | 79 | (4567, ["drue", "eple"]) |
paul@8 | 80 | ] |
paul@8 | 81 | |
paul@8 | 82 | f = open("testF", "wb") |
paul@8 | 83 | w = iixr.FieldWriter(f) |
paul@9 | 84 | for docnum, fields in doc_fields: |
paul@13 | 85 | w.write_fields(docnum, list(enumerate(fields))) |
paul@8 | 86 | w.close() |
paul@8 | 87 | |
paul@8 | 88 | f = open("testF", "rb") |
paul@8 | 89 | r = iixr.FieldReader(f) |
paul@9 | 90 | for docnum, fields in doc_fields: |
paul@9 | 91 | dn, df = r.read_fields() |
paul@9 | 92 | print docnum == dn, docnum, dn |
paul@13 | 93 | print list(enumerate(fields)) == df, list(enumerate(fields)), df |
paul@8 | 94 | r.close() |
paul@8 | 95 | |
paul@9 | 96 | # Test field index files. |
paul@9 | 97 | |
paul@9 | 98 | indexed_docs = [ |
paul@9 | 99 | (123, 100000987), |
paul@9 | 100 | (456, 100004321), |
paul@9 | 101 | (789, 100008765) |
paul@9 | 102 | ] |
paul@9 | 103 | |
paul@9 | 104 | f = open("testFI", "wb") |
paul@9 | 105 | w = iixr.FieldIndexWriter(f) |
paul@9 | 106 | for docnum, offset in indexed_docs: |
paul@9 | 107 | w.write_document(docnum, offset) |
paul@9 | 108 | w.close() |
paul@9 | 109 | |
paul@9 | 110 | f = open("testFI", "rb") |
paul@9 | 111 | r = iixr.FieldIndexReader(f) |
paul@9 | 112 | for docnum, offset in indexed_docs: |
paul@9 | 113 | dn, o = r.read_document() |
paul@9 | 114 | print docnum == dn, docnum, dn |
paul@9 | 115 | print offset == o, offset, o |
paul@9 | 116 | r.close() |
paul@9 | 117 | |
paul@9 | 118 | # Test field dictionaries. |
paul@9 | 119 | |
paul@9 | 120 | f = open("testF", "wb") |
paul@9 | 121 | w = iixr.FieldWriter(f) |
paul@9 | 122 | f2 = open("testFI", "wb") |
paul@9 | 123 | w2 = iixr.FieldIndexWriter(f2) |
paul@9 | 124 | wd = iixr.FieldDictionaryWriter(w, w2, 3) |
paul@9 | 125 | for docnum, fields in doc_fields: |
paul@13 | 126 | wd.write_fields(docnum, list(enumerate(fields))) |
paul@9 | 127 | wd.close() |
paul@9 | 128 | |
paul@9 | 129 | f = open("testF", "rb") |
paul@9 | 130 | r = iixr.FieldReader(f) |
paul@9 | 131 | f2 = open("testFI", "rb") |
paul@9 | 132 | r2 = iixr.FieldIndexReader(f2) |
paul@9 | 133 | rd = iixr.FieldDictionaryReader(r, r2) |
paul@9 | 134 | doc_fields_reversed = doc_fields[:] |
paul@9 | 135 | doc_fields_reversed.reverse() |
paul@9 | 136 | for docnum, fields in doc_fields_reversed: |
paul@13 | 137 | df = rd.get_fields(docnum) |
paul@13 | 138 | print list(enumerate(fields)) == df, list(enumerate(fields)), df |
paul@9 | 139 | for docnum in (13579, 246810): |
paul@13 | 140 | df = rd.get_fields(docnum) |
paul@9 | 141 | print df is None, df |
paul@13 | 142 | |
paul@13 | 143 | # (Test sequential access.) |
paul@13 | 144 | |
paul@13 | 145 | rd.rewind() |
paul@13 | 146 | for docnum, fields in doc_fields: |
paul@13 | 147 | dn, df = rd.read_fields() |
paul@13 | 148 | print docnum == dn, docnum, dn |
paul@13 | 149 | print list(enumerate(fields)) == df, list(enumerate(fields)), df |
paul@9 | 150 | rd.close() |
paul@9 | 151 | |
paul@9 | 152 | # Test terms. |
paul@9 | 153 | |
paul@2 | 154 | terms = [ |
paul@11 | 155 | # term offset frequency |
paul@11 | 156 | ("aardvark", 100000123, 1), |
paul@11 | 157 | ("anteater", 100000456, 2), |
paul@11 | 158 | ("badger", 100000789, 13), |
paul@11 | 159 | ("bull", 1000001234, 59), |
paul@11 | 160 | ("bulldog", 1000002345, 99), |
paul@11 | 161 | ("cat", 1000003456, 89) |
paul@2 | 162 | ] |
paul@2 | 163 | |
paul@2 | 164 | f = open("test", "wb") |
paul@2 | 165 | w = iixr.TermWriter(f) |
paul@11 | 166 | for term, offset, frequency in terms: |
paul@11 | 167 | w.write_term(term, offset, frequency) |
paul@2 | 168 | w.close() |
paul@2 | 169 | |
paul@3 | 170 | f = open("test", "rb") |
paul@2 | 171 | r = iixr.TermReader(f) |
paul@11 | 172 | for term, offset, frequency in terms: |
paul@11 | 173 | t, o, fr = r.read_term() |
paul@2 | 174 | print term == t, term, t |
paul@2 | 175 | print offset == o, offset, o |
paul@11 | 176 | print frequency == fr, frequency, fr |
paul@2 | 177 | r.close() |
paul@2 | 178 | |
paul@9 | 179 | # Test terms in index files. |
paul@9 | 180 | |
paul@3 | 181 | indexed_terms = [ |
paul@11 | 182 | # term offset frequency info_offset |
paul@11 | 183 | ("aardvark", 100000123, 1, 200000321), |
paul@11 | 184 | ("anteater", 100000456, 2, 200000654), |
paul@11 | 185 | ("badger", 100000789, 13, 200000987), |
paul@11 | 186 | ("bull", 1000001234, 59, 200004321), |
paul@11 | 187 | ("bulldog", 1000002345, 99, 200005432), |
paul@11 | 188 | ("cat", 1000003456, 89, 200006543) |
paul@3 | 189 | ] |
paul@3 | 190 | |
paul@3 | 191 | f = open("test", "wb") |
paul@3 | 192 | w = iixr.TermIndexWriter(f) |
paul@11 | 193 | for term, offset, frequency, info_offset in indexed_terms: |
paul@11 | 194 | w.write_term(term, offset, frequency, info_offset) |
paul@3 | 195 | w.close() |
paul@3 | 196 | |
paul@3 | 197 | f = open("test", "rb") |
paul@3 | 198 | r = iixr.TermIndexReader(f) |
paul@11 | 199 | for term, offset, frequency, info_offset in indexed_terms: |
paul@11 | 200 | t, o, fr, i = r.read_term() |
paul@3 | 201 | print term == t, term, t |
paul@3 | 202 | print offset == o, offset, o |
paul@11 | 203 | print frequency == fr, frequency, fr |
paul@3 | 204 | print info_offset == i, info_offset, i |
paul@3 | 205 | r.close() |
paul@3 | 206 | |
paul@9 | 207 | # Test dictionaries with only term data. |
paul@9 | 208 | |
paul@3 | 209 | f = open("test", "wb") |
paul@3 | 210 | w = iixr.TermWriter(f) |
paul@3 | 211 | f2 = open("testI", "wb") |
paul@3 | 212 | w2 = iixr.TermIndexWriter(f2) |
paul@5 | 213 | f3 = open("testP", "wb") |
paul@5 | 214 | w3 = iixr.PositionWriter(f3) |
paul@5 | 215 | wd = iixr.TermDictionaryWriter(w, w2, w3, 3) |
paul@11 | 216 | for term, offset, frequency in terms: |
paul@11 | 217 | wd._write_term(term, offset, frequency) |
paul@5 | 218 | wd.close() |
paul@3 | 219 | |
paul@3 | 220 | f = open("test", "rb") |
paul@3 | 221 | r = iixr.TermReader(f) |
paul@3 | 222 | f2 = open("testI", "rb") |
paul@3 | 223 | r2 = iixr.TermIndexReader(f2) |
paul@5 | 224 | f3 = open("testP", "rb") |
paul@5 | 225 | r3 = iixr.PositionReader(f3) |
paul@5 | 226 | rd = iixr.TermDictionaryReader(r, r2, r3) |
paul@3 | 227 | terms_reversed = terms[:] |
paul@3 | 228 | terms_reversed.reverse() |
paul@11 | 229 | for term, offset, frequency in terms_reversed: |
paul@11 | 230 | o, fr = rd._find_term(term) |
paul@3 | 231 | print offset == o, offset, o |
paul@11 | 232 | print frequency == fr, frequency, fr |
paul@3 | 233 | for term in ("dog", "dingo"): |
paul@11 | 234 | t = rd._find_term(term) |
paul@11 | 235 | print t is None, t |
paul@5 | 236 | rd.close() |
paul@5 | 237 | |
paul@9 | 238 | # Test dictionaries with term and position data. |
paul@9 | 239 | |
paul@5 | 240 | terms_with_positions = [ |
paul@5 | 241 | ("aardvark", [(1, [2, 45, 96]), (20, [13])]), |
paul@5 | 242 | ("anteater", [(1, [43, 44])]), |
paul@5 | 243 | ("badger", [(7, [2, 22, 196]), (19, [55, 1333]), (21, [0])]), |
paul@5 | 244 | ("bull", [(6, [128]), (16, [12])]), |
paul@5 | 245 | ("bulldog", [(43, [17, 19, 256, 512])]), |
paul@5 | 246 | ("cat", [(123, [12, 145, 196]), (1200, [113])]) |
paul@5 | 247 | ] |
paul@5 | 248 | |
paul@5 | 249 | f = open("test", "wb") |
paul@5 | 250 | w = iixr.TermWriter(f) |
paul@5 | 251 | f2 = open("testI", "wb") |
paul@5 | 252 | w2 = iixr.TermIndexWriter(f2) |
paul@5 | 253 | f3 = open("testP", "wb") |
paul@5 | 254 | w3 = iixr.PositionWriter(f3) |
paul@5 | 255 | wd = iixr.TermDictionaryWriter(w, w2, w3, 3) |
paul@5 | 256 | for term, doc_positions in terms_with_positions: |
paul@5 | 257 | wd.write_term_positions(term, doc_positions) |
paul@5 | 258 | wd.close() |
paul@5 | 259 | |
paul@5 | 260 | f = open("test", "rb") |
paul@5 | 261 | r = iixr.TermReader(f) |
paul@5 | 262 | f2 = open("testI", "rb") |
paul@5 | 263 | r2 = iixr.TermIndexReader(f2) |
paul@5 | 264 | f3 = open("testP", "rb") |
paul@5 | 265 | r3 = iixr.PositionReader(f3) |
paul@5 | 266 | rd = iixr.TermDictionaryReader(r, r2, r3) |
paul@5 | 267 | terms_reversed = terms_with_positions[:] |
paul@5 | 268 | terms_reversed.reverse() |
paul@5 | 269 | for term, doc_positions in terms_reversed: |
paul@5 | 270 | dp = rd.find_positions(term) |
paul@5 | 271 | print doc_positions == dp, doc_positions, dp |
paul@5 | 272 | for term in ("dog", "dingo"): |
paul@5 | 273 | dp = rd.find_positions(term) |
paul@5 | 274 | print dp is None, dp |
paul@12 | 275 | |
paul@12 | 276 | # (Test sequential access.) |
paul@12 | 277 | |
paul@12 | 278 | rd.rewind() |
paul@12 | 279 | for term, doc_positions in terms_with_positions: |
paul@12 | 280 | t, fr, dp = rd.read_term() |
paul@12 | 281 | print term == t, term, t |
paul@12 | 282 | print doc_positions == dp, doc_positions, dp |
paul@5 | 283 | rd.close() |
paul@3 | 284 | |
paul@9 | 285 | # Test high-level index operations. |
paul@9 | 286 | |
paul@6 | 287 | docs = [ |
paul@6 | 288 | (1, "The cat sat on the mat"), |
paul@6 | 289 | (2, "Every good boy deserves football"), |
paul@6 | 290 | (13, "One good turn deserves another"), |
paul@6 | 291 | (14, "Every man for himself"), |
paul@6 | 292 | (25, "Red sky at night shepherd's delight"), |
paul@6 | 293 | (36, "She sells sea shells on the sea shore") |
paul@6 | 294 | ] |
paul@6 | 295 | |
paul@6 | 296 | doc_tests = [ |
paul@11 | 297 | ("Every", 2, [(2, [0]), (14, [0])]), |
paul@11 | 298 | ("good", 2, [(2, [1]), (13, [1])]), |
paul@11 | 299 | ("deserves", 2, [(2, [3]), (13, [3])]), |
paul@11 | 300 | ("sea", 2, [(36, [2, 6])]) |
paul@6 | 301 | ] |
paul@6 | 302 | |
paul@7 | 303 | index = iixr.Index("test_index") |
paul@7 | 304 | wi = index.get_writer(3) |
paul@6 | 305 | for docnum, text in docs: |
paul@6 | 306 | for position, term in enumerate(text.split()): |
paul@6 | 307 | wi.add_position(term, docnum, position) |
paul@13 | 308 | wi.add_field(docnum, 123, text) |
paul@6 | 309 | wi.close() |
paul@6 | 310 | |
paul@7 | 311 | rd = index.get_reader() |
paul@11 | 312 | for term, frequency, doc_positions in doc_tests: |
paul@6 | 313 | dp = rd.find_positions(term) |
paul@6 | 314 | print doc_positions == dp, doc_positions, dp |
paul@11 | 315 | fr = rd.get_frequency(term) |
paul@11 | 316 | print frequency == fr, frequency, fr |
paul@10 | 317 | for docnum, text in docs: |
paul@10 | 318 | df = rd.get_fields(docnum) |
paul@13 | 319 | print (123, text) == df[0], (123, text), df[0] |
paul@7 | 320 | index.close() |
paul@6 | 321 | |
paul@0 | 322 | # vim: tabstop=4 expandtab shiftwidth=4 |