paul@0 | 1 | #!/usr/bin/env python |
paul@0 | 2 | |
paul@0 | 3 | import iixr |
paul@0 | 4 | |
paul@5 | 5 | numbers = [12345678, 0, 1, 127, 128, 255, 256] |
paul@0 | 6 | |
paul@0 | 7 | f = open("test", "wb") |
paul@0 | 8 | w = iixr.FileWriter(f) |
paul@0 | 9 | for number in numbers: |
paul@0 | 10 | w.write_number(number) |
paul@0 | 11 | w.close() |
paul@0 | 12 | |
paul@3 | 13 | f = open("test", "rb") |
paul@0 | 14 | r = iixr.FileReader(f) |
paul@0 | 15 | for number in numbers: |
paul@0 | 16 | n = r.read_number() |
paul@0 | 17 | print number == n, number, n |
paul@0 | 18 | r.close() |
paul@0 | 19 | |
paul@0 | 20 | all_doc_positions = [ |
paul@0 | 21 | [ |
paul@0 | 22 | (123, [1, 3, 5, 15, 25]), |
paul@0 | 23 | (124, [0, 100]) |
paul@0 | 24 | ], |
paul@0 | 25 | [ |
paul@0 | 26 | (78, [9]), |
paul@0 | 27 | (196, [10, 11]) |
paul@0 | 28 | ] |
paul@0 | 29 | ] |
paul@0 | 30 | |
paul@0 | 31 | f = open("test", "wb") |
paul@0 | 32 | w = iixr.PositionWriter(f) |
paul@0 | 33 | for doc_positions in all_doc_positions: |
paul@0 | 34 | for docnum, positions in doc_positions: |
paul@0 | 35 | w.write_positions(docnum, positions) |
paul@0 | 36 | w.reset() |
paul@0 | 37 | w.close() |
paul@0 | 38 | |
paul@3 | 39 | f = open("test", "rb") |
paul@0 | 40 | r = iixr.PositionReader(f) |
paul@0 | 41 | for doc_positions in all_doc_positions: |
paul@0 | 42 | for docnum, positions in doc_positions: |
paul@0 | 43 | d, p = r.read_positions() |
paul@0 | 44 | print docnum == d, docnum, d |
paul@0 | 45 | print positions == p, positions, p |
paul@0 | 46 | r.reset() |
paul@0 | 47 | r.close() |
paul@0 | 48 | |
paul@0 | 49 | f = open("test", "wb") |
paul@0 | 50 | w = iixr.PositionWriter(f) |
paul@0 | 51 | offsets = [] |
paul@0 | 52 | for doc_positions in all_doc_positions: |
paul@0 | 53 | offsets.append( |
paul@0 | 54 | w.write_all_positions(doc_positions) |
paul@0 | 55 | ) |
paul@0 | 56 | w.close() |
paul@0 | 57 | |
paul@3 | 58 | f = open("test", "rb") |
paul@0 | 59 | r = iixr.PositionReader(f) |
paul@0 | 60 | offsets.reverse() |
paul@0 | 61 | all_doc_positions.reverse() |
paul@0 | 62 | for offset, doc_positions in zip(offsets, all_doc_positions): |
paul@0 | 63 | dp = r.read_all_positions(offset) |
paul@0 | 64 | print doc_positions == dp, doc_positions, dp |
paul@0 | 65 | r.close() |
paul@0 | 66 | |
paul@2 | 67 | terms = [ |
paul@2 | 68 | ("aardvark", 100000123), |
paul@2 | 69 | ("anteater", 100000456), |
paul@2 | 70 | ("badger", 100000789), |
paul@2 | 71 | ("bull", 1000001234), |
paul@2 | 72 | ("bulldog", 1000002345), |
paul@2 | 73 | ("cat", 1000003456) |
paul@2 | 74 | ] |
paul@2 | 75 | |
paul@2 | 76 | f = open("test", "wb") |
paul@2 | 77 | w = iixr.TermWriter(f) |
paul@2 | 78 | for term, offset in terms: |
paul@2 | 79 | w.write_term(term, offset) |
paul@2 | 80 | w.close() |
paul@2 | 81 | |
paul@3 | 82 | f = open("test", "rb") |
paul@2 | 83 | r = iixr.TermReader(f) |
paul@2 | 84 | for term, offset in terms: |
paul@2 | 85 | t, o = r.read_term() |
paul@2 | 86 | print term == t, term, t |
paul@2 | 87 | print offset == o, offset, o |
paul@2 | 88 | r.close() |
paul@2 | 89 | |
paul@3 | 90 | indexed_terms = [ |
paul@3 | 91 | ("aardvark", 100000123, 200000321), |
paul@3 | 92 | ("anteater", 100000456, 200000654), |
paul@3 | 93 | ("badger", 100000789, 200000987), |
paul@3 | 94 | ("bull", 1000001234, 200004321), |
paul@3 | 95 | ("bulldog", 1000002345, 200005432), |
paul@3 | 96 | ("cat", 1000003456, 200006543) |
paul@3 | 97 | ] |
paul@3 | 98 | |
paul@3 | 99 | f = open("test", "wb") |
paul@3 | 100 | w = iixr.TermIndexWriter(f) |
paul@3 | 101 | for term, offset, info_offset in indexed_terms: |
paul@3 | 102 | w.write_term(term, offset, info_offset) |
paul@3 | 103 | w.close() |
paul@3 | 104 | |
paul@3 | 105 | f = open("test", "rb") |
paul@3 | 106 | r = iixr.TermIndexReader(f) |
paul@3 | 107 | for term, offset, info_offset in indexed_terms: |
paul@3 | 108 | t, o, i = r.read_term() |
paul@3 | 109 | print term == t, term, t |
paul@3 | 110 | print offset == o, offset, o |
paul@3 | 111 | print info_offset == i, info_offset, i |
paul@3 | 112 | r.close() |
paul@3 | 113 | |
paul@3 | 114 | f = open("test", "wb") |
paul@3 | 115 | w = iixr.TermWriter(f) |
paul@3 | 116 | f2 = open("testI", "wb") |
paul@3 | 117 | w2 = iixr.TermIndexWriter(f2) |
paul@5 | 118 | f3 = open("testP", "wb") |
paul@5 | 119 | w3 = iixr.PositionWriter(f3) |
paul@5 | 120 | wd = iixr.TermDictionaryWriter(w, w2, w3, 3) |
paul@3 | 121 | for term, offset in terms: |
paul@5 | 122 | wd.write_term(term, offset) |
paul@5 | 123 | wd.close() |
paul@3 | 124 | |
paul@3 | 125 | f = open("test", "rb") |
paul@3 | 126 | r = iixr.TermReader(f) |
paul@3 | 127 | f2 = open("testI", "rb") |
paul@3 | 128 | r2 = iixr.TermIndexReader(f2) |
paul@5 | 129 | f3 = open("testP", "rb") |
paul@5 | 130 | r3 = iixr.PositionReader(f3) |
paul@5 | 131 | rd = iixr.TermDictionaryReader(r, r2, r3) |
paul@3 | 132 | terms_reversed = terms[:] |
paul@3 | 133 | terms_reversed.reverse() |
paul@3 | 134 | for term, offset in terms_reversed: |
paul@6 | 135 | o = rd.find_term(term) |
paul@3 | 136 | print offset == o, offset, o |
paul@3 | 137 | for term in ("dog", "dingo"): |
paul@6 | 138 | o = rd.find_term(term) |
paul@3 | 139 | print o is None, o |
paul@5 | 140 | rd.close() |
paul@5 | 141 | |
paul@5 | 142 | terms_with_positions = [ |
paul@5 | 143 | ("aardvark", [(1, [2, 45, 96]), (20, [13])]), |
paul@5 | 144 | ("anteater", [(1, [43, 44])]), |
paul@5 | 145 | ("badger", [(7, [2, 22, 196]), (19, [55, 1333]), (21, [0])]), |
paul@5 | 146 | ("bull", [(6, [128]), (16, [12])]), |
paul@5 | 147 | ("bulldog", [(43, [17, 19, 256, 512])]), |
paul@5 | 148 | ("cat", [(123, [12, 145, 196]), (1200, [113])]) |
paul@5 | 149 | ] |
paul@5 | 150 | |
paul@5 | 151 | f = open("test", "wb") |
paul@5 | 152 | w = iixr.TermWriter(f) |
paul@5 | 153 | f2 = open("testI", "wb") |
paul@5 | 154 | w2 = iixr.TermIndexWriter(f2) |
paul@5 | 155 | f3 = open("testP", "wb") |
paul@5 | 156 | w3 = iixr.PositionWriter(f3) |
paul@5 | 157 | wd = iixr.TermDictionaryWriter(w, w2, w3, 3) |
paul@5 | 158 | for term, doc_positions in terms_with_positions: |
paul@5 | 159 | wd.write_term_positions(term, doc_positions) |
paul@5 | 160 | wd.close() |
paul@5 | 161 | |
paul@5 | 162 | f = open("test", "rb") |
paul@5 | 163 | r = iixr.TermReader(f) |
paul@5 | 164 | f2 = open("testI", "rb") |
paul@5 | 165 | r2 = iixr.TermIndexReader(f2) |
paul@5 | 166 | f3 = open("testP", "rb") |
paul@5 | 167 | r3 = iixr.PositionReader(f3) |
paul@5 | 168 | rd = iixr.TermDictionaryReader(r, r2, r3) |
paul@5 | 169 | terms_reversed = terms_with_positions[:] |
paul@5 | 170 | terms_reversed.reverse() |
paul@5 | 171 | for term, doc_positions in terms_reversed: |
paul@5 | 172 | dp = rd.find_positions(term) |
paul@5 | 173 | print doc_positions == dp, doc_positions, dp |
paul@5 | 174 | for term in ("dog", "dingo"): |
paul@5 | 175 | dp = rd.find_positions(term) |
paul@5 | 176 | print dp is None, dp |
paul@5 | 177 | rd.close() |
paul@3 | 178 | |
paul@6 | 179 | docs = [ |
paul@6 | 180 | (1, "The cat sat on the mat"), |
paul@6 | 181 | (2, "Every good boy deserves football"), |
paul@6 | 182 | (13, "One good turn deserves another"), |
paul@6 | 183 | (14, "Every man for himself"), |
paul@6 | 184 | (25, "Red sky at night shepherd's delight"), |
paul@6 | 185 | (36, "She sells sea shells on the sea shore") |
paul@6 | 186 | ] |
paul@6 | 187 | |
paul@6 | 188 | doc_tests = [ |
paul@6 | 189 | ("Every", [(2, [0]), (14, [0])]), |
paul@6 | 190 | ("good", [(2, [1]), (13, [1])]), |
paul@6 | 191 | ("deserves", [(2, [3]), (13, [3])]), |
paul@6 | 192 | ("sea", [(36, [2, 6])]) |
paul@6 | 193 | ] |
paul@6 | 194 | |
paul@7 | 195 | index = iixr.Index("test_index") |
paul@7 | 196 | wi = index.get_writer(3) |
paul@6 | 197 | for docnum, text in docs: |
paul@6 | 198 | for position, term in enumerate(text.split()): |
paul@6 | 199 | wi.add_position(term, docnum, position) |
paul@6 | 200 | wi.close() |
paul@6 | 201 | |
paul@7 | 202 | rd = index.get_reader() |
paul@6 | 203 | for term, doc_positions in doc_tests: |
paul@6 | 204 | dp = rd.find_positions(term) |
paul@6 | 205 | print doc_positions == dp, doc_positions, dp |
paul@7 | 206 | index.close() |
paul@6 | 207 | |
paul@0 | 208 | # vim: tabstop=4 expandtab shiftwidth=4 |