1.1 --- a/test.py	Mon Aug 31 21:02:30 2009 +0200
     1.2 +++ b/test.py	Wed Sep 02 01:30:42 2009 +0200
     1.3 @@ -38,15 +38,18 @@
     1.4  all_doc_positions = [
     1.5      [
     1.6          (123, [1, 3, 5, 15, 25]),
     1.7 -        (124, [0, 100])
     1.8 +        (124, [0, 100]),
     1.9 +        (125, [11, 99, 199]),
    1.10 +        (130, [77, 78, 80, 82, 89])
    1.11      ],
    1.12      [
    1.13          (78, [9]),
    1.14 -        (196, [10, 11])
    1.15 +        (196, [10, 11]),
    1.16 +        (197, [17, 21, 30])
    1.17      ]
    1.18      ]
    1.19  
    1.20 -f = open("test", "wb")
    1.21 +f = open("testP", "wb")
    1.22  w = iixr.PositionWriter(f)
    1.23  for doc_positions in all_doc_positions:
    1.24      for docnum, positions in doc_positions:
    1.25 @@ -54,7 +57,7 @@
    1.26      w.reset()
    1.27  w.close()
    1.28  
    1.29 -f = open("test", "rb")
    1.30 +f = open("testP", "rb")
    1.31  r = iixr.PositionReader(f)
    1.32  for doc_positions in all_doc_positions:
    1.33      for docnum, positions in doc_positions:
    1.34 @@ -64,20 +67,68 @@
    1.35      r.reset()
    1.36  r.close()
    1.37  
    1.38 -f = open("test", "wb")
    1.39 +# Test position index files.
    1.40 +
    1.41 +indexed_positions = [
    1.42 +    [
    1.43 +        (1234, 0, 100),
    1.44 +        (2345, 700, 100),
    1.45 +        (3456, 1900, 50)
    1.46 +    ],
    1.47 +    [
    1.48 +        (4567, 2800, 20)
    1.49 +    ]
    1.50 +    ]
    1.51 +
    1.52 +offsets = []
    1.53 +f = open("testPI", "wb")
    1.54 +w = iixr.PositionIndexWriter(f)
    1.55 +for term_positions in indexed_positions:
    1.56 +    offset = None
    1.57 +    doc_frequency = 0
    1.58 +    w.reset()
    1.59 +    for docnum, pos_offset, count in term_positions:
    1.60 +        io = w.write_positions(docnum, pos_offset, count)
    1.61 +        if offset is None:
    1.62 +            offset = io
    1.63 +        doc_frequency += count
    1.64 +    offsets.append((offset, doc_frequency))
    1.65 +w.close()
    1.66 +
    1.67 +f = open("testPI", "rb")
    1.68 +r = iixr.PositionIndexReader(f)
    1.69 +offsets.reverse()
    1.70 +indexed_positions.reverse()
    1.71 +for (offset, doc_frequency), term_positions in zip(offsets, indexed_positions):
    1.72 +    found_positions = r.read_term_positions(offset, doc_frequency)
    1.73 +    for (docnum, pos_offset, count), (dn, po, c) in zip(term_positions, found_positions):
    1.74 +        print docnum == dn, docnum, dn
    1.75 +        print pos_offset == po, pos_offset, po
    1.76 +        print count == c, count, c
    1.77 +r.close()
    1.78 +
    1.79 +# Test position dictionaries.
    1.80 +
    1.81 +f = open("testP", "wb")
    1.82  w = iixr.PositionWriter(f)
    1.83 +f2 = open("testPI", "wb")
    1.84 +w2 = iixr.PositionIndexWriter(f2)
    1.85 +wd = iixr.PositionDictionaryWriter(w, w2, 2)
    1.86  offsets = []
    1.87  for doc_positions in all_doc_positions:
    1.88 -    offset, frequency = w.write_term_positions(doc_positions)
    1.89 -    offsets.append(offset)
    1.90 +    offset, frequency, doc_frequency = wd.write_term_positions(doc_positions)
    1.91 +    offsets.append((offset, doc_frequency))
    1.92  w.close()
    1.93  
    1.94 -f = open("test", "rb")
    1.95 +f = open("testP", "rb")
    1.96  r = iixr.PositionReader(f)
    1.97 +f2 = open("testPI", "rb")
    1.98 +r2 = iixr.PositionIndexReader(f2)
    1.99 +rd = iixr.PositionDictionaryReader(r, r2)
   1.100  offsets.reverse()
   1.101  all_doc_positions.reverse()
   1.102 -for offset, doc_positions in zip(offsets, all_doc_positions):
   1.103 -    dp = list(r.read_term_positions(offset))
   1.104 +for (offset, doc_frequency), doc_positions in zip(offsets, all_doc_positions):
   1.105 +    dp = list(rd.read_term_positions(offset, doc_frequency))
   1.106      print doc_positions == dp, doc_positions, dp
   1.107  r.close()
   1.108  
   1.109 @@ -166,55 +217,57 @@
   1.110  # Test terms.
   1.111  
   1.112  terms = [
   1.113 -    # term       offset      frequency
   1.114 -    ("aardvark",  100000123,  1),
   1.115 -    ("anteater",  100000456,  2),
   1.116 -    ("badger",    100000789, 13),
   1.117 -    ("bull",     1000001234, 59),
   1.118 -    ("bulldog",  1000002345, 99),
   1.119 -    ("cat",      1000003456, 89)
   1.120 +    # term       offset      frequency  doc_frequency
   1.121 +    ("aardvark",  100000123,  1,         1),
   1.122 +    ("anteater",  100000456,  2,         1),
   1.123 +    ("badger",    100000789, 13,         7),
   1.124 +    ("bull",     1000001234, 59,        17),
   1.125 +    ("bulldog",  1000002345, 99,        80),
   1.126 +    ("cat",      1000003456, 89,        28)
   1.127      ]
   1.128  
   1.129  f = open("test", "wb")
   1.130  w = iixr.TermWriter(f)
   1.131 -for term, offset, frequency in terms:
   1.132 -    w.write_term(term, offset, frequency)
   1.133 +for term, offset, frequency, doc_frequency in terms:
   1.134 +    w.write_term(term, offset, frequency, doc_frequency)
   1.135  w.close()
   1.136  
   1.137  f = open("test", "rb")
   1.138  r = iixr.TermReader(f)
   1.139 -for term, offset, frequency in terms:
   1.140 -    t, o, fr = r.read_term()
   1.141 +for term, offset, frequency, doc_frequency in terms:
   1.142 +    t, o, fr, df = r.read_term()
   1.143      print term == t, term, t
   1.144      print offset == o, offset, o
   1.145      print frequency == fr, frequency, fr
   1.146 +    print doc_frequency == df, doc_frequency, df
   1.147  r.close()
   1.148  
   1.149  # Test terms in index files.
   1.150  
   1.151  indexed_terms = [
   1.152 -    # term       offset      frequency  info_offset
   1.153 -    ("aardvark",  100000123,  1,        200000321),
   1.154 -    ("anteater",  100000456,  2,        200000654),
   1.155 -    ("badger",    100000789, 13,        200000987),
   1.156 -    ("bull",     1000001234, 59,        200004321),
   1.157 -    ("bulldog",  1000002345, 99,        200005432),
   1.158 -    ("cat",      1000003456, 89,        200006543)
   1.159 +    # term       offset      frequency  doc_frequency   info_offset
   1.160 +    ("aardvark",  100000123,  1,         1,             200000321),
   1.161 +    ("anteater",  100000456,  2,         1,             200000654),
   1.162 +    ("badger",    100000789, 13,         7,             200000987),
   1.163 +    ("bull",     1000001234, 59,        17,             200004321),
   1.164 +    ("bulldog",  1000002345, 99,        80,             200005432),
   1.165 +    ("cat",      1000003456, 89,        28,             200006543)
   1.166      ]
   1.167  
   1.168  f = open("test", "wb")
   1.169  w = iixr.TermIndexWriter(f)
   1.170 -for term, offset, frequency, info_offset in indexed_terms:
   1.171 -    w.write_term(term, offset, frequency, info_offset)
   1.172 +for term, offset, frequency, doc_frequency, info_offset in indexed_terms:
   1.173 +    w.write_term(term, offset, frequency, doc_frequency, info_offset)
   1.174  w.close()
   1.175  
   1.176  f = open("test", "rb")
   1.177  r = iixr.TermIndexReader(f)
   1.178 -for term, offset, frequency, info_offset in indexed_terms:
   1.179 -    t, o, fr, i = r.read_term()
   1.180 +for term, offset, frequency, doc_frequency, info_offset in indexed_terms:
   1.181 +    t, o, fr, df, i = r.read_term()
   1.182      print term == t, term, t
   1.183      print offset == o, offset, o
   1.184      print frequency == fr, frequency, fr
   1.185 +    print doc_frequency == df, doc_frequency, df
   1.186      print info_offset == i, info_offset, i
   1.187  r.close()
   1.188  
   1.189 @@ -224,26 +277,23 @@
   1.190  w = iixr.TermWriter(f)
   1.191  f2 = open("testI", "wb")
   1.192  w2 = iixr.TermIndexWriter(f2)
   1.193 -f3 = open("testP", "wb")
   1.194 -w3 = iixr.PositionWriter(f3)
   1.195 -wd = iixr.TermDictionaryWriter(w, w2, w3, 3)
   1.196 -for term, offset, frequency in terms:
   1.197 -    wd._write_term(term, offset, frequency)
   1.198 +wd = iixr.TermDictionaryWriter(w, w2, None, 3)
   1.199 +for term, offset, frequency, doc_frequency in terms:
   1.200 +    wd._write_term(term, offset, frequency, doc_frequency)
   1.201  wd.close()
   1.202  
   1.203  f = open("test", "rb")
   1.204  r = iixr.TermReader(f)
   1.205  f2 = open("testI", "rb")
   1.206  r2 = iixr.TermIndexReader(f2)
   1.207 -f3 = open("testP", "rb")
   1.208 -r3 = iixr.PositionReader(f3)
   1.209 -rd = iixr.TermDictionaryReader(r, r2, r3)
   1.210 +rd = iixr.TermDictionaryReader(r, r2, None)
   1.211  terms_reversed = terms[:]
   1.212  terms_reversed.reverse()
   1.213 -for term, offset, frequency in terms_reversed:
   1.214 -    o, fr = rd._find_term(term)
   1.215 +for term, offset, frequency, doc_frequency in terms_reversed:
   1.216 +    o, fr, df = rd._find_term(term)
   1.217      print offset == o, offset, o
   1.218      print frequency == fr, frequency, fr
   1.219 +    print doc_frequency == df, doc_frequency, df
   1.220  for term in ("dog", "dingo"):
   1.221      t = rd._find_term(term)
   1.222      print t is None, t
   1.223 @@ -255,7 +305,7 @@
   1.224      ("aardvark",  [(1, [2, 45, 96]), (20, [13])]),
   1.225      ("anteater",  [(1, [43, 44])]),
   1.226      ("badger",    [(7, [2, 22, 196]), (19, [55, 1333]), (21, [0])]),
   1.227 -    ("bull",      [(6, [128]), (16, [12])]),
   1.228 +    ("bull",      [(6, [128]), (16, [12]), (26, [1, 3, 5, 7, 9]), (36, [2, 4, 6, 8, 10])]),
   1.229      ("bulldog",   [(43, [17, 19, 256, 512])]),
   1.230      ("cat",       [(123, [12, 145, 196]), (1200, [113])])
   1.231      ]
   1.232 @@ -266,7 +316,10 @@
   1.233  w2 = iixr.TermIndexWriter(f2)
   1.234  f3 = open("testP", "wb")
   1.235  w3 = iixr.PositionWriter(f3)
   1.236 -wd = iixr.TermDictionaryWriter(w, w2, w3, 3)
   1.237 +f4 = open("testPI", "wb")
   1.238 +w4 = iixr.PositionIndexWriter(f4)
   1.239 +wp = iixr.PositionDictionaryWriter(r3, r4, 2)
   1.240 +wd = iixr.TermDictionaryWriter(w, w2, wp, 3)
   1.241  for term, doc_positions in terms_with_positions:
   1.242      wd.write_term_positions(term, doc_positions)
   1.243  wd.close()
   1.244 @@ -277,7 +330,10 @@
   1.245  r2 = iixr.TermIndexReader(f2)
   1.246  f3 = open("testP", "rb")
   1.247  r3 = iixr.PositionReader(f3)
   1.248 -rd = iixr.TermDictionaryReader(r, r2, r3)
   1.249 +f4 = open("testPI", "rb")
   1.250 +r4 = iixr.PositionIndexReader(f4)
   1.251 +rp = iixr.PositionDictionaryReader(r3, r4)
   1.252 +rd = iixr.TermDictionaryReader(r, r2, rp)
   1.253  terms_reversed = terms_with_positions[:]
   1.254  terms_reversed.reverse()
   1.255  for term, doc_positions in terms_reversed:
   1.256 @@ -291,7 +347,7 @@
   1.257  
   1.258  rd.rewind()
   1.259  for term, doc_positions in terms_with_positions:
   1.260 -    t, fr, dp = rd.read_term()
   1.261 +    t, fr, df, dp = rd.read_term()
   1.262      dp = list(dp)
   1.263      print term == t, term, t
   1.264      print doc_positions == dp, doc_positions, dp