1.1 --- a/iixr.py Tue Aug 25 22:44:15 2009 +0200
1.2 +++ b/iixr.py Tue Aug 25 23:53:20 2009 +0200
1.3 @@ -19,6 +19,7 @@
1.4 """
1.5
1.6 from os.path import commonprefix # to find common string prefixes
1.7 +from bisect import bisect_right # to find terms in the dictionary index
1.8
1.9 # Foundation classes.
1.10
1.11 @@ -104,7 +105,7 @@
1.12
1.13 "Read a number from the file."
1.14
1.15 - nbytes = ord(self.f.read(1))
1.16 + nbytes = self.read_unsigned_byte()
1.17
1.18 # Read each byte, adding it to the number.
1.19
1.20 @@ -126,7 +127,11 @@
1.21
1.22 "Read a number from the file, consuming a single byte."
1.23
1.24 - return ord(self.f.read(1))
1.25 + s = self.f.read(1)
1.26 + if not s:
1.27 + raise EOFError
1.28 +
1.29 + return ord(s)
1.30
1.31 def read_string(self):
1.32
1.33 @@ -263,7 +268,8 @@
1.34
1.35 """
1.36 Write the given 'term' and its position file 'offset' to the term
1.37 - information file.
1.38 + information file. Return the offset after the term information was
1.39 + written to the file.
1.40 """
1.41
1.42 # Too long terms are not currently supported.
1.43 @@ -286,6 +292,8 @@
1.44 self.last_term = term
1.45 self.last_offset = offset
1.46
1.47 + return self.f.tell()
1.48 +
1.49 class TermReader(FileReader):
1.50
1.51 "Reading term information from files."
1.52 @@ -313,4 +321,146 @@
1.53
1.54 return self.last_term, self.last_offset
1.55
1.56 + def go_to_term(self, term, offset, info_offset):
1.57 +
1.58 + "Seek past the entry for 'term' having 'offset' to 'info_offset'."
1.59 +
1.60 + self.f.seek(info_offset)
1.61 + self.last_term = term
1.62 + self.last_offset = offset
1.63 +
1.64 +class TermIndexWriter(TermWriter):
1.65 +
1.66 + "Writing term dictionary index details to files."
1.67 +
1.68 + def reset(self):
1.69 + TermWriter.reset(self)
1.70 + self.last_info_offset = 0
1.71 +
1.72 + def write_term(self, term, offset, info_offset):
1.73 +
1.74 + """
1.75 + Write the given 'term' and its position file 'offset' to the term
1.76 + dictionary index file, along with the 'info_offset' in the term
1.77 + information file.
1.78 + """
1.79 +
1.80 + TermWriter.write_term(self, term, offset)
1.81 +
1.82 + # Write the information file offset delta.
1.83 +
1.84 + self.write_number(info_offset - self.last_info_offset)
1.85 + self.last_info_offset = info_offset
1.86 +
1.87 +class TermIndexReader(TermReader):
1.88 +
1.89 + "Reading term dictionary index details from files."
1.90 +
1.91 + def reset(self):
1.92 + TermReader.reset(self)
1.93 + self.last_info_offset = 0
1.94 +
1.95 + def read_term(self):
1.96 +
1.97 + """
1.98 + Read a term, its position file offset, and its term information file
1.99 + offset from the term dictionary index file.
1.100 + """
1.101 +
1.102 + term, offset = TermReader.read_term(self)
1.103 +
1.104 + # Read the offset delta.
1.105 +
1.106 + self.last_info_offset += self.read_number()
1.107 +
1.108 + return term, offset, self.last_info_offset
1.109 +
1.110 +class TermDictionaryWriter:
1.111 +
1.112 + "Writing term dictionaries."
1.113 +
1.114 + def __init__(self, info_writer, index_writer, interval):
1.115 + self.info_writer = info_writer
1.116 + self.index_writer = index_writer
1.117 + self.interval = interval
1.118 + self.entry = 0
1.119 +
1.120 + def write_term(self, term, offset):
1.121 +
1.122 + """
1.123 + Write the given 'term' and its position file 'offset' to the term
1.124 + information file and optionally to the index, making a dictionary entry.
1.125 + """
1.126 +
1.127 + info_offset = self.info_writer.write_term(term, offset)
1.128 +
1.129 + if self.entry % self.interval == 0:
1.130 + self.index_writer.write_term(term, offset, info_offset)
1.131 +
1.132 + self.entry += 1
1.133 +
1.134 + def close(self):
1.135 + self.info_writer.close()
1.136 + self.index_writer.close()
1.137 +
1.138 +class TermDictionaryReader:
1.139 +
1.140 + "Reading term dictionaries."
1.141 +
1.142 + def __init__(self, info_reader, index_reader):
1.143 + self.info_reader = info_reader
1.144 + self.index_reader = index_reader
1.145 +
1.146 + self.terms = []
1.147 + try:
1.148 + while 1:
1.149 + self.terms.append(self.index_reader.read_term())
1.150 + except EOFError:
1.151 + pass
1.152 +
1.153 + # Large numbers for ordering purposes.
1.154 +
1.155 + self.max_offset = self.terms[-1][1]
1.156 + self.max_info_offset = self.terms[-1][2]
1.157 +
1.158 + def find(self, term):
1.159 +
1.160 + "Find the position file offset of 'term' from the term dictionary."
1.161 +
1.162 + i = bisect_right(self.terms, (term, self.max_offset, self.max_info_offset)) - 1
1.163 +
1.164 + # Get the entry position providing the term or one preceding it.
1.165 +
1.166 + if i == -1:
1.167 + return None
1.168 +
1.169 + found_term, offset, info_offset = self.terms[i]
1.170 +
1.171 + # Where the term is found immediately, return the offset.
1.172 +
1.173 + if term == found_term:
1.174 + return offset
1.175 +
1.176 + # Otherwise, seek past the index term's entry in the information file
1.177 + # and scan for the desired term.
1.178 +
1.179 + else:
1.180 + self.info_reader.go_to_term(found_term, offset, info_offset)
1.181 + try:
1.182 + while term > found_term:
1.183 + found_term, offset = self.info_reader.read_term()
1.184 + except EOFError:
1.185 + pass
1.186 +
1.187 + # If the term is found, return the offset.
1.188 +
1.189 + if term == found_term:
1.190 + return offset
1.191 + else:
1.192 + return None
1.193 +
1.194 + def close(self):
1.195 + self.info_reader.close()
1.196 + self.index_reader.close()
1.197 +
1.198 # vim: tabstop=4 expandtab shiftwidth=4