1 #!/usr/bin/env python 2 3 """ 4 A simple (and sane) text indexing library. 5 """ 6 7 # Foundation classes. 8 9 class File: 10 11 "A basic file abstraction." 12 13 def __init__(self, f): 14 self.f = f 15 self.reset() 16 17 def reset(self): 18 pass 19 20 def close(self): 21 self.f.close() 22 23 class FileWriter(File): 24 25 "Writing basic data types to files." 26 27 def write_number(self, number): 28 29 "Write 'number' to the file using a variable length encoding." 30 31 # Negative numbers are not supported. 32 33 if number < 0: 34 raise ValueError, "Number %r is negative." % number 35 36 # Special case: one byte containing zero. 37 38 elif number == 0: 39 self.f.write(chr(1) + chr(0)) 40 return 41 42 # Write the number from least to most significant digits. 43 44 nbytes = 0 45 bytes = [] 46 47 while number != 0: 48 lsd = number & 255 49 bytes.append(chr(lsd)) 50 number = number >> 8 51 nbytes += 1 52 53 # Too large numbers are not supported. 54 55 if nbytes > 255: 56 raise ValueError, "Number %r is too large." % number 57 58 bytes.insert(0, chr(nbytes)) 59 record = "".join(bytes) 60 self.f.write(record) 61 62 class FileReader(File): 63 64 "Reading basic data types from files." 65 66 def read_number(self): 67 68 "Read a number from the file." 69 70 nbytes = ord(self.f.read(1)) 71 72 # Read each byte, adding it to the number. 73 74 bytes = self.f.read(nbytes) 75 76 i = 0 77 shift = 0 78 number = 0 79 80 while i < nbytes: 81 csd = ord(bytes[i]) 82 number += (csd << shift) 83 shift += 8 84 i += 1 85 86 return number 87 88 # Specific classes. 89 90 class PositionWriter(FileWriter): 91 92 "Writing position information to files." 93 94 def reset(self): 95 self.last_docnum = 0 96 97 def write_positions(self, docnum, positions): 98 99 "Write for the document 'docnum' the given 'positions'." 100 101 if docnum < self.last_docnum: 102 raise ValueError, "Document number %r is less than previous number %r." % (docnum, self.last_docnum) 103 104 # Write the document number delta. 105 106 self.write_number(docnum - self.last_docnum) 107 108 # Write the number of positions. 109 110 self.write_number(len(positions)) 111 112 # Write the position deltas. 113 114 last = 0 115 for position in positions: 116 pos = position - last 117 self.write_number(pos) 118 last = position 119 120 self.last_docnum = docnum 121 122 def write_all_positions(self, doc_positions): 123 124 """ 125 Write all 'doc_positions' - a collection of tuples of the form (document 126 number, position list) - to the file, returning the offset at which they 127 were stored. 128 """ 129 130 # Reset the writer and record the current file offset. 131 132 self.reset() 133 offset = self.f.tell() 134 135 # Write the number of documents. 136 137 self.write_number(len(doc_positions)) 138 139 # Write the positions. 140 141 for docnum, positions in doc_positions: 142 self.write_positions(docnum, positions) 143 144 return offset 145 146 class PositionReader(FileReader): 147 148 "Reading position information from files." 149 150 def reset(self): 151 self.last_docnum = 0 152 153 def read_positions(self): 154 155 "Read positions, returning a document number and a list of positions." 156 157 # Read the document number delta and add it to the last number. 158 159 self.last_docnum += self.read_number() 160 161 # Read the number of positions. 162 163 npositions = self.read_number() 164 165 # Read the position deltas, adding each previous position to get the 166 # appropriate collection of absolute positions. 167 168 i = 0 169 last = 0 170 positions = [] 171 172 while i < npositions: 173 last += self.read_number() 174 positions.append(last) 175 i += 1 176 177 return self.last_docnum, positions 178 179 def read_all_positions(self, offset): 180 181 """ 182 Read all positions from 'offset', seeking to that position in the file 183 before reading. 184 """ 185 186 self.reset() 187 self.f.seek(offset) 188 189 # Read the number of documents. 190 191 ndocuments = self.read_number() 192 193 # Read all records. 194 195 i = 0 196 doc_positions = [] 197 198 while i < ndocuments: 199 doc_positions.append(self.read_positions()) 200 i += 1 201 202 return doc_positions 203 204 # vim: tabstop=4 expandtab shiftwidth=4