1 #!/usr/bin/env python 2 3 """ 4 A simple (and sane) text indexing library. 5 6 Copyright (C) 2009 Paul Boddie <paul@boddie.org.uk> 7 8 This program is free software; you can redistribute it and/or modify it under 9 the terms of the GNU General Public License as published by the Free Software 10 Foundation; either version 3 of the License, or (at your option) any later 11 version. 12 13 This program is distributed in the hope that it will be useful, but WITHOUT ANY 14 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A 15 PARTICULAR PURPOSE. See the GNU General Public License for more details. 16 17 You should have received a copy of the GNU General Public License along 18 with this program. If not, see <http://www.gnu.org/licenses/>. 19 """ 20 21 from os.path import commonprefix # to find common string prefixes 22 23 # Foundation classes. 24 25 class File: 26 27 "A basic file abstraction." 28 29 def __init__(self, f): 30 self.f = f 31 self.reset() 32 33 def reset(self): 34 pass 35 36 def close(self): 37 self.f.close() 38 39 class FileWriter(File): 40 41 "Writing basic data types to files." 42 43 def write_number(self, number): 44 45 "Write 'number' to the file using a variable length encoding." 46 47 # Negative numbers are not supported. 48 49 if number < 0: 50 raise ValueError, "Number %r is negative." % number 51 52 # Special case: one byte containing zero. 53 54 elif number == 0: 55 self.f.write(chr(1) + chr(0)) 56 return 57 58 # Write the number from least to most significant digits. 59 60 nbytes = 0 61 bytes = [] 62 63 while number != 0: 64 lsd = number & 255 65 bytes.append(chr(lsd)) 66 number = number >> 8 67 nbytes += 1 68 69 # Too large numbers are not supported. 70 71 if nbytes > 255: 72 raise ValueError, "Number %r is too large." % number 73 74 bytes.insert(0, chr(nbytes)) 75 record = "".join(bytes) 76 self.f.write(record) 77 78 def write_unsigned_byte(self, number): 79 80 "Write 'number' to the file using a single byte." 81 82 if not (0 <= number <= 255): 83 raise ValueError, "Number %r is out of range." % number 84 85 self.f.write(chr(number)) 86 87 def write_string(self, s): 88 89 "Write 's' to the file, recording its length." 90 91 length = len(s) 92 93 if not (0 <= length <= 255): 94 raise ValueError, "String %r is too long." % s 95 96 self.write_unsigned_byte(length) 97 self.f.write(s) 98 99 class FileReader(File): 100 101 "Reading basic data types from files." 102 103 def read_number(self): 104 105 "Read a number from the file." 106 107 nbytes = ord(self.f.read(1)) 108 109 # Read each byte, adding it to the number. 110 111 bytes = self.f.read(nbytes) 112 113 i = 0 114 shift = 0 115 number = 0 116 117 while i < nbytes: 118 csd = ord(bytes[i]) 119 number += (csd << shift) 120 shift += 8 121 i += 1 122 123 return number 124 125 def read_unsigned_byte(self): 126 127 "Read a number from the file, consuming a single byte." 128 129 return ord(self.f.read(1)) 130 131 def read_string(self): 132 133 "Read a string from the file." 134 135 length = self.read_unsigned_byte() 136 return self.f.read(length) 137 138 # Specific classes. 139 140 class PositionWriter(FileWriter): 141 142 "Writing position information to files." 143 144 def reset(self): 145 self.last_docnum = 0 146 147 def write_positions(self, docnum, positions): 148 149 "Write for the document 'docnum' the given 'positions'." 150 151 if docnum < self.last_docnum: 152 raise ValueError, "Document number %r is less than previous number %r." % (docnum, self.last_docnum) 153 154 # Write the document number delta. 155 156 self.write_number(docnum - self.last_docnum) 157 158 # Write the number of positions. 159 160 self.write_number(len(positions)) 161 162 # Write the position deltas. 163 164 last = 0 165 for position in positions: 166 pos = position - last 167 self.write_number(pos) 168 last = position 169 170 self.last_docnum = docnum 171 172 def write_all_positions(self, doc_positions): 173 174 """ 175 Write all 'doc_positions' - a collection of tuples of the form (document 176 number, position list) - to the file, returning the offset at which they 177 were stored. 178 """ 179 180 # Reset the writer and record the current file offset. 181 182 self.reset() 183 offset = self.f.tell() 184 185 # Write the number of documents. 186 187 self.write_number(len(doc_positions)) 188 189 # Write the positions. 190 191 for docnum, positions in doc_positions: 192 self.write_positions(docnum, positions) 193 194 return offset 195 196 class PositionReader(FileReader): 197 198 "Reading position information from files." 199 200 def reset(self): 201 self.last_docnum = 0 202 203 def read_positions(self): 204 205 "Read positions, returning a document number and a list of positions." 206 207 # Read the document number delta and add it to the last number. 208 209 self.last_docnum += self.read_number() 210 211 # Read the number of positions. 212 213 npositions = self.read_number() 214 215 # Read the position deltas, adding each previous position to get the 216 # appropriate collection of absolute positions. 217 218 i = 0 219 last = 0 220 positions = [] 221 222 while i < npositions: 223 last += self.read_number() 224 positions.append(last) 225 i += 1 226 227 return self.last_docnum, positions 228 229 def read_all_positions(self, offset): 230 231 """ 232 Read all positions from 'offset', seeking to that position in the file 233 before reading. 234 """ 235 236 self.reset() 237 self.f.seek(offset) 238 239 # Read the number of documents. 240 241 ndocuments = self.read_number() 242 243 # Read all records. 244 245 i = 0 246 doc_positions = [] 247 248 while i < ndocuments: 249 doc_positions.append(self.read_positions()) 250 i += 1 251 252 return doc_positions 253 254 class TermWriter(FileWriter): 255 256 "Writing term information to files." 257 258 def reset(self): 259 self.last_term = "" 260 self.last_offset = 0 261 262 def write_term(self, term, offset): 263 264 """ 265 Write the given 'term' and its position file 'offset' to the term 266 information file. 267 """ 268 269 # Too long terms are not currently supported. 270 271 if len(term) > 255: 272 raise ValueError, "Term %r is too long." % term 273 274 # Write the prefix length and term suffix. 275 276 common = len(commonprefix([self.last_term, term])) 277 suffix = term[common:] 278 279 self.write_unsigned_byte(common) 280 self.write_string(suffix) 281 282 # Write the offset delta. 283 284 self.write_number(offset - self.last_offset) 285 286 self.last_term = term 287 self.last_offset = offset 288 289 class TermReader(FileReader): 290 291 "Reading term information from files." 292 293 def reset(self): 294 self.last_term = "" 295 self.last_offset = 0 296 297 def read_term(self): 298 299 """ 300 Read a term and its position file offset from the term information file. 301 """ 302 303 # Read the prefix length and term suffix. 304 305 common = self.read_unsigned_byte() 306 suffix = self.read_string() 307 308 self.last_term = self.last_term[:common] + suffix 309 310 # Read the offset delta. 311 312 self.last_offset += self.read_number() 313 314 return self.last_term, self.last_offset 315 316 # vim: tabstop=4 expandtab shiftwidth=4