1 #!/usr/bin/env python 2 3 """ 4 A simple (and sane) text indexing library. 5 6 Copyright (C) 2009 Paul Boddie <paul@boddie.org.uk> 7 8 This program is free software; you can redistribute it and/or modify it under 9 the terms of the GNU General Public License as published by the Free Software 10 Foundation; either version 3 of the License, or (at your option) any later 11 version. 12 13 This program is distributed in the hope that it will be useful, but WITHOUT ANY 14 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A 15 PARTICULAR PURPOSE. See the GNU General Public License for more details. 16 17 You should have received a copy of the GNU General Public License along 18 with this program. If not, see <http://www.gnu.org/licenses/>. 19 """ 20 21 # Foundation classes. 22 23 class File: 24 25 "A basic file abstraction." 26 27 def __init__(self, f): 28 self.f = f 29 self.reset() 30 31 def reset(self): 32 pass 33 34 def close(self): 35 self.f.close() 36 37 class FileWriter(File): 38 39 "Writing basic data types to files." 40 41 def write_number(self, number): 42 43 "Write 'number' to the file using a variable length encoding." 44 45 # Negative numbers are not supported. 46 47 if number < 0: 48 raise ValueError, "Number %r is negative." % number 49 50 # Special case: one byte containing zero. 51 52 elif number == 0: 53 self.f.write(chr(1) + chr(0)) 54 return 55 56 # Write the number from least to most significant digits. 57 58 nbytes = 0 59 bytes = [] 60 61 while number != 0: 62 lsd = number & 255 63 bytes.append(chr(lsd)) 64 number = number >> 8 65 nbytes += 1 66 67 # Too large numbers are not supported. 68 69 if nbytes > 255: 70 raise ValueError, "Number %r is too large." % number 71 72 bytes.insert(0, chr(nbytes)) 73 record = "".join(bytes) 74 self.f.write(record) 75 76 class FileReader(File): 77 78 "Reading basic data types from files." 79 80 def read_number(self): 81 82 "Read a number from the file." 83 84 nbytes = ord(self.f.read(1)) 85 86 # Read each byte, adding it to the number. 87 88 bytes = self.f.read(nbytes) 89 90 i = 0 91 shift = 0 92 number = 0 93 94 while i < nbytes: 95 csd = ord(bytes[i]) 96 number += (csd << shift) 97 shift += 8 98 i += 1 99 100 return number 101 102 # Specific classes. 103 104 class PositionWriter(FileWriter): 105 106 "Writing position information to files." 107 108 def reset(self): 109 self.last_docnum = 0 110 111 def write_positions(self, docnum, positions): 112 113 "Write for the document 'docnum' the given 'positions'." 114 115 if docnum < self.last_docnum: 116 raise ValueError, "Document number %r is less than previous number %r." % (docnum, self.last_docnum) 117 118 # Write the document number delta. 119 120 self.write_number(docnum - self.last_docnum) 121 122 # Write the number of positions. 123 124 self.write_number(len(positions)) 125 126 # Write the position deltas. 127 128 last = 0 129 for position in positions: 130 pos = position - last 131 self.write_number(pos) 132 last = position 133 134 self.last_docnum = docnum 135 136 def write_all_positions(self, doc_positions): 137 138 """ 139 Write all 'doc_positions' - a collection of tuples of the form (document 140 number, position list) - to the file, returning the offset at which they 141 were stored. 142 """ 143 144 # Reset the writer and record the current file offset. 145 146 self.reset() 147 offset = self.f.tell() 148 149 # Write the number of documents. 150 151 self.write_number(len(doc_positions)) 152 153 # Write the positions. 154 155 for docnum, positions in doc_positions: 156 self.write_positions(docnum, positions) 157 158 return offset 159 160 class PositionReader(FileReader): 161 162 "Reading position information from files." 163 164 def reset(self): 165 self.last_docnum = 0 166 167 def read_positions(self): 168 169 "Read positions, returning a document number and a list of positions." 170 171 # Read the document number delta and add it to the last number. 172 173 self.last_docnum += self.read_number() 174 175 # Read the number of positions. 176 177 npositions = self.read_number() 178 179 # Read the position deltas, adding each previous position to get the 180 # appropriate collection of absolute positions. 181 182 i = 0 183 last = 0 184 positions = [] 185 186 while i < npositions: 187 last += self.read_number() 188 positions.append(last) 189 i += 1 190 191 return self.last_docnum, positions 192 193 def read_all_positions(self, offset): 194 195 """ 196 Read all positions from 'offset', seeking to that position in the file 197 before reading. 198 """ 199 200 self.reset() 201 self.f.seek(offset) 202 203 # Read the number of documents. 204 205 ndocuments = self.read_number() 206 207 # Read all records. 208 209 i = 0 210 doc_positions = [] 211 212 while i < ndocuments: 213 doc_positions.append(self.read_positions()) 214 i += 1 215 216 return doc_positions 217 218 # vim: tabstop=4 expandtab shiftwidth=4