1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
1.2 +++ b/iixr/files.py Tue Sep 15 00:15:11 2009 +0200
1.3 @@ -0,0 +1,264 @@
1.4 +#!/usr/bin/env python
1.5 +
1.6 +"""
1.7 +Generic file access.
1.8 +
1.9 +Copyright (C) 2009 Paul Boddie <paul@boddie.org.uk>
1.10 +
1.11 +This program is free software; you can redistribute it and/or modify it under
1.12 +the terms of the GNU General Public License as published by the Free Software
1.13 +Foundation; either version 3 of the License, or (at your option) any later
1.14 +version.
1.15 +
1.16 +This program is distributed in the hope that it will be useful, but WITHOUT ANY
1.17 +WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
1.18 +PARTICULAR PURPOSE. See the GNU General Public License for more details.
1.19 +
1.20 +You should have received a copy of the GNU General Public License along
1.21 +with this program. If not, see <http://www.gnu.org/licenses/>.
1.22 +"""
1.23 +
1.24 +from iixr.data import vint
1.25 +import bz2, zlib
1.26 +
1.27 +# Constants.
1.28 +
1.29 +WRITE_CACHE_SIZE = 100000
1.30 +READ_CACHE_SIZE = 10000
1.31 +READ_CACHE_RESIZE = 5000
1.32 +
1.33 +compressors = [("b", bz2.compress), ("z", zlib.compress)]
1.34 +decompressors = {"b" : bz2.decompress, "z" : zlib.decompress}
1.35 +
1.36 +class File:
1.37 +
1.38 + "A basic file abstraction."
1.39 +
1.40 + def __init__(self, f):
1.41 + self.f = f
1.42 + self.reset()
1.43 +
1.44 + def reset(self):
1.45 +
1.46 + "To be used to reset the state of the reader or writer between records."
1.47 +
1.48 + pass
1.49 +
1.50 + def rewind(self):
1.51 + self.seek(0)
1.52 + self.reset()
1.53 +
1.54 + def seek(self, offset):
1.55 +
1.56 + "To be defined by readers."
1.57 +
1.58 + pass
1.59 +
1.60 + def flush(self):
1.61 +
1.62 + "To be defined by writers."
1.63 +
1.64 + pass
1.65 +
1.66 + def close(self):
1.67 + if self.f is not None:
1.68 + self.flush()
1.69 + self.f.close()
1.70 + self.f = None
1.71 +
1.72 +class FileWriter(File):
1.73 +
1.74 + "Writing basic data types to files."
1.75 +
1.76 + def __init__(self, f):
1.77 + File.__init__(self, f)
1.78 + self.cache = []
1.79 + self.cache_length = 0
1.80 +
1.81 + def write_number(self, number):
1.82 +
1.83 + "Write 'number' to the file using a variable length encoding."
1.84 +
1.85 + self.write(vint(number))
1.86 +
1.87 + def write_string(self, s, compress=0):
1.88 +
1.89 + """
1.90 + Write 's' to the file, recording its length and compressing the string
1.91 + if 'compress' is set to a true value.
1.92 + """
1.93 +
1.94 + # Convert Unicode objects to strings.
1.95 +
1.96 + if isinstance(s, unicode):
1.97 + s = s.encode("utf-8")
1.98 +
1.99 + # Compress the string if requested.
1.100 +
1.101 + if compress:
1.102 + for flag, fn in compressors:
1.103 + cs = fn(s)
1.104 +
1.105 + # Take the first string shorter than the original.
1.106 +
1.107 + if len(cs) < len(s):
1.108 + s = cs
1.109 + break
1.110 + else:
1.111 + flag = "-"
1.112 +
1.113 + else:
1.114 + flag = ""
1.115 +
1.116 + # Write the length of the data before the data itself.
1.117 +
1.118 + length = len(s)
1.119 + self.write(flag + vint(length) + s)
1.120 +
1.121 + # Cache-affected methods.
1.122 +
1.123 + def write(self, s):
1.124 + self.cache.append(s)
1.125 + self.cache_length += len(s)
1.126 + if self.cache_length >= WRITE_CACHE_SIZE:
1.127 + self.flush()
1.128 +
1.129 + def tell(self):
1.130 + return self.f.tell() + self.cache_length
1.131 +
1.132 + def flush(self):
1.133 + self.f.write("".join(self.cache))
1.134 + self.cache = []
1.135 + self.cache_length = 0
1.136 +
1.137 +class FileReader(File):
1.138 +
1.139 + "Reading basic data types from files."
1.140 +
1.141 + def __init__(self, f):
1.142 + File.__init__(self, f)
1.143 + self.reset_cache()
1.144 +
1.145 + def reset_cache(self):
1.146 + self.cache = ""
1.147 + self.cache_length = 0
1.148 + self.cache_start = 0
1.149 +
1.150 + def read_number(self):
1.151 +
1.152 + "Read a number from the file."
1.153 +
1.154 + # Read each byte, adding it to the number.
1.155 +
1.156 + shift = 0
1.157 + number = 0
1.158 + read = self.read
1.159 +
1.160 + try:
1.161 + csd = ord(read(1))
1.162 + while csd & 128:
1.163 + number += ((csd & 127) << shift)
1.164 + shift += 7
1.165 + csd = ord(read(1))
1.166 + else:
1.167 + number += (csd << shift)
1.168 + except TypeError:
1.169 + raise EOFError
1.170 +
1.171 + return number
1.172 +
1.173 + def read_string(self, decompress=0):
1.174 +
1.175 + """
1.176 + Read a string from the file, decompressing the stored data if
1.177 + 'decompress' is set to a true value.
1.178 + """
1.179 +
1.180 + # Decompress the data if requested.
1.181 +
1.182 + if decompress:
1.183 + flag = self.read(1)
1.184 + else:
1.185 + flag = "-"
1.186 +
1.187 + length = self.read_number()
1.188 + s = self.read(length)
1.189 +
1.190 + # Perform decompression if applicable.
1.191 +
1.192 + if flag != "-":
1.193 + fn = decompressors[flag]
1.194 + s = fn(s)
1.195 +
1.196 + # Convert strings to Unicode objects.
1.197 +
1.198 + return unicode(s, "utf-8")
1.199 +
1.200 + # Cache-affected methods.
1.201 +
1.202 + def read(self, n):
1.203 + needed = n - (self.cache_length - self.cache_start)
1.204 +
1.205 + # Read the needed number of characters, if possible.
1.206 +
1.207 + if needed > 0:
1.208 + s = self.f.read(max(needed, READ_CACHE_SIZE))
1.209 + self.cache += s
1.210 + self.cache_length += len(s)
1.211 +
1.212 + # Get the end of the requested block.
1.213 +
1.214 + next_start = self.cache_start + n
1.215 + s = self.cache[self.cache_start:next_start]
1.216 +
1.217 + # Reposition the pointer to the cache.
1.218 +
1.219 + self._seek_cache(len(s))
1.220 + return s
1.221 +
1.222 + def tell(self):
1.223 + return self.f.tell() - self.cache_length + self.cache_start
1.224 +
1.225 + def seek(self, offset):
1.226 + current = self.tell()
1.227 + self.f.seek(offset)
1.228 +
1.229 + # If seeking forward, attempt to navigate the cache.
1.230 +
1.231 + if offset >= current:
1.232 + self._seek_cache(offset - current)
1.233 + else:
1.234 + self.reset_cache()
1.235 +
1.236 + def _seek_cache(self, delta):
1.237 + next_start = self.cache_start + delta
1.238 +
1.239 + if next_start > 0 and next_start >= len(self.cache):
1.240 + self.reset_cache()
1.241 +
1.242 + # If the cache is too big, resize it.
1.243 +
1.244 + elif next_start > READ_CACHE_RESIZE:
1.245 + self.cache = self.cache[next_start:]
1.246 + self.cache_length = len(self.cache)
1.247 + self.cache_start = 0
1.248 +
1.249 + # Otherwise, just reference the next part of the cache.
1.250 +
1.251 + else:
1.252 + self.cache_start = next_start
1.253 +
1.254 +class FileOpener:
1.255 +
1.256 + "Opening files using their filenames."
1.257 +
1.258 + def __init__(self, filename):
1.259 + self.filename = filename
1.260 +
1.261 + def open(self, mode):
1.262 + return open(self.filename, mode)
1.263 +
1.264 + def close(self):
1.265 + pass
1.266 +
1.267 +# vim: tabstop=4 expandtab shiftwidth=4