1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/iixr/fields.py	Tue Sep 15 00:15:11 2009 +0200
     1.3 @@ -0,0 +1,256 @@
     1.4 +#!/usr/bin/env python
     1.5 +
     1.6 +"""
     1.7 +Specific classes for storing document information.
     1.8 +
     1.9 +Copyright (C) 2009 Paul Boddie <paul@boddie.org.uk>
    1.10 +
    1.11 +This program is free software; you can redistribute it and/or modify it under
    1.12 +the terms of the GNU General Public License as published by the Free Software
    1.13 +Foundation; either version 3 of the License, or (at your option) any later
    1.14 +version.
    1.15 +
    1.16 +This program is distributed in the hope that it will be useful, but WITHOUT ANY
    1.17 +WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
    1.18 +PARTICULAR PURPOSE.  See the GNU General Public License for more details.
    1.19 +
    1.20 +You should have received a copy of the GNU General Public License along
    1.21 +with this program.  If not, see <http://www.gnu.org/licenses/>.
    1.22 +"""
    1.23 +
    1.24 +from iixr.files import *
    1.25 +from bisect import bisect_right  # to find terms in the dictionary index
    1.26 +
    1.27 +class FieldWriter(FileWriter):
    1.28 +
    1.29 +    "Writing field data to files."
    1.30 +
    1.31 +    def reset(self):
    1.32 +        self.last_docnum = 0
    1.33 +
    1.34 +    def write_fields(self, docnum, fields):
    1.35 +
    1.36 +        """
    1.37 +        Write for the given 'docnum', a list of 'fields' (integer, string pairs
    1.38 +        representing field identifiers and values respectively).
    1.39 +        Return the offset at which the fields are stored.
    1.40 +        """
    1.41 +
    1.42 +        offset = self.tell()
    1.43 +
    1.44 +        # Write the document number delta.
    1.45 +
    1.46 +        self.write_number(docnum - self.last_docnum)
    1.47 +
    1.48 +        # Write the number of fields.
    1.49 +
    1.50 +        self.write_number(len(fields))
    1.51 +
    1.52 +        # Write the fields themselves.
    1.53 +
    1.54 +        for i, field in fields:
    1.55 +            self.write_number(i)
    1.56 +            self.write_string(field, 1) # compress
    1.57 +
    1.58 +        self.last_docnum = docnum
    1.59 +        return offset
    1.60 +
    1.61 +class FieldReader(FileReader):
    1.62 +
    1.63 +    "Reading field data from files."
    1.64 +
    1.65 +    def reset(self):
    1.66 +        self.last_docnum = 0
    1.67 +
    1.68 +    def read_fields(self):
    1.69 +
    1.70 +        """
    1.71 +        Read fields from the file, returning a tuple containing the document
    1.72 +        number and a list of field (identifier, value) pairs.
    1.73 +        """
    1.74 +
    1.75 +        # Read the document number.
    1.76 +
    1.77 +        self.last_docnum += self.read_number()
    1.78 +
    1.79 +        # Read the number of fields.
    1.80 +
    1.81 +        nfields = self.read_number()
    1.82 +
    1.83 +        # Collect the fields.
    1.84 +
    1.85 +        fields = []
    1.86 +        i = 0
    1.87 +
    1.88 +        while i < nfields:
    1.89 +            identifier = self.read_number()
    1.90 +            value = self.read_string(1) # decompress
    1.91 +            fields.append((identifier, value))
    1.92 +            i += 1
    1.93 +
    1.94 +        return self.last_docnum, fields
    1.95 +
    1.96 +    def read_document_fields(self, docnum, offset):
    1.97 +
    1.98 +        """
    1.99 +        Read fields for 'docnum' at the given 'offset'. This permits the
   1.100 +        retrieval of details for the specified document, as well as scanning for
   1.101 +        later documents.
   1.102 +        """
   1.103 +
   1.104 +        self.seek(offset)
   1.105 +        bad_docnum, fields = self.read_fields()
   1.106 +        self.last_docnum = docnum
   1.107 +        return docnum, fields
   1.108 +
   1.109 +class FieldIndexWriter(FileWriter):
   1.110 +
   1.111 +    "Writing field index details to files."
   1.112 +
   1.113 +    def reset(self):
   1.114 +        self.last_docnum = 0
   1.115 +        self.last_offset = 0
   1.116 +
   1.117 +    def write_document(self, docnum, offset):
   1.118 +
   1.119 +        """
   1.120 +        Write for the given 'docnum', the 'offset' at which the fields for the
   1.121 +        document are stored in the fields file.
   1.122 +        """
   1.123 +
   1.124 +        # Write the document number and offset deltas.
   1.125 +
   1.126 +        self.write_number(docnum - self.last_docnum)
   1.127 +        self.write_number(offset - self.last_offset)
   1.128 +
   1.129 +        self.last_docnum = docnum
   1.130 +        self.last_offset = offset
   1.131 +
   1.132 +class FieldIndexReader(FileReader):
   1.133 +
   1.134 +    "Reading field index details from files."
   1.135 +
   1.136 +    def reset(self):
   1.137 +        self.last_docnum = 0
   1.138 +        self.last_offset = 0
   1.139 +
   1.140 +    def read_document(self):
   1.141 +
   1.142 +        "Read a document number and field file offset."
   1.143 +
   1.144 +        # Read the document number delta and offset.
   1.145 +
   1.146 +        self.last_docnum += self.read_number()
   1.147 +        self.last_offset += self.read_number()
   1.148 +
   1.149 +        return self.last_docnum, self.last_offset
   1.150 +
   1.151 +class FieldDictionaryWriter:
   1.152 +
   1.153 +    "Writing field dictionary details."
   1.154 +
   1.155 +    def __init__(self, field_writer, field_index_writer, interval):
   1.156 +        self.field_writer = field_writer
   1.157 +        self.field_index_writer = field_index_writer
   1.158 +        self.interval = interval
   1.159 +        self.entry = 0
   1.160 +
   1.161 +    def write_fields(self, docnum, fields):
   1.162 +
   1.163 +        "Write details of the document with the given 'docnum' and 'fields'."
   1.164 +
   1.165 +        offset = self.field_writer.write_fields(docnum, fields)
   1.166 +
   1.167 +        if self.entry % self.interval == 0:
   1.168 +            self.field_index_writer.write_document(docnum, offset)
   1.169 +
   1.170 +        self.entry += 1
   1.171 +
   1.172 +    def close(self):
   1.173 +        self.field_writer.close()
   1.174 +        self.field_index_writer.close()
   1.175 +
   1.176 +class FieldDictionaryReader:
   1.177 +
   1.178 +    "Reading field dictionary details."
   1.179 +
   1.180 +    def __init__(self, field_reader, field_index_reader):
   1.181 +        self.field_reader = field_reader
   1.182 +        self.field_index_reader = field_index_reader
   1.183 +
   1.184 +        self.docs = []
   1.185 +        try:
   1.186 +            while 1:
   1.187 +                self.docs.append(self.field_index_reader.read_document())
   1.188 +        except EOFError:
   1.189 +            pass
   1.190 +
   1.191 +        # Large numbers for ordering purposes.
   1.192 +
   1.193 +        if self.docs:
   1.194 +            self.max_offset = self.docs[-1][1]
   1.195 +        else:
   1.196 +            self.max_offset = None
   1.197 +
   1.198 +    # Iterator convenience methods.
   1.199 +
   1.200 +    def __iter__(self):
   1.201 +        self.rewind()
   1.202 +        return self
   1.203 +
   1.204 +    def next(self):
   1.205 +        try:
   1.206 +            return self.read_fields()
   1.207 +        except EOFError:
   1.208 +            raise StopIteration
   1.209 +
   1.210 +    # Sequential access methods.
   1.211 +
   1.212 +    def rewind(self):
   1.213 +        self.field_reader.rewind()
   1.214 +
   1.215 +    def read_fields(self):
   1.216 +
   1.217 +        "Return the next document number and fields."
   1.218 +
   1.219 +        return self.field_reader.read_fields()
   1.220 +
   1.221 +    # Random access methods.
   1.222 +
   1.223 +    def get_fields(self, docnum):
   1.224 +
   1.225 +        "Read the fields of the document with the given 'docnum'."
   1.226 +
   1.227 +        i = bisect_right(self.docs, (docnum, self.max_offset)) - 1
   1.228 +
   1.229 +        # Get the entry position providing the term or one preceding it.
   1.230 +
   1.231 +        if i == -1:
   1.232 +            return None
   1.233 +
   1.234 +        found_docnum, offset = self.docs[i]
   1.235 +
   1.236 +        # Read from the fields file.
   1.237 +
   1.238 +        found_docnum, fields = self.field_reader.read_document_fields(found_docnum, offset)
   1.239 +
   1.240 +        # Scan for the document, if necessary.
   1.241 +
   1.242 +        try:
   1.243 +            while docnum > found_docnum:
   1.244 +                found_docnum, fields = self.field_reader.read_fields()
   1.245 +        except EOFError:
   1.246 +            pass
   1.247 +
   1.248 +        # If the document is found, return the fields.
   1.249 +
   1.250 +        if docnum == found_docnum:
   1.251 +            return fields
   1.252 +        else:
   1.253 +            return None
   1.254 +
   1.255 +    def close(self):
   1.256 +        self.field_reader.close()
   1.257 +        self.field_index_reader.close()
   1.258 +
   1.259 +# vim: tabstop=4 expandtab shiftwidth=4