1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
1.2 +++ b/iixr/index.py Tue Sep 15 00:15:11 2009 +0200
1.3 @@ -0,0 +1,326 @@
1.4 +#!/usr/bin/env python
1.5 +
1.6 +"""
1.7 +High-level classes.
1.8 +
1.9 +Copyright (C) 2009 Paul Boddie <paul@boddie.org.uk>
1.10 +
1.11 +This program is free software; you can redistribute it and/or modify it under
1.12 +the terms of the GNU General Public License as published by the Free Software
1.13 +Foundation; either version 3 of the License, or (at your option) any later
1.14 +version.
1.15 +
1.16 +This program is distributed in the hope that it will be useful, but WITHOUT ANY
1.17 +WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
1.18 +PARTICULAR PURPOSE. See the GNU General Public License for more details.
1.19 +
1.20 +You should have received a copy of the GNU General Public License along
1.21 +with this program. If not, see <http://www.gnu.org/licenses/>.
1.22 +"""
1.23 +
1.24 +from iixr.filesystem import *
1.25 +from os import listdir, mkdir # index and partition discovery
1.26 +from os.path import exists
1.27 +
1.28 +try:
1.29 + set
1.30 +except NameError:
1.31 + from sets import Set as set
1.32 +
1.33 +# Constants.
1.34 +
1.35 +TERM_INTERVAL = 100
1.36 +DOCUMENT_INTERVAL = 100
1.37 +FIELD_INTERVAL = 100
1.38 +FLUSH_INTERVAL = 10000
1.39 +
1.40 +# High-level classes.
1.41 +
1.42 +class Document:
1.43 +
1.44 + "A container of document information."
1.45 +
1.46 + def __init__(self, docnum):
1.47 + self.docnum = docnum
1.48 + self.fields = []
1.49 + self.terms = {}
1.50 +
1.51 + def add_position(self, term, position):
1.52 +
1.53 + """
1.54 + Add a position entry for the given 'term', indicating the given
1.55 + 'position'.
1.56 + """
1.57 +
1.58 + self.terms.setdefault(term, []).append(position)
1.59 +
1.60 + def add_field(self, identifier, value):
1.61 +
1.62 + "Add a field having the given 'identifier' and 'value'."
1.63 +
1.64 + self.fields.append((identifier, unicode(value))) # convert to string
1.65 +
1.66 + def set_fields(self, fields):
1.67 +
1.68 + """
1.69 + Set the document's 'fields': a list of tuples each containing an integer
1.70 + identifier and a string value.
1.71 + """
1.72 +
1.73 + self.fields = fields
1.74 +
1.75 +class IndexWriter:
1.76 +
1.77 + """
1.78 + Building term information and writing it to the term and field dictionaries.
1.79 + """
1.80 +
1.81 + def __init__(self, pathname, interval, doc_interval, flush_interval):
1.82 + self.pathname = pathname
1.83 + self.interval = interval
1.84 + self.doc_interval = doc_interval
1.85 + self.flush_interval = flush_interval
1.86 +
1.87 + self.dict_partition = 0
1.88 + self.field_dict_partition = 0
1.89 +
1.90 + self.terms = {}
1.91 + self.docs = {}
1.92 +
1.93 + self.doc_counter = 0
1.94 +
1.95 + def add_document(self, doc):
1.96 +
1.97 + """
1.98 + Add the given document 'doc', updating the document counter and flushing
1.99 + terms and fields if appropriate.
1.100 + """
1.101 +
1.102 + for term, positions in doc.terms.items():
1.103 + self.terms.setdefault(term, {})[doc.docnum] = positions
1.104 +
1.105 + self.docs[doc.docnum] = doc.fields
1.106 +
1.107 + self.doc_counter += 1
1.108 + if self.flush_interval and self.doc_counter >= self.flush_interval:
1.109 + self.flush_terms()
1.110 + self.flush_fields()
1.111 + self.doc_counter = 0
1.112 +
1.113 + def get_term_writer(self):
1.114 +
1.115 + "Return a term dictionary writer for the current partition."
1.116 +
1.117 + return get_term_writer(self.pathname, self.dict_partition, self.interval, self.doc_interval)
1.118 +
1.119 + def get_field_writer(self):
1.120 +
1.121 + "Return a field dictionary writer for the current partition."
1.122 +
1.123 + return get_field_writer(self.pathname, self.field_dict_partition, self.interval)
1.124 +
1.125 + def flush_terms(self):
1.126 +
1.127 + "Flush terms into the current term dictionary partition."
1.128 +
1.129 + # Get the terms in order.
1.130 +
1.131 + all_terms = self.terms
1.132 + terms = all_terms.keys()
1.133 + terms.sort()
1.134 +
1.135 + dict_writer = self.get_term_writer()
1.136 +
1.137 + for term in terms:
1.138 + doc_positions = all_terms[term].items()
1.139 + dict_writer.write_term_positions(term, doc_positions)
1.140 +
1.141 + dict_writer.close()
1.142 +
1.143 + self.terms = {}
1.144 + self.dict_partition += 1
1.145 +
1.146 + def flush_fields(self):
1.147 +
1.148 + "Flush fields into the current term dictionary partition."
1.149 +
1.150 + # Get the documents in order.
1.151 +
1.152 + docs = self.docs.items()
1.153 + docs.sort()
1.154 +
1.155 + field_dict_writer = self.get_field_writer()
1.156 +
1.157 + for docnum, fields in docs:
1.158 + field_dict_writer.write_fields(docnum, fields)
1.159 +
1.160 + field_dict_writer.close()
1.161 +
1.162 + self.docs = {}
1.163 + self.field_dict_partition += 1
1.164 +
1.165 + def close(self):
1.166 + if self.terms:
1.167 + self.flush_terms()
1.168 + if self.docs:
1.169 + self.flush_fields()
1.170 +
1.171 +class IndexReader:
1.172 +
1.173 + "Accessing the term and field dictionaries."
1.174 +
1.175 + def __init__(self, pathname):
1.176 + self.dict_reader = get_term_reader(pathname, "merged")
1.177 + self.field_dict_reader = get_field_reader(pathname, "merged")
1.178 +
1.179 + def find_terms(self, term):
1.180 + return self.dict_reader.find_terms(term)
1.181 +
1.182 + def find_positions(self, term):
1.183 + return self.dict_reader.find_positions(term)
1.184 +
1.185 + def get_frequency(self, term):
1.186 + return self.dict_reader.get_frequency(term)
1.187 +
1.188 + def get_document_frequency(self, term):
1.189 + return self.dict_reader.get_document_frequency(term)
1.190 +
1.191 + def get_fields(self, docnum):
1.192 + return self.field_dict_reader.get_fields(docnum)
1.193 +
1.194 + def close(self):
1.195 + self.dict_reader.close()
1.196 + self.field_dict_reader.close()
1.197 +
1.198 +class Index:
1.199 +
1.200 + "An inverted index solution encapsulating the various components."
1.201 +
1.202 + def __init__(self, pathname):
1.203 + self.pathname = pathname
1.204 + self.reader = None
1.205 + self.writer = None
1.206 +
1.207 + def get_writer(self, interval=TERM_INTERVAL, doc_interval=DOCUMENT_INTERVAL, flush_interval=FLUSH_INTERVAL):
1.208 +
1.209 + """
1.210 + Return a writer, optionally using the given indexing 'interval',
1.211 + 'doc_interval' and 'flush_interval'.
1.212 + """
1.213 +
1.214 + if not exists(self.pathname):
1.215 + mkdir(self.pathname)
1.216 +
1.217 + self.writer = IndexWriter(self.pathname, interval, doc_interval, flush_interval)
1.218 + return self.writer
1.219 +
1.220 + def get_reader(self, partition=0):
1.221 +
1.222 + "Return a reader for the index."
1.223 +
1.224 + # Ensure that only one partition exists.
1.225 +
1.226 + self.merge()
1.227 + return self._get_reader(partition)
1.228 +
1.229 + def _get_reader(self, partition):
1.230 +
1.231 + "Return a reader for the index."
1.232 +
1.233 + if not exists(self.pathname):
1.234 + raise OSError, "Index path %r does not exist." % self.pathname
1.235 +
1.236 + self.reader = IndexReader(self.pathname)
1.237 + return self.reader
1.238 +
1.239 + def merge(self):
1.240 +
1.241 + "Merge/optimise index partitions."
1.242 +
1.243 + self.merge_terms()
1.244 + self.merge_fields()
1.245 +
1.246 + def merge_terms(self, interval=TERM_INTERVAL, doc_interval=DOCUMENT_INTERVAL):
1.247 +
1.248 + """
1.249 + Merge term dictionaries using the given indexing 'interval' and
1.250 + 'doc_interval'.
1.251 + """
1.252 +
1.253 + readers = []
1.254 + partitions = set()
1.255 +
1.256 + for filename in listdir(self.pathname):
1.257 + if filename.startswith("terms-"): # 6 character prefix
1.258 + partition = filename[6:]
1.259 + readers.append(get_term_reader(self.pathname, partition))
1.260 + partitions.add(partition)
1.261 +
1.262 + # Write directly to a dictionary.
1.263 +
1.264 + if len(readers) > 1:
1.265 + if "merged" in partitions:
1.266 + rename_term_files(self.pathname, "merged", "old-merged")
1.267 + partitions.remove("merged")
1.268 + partitions.add("old-merged")
1.269 +
1.270 + writer = get_term_writer(self.pathname, "merged", interval, doc_interval)
1.271 + merger = TermDictionaryMerger(writer, readers)
1.272 + merger.merge()
1.273 + merger.close()
1.274 +
1.275 + # Remove old files.
1.276 +
1.277 + for partition in partitions:
1.278 + remove_term_files(self.pathname, partition)
1.279 +
1.280 + elif len(readers) == 1:
1.281 + partition = list(partitions)[0]
1.282 + if partition != "merged":
1.283 + rename_term_files(self.pathname, partition, "merged")
1.284 +
1.285 + def merge_fields(self, interval=FIELD_INTERVAL):
1.286 +
1.287 + "Merge field dictionaries using the given indexing 'interval'."
1.288 +
1.289 + readers = []
1.290 + partitions = set()
1.291 +
1.292 + for filename in listdir(self.pathname):
1.293 + if filename.startswith("fields-"): # 7 character prefix
1.294 + partition = filename[7:]
1.295 + readers.append(get_field_reader(self.pathname, partition))
1.296 + partitions.add(partition)
1.297 +
1.298 + # Write directly to a dictionary.
1.299 +
1.300 + if len(readers) > 1:
1.301 + if "merged" in partitions:
1.302 + rename_field_files(self.pathname, "merged", "old-merged")
1.303 + partitions.remove("merged")
1.304 + partitions.add("old-merged")
1.305 +
1.306 + writer = get_field_writer(self.pathname, "merged", interval)
1.307 + merger = FieldDictionaryMerger(writer, readers)
1.308 + merger.merge()
1.309 + merger.close()
1.310 +
1.311 + # Remove old files.
1.312 +
1.313 + for partition in partitions:
1.314 + remove_field_files(self.pathname, partition)
1.315 +
1.316 + elif len(readers) == 1:
1.317 + partition = list(partitions)[0]
1.318 + if partition != "merged":
1.319 + rename_field_files(self.pathname, partition, "merged")
1.320 +
1.321 + def close(self):
1.322 + if self.reader is not None:
1.323 + self.reader.close()
1.324 + self.reader = None
1.325 + if self.writer is not None:
1.326 + self.writer.close()
1.327 + self.writer = None
1.328 +
1.329 +# vim: tabstop=4 expandtab shiftwidth=4