paul@44 | 1 | #!/usr/bin/env python |
paul@44 | 2 | |
paul@44 | 3 | """ |
paul@44 | 4 | High-level classes. |
paul@44 | 5 | |
paul@85 | 6 | Copyright (C) 2009, 2010, 2011 Paul Boddie <paul@boddie.org.uk> |
paul@44 | 7 | |
paul@44 | 8 | This program is free software; you can redistribute it and/or modify it under |
paul@44 | 9 | the terms of the GNU General Public License as published by the Free Software |
paul@44 | 10 | Foundation; either version 3 of the License, or (at your option) any later |
paul@44 | 11 | version. |
paul@44 | 12 | |
paul@44 | 13 | This program is distributed in the hope that it will be useful, but WITHOUT ANY |
paul@44 | 14 | WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A |
paul@44 | 15 | PARTICULAR PURPOSE. See the GNU General Public License for more details. |
paul@44 | 16 | |
paul@44 | 17 | You should have received a copy of the GNU General Public License along |
paul@44 | 18 | with this program. If not, see <http://www.gnu.org/licenses/>. |
paul@44 | 19 | """ |
paul@44 | 20 | |
paul@44 | 21 | from iixr.filesystem import * |
paul@47 | 22 | from iixr.merging import * |
paul@73 | 23 | from itertools import islice |
paul@76 | 24 | from os import mkdir # index discovery |
paul@44 | 25 | from os.path import exists |
paul@44 | 26 | |
paul@44 | 27 | # Constants. |
paul@44 | 28 | |
paul@44 | 29 | TERM_INTERVAL = 100 |
paul@44 | 30 | DOCUMENT_INTERVAL = 100 |
paul@44 | 31 | FIELD_INTERVAL = 100 |
paul@44 | 32 | FLUSH_INTERVAL = 10000 |
paul@85 | 33 | POSITIONS_FLUSH_INTERVAL = 1000000 |
paul@73 | 34 | OPEN_PARTITIONS = 20 |
paul@44 | 35 | |
paul@44 | 36 | # High-level classes. |
paul@44 | 37 | |
paul@44 | 38 | class Document: |
paul@44 | 39 | |
paul@44 | 40 | "A container of document information." |
paul@44 | 41 | |
paul@63 | 42 | def __init__(self, docnum, fields=None): |
paul@44 | 43 | self.docnum = docnum |
paul@63 | 44 | self.fields = fields or [] |
paul@44 | 45 | self.terms = {} |
paul@63 | 46 | self.field_dict = None |
paul@44 | 47 | |
paul@44 | 48 | def add_position(self, term, position): |
paul@44 | 49 | |
paul@44 | 50 | """ |
paul@44 | 51 | Add a position entry for the given 'term', indicating the given |
paul@44 | 52 | 'position'. |
paul@44 | 53 | """ |
paul@44 | 54 | |
paul@44 | 55 | self.terms.setdefault(term, []).append(position) |
paul@44 | 56 | |
paul@44 | 57 | def add_field(self, identifier, value): |
paul@44 | 58 | |
paul@44 | 59 | "Add a field having the given 'identifier' and 'value'." |
paul@44 | 60 | |
paul@44 | 61 | self.fields.append((identifier, unicode(value))) # convert to string |
paul@44 | 62 | |
paul@44 | 63 | def set_fields(self, fields): |
paul@44 | 64 | |
paul@44 | 65 | """ |
paul@44 | 66 | Set the document's 'fields': a list of tuples each containing an integer |
paul@44 | 67 | identifier and a string value. |
paul@44 | 68 | """ |
paul@44 | 69 | |
paul@44 | 70 | self.fields = fields |
paul@44 | 71 | |
paul@63 | 72 | def _ensure_dict(self): |
paul@63 | 73 | if self.field_dict is None: |
paul@63 | 74 | self.field_dict = dict(self.fields) |
paul@63 | 75 | |
paul@63 | 76 | def keys(self): |
paul@63 | 77 | self._ensure_dict() |
paul@63 | 78 | return self.field_dict.keys() |
paul@63 | 79 | |
paul@63 | 80 | def __getitem__(self, key): |
paul@63 | 81 | self._ensure_dict() |
paul@63 | 82 | return self.field_dict[key] |
paul@63 | 83 | |
paul@44 | 84 | class IndexWriter: |
paul@44 | 85 | |
paul@44 | 86 | """ |
paul@44 | 87 | Building term information and writing it to the term and field dictionaries. |
paul@44 | 88 | """ |
paul@44 | 89 | |
paul@85 | 90 | def __init__(self, pathname, interval, doc_interval, field_interval, flush_interval, positions_flush_interval): |
paul@44 | 91 | self.pathname = pathname |
paul@44 | 92 | self.interval = interval |
paul@44 | 93 | self.doc_interval = doc_interval |
paul@66 | 94 | self.field_interval = field_interval |
paul@44 | 95 | self.flush_interval = flush_interval |
paul@85 | 96 | self.positions_flush_interval = positions_flush_interval |
paul@44 | 97 | |
paul@76 | 98 | self.dict_partition = get_next_partition(get_term_partitions(self.pathname)) |
paul@76 | 99 | self.field_dict_partition = get_next_partition(get_field_partitions(self.pathname)) |
paul@44 | 100 | |
paul@44 | 101 | self.terms = {} |
paul@63 | 102 | self.docs = [] |
paul@44 | 103 | |
paul@44 | 104 | self.doc_counter = 0 |
paul@85 | 105 | self.position_counter = 0 |
paul@44 | 106 | |
paul@44 | 107 | def add_document(self, doc): |
paul@44 | 108 | |
paul@44 | 109 | """ |
paul@44 | 110 | Add the given document 'doc', updating the document counter and flushing |
paul@44 | 111 | terms and fields if appropriate. |
paul@44 | 112 | """ |
paul@44 | 113 | |
paul@85 | 114 | docnum = doc.docnum |
paul@85 | 115 | |
paul@44 | 116 | for term, positions in doc.terms.items(): |
paul@85 | 117 | self.terms.setdefault(term, {})[docnum] = positions |
paul@85 | 118 | self.position_counter += len(positions) |
paul@44 | 119 | |
paul@85 | 120 | self.docs.append((docnum, doc.fields)) |
paul@44 | 121 | |
paul@44 | 122 | self.doc_counter += 1 |
paul@85 | 123 | |
paul@85 | 124 | if self.flush_interval and self.doc_counter >= self.flush_interval or \ |
paul@85 | 125 | self.positions_flush_interval and self.position_counter >= self.positions_flush_interval: |
paul@85 | 126 | |
paul@44 | 127 | self.flush_terms() |
paul@44 | 128 | self.flush_fields() |
paul@44 | 129 | self.doc_counter = 0 |
paul@85 | 130 | self.position_counter = 0 |
paul@44 | 131 | |
paul@44 | 132 | def get_term_writer(self): |
paul@44 | 133 | |
paul@44 | 134 | "Return a term dictionary writer for the current partition." |
paul@44 | 135 | |
paul@44 | 136 | return get_term_writer(self.pathname, self.dict_partition, self.interval, self.doc_interval) |
paul@44 | 137 | |
paul@44 | 138 | def get_field_writer(self): |
paul@44 | 139 | |
paul@44 | 140 | "Return a field dictionary writer for the current partition." |
paul@44 | 141 | |
paul@65 | 142 | return get_field_writer(self.pathname, self.field_dict_partition, self.field_interval) |
paul@44 | 143 | |
paul@44 | 144 | def flush_terms(self): |
paul@44 | 145 | |
paul@44 | 146 | "Flush terms into the current term dictionary partition." |
paul@44 | 147 | |
paul@44 | 148 | # Get the terms in order. |
paul@44 | 149 | |
paul@44 | 150 | all_terms = self.terms |
paul@44 | 151 | terms = all_terms.keys() |
paul@44 | 152 | terms.sort() |
paul@44 | 153 | |
paul@44 | 154 | dict_writer = self.get_term_writer() |
paul@44 | 155 | |
paul@44 | 156 | for term in terms: |
paul@44 | 157 | doc_positions = all_terms[term].items() |
paul@44 | 158 | dict_writer.write_term_positions(term, doc_positions) |
paul@44 | 159 | |
paul@44 | 160 | dict_writer.close() |
paul@44 | 161 | |
paul@44 | 162 | self.terms = {} |
paul@44 | 163 | self.dict_partition += 1 |
paul@44 | 164 | |
paul@44 | 165 | def flush_fields(self): |
paul@44 | 166 | |
paul@44 | 167 | "Flush fields into the current term dictionary partition." |
paul@44 | 168 | |
paul@44 | 169 | # Get the documents in order. |
paul@44 | 170 | |
paul@63 | 171 | self.docs.sort() |
paul@44 | 172 | |
paul@44 | 173 | field_dict_writer = self.get_field_writer() |
paul@63 | 174 | for docnum, fields in self.docs: |
paul@44 | 175 | field_dict_writer.write_fields(docnum, fields) |
paul@44 | 176 | field_dict_writer.close() |
paul@44 | 177 | |
paul@63 | 178 | self.docs = [] |
paul@44 | 179 | self.field_dict_partition += 1 |
paul@44 | 180 | |
paul@44 | 181 | def close(self): |
paul@76 | 182 | if self.terms or not get_term_partitions(self.pathname): |
paul@44 | 183 | self.flush_terms() |
paul@76 | 184 | if self.docs or not get_field_partitions(self.pathname): |
paul@44 | 185 | self.flush_fields() |
paul@44 | 186 | |
paul@44 | 187 | class IndexReader: |
paul@44 | 188 | |
paul@44 | 189 | "Accessing the term and field dictionaries." |
paul@44 | 190 | |
paul@44 | 191 | def __init__(self, pathname): |
paul@44 | 192 | self.dict_reader = get_term_reader(pathname, "merged") |
paul@44 | 193 | self.field_dict_reader = get_field_reader(pathname, "merged") |
paul@44 | 194 | |
paul@81 | 195 | # Sequential access. |
paul@81 | 196 | |
paul@81 | 197 | def read_term(self): |
paul@81 | 198 | return self.dict_reader.read_term() |
paul@81 | 199 | |
paul@81 | 200 | def go_to_term(self, term): |
paul@81 | 201 | return self.dict_reader._get_term_and_positions(*self.dict_reader.go_to_term(term)) |
paul@81 | 202 | |
paul@81 | 203 | # Query access. |
paul@81 | 204 | |
paul@72 | 205 | def get_terms(self): |
paul@72 | 206 | return self.dict_reader.get_terms() |
paul@72 | 207 | |
paul@44 | 208 | def find_terms(self, term): |
paul@44 | 209 | return self.dict_reader.find_terms(term) |
paul@44 | 210 | |
paul@44 | 211 | def find_positions(self, term): |
paul@44 | 212 | return self.dict_reader.find_positions(term) |
paul@44 | 213 | |
paul@70 | 214 | def find_common_positions(self, terms): |
paul@70 | 215 | return self.dict_reader.find_common_positions(terms) |
paul@60 | 216 | |
paul@44 | 217 | def get_frequency(self, term): |
paul@44 | 218 | return self.dict_reader.get_frequency(term) |
paul@44 | 219 | |
paul@44 | 220 | def get_document_frequency(self, term): |
paul@44 | 221 | return self.dict_reader.get_document_frequency(term) |
paul@44 | 222 | |
paul@44 | 223 | def get_fields(self, docnum): |
paul@44 | 224 | return self.field_dict_reader.get_fields(docnum) |
paul@44 | 225 | |
paul@63 | 226 | def get_document(self, docnum): |
paul@63 | 227 | return Document(docnum, self.get_fields(docnum)) |
paul@63 | 228 | |
paul@44 | 229 | def close(self): |
paul@44 | 230 | self.dict_reader.close() |
paul@44 | 231 | self.field_dict_reader.close() |
paul@44 | 232 | |
paul@44 | 233 | class Index: |
paul@44 | 234 | |
paul@44 | 235 | "An inverted index solution encapsulating the various components." |
paul@44 | 236 | |
paul@64 | 237 | def __init__(self, pathname, interval=TERM_INTERVAL, doc_interval=DOCUMENT_INTERVAL, field_interval=FIELD_INTERVAL, |
paul@85 | 238 | flush_interval=FLUSH_INTERVAL, positions_flush_interval=POSITIONS_FLUSH_INTERVAL, open_partitions=OPEN_PARTITIONS): |
paul@64 | 239 | |
paul@44 | 240 | self.pathname = pathname |
paul@64 | 241 | self.interval = interval |
paul@64 | 242 | self.doc_interval = doc_interval |
paul@64 | 243 | self.field_interval = field_interval |
paul@64 | 244 | self.flush_interval = flush_interval |
paul@85 | 245 | self.positions_flush_interval = positions_flush_interval |
paul@73 | 246 | self.open_partitions = open_partitions |
paul@44 | 247 | self.reader = None |
paul@44 | 248 | self.writer = None |
paul@44 | 249 | |
paul@64 | 250 | def get_writer(self): |
paul@44 | 251 | |
paul@64 | 252 | "Return a writer." |
paul@44 | 253 | |
paul@59 | 254 | self._ensure_directory() |
paul@66 | 255 | self.writer = IndexWriter(self.pathname, self.interval, self.doc_interval, |
paul@85 | 256 | self.field_interval, self.flush_interval, self.positions_flush_interval) |
paul@59 | 257 | return self.writer |
paul@59 | 258 | |
paul@59 | 259 | def _ensure_directory(self): |
paul@44 | 260 | if not exists(self.pathname): |
paul@44 | 261 | mkdir(self.pathname) |
paul@44 | 262 | |
paul@44 | 263 | def get_reader(self, partition=0): |
paul@44 | 264 | |
paul@44 | 265 | "Return a reader for the index." |
paul@44 | 266 | |
paul@44 | 267 | # Ensure that only one partition exists. |
paul@44 | 268 | |
paul@44 | 269 | self.merge() |
paul@44 | 270 | return self._get_reader(partition) |
paul@44 | 271 | |
paul@44 | 272 | def _get_reader(self, partition): |
paul@44 | 273 | |
paul@44 | 274 | "Return a reader for the index." |
paul@44 | 275 | |
paul@44 | 276 | if not exists(self.pathname): |
paul@44 | 277 | raise OSError, "Index path %r does not exist." % self.pathname |
paul@44 | 278 | |
paul@44 | 279 | self.reader = IndexReader(self.pathname) |
paul@44 | 280 | return self.reader |
paul@44 | 281 | |
paul@58 | 282 | def get_term_partitions(self): |
paul@58 | 283 | |
paul@58 | 284 | "Return a set of term partition identifiers." |
paul@58 | 285 | |
paul@76 | 286 | return get_term_partitions(self.pathname) |
paul@58 | 287 | |
paul@58 | 288 | def get_field_partitions(self): |
paul@58 | 289 | |
paul@58 | 290 | "Return a set of field partition identifiers." |
paul@58 | 291 | |
paul@76 | 292 | return get_field_partitions(self.pathname) |
paul@58 | 293 | |
paul@44 | 294 | def merge(self): |
paul@44 | 295 | |
paul@44 | 296 | "Merge/optimise index partitions." |
paul@44 | 297 | |
paul@58 | 298 | self._merge_terms() |
paul@58 | 299 | self._merge_fields() |
paul@44 | 300 | |
paul@73 | 301 | def _merge_dictionaries(self, get_partitions, rename_files, remove_files, get_reader, get_writer, get_merger, intervals): |
paul@73 | 302 | |
paul@73 | 303 | "Merge term or field dictionaries." |
paul@44 | 304 | |
paul@73 | 305 | partitions = get_partitions() |
paul@73 | 306 | |
paul@73 | 307 | # Ensure the correct labelling of a single partition. |
paul@44 | 308 | |
paul@73 | 309 | if len(partitions) == 1: |
paul@73 | 310 | partition = list(partitions)[0] |
paul@73 | 311 | if partition != "merged": |
paul@73 | 312 | rename_files(self.pathname, partition, "merged") |
paul@73 | 313 | return |
paul@44 | 314 | |
paul@73 | 315 | # Merge the partitions. |
paul@73 | 316 | |
paul@73 | 317 | old_merged_counter = 0 |
paul@44 | 318 | |
paul@73 | 319 | while len(partitions) > 1: |
paul@44 | 320 | |
paul@44 | 321 | if "merged" in partitions: |
paul@73 | 322 | rename_files(self.pathname, "merged", "old-merged-%d" % old_merged_counter) |
paul@44 | 323 | partitions.remove("merged") |
paul@73 | 324 | partitions.add("old-merged-%d" % old_merged_counter) |
paul@73 | 325 | old_merged_counter += 1 |
paul@73 | 326 | |
paul@73 | 327 | # Process only a certain number at once, avoiding resource limits. |
paul@73 | 328 | |
paul@73 | 329 | active_partitions = list(islice(partitions, self.open_partitions)) |
paul@44 | 330 | |
paul@73 | 331 | readers = [] |
paul@73 | 332 | for partition in active_partitions: |
paul@73 | 333 | readers.append(get_reader(self.pathname, partition)) |
paul@73 | 334 | |
paul@73 | 335 | # Write directly to a dictionary. |
paul@73 | 336 | |
paul@73 | 337 | writer = get_writer(self.pathname, "merged", *intervals) |
paul@73 | 338 | merger = get_merger(writer, readers) |
paul@44 | 339 | merger.merge() |
paul@44 | 340 | merger.close() |
paul@44 | 341 | |
paul@44 | 342 | # Remove old files. |
paul@44 | 343 | |
paul@73 | 344 | for partition in active_partitions: |
paul@73 | 345 | remove_files(self.pathname, partition) |
paul@73 | 346 | |
paul@73 | 347 | # Acquire the partitions to check their number again. |
paul@73 | 348 | |
paul@73 | 349 | partitions = get_partitions() |
paul@44 | 350 | |
paul@73 | 351 | def _merge_terms(self): |
paul@73 | 352 | |
paul@73 | 353 | "Merge term dictionaries." |
paul@73 | 354 | |
paul@73 | 355 | self._merge_dictionaries(self.get_term_partitions, rename_term_files, |
paul@73 | 356 | remove_term_files, get_term_reader, get_term_writer, |
paul@73 | 357 | TermDictionaryMerger, [self.interval, self.doc_interval]) |
paul@44 | 358 | |
paul@64 | 359 | def _merge_fields(self): |
paul@44 | 360 | |
paul@64 | 361 | "Merge field dictionaries." |
paul@44 | 362 | |
paul@73 | 363 | self._merge_dictionaries(self.get_field_partitions, rename_field_files, |
paul@73 | 364 | remove_field_files, get_field_reader, get_field_writer, |
paul@73 | 365 | FieldDictionaryMerger, [self.field_interval]) |
paul@44 | 366 | |
paul@58 | 367 | def update(self, other_indexes): |
paul@58 | 368 | |
paul@58 | 369 | "Copy the content of the 'other_indexes' into this index and merge." |
paul@58 | 370 | |
paul@59 | 371 | self._ensure_directory() |
paul@59 | 372 | |
paul@58 | 373 | for i, index in enumerate(other_indexes): |
paul@58 | 374 | for partition in index.get_term_partitions(): |
paul@58 | 375 | copy_term_files(index.pathname, partition, self.pathname, "-added-%d" % i) |
paul@58 | 376 | for partition in index.get_field_partitions(): |
paul@58 | 377 | copy_field_files(index.pathname, partition, self.pathname, "-added-%d" % i) |
paul@58 | 378 | |
paul@58 | 379 | self.merge() |
paul@58 | 380 | |
paul@44 | 381 | def close(self): |
paul@44 | 382 | if self.reader is not None: |
paul@44 | 383 | self.reader.close() |
paul@44 | 384 | self.reader = None |
paul@44 | 385 | if self.writer is not None: |
paul@44 | 386 | self.writer.close() |
paul@44 | 387 | self.writer = None |
paul@44 | 388 | |
paul@44 | 389 | # vim: tabstop=4 expandtab shiftwidth=4 |