paulb@4 | 1 | #!/usr/bin/env python |
paulb@4 | 2 | |
paulb@4 | 3 | "A simple file indexer." |
paulb@4 | 4 | |
paulb@4 | 5 | import codecs |
paulb@4 | 6 | |
paulb@4 | 7 | class Indexer: |
paulb@4 | 8 | def __init__(self, dict_location, encoding=None): |
paulb@4 | 9 | self.dict_location = dict_location |
paulb@4 | 10 | self.encoding = encoding |
paulb@4 | 11 | |
paulb@4 | 12 | # Initialisation. |
paulb@4 | 13 | |
paulb@4 | 14 | self.index = self.make_index() |
paulb@4 | 15 | |
paulb@4 | 16 | def get_index(self): |
paulb@4 | 17 | return self.index |
paulb@4 | 18 | |
paulb@4 | 19 | def make_index(self): |
paulb@4 | 20 | |
paulb@4 | 21 | "Return a dictionary containing an index structure for the dict." |
paulb@4 | 22 | |
paulb@4 | 23 | if self.encoding is None: |
paulb@4 | 24 | f = open(self.dict_location) |
paulb@4 | 25 | else: |
paulb@4 | 26 | f = codecs.open(self.dict_location, encoding=self.encoding) |
paulb@4 | 27 | s = f.read() |
paulb@4 | 28 | f.close() |
paulb@4 | 29 | |
paulb@4 | 30 | tokens = s.split() |
paulb@4 | 31 | index = {} |
paulb@4 | 32 | |
paulb@4 | 33 | for token in tokens: |
paulb@4 | 34 | slot = index |
paulb@4 | 35 | for c in token: |
paulb@4 | 36 | if not slot.has_key(c): |
paulb@4 | 37 | slot[c] = {}, [] |
paulb@4 | 38 | slot, words = slot[c] |
paulb@4 | 39 | |
paulb@4 | 40 | if token not in words: |
paulb@4 | 41 | words.append(token) |
paulb@4 | 42 | |
paulb@4 | 43 | return index |
paulb@4 | 44 | |
paulb@4 | 45 | class Searcher: |
paulb@4 | 46 | def __init__(self, index): |
paulb@4 | 47 | self.index = index |
paulb@4 | 48 | |
paulb@4 | 49 | def find(self, pattern): |
paulb@4 | 50 | |
paulb@4 | 51 | "Find words beginning with the given 'pattern'." |
paulb@4 | 52 | |
paulb@4 | 53 | slot = self.index |
paulb@4 | 54 | words = [] |
paulb@4 | 55 | |
paulb@4 | 56 | for c in pattern: |
paulb@4 | 57 | if not slot.has_key(c): |
paulb@4 | 58 | return [] |
paulb@4 | 59 | slot, words = slot[c] |
paulb@4 | 60 | |
paulb@4 | 61 | results = [] |
paulb@4 | 62 | results += words |
paulb@4 | 63 | results += self.get_all_words(slot) |
paulb@4 | 64 | return results |
paulb@4 | 65 | |
paulb@4 | 66 | def get_all_words(self, slot): |
paulb@4 | 67 | |
paulb@4 | 68 | "Get all words under the given index 'slot'." |
paulb@4 | 69 | |
paulb@4 | 70 | all_words = [] |
paulb@4 | 71 | keys = slot.keys() |
paulb@4 | 72 | keys.sort() |
paulb@4 | 73 | for c in keys: |
paulb@4 | 74 | this_slot, words = slot[c] |
paulb@4 | 75 | all_words += words |
paulb@4 | 76 | all_words += self.get_all_words(this_slot) |
paulb@4 | 77 | return all_words |
paulb@4 | 78 | |
paulb@4 | 79 | def update(index1, index2): |
paulb@4 | 80 | for key in index2.keys(): |
paulb@4 | 81 | if not index1.has_key(key): |
paulb@4 | 82 | index1[key] = index2[key] |
paulb@4 | 83 | else: |
paulb@4 | 84 | slot1, words1 = index1[key] |
paulb@4 | 85 | slot2, words2 = index2[key] |
paulb@4 | 86 | for word in words2: |
paulb@4 | 87 | if not word in words1: |
paulb@4 | 88 | words1.append(word) |
paulb@4 | 89 | update(slot1, slot2) |
paulb@4 | 90 | |
paulb@4 | 91 | # vim: tabstop=4 expandtab shiftwidth=4 |