paulb@219 | 1 | #!/usr/bin/env python |
paulb@219 | 2 | |
paulb@219 | 3 | "A simple file indexer." |
paulb@219 | 4 | |
paulb@219 | 5 | import codecs |
paulb@219 | 6 | |
paulb@219 | 7 | class Dict: |
paulb@219 | 8 | def __init__(self, dict_location, encoding=None): |
paulb@219 | 9 | self.dict_location = dict_location |
paulb@219 | 10 | self.encoding = encoding |
paulb@219 | 11 | |
paulb@219 | 12 | # Initialisation. |
paulb@219 | 13 | |
paulb@219 | 14 | self.index = self.get_index() |
paulb@219 | 15 | |
paulb@219 | 16 | def get_index(self): |
paulb@219 | 17 | |
paulb@219 | 18 | "Return a dictionary containing an index structure for the dict." |
paulb@219 | 19 | |
paulb@219 | 20 | if self.encoding is None: |
paulb@219 | 21 | f = open(self.dict_location) |
paulb@219 | 22 | else: |
paulb@219 | 23 | f = codecs.open(self.dict_location, encoding=self.encoding) |
paulb@219 | 24 | s = f.read() |
paulb@219 | 25 | f.close() |
paulb@219 | 26 | |
paulb@219 | 27 | tokens = s.split() |
paulb@219 | 28 | index = {} |
paulb@219 | 29 | |
paulb@219 | 30 | for token in tokens: |
paulb@219 | 31 | slot = index |
paulb@219 | 32 | for c in token: |
paulb@219 | 33 | if not slot.has_key(c): |
paulb@219 | 34 | slot[c] = {}, [] |
paulb@219 | 35 | slot, words = slot[c] |
paulb@219 | 36 | |
paulb@219 | 37 | if token not in words: |
paulb@219 | 38 | words.append(token) |
paulb@219 | 39 | |
paulb@219 | 40 | return index |
paulb@219 | 41 | |
paulb@219 | 42 | def find(self, pattern): |
paulb@219 | 43 | |
paulb@219 | 44 | "Find words beginning with the given 'pattern'." |
paulb@219 | 45 | |
paulb@219 | 46 | slot = self.index |
paulb@219 | 47 | words = [] |
paulb@219 | 48 | |
paulb@219 | 49 | for c in pattern: |
paulb@219 | 50 | if not slot.has_key(c): |
paulb@219 | 51 | return [] |
paulb@219 | 52 | slot, words = slot[c] |
paulb@219 | 53 | |
paulb@219 | 54 | results = [] |
paulb@219 | 55 | results += words |
paulb@219 | 56 | results += self.get_all_words(slot) |
paulb@219 | 57 | return results |
paulb@219 | 58 | |
paulb@219 | 59 | def get_all_words(self, slot): |
paulb@219 | 60 | |
paulb@219 | 61 | "Get all words under the given index 'slot'." |
paulb@219 | 62 | |
paulb@219 | 63 | all_words = [] |
paulb@225 | 64 | keys = slot.keys() |
paulb@225 | 65 | keys.sort() |
paulb@225 | 66 | for c in keys: |
paulb@219 | 67 | this_slot, words = slot[c] |
paulb@219 | 68 | all_words += words |
paulb@219 | 69 | all_words += self.get_all_words(this_slot) |
paulb@219 | 70 | return all_words |
paulb@219 | 71 | |
paulb@219 | 72 | # vim: tabstop=4 expandtab shiftwidth=4 |