paulb@4 | 1 | #!/usr/bin/env python |
paulb@4 | 2 | |
paulb@4 | 3 | "A simple file indexer." |
paulb@4 | 4 | |
paulb@4 | 5 | import codecs |
paulb@4 | 6 | |
paulb@10 | 7 | class Parser: |
paulb@4 | 8 | def __init__(self, dict_location, encoding=None): |
paulb@4 | 9 | self.dict_location = dict_location |
paulb@4 | 10 | self.encoding = encoding |
paulb@4 | 11 | |
paulb@10 | 12 | def _get_file_content(self): |
paulb@4 | 13 | if self.encoding is None: |
paulb@4 | 14 | f = open(self.dict_location) |
paulb@4 | 15 | else: |
paulb@4 | 16 | f = codecs.open(self.dict_location, encoding=self.encoding) |
paulb@4 | 17 | s = f.read() |
paulb@4 | 18 | f.close() |
paulb@10 | 19 | return s |
paulb@4 | 20 | |
paulb@10 | 21 | def send_entries(self, channel): |
paulb@10 | 22 | |
paulb@10 | 23 | "Send word entries from the file." |
paulb@10 | 24 | |
paulb@10 | 25 | tokens = self._get_file_content().split() |
paulb@4 | 26 | index = {} |
paulb@4 | 27 | |
paulb@10 | 28 | words = [] |
paulb@4 | 29 | for token in tokens: |
paulb@4 | 30 | if token not in words: |
paulb@10 | 31 | channel.send((token, self.dict_location)) |
paulb@4 | 32 | words.append(token) |
paulb@4 | 33 | |
paulb@10 | 34 | class Indexer: |
paulb@10 | 35 | def __init__(self): |
paulb@10 | 36 | self.index = {} |
paulb@10 | 37 | |
paulb@10 | 38 | def get_index(self): |
paulb@10 | 39 | return self.index |
paulb@10 | 40 | |
paulb@10 | 41 | def add_entry(self, entry): |
paulb@10 | 42 | |
paulb@10 | 43 | "Add the given word 'entry' (token, dict_location) to the index." |
paulb@10 | 44 | |
paulb@10 | 45 | token, dict_location = entry |
paulb@10 | 46 | slot = self.index |
paulb@10 | 47 | for c in token: |
paulb@10 | 48 | if not slot.has_key(c): |
paulb@10 | 49 | slot[c] = {}, [] |
paulb@10 | 50 | slot, words = slot[c] |
paulb@10 | 51 | |
paulb@10 | 52 | if token not in words: |
paulb@10 | 53 | words.append(token) |
paulb@4 | 54 | |
paulb@4 | 55 | class Searcher: |
paulb@4 | 56 | def __init__(self, index): |
paulb@4 | 57 | self.index = index |
paulb@4 | 58 | |
paulb@4 | 59 | def find(self, pattern): |
paulb@4 | 60 | |
paulb@4 | 61 | "Find words beginning with the given 'pattern'." |
paulb@4 | 62 | |
paulb@4 | 63 | slot = self.index |
paulb@4 | 64 | words = [] |
paulb@4 | 65 | |
paulb@4 | 66 | for c in pattern: |
paulb@4 | 67 | if not slot.has_key(c): |
paulb@4 | 68 | return [] |
paulb@4 | 69 | slot, words = slot[c] |
paulb@4 | 70 | |
paulb@4 | 71 | results = [] |
paulb@4 | 72 | results += words |
paulb@4 | 73 | results += self.get_all_words(slot) |
paulb@4 | 74 | return results |
paulb@4 | 75 | |
paulb@4 | 76 | def get_all_words(self, slot): |
paulb@4 | 77 | |
paulb@4 | 78 | "Get all words under the given index 'slot'." |
paulb@4 | 79 | |
paulb@4 | 80 | all_words = [] |
paulb@4 | 81 | keys = slot.keys() |
paulb@4 | 82 | keys.sort() |
paulb@4 | 83 | for c in keys: |
paulb@4 | 84 | this_slot, words = slot[c] |
paulb@4 | 85 | all_words += words |
paulb@4 | 86 | all_words += self.get_all_words(this_slot) |
paulb@4 | 87 | return all_words |
paulb@4 | 88 | |
paulb@4 | 89 | # vim: tabstop=4 expandtab shiftwidth=4 |