paulb@4 | 1 | #!/usr/bin/env python |
paulb@4 | 2 | |
paulb@4 | 3 | "A simple file indexer." |
paulb@4 | 4 | |
paulb@4 | 5 | import codecs |
paulb@4 | 6 | |
paulb@10 | 7 | class Parser: |
paulb@4 | 8 | def __init__(self, dict_location, encoding=None): |
paulb@4 | 9 | self.dict_location = dict_location |
paulb@4 | 10 | self.encoding = encoding |
paulb@4 | 11 | |
paulb@10 | 12 | def _get_file_content(self): |
paulb@4 | 13 | if self.encoding is None: |
paulb@4 | 14 | f = open(self.dict_location) |
paulb@4 | 15 | else: |
paulb@4 | 16 | f = codecs.open(self.dict_location, encoding=self.encoding) |
paulb@4 | 17 | s = f.read() |
paulb@4 | 18 | f.close() |
paulb@10 | 19 | return s |
paulb@4 | 20 | |
paulb@10 | 21 | def send_entries(self, channel): |
paulb@10 | 22 | |
paulb@10 | 23 | "Send word entries from the file." |
paulb@10 | 24 | |
paulb@10 | 25 | tokens = self._get_file_content().split() |
paulb@4 | 26 | index = {} |
paulb@4 | 27 | |
paulb@10 | 28 | words = [] |
paulb@4 | 29 | for token in tokens: |
paulb@18 | 30 | token = self._strip(token) |
paulb@4 | 31 | if token not in words: |
paulb@10 | 32 | channel.send((token, self.dict_location)) |
paulb@4 | 33 | words.append(token) |
paulb@4 | 34 | |
paulb@18 | 35 | def _strip(self, token): |
paulb@18 | 36 | |
paulb@18 | 37 | "Return the token stripped of non-alphanumeric symbols at each end." |
paulb@18 | 38 | |
paulb@18 | 39 | characters = [] |
paulb@18 | 40 | in_alphanum = 0 |
paulb@18 | 41 | for c in token: |
paulb@18 | 42 | if not c.isalpha() and not c.isdigit(): |
paulb@18 | 43 | if in_alphanum: |
paulb@18 | 44 | break |
paulb@18 | 45 | else: |
paulb@18 | 46 | in_alphanum = 1 |
paulb@18 | 47 | characters.append(c) |
paulb@18 | 48 | return "".join(characters) |
paulb@18 | 49 | |
paulb@10 | 50 | class Indexer: |
paulb@10 | 51 | def __init__(self): |
paulb@10 | 52 | self.index = {} |
paulb@10 | 53 | |
paulb@10 | 54 | def get_index(self): |
paulb@10 | 55 | return self.index |
paulb@10 | 56 | |
paulb@10 | 57 | def add_entry(self, entry): |
paulb@10 | 58 | |
paulb@10 | 59 | "Add the given word 'entry' (token, dict_location) to the index." |
paulb@10 | 60 | |
paulb@10 | 61 | token, dict_location = entry |
paulb@12 | 62 | |
paulb@12 | 63 | if not token: |
paulb@12 | 64 | return |
paulb@12 | 65 | |
paulb@10 | 66 | slot = self.index |
paulb@10 | 67 | for c in token: |
paulb@10 | 68 | if not slot.has_key(c): |
paulb@12 | 69 | slot[c] = {}, {} |
paulb@10 | 70 | slot, words = slot[c] |
paulb@10 | 71 | |
paulb@12 | 72 | if not words.has_key(token): |
paulb@12 | 73 | words[token] = [] |
paulb@12 | 74 | words[token].append(dict_location) |
paulb@4 | 75 | |
paulb@4 | 76 | class Searcher: |
paulb@4 | 77 | def __init__(self, index): |
paulb@4 | 78 | self.index = index |
paulb@4 | 79 | |
paulb@4 | 80 | def find(self, pattern): |
paulb@4 | 81 | |
paulb@4 | 82 | "Find words beginning with the given 'pattern'." |
paulb@4 | 83 | |
paulb@4 | 84 | slot = self.index |
paulb@4 | 85 | words = [] |
paulb@4 | 86 | |
paulb@4 | 87 | for c in pattern: |
paulb@4 | 88 | if not slot.has_key(c): |
paulb@4 | 89 | return [] |
paulb@4 | 90 | slot, words = slot[c] |
paulb@4 | 91 | |
paulb@12 | 92 | results = {} |
paulb@12 | 93 | results.update(words) |
paulb@12 | 94 | results.update(self.get_all_words(slot)) |
paulb@4 | 95 | return results |
paulb@4 | 96 | |
paulb@4 | 97 | def get_all_words(self, slot): |
paulb@4 | 98 | |
paulb@4 | 99 | "Get all words under the given index 'slot'." |
paulb@4 | 100 | |
paulb@12 | 101 | all_words = {} |
paulb@4 | 102 | keys = slot.keys() |
paulb@4 | 103 | keys.sort() |
paulb@4 | 104 | for c in keys: |
paulb@4 | 105 | this_slot, words = slot[c] |
paulb@12 | 106 | all_words.update(words) |
paulb@12 | 107 | all_words.update(self.get_all_words(this_slot)) |
paulb@4 | 108 | return all_words |
paulb@4 | 109 | |
paulb@4 | 110 | # vim: tabstop=4 expandtab shiftwidth=4 |