1 #!/usr/bin/env python 2 3 "A simple file indexer." 4 5 import codecs 6 7 class Parser: 8 def __init__(self, dict_location, encoding=None): 9 self.dict_location = dict_location 10 self.encoding = encoding 11 12 def _get_file_content(self): 13 if self.encoding is None: 14 f = open(self.dict_location) 15 else: 16 f = codecs.open(self.dict_location, encoding=self.encoding) 17 s = f.read() 18 f.close() 19 return s 20 21 def send_entries(self, channel): 22 23 "Send word entries from the file." 24 25 tokens = self._get_file_content().split() 26 index = {} 27 28 words = [] 29 for token in tokens: 30 token = self._strip(token) 31 if token not in words: 32 channel.send((token, self.dict_location)) 33 words.append(token) 34 35 def _strip(self, token): 36 37 "Return the token stripped of non-alphanumeric symbols at each end." 38 39 characters = [] 40 in_alphanum = 0 41 for c in token: 42 if not c.isalpha() and not c.isdigit(): 43 if in_alphanum: 44 break 45 else: 46 in_alphanum = 1 47 characters.append(c) 48 return "".join(characters) 49 50 class Indexer: 51 def __init__(self): 52 self.index = {} 53 54 def get_index(self): 55 return self.index 56 57 def add_entry(self, entry): 58 59 "Add the given word 'entry' (token, dict_location) to the index." 60 61 token, dict_location = entry 62 63 if not token: 64 return 65 66 slot = self.index 67 for c in token: 68 if not slot.has_key(c): 69 slot[c] = {}, {} 70 slot, words = slot[c] 71 72 if not words.has_key(token): 73 words[token] = [] 74 words[token].append(dict_location) 75 76 class Searcher: 77 def __init__(self, index): 78 self.index = index 79 80 def find(self, pattern): 81 82 "Find words beginning with the given 'pattern'." 83 84 slot = self.index 85 words = [] 86 87 for c in pattern: 88 if not slot.has_key(c): 89 return [] 90 slot, words = slot[c] 91 92 results = {} 93 results.update(words) 94 results.update(self.get_all_words(slot)) 95 return results 96 97 def get_all_words(self, slot): 98 99 "Get all words under the given index 'slot'." 100 101 all_words = {} 102 keys = slot.keys() 103 keys.sort() 104 for c in keys: 105 this_slot, words = slot[c] 106 all_words.update(words) 107 all_words.update(self.get_all_words(this_slot)) 108 return all_words 109 110 # vim: tabstop=4 expandtab shiftwidth=4