paulb@4 | 1 | #!/usr/bin/env python |
paulb@4 | 2 | |
paulb@4 | 3 | "A simple file indexer." |
paulb@4 | 4 | |
paulb@4 | 5 | import codecs |
paulb@37 | 6 | import time |
paulb@4 | 7 | |
paulb@10 | 8 | class Parser: |
paulb@37 | 9 | def __init__(self, filenames, encoding=None, delay=None): |
paulb@37 | 10 | self.filenames = filenames |
paulb@4 | 11 | self.encoding = encoding |
paulb@37 | 12 | self.delay = delay |
paulb@4 | 13 | |
paulb@37 | 14 | def _get_file_content(self, filename): |
paulb@4 | 15 | if self.encoding is None: |
paulb@37 | 16 | f = open(filename) |
paulb@4 | 17 | else: |
paulb@37 | 18 | f = codecs.open(filename, encoding=self.encoding) |
paulb@4 | 19 | s = f.read() |
paulb@4 | 20 | f.close() |
paulb@10 | 21 | return s |
paulb@4 | 22 | |
paulb@10 | 23 | def send_entries(self, channel): |
paulb@10 | 24 | |
paulb@10 | 25 | "Send word entries from the file." |
paulb@10 | 26 | |
paulb@37 | 27 | for filename in self.filenames: |
paulb@37 | 28 | tokens = self._get_file_content(filename).split() |
paulb@37 | 29 | index = {} |
paulb@4 | 30 | |
paulb@37 | 31 | words = [] |
paulb@37 | 32 | for token in tokens: |
paulb@37 | 33 | token = self._strip(token) |
paulb@37 | 34 | if token not in words: |
paulb@37 | 35 | channel.send((token, filename)) |
paulb@37 | 36 | words.append(token) |
paulb@37 | 37 | |
paulb@37 | 38 | # Introduce a delay to simulate hard work. |
paulb@37 | 39 | |
paulb@37 | 40 | if self.delay: |
paulb@37 | 41 | time.sleep(self.delay) |
paulb@4 | 42 | |
paulb@18 | 43 | def _strip(self, token): |
paulb@18 | 44 | |
paulb@18 | 45 | "Return the token stripped of non-alphanumeric symbols at each end." |
paulb@18 | 46 | |
paulb@18 | 47 | characters = [] |
paulb@18 | 48 | in_alphanum = 0 |
paulb@18 | 49 | for c in token: |
paulb@18 | 50 | if not c.isalpha() and not c.isdigit(): |
paulb@18 | 51 | if in_alphanum: |
paulb@18 | 52 | break |
paulb@18 | 53 | else: |
paulb@18 | 54 | in_alphanum = 1 |
paulb@18 | 55 | characters.append(c) |
paulb@18 | 56 | return "".join(characters) |
paulb@18 | 57 | |
paulb@10 | 58 | class Indexer: |
paulb@10 | 59 | def __init__(self): |
paulb@10 | 60 | self.index = {} |
paulb@10 | 61 | |
paulb@10 | 62 | def get_index(self): |
paulb@10 | 63 | return self.index |
paulb@10 | 64 | |
paulb@10 | 65 | def add_entry(self, entry): |
paulb@10 | 66 | |
paulb@37 | 67 | "Add the given word 'entry' (token, filename) to the index." |
paulb@10 | 68 | |
paulb@37 | 69 | token, filename = entry |
paulb@12 | 70 | |
paulb@12 | 71 | if not token: |
paulb@12 | 72 | return |
paulb@12 | 73 | |
paulb@10 | 74 | slot = self.index |
paulb@10 | 75 | for c in token: |
paulb@10 | 76 | if not slot.has_key(c): |
paulb@12 | 77 | slot[c] = {}, {} |
paulb@10 | 78 | slot, words = slot[c] |
paulb@10 | 79 | |
paulb@12 | 80 | if not words.has_key(token): |
paulb@12 | 81 | words[token] = [] |
paulb@37 | 82 | words[token].append(filename) |
paulb@4 | 83 | |
paulb@4 | 84 | class Searcher: |
paulb@4 | 85 | def __init__(self, index): |
paulb@4 | 86 | self.index = index |
paulb@4 | 87 | |
paulb@4 | 88 | def find(self, pattern): |
paulb@4 | 89 | |
paulb@4 | 90 | "Find words beginning with the given 'pattern'." |
paulb@4 | 91 | |
paulb@4 | 92 | slot = self.index |
paulb@4 | 93 | words = [] |
paulb@4 | 94 | |
paulb@4 | 95 | for c in pattern: |
paulb@4 | 96 | if not slot.has_key(c): |
paulb@4 | 97 | return [] |
paulb@4 | 98 | slot, words = slot[c] |
paulb@4 | 99 | |
paulb@12 | 100 | results = {} |
paulb@12 | 101 | results.update(words) |
paulb@12 | 102 | results.update(self.get_all_words(slot)) |
paulb@4 | 103 | return results |
paulb@4 | 104 | |
paulb@4 | 105 | def get_all_words(self, slot): |
paulb@4 | 106 | |
paulb@4 | 107 | "Get all words under the given index 'slot'." |
paulb@4 | 108 | |
paulb@12 | 109 | all_words = {} |
paulb@4 | 110 | keys = slot.keys() |
paulb@4 | 111 | keys.sort() |
paulb@4 | 112 | for c in keys: |
paulb@4 | 113 | this_slot, words = slot[c] |
paulb@12 | 114 | all_words.update(words) |
paulb@12 | 115 | all_words.update(self.get_all_words(this_slot)) |
paulb@4 | 116 | return all_words |
paulb@4 | 117 | |
paulb@4 | 118 | # vim: tabstop=4 expandtab shiftwidth=4 |