# HG changeset patch # User paulb # Date 1127842105 0 # Node ID cadfcdf9a910accc98e657b12d9390f98be2fbd8 # Parent 0432c66b661e33c11fce6536632289399c7d26ab [project @ 2005-09-27 17:28:25 by paulb] Changed the organisation of the Dict classes so that a Parser object sends word information back to the creating process via a channel, an Indexer object collects word information and builds an index, and a Searcher object provides a front-end to the index. diff -r 0432c66b661e -r cadfcdf9a910 tests/Dict.py --- a/tests/Dict.py Tue Sep 27 17:26:44 2005 +0000 +++ b/tests/Dict.py Tue Sep 27 17:28:25 2005 +0000 @@ -4,43 +4,53 @@ import codecs -class Indexer: +class Parser: def __init__(self, dict_location, encoding=None): self.dict_location = dict_location self.encoding = encoding - # Initialisation. - - self.index = self.make_index() - - def get_index(self): - return self.index - - def make_index(self): - - "Return a dictionary containing an index structure for the dict." - + def _get_file_content(self): if self.encoding is None: f = open(self.dict_location) else: f = codecs.open(self.dict_location, encoding=self.encoding) s = f.read() f.close() + return s - tokens = s.split() + def send_entries(self, channel): + + "Send word entries from the file." + + tokens = self._get_file_content().split() index = {} + words = [] for token in tokens: - slot = index - for c in token: - if not slot.has_key(c): - slot[c] = {}, [] - slot, words = slot[c] - if token not in words: + channel.send((token, self.dict_location)) words.append(token) - return index +class Indexer: + def __init__(self): + self.index = {} + + def get_index(self): + return self.index + + def add_entry(self, entry): + + "Add the given word 'entry' (token, dict_location) to the index." + + token, dict_location = entry + slot = self.index + for c in token: + if not slot.has_key(c): + slot[c] = {}, [] + slot, words = slot[c] + + if token not in words: + words.append(token) class Searcher: def __init__(self, index): @@ -76,16 +86,4 @@ all_words += self.get_all_words(this_slot) return all_words -def update(index1, index2): - for key in index2.keys(): - if not index1.has_key(key): - index1[key] = index2[key] - else: - slot1, words1 = index1[key] - slot2, words2 = index2[key] - for word in words2: - if not word in words1: - words1.append(word) - update(slot1, slot2) - # vim: tabstop=4 expandtab shiftwidth=4 diff -r 0432c66b661e -r cadfcdf9a910 tests/start_indexer.py --- a/tests/start_indexer.py Tue Sep 27 17:26:44 2005 +0000 +++ b/tests/start_indexer.py Tue Sep 27 17:28:25 2005 +0000 @@ -1,22 +1,22 @@ #!/usr/bin/env python from parallel import start, Exchange -from Dict import Indexer, Searcher, update +from Dict import Indexer, Searcher, Parser -def apply_indexer(channel, filename): - print "Indexing", filename - indexer = Indexer(filename) - channel.send(indexer.get_index()) +def apply_parser(channel, filename): + print "Parsing", filename + parser = Parser(filename) + parser.send_entries(channel) def get_searcher(filenames, window_size): - master_index = {} + master_index = Indexer() # Start indexing. for i in range(0, len(filenames), window_size): channels = [] for filename in filenames[i:i + window_size]: - channels.append(start(apply_indexer, filename)) + channels.append(start(apply_parser, filename)) # Start listening for responses. @@ -24,19 +24,18 @@ while exchange.active(): print "Waiting for %d channels..." % exchange.active() for channel in exchange.ready(): - index = channel.receive() - update(master_index, index) - exchange.remove(channel) - channel.close() + entry = channel.receive() + master_index.add_entry(entry) # Provide a search interface. - return Searcher(master_index) + return Searcher(master_index.get_index()) if __name__ == "__main__": import sys, os window_size, directory = int(sys.argv[1]), sys.argv[2] filenames = [os.path.join(directory, filename) for filename in os.listdir(directory)] + filenames = [filename for filename in filenames if os.path.isfile(filename)] searcher = get_searcher(filenames, window_size) while 1: