# HG changeset patch # User paulb # Date 1127771956 0 # Node ID c64efeb60986c16c790e355a37b096dee5cd74ff # Parent d5bb6c75e31fdce6ceecc313859a2cf53464d190 [project @ 2005-09-26 21:59:16 by paulb] Added a text indexing demonstration using a simple implementation of an indexer along with an example which uses message exchanges to coordinate communications between the parent and child processes. diff -r d5bb6c75e31f -r c64efeb60986 tests/Dict.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tests/Dict.py Mon Sep 26 21:59:16 2005 +0000 @@ -0,0 +1,91 @@ +#!/usr/bin/env python + +"A simple file indexer." + +import codecs + +class Indexer: + def __init__(self, dict_location, encoding=None): + self.dict_location = dict_location + self.encoding = encoding + + # Initialisation. + + self.index = self.make_index() + + def get_index(self): + return self.index + + def make_index(self): + + "Return a dictionary containing an index structure for the dict." + + if self.encoding is None: + f = open(self.dict_location) + else: + f = codecs.open(self.dict_location, encoding=self.encoding) + s = f.read() + f.close() + + tokens = s.split() + index = {} + + for token in tokens: + slot = index + for c in token: + if not slot.has_key(c): + slot[c] = {}, [] + slot, words = slot[c] + + if token not in words: + words.append(token) + + return index + +class Searcher: + def __init__(self, index): + self.index = index + + def find(self, pattern): + + "Find words beginning with the given 'pattern'." + + slot = self.index + words = [] + + for c in pattern: + if not slot.has_key(c): + return [] + slot, words = slot[c] + + results = [] + results += words + results += self.get_all_words(slot) + return results + + def get_all_words(self, slot): + + "Get all words under the given index 'slot'." + + all_words = [] + keys = slot.keys() + keys.sort() + for c in keys: + this_slot, words = slot[c] + all_words += words + all_words += self.get_all_words(this_slot) + return all_words + +def update(index1, index2): + for key in index2.keys(): + if not index1.has_key(key): + index1[key] = index2[key] + else: + slot1, words1 = index1[key] + slot2, words2 = index2[key] + for word in words2: + if not word in words1: + words1.append(word) + update(slot1, slot2) + +# vim: tabstop=4 expandtab shiftwidth=4 diff -r d5bb6c75e31f -r c64efeb60986 tests/start_indexer.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tests/start_indexer.py Mon Sep 26 21:59:16 2005 +0000 @@ -0,0 +1,43 @@ +#!/usr/bin/env python + +from parallel import start, Exchange +from Dict import Indexer, Searcher, update + +def apply_indexer(channel, filename): + print "Indexing", filename + indexer = Indexer(filename) + channel.send(indexer.get_index()) + +def get_searcher(filenames): + master_index = {} + + # Start indexing. + + channels = [] + for filename in filenames: + channels.append(start(apply_indexer, filename)) + + # Start listening for responses. + + exchange = Exchange(channels) + while len(channels) > 0: + print "Waiting for %d channels..." % len(channels) + for channel in exchange.ready(): + index = channel.receive() + update(master_index, index) + channels.remove(channel) + + # Provide a search interface. + + return Searcher(master_index) + +if __name__ == "__main__": + import sys + filenames = sys.argv[1:] + searcher = get_searcher(filenames) + while 1: + print "Pattern:", + pattern = raw_input() + print searcher.find(pattern) + +# vim: tabstop=4 expandtab shiftwidth=4