# HG changeset patch
# User paulb
# Date 1127842105 0
# Node ID cadfcdf9a910accc98e657b12d9390f98be2fbd8
# Parent  0432c66b661e33c11fce6536632289399c7d26ab
[project @ 2005-09-27 17:28:25 by paulb]
Changed the organisation of the Dict classes so that a Parser object sends
word information back to the creating process via a channel, an Indexer
object collects word information and builds an index, and a Searcher object
provides a front-end to the index.

diff -r 0432c66b661e -r cadfcdf9a910 tests/Dict.py
--- a/tests/Dict.py	Tue Sep 27 17:26:44 2005 +0000
+++ b/tests/Dict.py	Tue Sep 27 17:28:25 2005 +0000
@@ -4,43 +4,53 @@
 
 import codecs
 
-class Indexer:
+class Parser:
     def __init__(self, dict_location, encoding=None):
         self.dict_location = dict_location
         self.encoding = encoding
 
-        # Initialisation.
-
-        self.index = self.make_index()
-
-    def get_index(self):
-        return self.index
-
-    def make_index(self):
-
-        "Return a dictionary containing an index structure for the dict."
-
+    def _get_file_content(self):
         if self.encoding is None:
             f = open(self.dict_location)
         else:
             f = codecs.open(self.dict_location, encoding=self.encoding)
         s = f.read()
         f.close()
+        return s
 
-        tokens = s.split()
+    def send_entries(self, channel):
+
+        "Send word entries from the file."
+
+        tokens = self._get_file_content().split()
         index = {}
 
+        words = []
         for token in tokens:
-            slot = index
-            for c in token:
-                if not slot.has_key(c):
-                    slot[c] = {}, []
-                slot, words = slot[c]
-
             if token not in words:
+                channel.send((token, self.dict_location))
                 words.append(token)
 
-        return index
+class Indexer:
+    def __init__(self):
+        self.index = {}
+
+    def get_index(self):
+        return self.index
+
+    def add_entry(self, entry):
+
+        "Add the given word 'entry' (token, dict_location) to the index."
+
+        token, dict_location = entry
+        slot = self.index
+        for c in token:
+            if not slot.has_key(c):
+                slot[c] = {}, []
+            slot, words = slot[c]
+
+        if token not in words:
+            words.append(token)
 
 class Searcher:
     def __init__(self, index):
@@ -76,16 +86,4 @@
             all_words += self.get_all_words(this_slot)
         return all_words
 
-def update(index1, index2):
-    for key in index2.keys():
-        if not index1.has_key(key):
-            index1[key] = index2[key]
-        else:
-            slot1, words1 = index1[key]
-            slot2, words2 = index2[key]
-            for word in words2:
-                if not word in words1:
-                    words1.append(word)
-            update(slot1, slot2)
-
 # vim: tabstop=4 expandtab shiftwidth=4
diff -r 0432c66b661e -r cadfcdf9a910 tests/start_indexer.py
--- a/tests/start_indexer.py	Tue Sep 27 17:26:44 2005 +0000
+++ b/tests/start_indexer.py	Tue Sep 27 17:28:25 2005 +0000
@@ -1,22 +1,22 @@
 #!/usr/bin/env python
 
 from parallel import start, Exchange
-from Dict import Indexer, Searcher, update
+from Dict import Indexer, Searcher, Parser
 
-def apply_indexer(channel, filename):
-    print "Indexing", filename
-    indexer = Indexer(filename)
-    channel.send(indexer.get_index())
+def apply_parser(channel, filename):
+    print "Parsing", filename
+    parser = Parser(filename)
+    parser.send_entries(channel)
 
 def get_searcher(filenames, window_size):
-    master_index = {}
+    master_index = Indexer()
 
     # Start indexing.
 
     for i in range(0, len(filenames), window_size):
         channels = []
         for filename in filenames[i:i + window_size]:
-            channels.append(start(apply_indexer, filename))
+            channels.append(start(apply_parser, filename))
 
         # Start listening for responses.
 
@@ -24,19 +24,18 @@
         while exchange.active():
             print "Waiting for %d channels..." % exchange.active()
             for channel in exchange.ready():
-                index = channel.receive()
-                update(master_index, index)
-                exchange.remove(channel)
-                channel.close()
+                entry = channel.receive()
+                master_index.add_entry(entry)
 
     # Provide a search interface.
 
-    return Searcher(master_index)
+    return Searcher(master_index.get_index())
 
 if __name__ == "__main__":
     import sys, os
     window_size, directory = int(sys.argv[1]), sys.argv[2]
     filenames = [os.path.join(directory, filename) for filename in os.listdir(directory)]
+    filenames = [filename for filename in filenames if os.path.isfile(filename)]
 
     searcher = get_searcher(filenames, window_size)
     while 1: