# HG changeset patch
# User paulb
# Date 1127924149 0
# Node ID 4af6ce77fa1426829e8d789d03d596636cf24038
# Parent  3a227f484f48045cc4df5e842d5449dd3e2012a6
[project @ 2005-09-28 16:15:49 by paulb]
Added token stripping to produce mostly genuine words.
Added a trace statement in the created process to signal the end of
processing.

diff -r 3a227f484f48 -r 4af6ce77fa14 tests/Dict.py
--- a/tests/Dict.py	Wed Sep 28 16:15:15 2005 +0000
+++ b/tests/Dict.py	Wed Sep 28 16:15:49 2005 +0000
@@ -27,10 +27,26 @@
 
         words = []
         for token in tokens:
+            token = self._strip(token)
             if token not in words:
                 channel.send((token, self.dict_location))
                 words.append(token)
 
+    def _strip(self, token):
+
+        "Return the token stripped of non-alphanumeric symbols at each end."
+
+        characters = []
+        in_alphanum = 0
+        for c in token:
+            if not c.isalpha() and not c.isdigit():
+                if in_alphanum:
+                    break
+            else:
+                in_alphanum = 1
+                characters.append(c)
+        return "".join(characters)
+
 class Indexer:
     def __init__(self):
         self.index = {}
diff -r 3a227f484f48 -r 4af6ce77fa14 tests/start_indexer.py
--- a/tests/start_indexer.py	Wed Sep 28 16:15:15 2005 +0000
+++ b/tests/start_indexer.py	Wed Sep 28 16:15:49 2005 +0000
@@ -7,6 +7,7 @@
     print "Parsing", filename
     parser = Parser(filename)
     parser.send_entries(channel)
+    print "Done", filename
 
 def get_searcher(filenames, window_size):
     master_index = Indexer()