# HG changeset patch # User paulb # Date 1127924149 0 # Node ID 4af6ce77fa1426829e8d789d03d596636cf24038 # Parent 3a227f484f48045cc4df5e842d5449dd3e2012a6 [project @ 2005-09-28 16:15:49 by paulb] Added token stripping to produce mostly genuine words. Added a trace statement in the created process to signal the end of processing. diff -r 3a227f484f48 -r 4af6ce77fa14 tests/Dict.py --- a/tests/Dict.py Wed Sep 28 16:15:15 2005 +0000 +++ b/tests/Dict.py Wed Sep 28 16:15:49 2005 +0000 @@ -27,10 +27,26 @@ words = [] for token in tokens: + token = self._strip(token) if token not in words: channel.send((token, self.dict_location)) words.append(token) + def _strip(self, token): + + "Return the token stripped of non-alphanumeric symbols at each end." + + characters = [] + in_alphanum = 0 + for c in token: + if not c.isalpha() and not c.isdigit(): + if in_alphanum: + break + else: + in_alphanum = 1 + characters.append(c) + return "".join(characters) + class Indexer: def __init__(self): self.index = {} diff -r 3a227f484f48 -r 4af6ce77fa14 tests/start_indexer.py --- a/tests/start_indexer.py Wed Sep 28 16:15:15 2005 +0000 +++ b/tests/start_indexer.py Wed Sep 28 16:15:49 2005 +0000 @@ -7,6 +7,7 @@ print "Parsing", filename parser = Parser(filename) parser.send_entries(channel) + print "Done", filename def get_searcher(filenames, window_size): master_index = Indexer()