1.1 --- a/tests/Dict.py Thu Sep 29 17:28:37 2005 +0000
1.2 +++ b/tests/Dict.py Thu Sep 29 17:30:53 2005 +0000
1.3 @@ -3,17 +3,19 @@
1.4 "A simple file indexer."
1.5
1.6 import codecs
1.7 +import time
1.8
1.9 class Parser:
1.10 - def __init__(self, dict_location, encoding=None):
1.11 - self.dict_location = dict_location
1.12 + def __init__(self, filenames, encoding=None, delay=None):
1.13 + self.filenames = filenames
1.14 self.encoding = encoding
1.15 + self.delay = delay
1.16
1.17 - def _get_file_content(self):
1.18 + def _get_file_content(self, filename):
1.19 if self.encoding is None:
1.20 - f = open(self.dict_location)
1.21 + f = open(filename)
1.22 else:
1.23 - f = codecs.open(self.dict_location, encoding=self.encoding)
1.24 + f = codecs.open(filename, encoding=self.encoding)
1.25 s = f.read()
1.26 f.close()
1.27 return s
1.28 @@ -22,15 +24,21 @@
1.29
1.30 "Send word entries from the file."
1.31
1.32 - tokens = self._get_file_content().split()
1.33 - index = {}
1.34 + for filename in self.filenames:
1.35 + tokens = self._get_file_content(filename).split()
1.36 + index = {}
1.37
1.38 - words = []
1.39 - for token in tokens:
1.40 - token = self._strip(token)
1.41 - if token not in words:
1.42 - channel.send((token, self.dict_location))
1.43 - words.append(token)
1.44 + words = []
1.45 + for token in tokens:
1.46 + token = self._strip(token)
1.47 + if token not in words:
1.48 + channel.send((token, filename))
1.49 + words.append(token)
1.50 +
1.51 + # Introduce a delay to simulate hard work.
1.52 +
1.53 + if self.delay:
1.54 + time.sleep(self.delay)
1.55
1.56 def _strip(self, token):
1.57
1.58 @@ -56,9 +64,9 @@
1.59
1.60 def add_entry(self, entry):
1.61
1.62 - "Add the given word 'entry' (token, dict_location) to the index."
1.63 + "Add the given word 'entry' (token, filename) to the index."
1.64
1.65 - token, dict_location = entry
1.66 + token, filename = entry
1.67
1.68 if not token:
1.69 return
1.70 @@ -71,7 +79,7 @@
1.71
1.72 if not words.has_key(token):
1.73 words[token] = []
1.74 - words[token].append(dict_location)
1.75 + words[token].append(filename)
1.76
1.77 class Searcher:
1.78 def __init__(self, index):
2.1 --- a/tests/start_indexer.py Thu Sep 29 17:28:37 2005 +0000
2.2 +++ b/tests/start_indexer.py Thu Sep 29 17:30:53 2005 +0000
2.3 @@ -1,30 +1,62 @@
2.4 #!/usr/bin/env python
2.5
2.6 +"A simple text indexing activity."
2.7 +
2.8 from parallel import start, Exchange
2.9 from Dict import Indexer, Searcher, Parser
2.10 import os
2.11
2.12 -def apply_parser(channel, filename):
2.13 - #print "Parsing", filename
2.14 - parser = Parser(filename)
2.15 +def apply_parser(channel, filenames, delay=None):
2.16 +
2.17 + """
2.18 + Apply the parser, returning results through the given 'channel', processing
2.19 + the files with the given 'filenames', and imposing a optional 'delay' in the
2.20 + parser to simulate heavy processing.
2.21 + """
2.22 +
2.23 + #print "Parsing", filenames
2.24 + parser = Parser(filenames, delay=delay)
2.25 parser.send_entries(channel)
2.26 - #print "Done", filename
2.27 + #print "Done", filenames
2.28 +
2.29 +def fill_window(filenames, i, window_size, exchange, slice_size, delay=None):
2.30
2.31 -def fill_window(filenames, i, window_size, exchange):
2.32 - limit = min(len(filenames), i + window_size - len(exchange.active()))
2.33 - while i < limit:
2.34 - channel = start(apply_parser, filenames[i])
2.35 + """
2.36 + Fill a "window" of channels using the given 'filenames', starting from index
2.37 + 'i' in that list, choosing a number of channels limited to 'window_size' and
2.38 + adding those channels to the given 'exchange'. Each channel will be
2.39 + employed by a parser which will be given a number of filenames limited to
2.40 + 'slice_size', and which will be requested to wait periodically if specified
2.41 + by the optional 'delay'.
2.42 + """
2.43 +
2.44 + number = 0
2.45 + limit = len(filenames)
2.46 + active = len(exchange.active())
2.47 + while i < limit and number < window_size - active:
2.48 + j = i + slice_size
2.49 + channel = start(apply_parser, filenames[i:j], delay)
2.50 exchange.add(channel)
2.51 - i = i + 1
2.52 + i = j
2.53 + number += 1
2.54 return i
2.55
2.56 -def get_searcher(filenames, window_size):
2.57 +def get_searcher(filenames, window_size, slice_size, delay=None):
2.58 +
2.59 + """
2.60 + Get a searcher object, providing searching on the contents of the given
2.61 + 'filenames', employing a 'window_size' and 'slice_size' as parameters to
2.62 + respectively control the number of monitored channels in the exchange, and
2.63 + the number of filenames given to each created process. The optional 'delay'
2.64 + is used to simulate heavy processing in each created process.
2.65 + """
2.66 +
2.67 master_index = Indexer()
2.68
2.69 # Start indexing by filling a window with channels.
2.70
2.71 exchange = Exchange()
2.72 - i = fill_window(filenames, 0, window_size, exchange)
2.73 + i = fill_window(filenames, 0, window_size, exchange, slice_size, delay)
2.74
2.75 # Start listening for responses.
2.76
2.77 @@ -34,7 +66,7 @@
2.78 entry = channel.receive()
2.79 master_index.add_entry(entry)
2.80
2.81 - i = fill_window(filenames, i, window_size, exchange)
2.82 + i = fill_window(filenames, i, window_size, exchange, slice_size, delay)
2.83
2.84 # Provide a search interface.
2.85
2.86 @@ -46,7 +78,32 @@
2.87
2.88 # Get the parameters.
2.89
2.90 - window_size, directory = int(sys.argv[1]), sys.argv[2]
2.91 + try:
2.92 + directory = sys.argv[1]
2.93 + except IndexError:
2.94 + print "Please specify a directory where text files reside."
2.95 + print "To investigate other performance factors, you can also specify"
2.96 + print "a window size (eg. 1, 5, 10, ...) indicating the number of"
2.97 + print "channels/processes being used, a slice size (eg. 5, 10, ...)"
2.98 + print "indicating the number of filenames given to each created process"
2.99 + print "and a time delay to simulate heavy processing in each created"
2.100 + print "process (eg. 0.5, 1, ...)."
2.101 + sys.exit(1)
2.102 +
2.103 + if len(sys.argv) > 2:
2.104 + window_size = int(sys.argv[2])
2.105 + else:
2.106 + window_size = 4
2.107 +
2.108 + if len(sys.argv) > 3:
2.109 + slice_size = int(sys.argv[3])
2.110 + else:
2.111 + slice_size = 5
2.112 +
2.113 + if len(sys.argv) > 4:
2.114 + delay = float(sys.argv[4])
2.115 + else:
2.116 + delay = None
2.117
2.118 # Build a list of filenames.
2.119
2.120 @@ -55,7 +112,7 @@
2.121
2.122 # Get a searcher using an index built in parallel.
2.123
2.124 - searcher = get_searcher(filenames, window_size)
2.125 + searcher = get_searcher(filenames, window_size, slice_size, delay)
2.126
2.127 # Present a user interface.
2.128