paulb@4 | 1 | #!/usr/bin/env python |
paulb@4 | 2 | |
paulb@37 | 3 | "A simple text indexing activity." |
paulb@37 | 4 | |
paulb@40 | 5 | from pprocess import start, Exchange |
paulb@10 | 6 | from Dict import Indexer, Searcher, Parser |
paulb@34 | 7 | import os |
paulb@4 | 8 | |
paulb@37 | 9 | def apply_parser(channel, filenames, delay=None): |
paulb@37 | 10 | |
paulb@37 | 11 | """ |
paulb@37 | 12 | Apply the parser, returning results through the given 'channel', processing |
paulb@37 | 13 | the files with the given 'filenames', and imposing a optional 'delay' in the |
paulb@37 | 14 | parser to simulate heavy processing. |
paulb@37 | 15 | """ |
paulb@37 | 16 | |
paulb@37 | 17 | #print "Parsing", filenames |
paulb@37 | 18 | parser = Parser(filenames, delay=delay) |
paulb@10 | 19 | parser.send_entries(channel) |
paulb@37 | 20 | #print "Done", filenames |
paulb@37 | 21 | |
paulb@37 | 22 | def fill_window(filenames, i, window_size, exchange, slice_size, delay=None): |
paulb@4 | 23 | |
paulb@37 | 24 | """ |
paulb@37 | 25 | Fill a "window" of channels using the given 'filenames', starting from index |
paulb@37 | 26 | 'i' in that list, choosing a number of channels limited to 'window_size' and |
paulb@37 | 27 | adding those channels to the given 'exchange'. Each channel will be |
paulb@37 | 28 | employed by a parser which will be given a number of filenames limited to |
paulb@37 | 29 | 'slice_size', and which will be requested to wait periodically if specified |
paulb@37 | 30 | by the optional 'delay'. |
paulb@37 | 31 | """ |
paulb@37 | 32 | |
paulb@37 | 33 | number = 0 |
paulb@37 | 34 | limit = len(filenames) |
paulb@37 | 35 | active = len(exchange.active()) |
paulb@37 | 36 | while i < limit and number < window_size - active: |
paulb@37 | 37 | j = i + slice_size |
paulb@37 | 38 | channel = start(apply_parser, filenames[i:j], delay) |
paulb@34 | 39 | exchange.add(channel) |
paulb@37 | 40 | i = j |
paulb@37 | 41 | number += 1 |
paulb@34 | 42 | return i |
paulb@34 | 43 | |
paulb@37 | 44 | def get_searcher(filenames, window_size, slice_size, delay=None): |
paulb@37 | 45 | |
paulb@37 | 46 | """ |
paulb@37 | 47 | Get a searcher object, providing searching on the contents of the given |
paulb@37 | 48 | 'filenames', employing a 'window_size' and 'slice_size' as parameters to |
paulb@37 | 49 | respectively control the number of monitored channels in the exchange, and |
paulb@37 | 50 | the number of filenames given to each created process. The optional 'delay' |
paulb@37 | 51 | is used to simulate heavy processing in each created process. |
paulb@37 | 52 | """ |
paulb@37 | 53 | |
paulb@10 | 54 | master_index = Indexer() |
paulb@4 | 55 | |
paulb@34 | 56 | # Start indexing by filling a window with channels. |
paulb@4 | 57 | |
paulb@34 | 58 | exchange = Exchange() |
paulb@37 | 59 | i = fill_window(filenames, 0, window_size, exchange, slice_size, delay) |
paulb@4 | 60 | |
paulb@34 | 61 | # Start listening for responses. |
paulb@4 | 62 | |
paulb@34 | 63 | while exchange.active(): |
paulb@35 | 64 | #print "(%d)" % len(exchange.active()), |
paulb@34 | 65 | for channel in exchange.ready(): |
paulb@34 | 66 | entry = channel.receive() |
paulb@34 | 67 | master_index.add_entry(entry) |
paulb@4 | 68 | |
paulb@37 | 69 | i = fill_window(filenames, i, window_size, exchange, slice_size, delay) |
paulb@20 | 70 | |
paulb@4 | 71 | # Provide a search interface. |
paulb@4 | 72 | |
paulb@35 | 73 | #print |
paulb@10 | 74 | return Searcher(master_index.get_index()) |
paulb@4 | 75 | |
paulb@4 | 76 | if __name__ == "__main__": |
paulb@8 | 77 | import sys, os |
paulb@12 | 78 | |
paulb@12 | 79 | # Get the parameters. |
paulb@12 | 80 | |
paulb@37 | 81 | try: |
paulb@37 | 82 | directory = sys.argv[1] |
paulb@37 | 83 | except IndexError: |
paulb@37 | 84 | print "Please specify a directory where text files reside." |
paulb@37 | 85 | print "To investigate other performance factors, you can also specify" |
paulb@37 | 86 | print "a window size (eg. 1, 5, 10, ...) indicating the number of" |
paulb@37 | 87 | print "channels/processes being used, a slice size (eg. 5, 10, ...)" |
paulb@37 | 88 | print "indicating the number of filenames given to each created process" |
paulb@37 | 89 | print "and a time delay to simulate heavy processing in each created" |
paulb@37 | 90 | print "process (eg. 0.5, 1, ...)." |
paulb@37 | 91 | sys.exit(1) |
paulb@37 | 92 | |
paulb@37 | 93 | if len(sys.argv) > 2: |
paulb@37 | 94 | window_size = int(sys.argv[2]) |
paulb@37 | 95 | else: |
paulb@37 | 96 | window_size = 4 |
paulb@37 | 97 | |
paulb@37 | 98 | if len(sys.argv) > 3: |
paulb@37 | 99 | slice_size = int(sys.argv[3]) |
paulb@37 | 100 | else: |
paulb@37 | 101 | slice_size = 5 |
paulb@37 | 102 | |
paulb@37 | 103 | if len(sys.argv) > 4: |
paulb@37 | 104 | delay = float(sys.argv[4]) |
paulb@37 | 105 | else: |
paulb@37 | 106 | delay = None |
paulb@12 | 107 | |
paulb@12 | 108 | # Build a list of filenames. |
paulb@12 | 109 | |
paulb@8 | 110 | filenames = [os.path.join(directory, filename) for filename in os.listdir(directory)] |
paulb@10 | 111 | filenames = [filename for filename in filenames if os.path.isfile(filename)] |
paulb@8 | 112 | |
paulb@12 | 113 | # Get a searcher using an index built in parallel. |
paulb@12 | 114 | |
paulb@37 | 115 | searcher = get_searcher(filenames, window_size, slice_size, delay) |
paulb@13 | 116 | |
paulb@12 | 117 | # Present a user interface. |
paulb@12 | 118 | |
paulb@32 | 119 | if "--noprompt" not in sys.argv: |
paulb@32 | 120 | while 1: |
paulb@32 | 121 | print "Pattern:", |
paulb@32 | 122 | pattern = raw_input() |
paulb@32 | 123 | print searcher.find(pattern) |
paulb@4 | 124 | |
paulb@4 | 125 | # vim: tabstop=4 expandtab shiftwidth=4 |