pprocess (annotate tests/Dict.py in 427c42d0e3c8)

pprocess

Annotated tests/Dict.py

161:427c42d0e3c8

2009-06-10

Paul Boddie

Added new example.

paulb@4	1	#!/usr/bin/env python
paulb@4	2
paulb@4	3	"A simple file indexer."
paulb@4	4
paulb@4	5	import codecs
paulb@37	6	import time
paulb@4	7
paulb@10	8	class Parser:
paulb@37	9	def __init__(self, filenames, encoding=None, delay=None):
paulb@37	10	self.filenames = filenames
paulb@4	11	self.encoding = encoding
paulb@37	12	self.delay = delay
paulb@4	13
paulb@37	14	def _get_file_content(self, filename):
paulb@4	15	if self.encoding is None:
paulb@37	16	f = open(filename)
paulb@4	17	else:
paulb@37	18	f = codecs.open(filename, encoding=self.encoding)
paulb@4	19	s = f.read()
paulb@4	20	f.close()
paulb@10	21	return s
paulb@4	22
paulb@10	23	def send_entries(self, channel):
paulb@10	24
paulb@10	25	"Send word entries from the file."
paulb@10	26
paulb@37	27	for filename in self.filenames:
paulb@37	28	tokens = self._get_file_content(filename).split()
paulb@37	29	index = {}
paulb@4	30
paulb@37	31	words = []
paulb@37	32	for token in tokens:
paulb@37	33	token = self._strip(token)
paulb@37	34	if token not in words:
paulb@37	35	channel.send((token, filename))
paulb@37	36	words.append(token)
paulb@37	37
paulb@37	38	# Introduce a delay to simulate hard work.
paulb@37	39
paulb@37	40	if self.delay:
paulb@37	41	time.sleep(self.delay)
paulb@4	42
paulb@18	43	def _strip(self, token):
paulb@18	44
paulb@18	45	"Return the token stripped of non-alphanumeric symbols at each end."
paulb@18	46
paulb@18	47	characters = []
paulb@18	48	in_alphanum = 0
paulb@18	49	for c in token:
paulb@18	50	if not c.isalpha() and not c.isdigit():
paulb@18	51	if in_alphanum:
paulb@18	52	break
paulb@18	53	else:
paulb@18	54	in_alphanum = 1
paulb@18	55	characters.append(c)
paulb@18	56	return "".join(characters)
paulb@18	57
paulb@10	58	class Indexer:
paulb@10	59	def __init__(self):
paulb@10	60	self.index = {}
paulb@10	61
paulb@10	62	def get_index(self):
paulb@10	63	return self.index
paulb@10	64
paulb@10	65	def add_entry(self, entry):
paulb@10	66
paulb@37	67	"Add the given word 'entry' (token, filename) to the index."
paulb@10	68
paulb@37	69	token, filename = entry
paulb@12	70
paulb@12	71	if not token:
paulb@12	72	return
paulb@12	73
paulb@10	74	slot = self.index
paulb@10	75	for c in token:
paulb@10	76	if not slot.has_key(c):
paulb@12	77	slot[c] = {}, {}
paulb@10	78	slot, words = slot[c]
paulb@10	79
paulb@12	80	if not words.has_key(token):
paulb@12	81	words[token] = []
paulb@37	82	words[token].append(filename)
paulb@4	83
paulb@4	84	class Searcher:
paulb@4	85	def __init__(self, index):
paulb@4	86	self.index = index
paulb@4	87
paulb@4	88	def find(self, pattern):
paulb@4	89
paulb@4	90	"Find words beginning with the given 'pattern'."
paulb@4	91
paulb@4	92	slot = self.index
paulb@4	93	words = []
paulb@4	94
paulb@4	95	for c in pattern:
paulb@4	96	if not slot.has_key(c):
paulb@4	97	return []
paulb@4	98	slot, words = slot[c]
paulb@4	99
paulb@12	100	results = {}
paulb@12	101	results.update(words)
paulb@12	102	results.update(self.get_all_words(slot))
paulb@4	103	return results
paulb@4	104
paulb@4	105	def get_all_words(self, slot):
paulb@4	106
paulb@4	107	"Get all words under the given index 'slot'."
paulb@4	108
paulb@12	109	all_words = {}
paulb@4	110	keys = slot.keys()
paulb@4	111	keys.sort()
paulb@4	112	for c in keys:
paulb@4	113	this_slot, words = slot[c]
paulb@12	114	all_words.update(words)
paulb@12	115	all_words.update(self.get_all_words(this_slot))
paulb@4	116	return all_words
paulb@4	117
paulb@4	118	# vim: tabstop=4 expandtab shiftwidth=4