pprocess (annotate tests/Dict.py in 4af6ce77fa14)

pprocess

Annotated tests/Dict.py

18:4af6ce77fa14

2005-09-28

paulb

[project @ 2005-09-28 16:15:49 by paulb] Added token stripping to produce mostly genuine words. Added a trace statement in the created process to signal the end of processing.

paulb@4	1	#!/usr/bin/env python
paulb@4	2
paulb@4	3	"A simple file indexer."
paulb@4	4
paulb@4	5	import codecs
paulb@4	6
paulb@10	7	class Parser:
paulb@4	8	def __init__(self, dict_location, encoding=None):
paulb@4	9	self.dict_location = dict_location
paulb@4	10	self.encoding = encoding
paulb@4	11
paulb@10	12	def _get_file_content(self):
paulb@4	13	if self.encoding is None:
paulb@4	14	f = open(self.dict_location)
paulb@4	15	else:
paulb@4	16	f = codecs.open(self.dict_location, encoding=self.encoding)
paulb@4	17	s = f.read()
paulb@4	18	f.close()
paulb@10	19	return s
paulb@4	20
paulb@10	21	def send_entries(self, channel):
paulb@10	22
paulb@10	23	"Send word entries from the file."
paulb@10	24
paulb@10	25	tokens = self._get_file_content().split()
paulb@4	26	index = {}
paulb@4	27
paulb@10	28	words = []
paulb@4	29	for token in tokens:
paulb@18	30	token = self._strip(token)
paulb@4	31	if token not in words:
paulb@10	32	channel.send((token, self.dict_location))
paulb@4	33	words.append(token)
paulb@4	34
paulb@18	35	def _strip(self, token):
paulb@18	36
paulb@18	37	"Return the token stripped of non-alphanumeric symbols at each end."
paulb@18	38
paulb@18	39	characters = []
paulb@18	40	in_alphanum = 0
paulb@18	41	for c in token:
paulb@18	42	if not c.isalpha() and not c.isdigit():
paulb@18	43	if in_alphanum:
paulb@18	44	break
paulb@18	45	else:
paulb@18	46	in_alphanum = 1
paulb@18	47	characters.append(c)
paulb@18	48	return "".join(characters)
paulb@18	49
paulb@10	50	class Indexer:
paulb@10	51	def __init__(self):
paulb@10	52	self.index = {}
paulb@10	53
paulb@10	54	def get_index(self):
paulb@10	55	return self.index
paulb@10	56
paulb@10	57	def add_entry(self, entry):
paulb@10	58
paulb@10	59	"Add the given word 'entry' (token, dict_location) to the index."
paulb@10	60
paulb@10	61	token, dict_location = entry
paulb@12	62
paulb@12	63	if not token:
paulb@12	64	return
paulb@12	65
paulb@10	66	slot = self.index
paulb@10	67	for c in token:
paulb@10	68	if not slot.has_key(c):
paulb@12	69	slot[c] = {}, {}
paulb@10	70	slot, words = slot[c]
paulb@10	71
paulb@12	72	if not words.has_key(token):
paulb@12	73	words[token] = []
paulb@12	74	words[token].append(dict_location)
paulb@4	75
paulb@4	76	class Searcher:
paulb@4	77	def __init__(self, index):
paulb@4	78	self.index = index
paulb@4	79
paulb@4	80	def find(self, pattern):
paulb@4	81
paulb@4	82	"Find words beginning with the given 'pattern'."
paulb@4	83
paulb@4	84	slot = self.index
paulb@4	85	words = []
paulb@4	86
paulb@4	87	for c in pattern:
paulb@4	88	if not slot.has_key(c):
paulb@4	89	return []
paulb@4	90	slot, words = slot[c]
paulb@4	91
paulb@12	92	results = {}
paulb@12	93	results.update(words)
paulb@12	94	results.update(self.get_all_words(slot))
paulb@4	95	return results
paulb@4	96
paulb@4	97	def get_all_words(self, slot):
paulb@4	98
paulb@4	99	"Get all words under the given index 'slot'."
paulb@4	100
paulb@12	101	all_words = {}
paulb@4	102	keys = slot.keys()
paulb@4	103	keys.sort()
paulb@4	104	for c in keys:
paulb@4	105	this_slot, words = slot[c]
paulb@12	106	all_words.update(words)
paulb@12	107	all_words.update(self.get_all_words(this_slot))
paulb@4	108	return all_words
paulb@4	109
paulb@4	110	# vim: tabstop=4 expandtab shiftwidth=4