iixr (annotate iixr/phrases.py in c4da9505f73e)

iixr

Annotated iixr/phrases.py

85:c4da9505f73e

2011-01-25

Paul Boddie

Added a threshold or interval which causes the term dictionary to be flushed when a certain number of document positions have been recorded. Updated the copyright information.

paul@60	1	#!/usr/bin/env python
paul@60	2
paul@60	3	"""
paul@60	4	Phrase iterators providing navigation over common positions for a number of
paul@60	5	different terms.
paul@60	6
paul@82	7	Copyright (C) 2009, 2010 Paul Boddie <paul@boddie.org.uk>
paul@60	8
paul@60	9	This program is free software; you can redistribute it and/or modify it under
paul@60	10	the terms of the GNU General Public License as published by the Free Software
paul@60	11	Foundation; either version 3 of the License, or (at your option) any later
paul@60	12	version.
paul@60	13
paul@60	14	This program is distributed in the hope that it will be useful, but WITHOUT ANY
paul@60	15	WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
paul@60	16	PARTICULAR PURPOSE. See the GNU General Public License for more details.
paul@60	17
paul@60	18	You should have received a copy of the GNU General Public License along
paul@60	19	with this program. If not, see <http://www.gnu.org/licenses/>.
paul@60	20	"""
paul@60	21
paul@60	22	from itermerge import itermerge
paul@60	23	from bisect import insort_right
paul@60	24
paul@62	25	class CommonIterator(itermerge):
paul@60	26
paul@62	27	"""
paul@62	28	Iteration over many terms, driving the search for term co-occurrences using
paul@62	29	the least frequent term.
paul@62	30	"""
paul@60	31
paul@61	32	def _add_seq(self, sequence, i):
paul@60	33
paul@61	34	"Store the details of the given 'sequence' at position 'i'."
paul@60	35
paul@61	36	insort_right(self.iters, (len(sequence), i, iter(sequence)))
paul@60	37
paul@60	38	def next(self):
paul@62	39
paul@62	40	"""
paul@62	41	Return a tuple containing a document identifier and a list of position
paul@62	42	lists for all terms.
paul@62	43	"""
paul@62	44
paul@60	45	if self.iters:
paul@60	46	while 1:
paul@62	47
paul@62	48	# Get the next document for the lowest frequency term.
paul@62	49
paul@61	50	freq, i, it = self.iters[0]
paul@60	51	doc, positions = it.next()
paul@60	52	values = [(i, positions)]
paul@62	53
paul@62	54	# Attempt to find the other terms in this document.
paul@62	55
paul@60	56	for freq, i, it in self.iters[1:]:
paul@60	57	positions = it.from_document(doc)
paul@62	58
paul@62	59	# Insert position details if appropriate.
paul@62	60
paul@62	61	if positions is not None:
paul@84	62	values.append((i, positions))
paul@62	63
paul@62	64	# Otherwise, reject this document.
paul@62	65
paul@62	66	else:
paul@60	67	break
paul@60	68	else:
paul@84	69	values.sort()
paul@60	70	return doc, [positions for (i, positions) in values]
paul@60	71	else:
paul@60	72	raise StopIteration
paul@60	73
paul@61	74	def close(self):
paul@61	75	for freq, i, it in self.iters:
paul@61	76	if hasattr(it, "close"):
paul@61	77	it.close()
paul@61	78	self.iters = []
paul@61	79
paul@62	80	class PhraseIterator(CommonIterator):
paul@62	81
paul@62	82	"Phrase iteration using the phrase filter."
paul@62	83
paul@80	84	def __init__(self, sequences, filter=None):
paul@62	85	CommonIterator.__init__(self, sequences)
paul@62	86	self.current_doc = None
paul@62	87	self.current_positions = None
paul@80	88	self.filter = filter or PhraseFilter
paul@62	89
paul@62	90	def next(self):
paul@62	91
paul@62	92	"""
paul@62	93	Return a tuple containing a document identifier and a list of term
paul@62	94	positions.
paul@62	95	"""
paul@62	96
paul@62	97	while 1:
paul@62	98	if self.current_doc is None:
paul@62	99	self.current_doc, all_positions = CommonIterator.next(self)
paul@62	100
paul@62	101	# Handle incomplete phrases.
paul@62	102
paul@62	103	try:
paul@80	104	self.current_positions = self.filter(all_positions)
paul@62	105	except StopIteration:
paul@62	106	self.current_doc = None
paul@62	107	continue
paul@62	108
paul@62	109	# Return new phrases.
paul@62	110
paul@62	111	try:
paul@62	112	return self.current_doc, self.current_positions.next()
paul@62	113	except StopIteration:
paul@62	114	self.current_doc = None
paul@62	115
paul@62	116	class PhraseFilter(itermerge):
paul@62	117
paul@62	118	"Filter phrase suggestions according to position information."
paul@62	119
paul@62	120	def _add_seq(self, sequence, i):
paul@62	121
paul@62	122	"Store the details of the given 'sequence' at position 'i'."
paul@62	123
paul@62	124	it = iter(sequence)
paul@62	125	self._add_next(it.next, i)
paul@62	126
paul@62	127	def _add_next(self, next, i):
paul@62	128
paul@62	129	"""
paul@62	130	Store the current value for an iterator, alongside the means of
paul@62	131	getting the next value - the 'next' method - together with the
paul@62	132	iterator's position 'i'.
paul@62	133	"""
paul@62	134
paul@62	135	# Allow StopIteration to be raised.
paul@62	136
paul@62	137	insort_right(self.iters, (next(), i, next))
paul@62	138
paul@62	139	def next(self):
paul@62	140	if self.iters:
paul@62	141	while 1:
paul@62	142	current, first_token, next = self.iters[0]
paul@62	143	values = [current]
paul@62	144	last = current
paul@62	145	last_token = first_token
paul@62	146
paul@62	147	# Find a sequence of positions providing a phrase.
paul@62	148
paul@62	149	for current, current_token, _next in self.iters[1:]:
paul@62	150	if not self.is_phrase_position(last, last_token, current, current_token):
paul@62	151	break
paul@62	152	values.append(current)
paul@62	153	last = current
paul@62	154	last_token = current_token
paul@62	155	else:
paul@62	156	del self.iters[0]
paul@62	157
paul@62	158	# Handle future end of iteration.
paul@62	159
paul@62	160	try:
paul@62	161	self._add_next(next, first_token)
paul@62	162	except StopIteration:
paul@62	163	self.iters = []
paul@62	164
paul@62	165	return values
paul@62	166
paul@62	167	del self.iters[0]
paul@62	168	self._add_next(next, first_token)
paul@62	169	else:
paul@62	170	raise StopIteration
paul@62	171
paul@62	172	def is_phrase_position(self, last, last_token, current, current_token):
paul@77	173	if current_token <= last_token:
paul@77	174	return 0
paul@77	175
paul@77	176	# NOTE: For position sequences, assume that the first value is the token
paul@77	177	# NOTE: index/position.
paul@77	178
paul@77	179	if isinstance(last, (list, tuple)):
paul@82	180	return current[0] - last[0] == 1
paul@77	181	else:
paul@82	182	return current - last == 1
paul@62	183
paul@60	184	# vim: tabstop=4 expandtab shiftwidth=4