1 #!/usr/bin/env python 2 3 """ 4 Phrase iterators providing navigation over common positions for a number of 5 different terms. 6 7 Copyright (C) 2009, 2010 Paul Boddie <paul@boddie.org.uk> 8 9 This program is free software; you can redistribute it and/or modify it under 10 the terms of the GNU General Public License as published by the Free Software 11 Foundation; either version 3 of the License, or (at your option) any later 12 version. 13 14 This program is distributed in the hope that it will be useful, but WITHOUT ANY 15 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A 16 PARTICULAR PURPOSE. See the GNU General Public License for more details. 17 18 You should have received a copy of the GNU General Public License along 19 with this program. If not, see <http://www.gnu.org/licenses/>. 20 """ 21 22 from itermerge import itermerge 23 from bisect import insort_right 24 25 class CommonIterator(itermerge): 26 27 """ 28 Iteration over many terms, driving the search for term co-occurrences using 29 the least frequent term. 30 """ 31 32 def _add_seq(self, sequence, i): 33 34 "Store the details of the given 'sequence' at position 'i'." 35 36 insort_right(self.iters, (len(sequence), i, iter(sequence))) 37 38 def next(self): 39 40 """ 41 Return a tuple containing a document identifier and a list of position 42 lists for all terms. 43 """ 44 45 if self.iters: 46 while 1: 47 48 # Get the next document for the lowest frequency term. 49 50 freq, i, it = self.iters[0] 51 doc, positions = it.next() 52 values = [(i, positions)] 53 54 # Attempt to find the other terms in this document. 55 56 for freq, i, it in self.iters[1:]: 57 positions = it.from_document(doc) 58 59 # Insert position details if appropriate. 60 61 if positions is not None: 62 values.append((i, positions)) 63 64 # Otherwise, reject this document. 65 66 else: 67 break 68 else: 69 values.sort() 70 return doc, [positions for (i, positions) in values] 71 else: 72 raise StopIteration 73 74 def close(self): 75 for freq, i, it in self.iters: 76 if hasattr(it, "close"): 77 it.close() 78 self.iters = [] 79 80 class PhraseIterator(CommonIterator): 81 82 "Phrase iteration using the phrase filter." 83 84 def __init__(self, sequences, filter=None): 85 CommonIterator.__init__(self, sequences) 86 self.current_doc = None 87 self.current_positions = None 88 self.filter = filter or PhraseFilter 89 90 def next(self): 91 92 """ 93 Return a tuple containing a document identifier and a list of term 94 positions. 95 """ 96 97 while 1: 98 if self.current_doc is None: 99 self.current_doc, all_positions = CommonIterator.next(self) 100 101 # Handle incomplete phrases. 102 103 try: 104 self.current_positions = self.filter(all_positions) 105 except StopIteration: 106 self.current_doc = None 107 continue 108 109 # Return new phrases. 110 111 try: 112 return self.current_doc, self.current_positions.next() 113 except StopIteration: 114 self.current_doc = None 115 116 class PhraseFilter(itermerge): 117 118 "Filter phrase suggestions according to position information." 119 120 def _add_seq(self, sequence, i): 121 122 "Store the details of the given 'sequence' at position 'i'." 123 124 it = iter(sequence) 125 self._add_next(it.next, i) 126 127 def _add_next(self, next, i): 128 129 """ 130 Store the current value for an iterator, alongside the means of 131 getting the next value - the 'next' method - together with the 132 iterator's position 'i'. 133 """ 134 135 # Allow StopIteration to be raised. 136 137 insort_right(self.iters, (next(), i, next)) 138 139 def next(self): 140 if self.iters: 141 while 1: 142 current, first_token, next = self.iters[0] 143 values = [current] 144 last = current 145 last_token = first_token 146 147 # Find a sequence of positions providing a phrase. 148 149 for current, current_token, _next in self.iters[1:]: 150 if not self.is_phrase_position(last, last_token, current, current_token): 151 break 152 values.append(current) 153 last = current 154 last_token = current_token 155 else: 156 del self.iters[0] 157 158 # Handle future end of iteration. 159 160 try: 161 self._add_next(next, first_token) 162 except StopIteration: 163 self.iters = [] 164 165 return values 166 167 del self.iters[0] 168 self._add_next(next, first_token) 169 else: 170 raise StopIteration 171 172 def is_phrase_position(self, last, last_token, current, current_token): 173 if current_token <= last_token: 174 return 0 175 176 # NOTE: For position sequences, assume that the first value is the token 177 # NOTE: index/position. 178 179 if isinstance(last, (list, tuple)): 180 return current[0] - last[0] == 1 181 else: 182 return current - last == 1 183 184 # vim: tabstop=4 expandtab shiftwidth=4