1 #!/usr/bin/env python 2 3 """ 4 Phrase iterators providing navigation over common positions for a number of 5 different terms. 6 7 Copyright (C) 2009, 2010 Paul Boddie <paul@boddie.org.uk> 8 9 This program is free software; you can redistribute it and/or modify it under 10 the terms of the GNU General Public License as published by the Free Software 11 Foundation; either version 3 of the License, or (at your option) any later 12 version. 13 14 This program is distributed in the hope that it will be useful, but WITHOUT ANY 15 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A 16 PARTICULAR PURPOSE. See the GNU General Public License for more details. 17 18 You should have received a copy of the GNU General Public License along 19 with this program. If not, see <http://www.gnu.org/licenses/>. 20 """ 21 22 from itermerge import itermerge 23 from bisect import insort_right 24 25 class CommonIterator(itermerge): 26 27 """ 28 Iteration over many terms, driving the search for term co-occurrences using 29 the least frequent term. 30 """ 31 32 def _add_seq(self, sequence, i): 33 34 "Store the details of the given 'sequence' at position 'i'." 35 36 insort_right(self.iters, (len(sequence), i, iter(sequence))) 37 38 def next(self): 39 40 """ 41 Return a tuple containing a document identifier and a list of position 42 lists for all terms. 43 """ 44 45 if self.iters: 46 while 1: 47 48 # Get the next document for the lowest frequency term. 49 50 freq, i, it = self.iters[0] 51 doc, positions = it.next() 52 values = [(i, positions)] 53 54 # Attempt to find the other terms in this document. 55 56 for freq, i, it in self.iters[1:]: 57 positions = it.from_document(doc) 58 59 # Insert position details if appropriate. 60 61 if positions is not None: 62 insort_right(values, (i, positions)) 63 64 # Otherwise, reject this document. 65 66 else: 67 break 68 else: 69 return doc, [positions for (i, positions) in values] 70 else: 71 raise StopIteration 72 73 def close(self): 74 for freq, i, it in self.iters: 75 if hasattr(it, "close"): 76 it.close() 77 self.iters = [] 78 79 class PhraseIterator(CommonIterator): 80 81 "Phrase iteration using the phrase filter." 82 83 def __init__(self, sequences): 84 CommonIterator.__init__(self, sequences) 85 self.current_doc = None 86 self.current_positions = None 87 88 def next(self): 89 90 """ 91 Return a tuple containing a document identifier and a list of term 92 positions. 93 """ 94 95 while 1: 96 if self.current_doc is None: 97 self.current_doc, all_positions = CommonIterator.next(self) 98 99 # Handle incomplete phrases. 100 101 try: 102 self.current_positions = PhraseFilter(all_positions) 103 except StopIteration: 104 self.current_doc = None 105 continue 106 107 # Return new phrases. 108 109 try: 110 return self.current_doc, self.current_positions.next() 111 except StopIteration: 112 self.current_doc = None 113 114 class PhraseFilter(itermerge): 115 116 "Filter phrase suggestions according to position information." 117 118 def _add_seq(self, sequence, i): 119 120 "Store the details of the given 'sequence' at position 'i'." 121 122 it = iter(sequence) 123 self._add_next(it.next, i) 124 125 def _add_next(self, next, i): 126 127 """ 128 Store the current value for an iterator, alongside the means of 129 getting the next value - the 'next' method - together with the 130 iterator's position 'i'. 131 """ 132 133 # Allow StopIteration to be raised. 134 135 insort_right(self.iters, (next(), i, next)) 136 137 def next(self): 138 if self.iters: 139 while 1: 140 current, first_token, next = self.iters[0] 141 values = [current] 142 last = current 143 last_token = first_token 144 145 # Find a sequence of positions providing a phrase. 146 147 for current, current_token, _next in self.iters[1:]: 148 if not self.is_phrase_position(last, last_token, current, current_token): 149 break 150 values.append(current) 151 last = current 152 last_token = current_token 153 else: 154 del self.iters[0] 155 156 # Handle future end of iteration. 157 158 try: 159 self._add_next(next, first_token) 160 except StopIteration: 161 self.iters = [] 162 163 return values 164 165 del self.iters[0] 166 self._add_next(next, first_token) 167 else: 168 raise StopIteration 169 170 def is_phrase_position(self, last, last_token, current, current_token): 171 last_position, last_preceding = last 172 position, preceding = current 173 return position - last_position <= 1 and current_token > last_token 174 175 # vim: tabstop=4 expandtab shiftwidth=4