iixr (file iixr/phrases.py at c4da9505f73e)

     1 #!/usr/bin/env python     2      3 """     4 Phrase iterators providing navigation over common positions for a number of     5 different terms.     6      7 Copyright (C) 2009, 2010 Paul Boddie <paul@boddie.org.uk>     8      9 This program is free software; you can redistribute it and/or modify it under    10 the terms of the GNU General Public License as published by the Free Software    11 Foundation; either version 3 of the License, or (at your option) any later    12 version.    13     14 This program is distributed in the hope that it will be useful, but WITHOUT ANY    15 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A    16 PARTICULAR PURPOSE.  See the GNU General Public License for more details.    17     18 You should have received a copy of the GNU General Public License along    19 with this program.  If not, see <http://www.gnu.org/licenses/>.    20 """    21     22 from itermerge import itermerge    23 from bisect import insort_right    24     25 class CommonIterator(itermerge):    26     27     """    28     Iteration over many terms, driving the search for term co-occurrences using    29     the least frequent term.    30     """    31     32     def _add_seq(self, sequence, i):    33     34         "Store the details of the given 'sequence' at position 'i'."    35     36         insort_right(self.iters, (len(sequence), i, iter(sequence)))    37     38     def next(self):    39     40         """    41         Return a tuple containing a document identifier and a list of position    42         lists for all terms.    43         """    44     45         if self.iters:    46             while 1:    47     48                 # Get the next document for the lowest frequency term.    49     50                 freq, i, it = self.iters[0]    51                 doc, positions = it.next()    52                 values = [(i, positions)]    53     54                 # Attempt to find the other terms in this document.    55     56                 for freq, i, it in self.iters[1:]:    57                     positions = it.from_document(doc)    58     59                     # Insert position details if appropriate.    60     61                     if positions is not None:    62                         values.append((i, positions))    63     64                     # Otherwise, reject this document.    65     66                     else:    67                         break    68                 else:    69                     values.sort()    70                     return doc, [positions for (i, positions) in values]    71         else:    72             raise StopIteration    73     74     def close(self):    75         for freq, i, it in self.iters:    76             if hasattr(it, "close"):    77                 it.close()    78         self.iters = []    79     80 class PhraseIterator(CommonIterator):    81     82     "Phrase iteration using the phrase filter."    83     84     def __init__(self, sequences, filter=None):    85         CommonIterator.__init__(self, sequences)    86         self.current_doc = None    87         self.current_positions = None    88         self.filter = filter or PhraseFilter    89     90     def next(self):    91     92         """    93         Return a tuple containing a document identifier and a list of term    94         positions.    95         """    96     97         while 1:    98             if self.current_doc is None:    99                 self.current_doc, all_positions = CommonIterator.next(self)   100    101                 # Handle incomplete phrases.   102    103                 try:   104                     self.current_positions = self.filter(all_positions)   105                 except StopIteration:   106                     self.current_doc = None   107                     continue   108    109             # Return new phrases.   110    111             try:   112                 return self.current_doc, self.current_positions.next()   113             except StopIteration:   114                 self.current_doc = None   115    116 class PhraseFilter(itermerge):   117    118     "Filter phrase suggestions according to position information."   119    120     def _add_seq(self, sequence, i):   121    122         "Store the details of the given 'sequence' at position 'i'."   123    124         it = iter(sequence)   125         self._add_next(it.next, i)   126    127     def _add_next(self, next, i):   128    129         """   130         Store the current value for an iterator, alongside the means of   131         getting the next value - the 'next' method - together with the   132         iterator's position 'i'.   133         """   134    135         # Allow StopIteration to be raised.   136    137         insort_right(self.iters, (next(), i, next))   138    139     def next(self):   140         if self.iters:   141             while 1:   142                 current, first_token, next = self.iters[0]   143                 values = [current]   144                 last = current   145                 last_token = first_token   146    147                 # Find a sequence of positions providing a phrase.   148    149                 for current, current_token, _next in self.iters[1:]:   150                     if not self.is_phrase_position(last, last_token, current, current_token):   151                         break   152                     values.append(current)   153                     last = current   154                     last_token = current_token   155                 else:   156                     del self.iters[0]   157    158                     # Handle future end of iteration.   159    160                     try:   161                         self._add_next(next, first_token)   162                     except StopIteration:   163                         self.iters = []   164    165                     return values   166    167                 del self.iters[0]   168                 self._add_next(next, first_token)   169         else:   170             raise StopIteration   171    172     def is_phrase_position(self, last, last_token, current, current_token):   173         if current_token <= last_token:   174             return 0   175    176         # NOTE: For position sequences, assume that the first value is the token   177         # NOTE: index/position.   178    179         if isinstance(last, (list, tuple)):   180             return current[0] - last[0] == 1   181         else:   182             return current - last == 1   183    184 # vim: tabstop=4 expandtab shiftwidth=4