iixr (file iixr/phrases.py at 00995a70f535)

     1 #!/usr/bin/env python     2      3 """     4 Phrase iterators providing navigation over common positions for a number of     5 different terms.     6      7 Copyright (C) 2009, 2010 Paul Boddie <paul@boddie.org.uk>     8      9 This program is free software; you can redistribute it and/or modify it under    10 the terms of the GNU General Public License as published by the Free Software    11 Foundation; either version 3 of the License, or (at your option) any later    12 version.    13     14 This program is distributed in the hope that it will be useful, but WITHOUT ANY    15 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A    16 PARTICULAR PURPOSE.  See the GNU General Public License for more details.    17     18 You should have received a copy of the GNU General Public License along    19 with this program.  If not, see <http://www.gnu.org/licenses/>.    20 """    21     22 from itermerge import itermerge    23 from bisect import insort_right    24     25 class CommonIterator(itermerge):    26     27     """    28     Iteration over many terms, driving the search for term co-occurrences using    29     the least frequent term.    30     """    31     32     def _add_seq(self, sequence, i):    33     34         "Store the details of the given 'sequence' at position 'i'."    35     36         insort_right(self.iters, (len(sequence), i, iter(sequence)))    37     38     def next(self):    39     40         """    41         Return a tuple containing a document identifier and a list of position    42         lists for all terms.    43         """    44     45         if self.iters:    46             while 1:    47     48                 # Get the next document for the lowest frequency term.    49     50                 freq, i, it = self.iters[0]    51                 doc, positions = it.next()    52                 values = [(i, positions)]    53     54                 # Attempt to find the other terms in this document.    55     56                 for freq, i, it in self.iters[1:]:    57                     positions = it.from_document(doc)    58     59                     # Insert position details if appropriate.    60     61                     if positions is not None:    62                         insort_right(values, (i, positions))    63     64                     # Otherwise, reject this document.    65     66                     else:    67                         break    68                 else:    69                     return doc, [positions for (i, positions) in values]    70         else:    71             raise StopIteration    72     73     def close(self):    74         for freq, i, it in self.iters:    75             if hasattr(it, "close"):    76                 it.close()    77         self.iters = []    78     79 class PhraseIterator(CommonIterator):    80     81     "Phrase iteration using the phrase filter."    82     83     def __init__(self, sequences):    84         CommonIterator.__init__(self, sequences)    85         self.current_doc = None    86         self.current_positions = None    87     88     def next(self):    89     90         """    91         Return a tuple containing a document identifier and a list of term    92         positions.    93         """    94     95         while 1:    96             if self.current_doc is None:    97                 self.current_doc, all_positions = CommonIterator.next(self)    98     99                 # Handle incomplete phrases.   100    101                 try:   102                     self.current_positions = PhraseFilter(all_positions)   103                 except StopIteration:   104                     self.current_doc = None   105                     continue   106    107             # Return new phrases.   108    109             try:   110                 return self.current_doc, self.current_positions.next()   111             except StopIteration:   112                 self.current_doc = None   113    114 class PhraseFilter(itermerge):   115    116     "Filter phrase suggestions according to position information."   117    118     def _add_seq(self, sequence, i):   119    120         "Store the details of the given 'sequence' at position 'i'."   121    122         it = iter(sequence)   123         self._add_next(it.next, i)   124    125     def _add_next(self, next, i):   126    127         """   128         Store the current value for an iterator, alongside the means of   129         getting the next value - the 'next' method - together with the   130         iterator's position 'i'.   131         """   132    133         # Allow StopIteration to be raised.   134    135         insort_right(self.iters, (next(), i, next))   136    137     def next(self):   138         if self.iters:   139             while 1:   140                 current, first_token, next = self.iters[0]   141                 values = [current]   142                 last = current   143                 last_token = first_token   144    145                 # Find a sequence of positions providing a phrase.   146    147                 for current, current_token, _next in self.iters[1:]:   148                     if not self.is_phrase_position(last, last_token, current, current_token):   149                         break   150                     values.append(current)   151                     last = current   152                     last_token = current_token   153                 else:   154                     del self.iters[0]   155    156                     # Handle future end of iteration.   157    158                     try:   159                         self._add_next(next, first_token)   160                     except StopIteration:   161                         self.iters = []   162    163                     return values   164    165                 del self.iters[0]   166                 self._add_next(next, first_token)   167         else:   168             raise StopIteration   169    170     def is_phrase_position(self, last, last_token, current, current_token):   171         last_position, last_preceding = last   172         position, preceding = current   173         return position - last_position <= 1 and current_token > last_token   174    175 # vim: tabstop=4 expandtab shiftwidth=4