# HG changeset patch # User Paul Boddie # Date 1263152861 -3600 # Node ID 1077b05c9b76fce32f5d8f6a474061c0898c9418 # Parent 9d836f8a4075e76cf39e4995fdaabe66918cee2b Introduced position dictionary, file and index iterators which capture the relevant result data in caches for particular terms, wrapping the underlying shared file readers. Added section output to the test program in order to make troubleshooting easier. Added a seek method to the File class. diff -r 9d836f8a4075 -r 1077b05c9b76 docs/COPYING.txt --- a/docs/COPYING.txt Fri Jan 08 00:44:59 2010 +0100 +++ b/docs/COPYING.txt Sun Jan 10 20:47:41 2010 +0100 @@ -1,7 +1,7 @@ Licence Agreement for iixr -------------------------- -Copyright (C) 2009 Paul Boddie +Copyright (C) 2009, 2010 Paul Boddie This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software diff -r 9d836f8a4075 -r 1077b05c9b76 iixr/fields.py --- a/iixr/fields.py Fri Jan 08 00:44:59 2010 +0100 +++ b/iixr/fields.py Sun Jan 10 20:47:41 2010 +0100 @@ -3,7 +3,7 @@ """ Specific classes for storing document information. -Copyright (C) 2009 Paul Boddie +Copyright (C) 2009, 2010 Paul Boddie This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -96,7 +96,7 @@ later documents. """ - self.f.seek(offset) + self.seek(offset) bad_docnum, fields = self.read_fields() self.last_docnum = docnum return docnum, fields diff -r 9d836f8a4075 -r 1077b05c9b76 iixr/files.py --- a/iixr/files.py Fri Jan 08 00:44:59 2010 +0100 +++ b/iixr/files.py Sun Jan 10 20:47:41 2010 +0100 @@ -3,7 +3,7 @@ """ Generic file access. -Copyright (C) 2009 Paul Boddie +Copyright (C) 2009, 2010 Paul Boddie This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -38,6 +38,10 @@ pass + def seek(self, offset): + self.f.seek(offset) + self.reset() + def rewind(self): self.f.seek(0) self.reset() diff -r 9d836f8a4075 -r 1077b05c9b76 iixr/filesystem.py --- a/iixr/filesystem.py Fri Jan 08 00:44:59 2010 +0100 +++ b/iixr/filesystem.py Sun Jan 10 20:47:41 2010 +0100 @@ -3,7 +3,7 @@ """ File access. -Copyright (C) 2009 Paul Boddie +Copyright (C) 2009, 2010 Paul Boddie This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software diff -r 9d836f8a4075 -r 1077b05c9b76 iixr/positions.py --- a/iixr/positions.py Fri Jan 08 00:44:59 2010 +0100 +++ b/iixr/positions.py Sun Jan 10 20:47:41 2010 +0100 @@ -3,7 +3,7 @@ """ Specific classes for storing position information. -Copyright (C) 2009 Paul Boddie +Copyright (C) 2009, 2010 Paul Boddie This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -61,6 +61,39 @@ self.last_docnum = docnum +class PositionReader(FileReader): + + "Reading position information within term-specific regions of a file." + + def reset(self): + self.last_docnum = 0 + + def read_positions(self): + + "Read positions, returning a document number and a list of positions." + + # Read the document number delta and add it to the last number. + + self.last_docnum += self.read_number() + + # Read the number of positions. + + npositions = self.read_number() + + # Read the position deltas, adding each previous position to get the + # appropriate collection of absolute positions. + + i = 0 + last = 0 + positions = [] + + while i < npositions: + last += self.read_number() + positions.append(last) + i += 1 + + return self.last_docnum, positions + class PositionIndexWriter(FileWriter): "Writing position index information to files." @@ -92,109 +125,13 @@ self.last_pos_offset = pos_offset self.last_docnum = docnum -# Iterators for position-related files. - -class IteratorBase: - - def __init__(self, count): - self.replenish(count) - - def replenish(self, count): - self.count = count - self.read_documents = 0 - - def __len__(self): - return self.count - - def sort(self): - pass # Stored document positions are already sorted. - - def __iter__(self): - return self - -class PositionReader(FileReader, IteratorBase): - - "Iterating over document positions." - - def __init__(self, f): - FileReader.__init__(self, f) - IteratorBase.__init__(self, 0) # no iteration initially permitted - self.reset() - - def reset(self): - self.last_docnum = 0 - - def seek(self, offset, count): - - """ - Seek to 'offset' in the file, limiting the number of documents available - for reading to 'count'. - """ - - self.f.seek(offset) - self.replenish(count) - self.reset() +class PositionIndexReader(FileReader): - def read_positions(self): - - "Read positions, returning a document number and a list of positions." - - # Read the document number delta and add it to the last number. - - self.last_docnum += self.read_number() - - # Read the number of positions. - - npositions = self.read_number() - - # Read the position deltas, adding each previous position to get the - # appropriate collection of absolute positions. - - i = 0 - last = 0 - positions = [] - - while i < npositions: - last += self.read_number() - positions.append(last) - i += 1 - - return self.last_docnum, positions - - def next(self): - - "Read positions for a single document." - - if self.read_documents < self.count: - self.read_documents += 1 - return self.read_positions() - else: - raise StopIteration - -class PositionIndexReader(FileReader, IteratorBase): - - "Iterating over document positions." - - def __init__(self, f): - FileReader.__init__(self, f) - IteratorBase.__init__(self, 0) # no iteration initially permitted - self.reset() + "Reading position index information within term-specific regions of a file." def reset(self): self.last_docnum = 0 self.last_pos_offset = 0 - self.section_count = 0 - - def seek(self, offset, doc_frequency): - - """ - Seek to 'offset' in the file, limiting the number of documents available - for reading to 'doc_frequency'. - """ - - self.f.seek(offset) - self.replenish(doc_frequency) - self.reset() def read_positions(self): @@ -217,16 +154,112 @@ return self.last_docnum, self.last_pos_offset, count +# Iterators for position-related files. + +class IteratorBase: + + "Support for iterating over results." + + def __init__(self, reader): + + "Initialise the iterator using the given 'reader'." + + self.reader = reader + self.replenish(0) # no iteration initially permitted + + def replenish(self, count): + + "Replenish the iterator with 'count' results." + + self.count = count + self.read_documents = 0 + + def __len__(self): + + "Return the total number of results." + + return self.count + + def sort(self): + pass # Stored document positions are already sorted. + + def __iter__(self): + return self + +class PositionIterator(IteratorBase): + + "Iterating over document positions." + + def replenish(self, count): + IteratorBase.replenish(self, count) + + # Fill a cache of positions. + + self.cache = [] + n = 0 + + while n < self.count: + self.cache.append(self.reader.read_positions()) + n += 1 + + def seek(self, offset, count): + + """ + Seek to 'offset' in the file, limiting the number of documents available + for reading to 'count'. + """ + + self.reader.seek(offset) + self.replenish(count) + def next(self): "Read positions for a single document." - self.read_documents += self.section_count if self.read_documents < self.count: - docnum, pos_offset, self.section_count = t = self.read_positions() + positions = self.cache[self.read_documents] + self.read_documents += 1 + return positions + else: + raise StopIteration + +class PositionIndexIterator(IteratorBase): + + "Iterating over document positions." + + def replenish(self, count): + IteratorBase.replenish(self, count) + + # Fill a cache of offsets. + + self.cache = [] + self.current = 0 + n = 0 + + while n < self.count: + docnum, pos_offset, section_count = t = self.reader.read_positions() + self.cache.append(t) + n += section_count + + def seek(self, offset, doc_frequency): + + """ + Seek to 'offset' in the file, limiting the number of documents available + for reading to 'doc_frequency'. + """ + + self.reader.seek(offset) + self.replenish(doc_frequency) + + def next(self): + + "Read positions for a single document." + + if self.current < len(self.cache): + docnum, pos_offset, self.section_count = t = self.cache[self.current] + self.current += 1 return t else: - #assert self.read_documents == self.count # not upheld by from_document raise StopIteration class PositionDictionaryWriter: @@ -311,11 +344,31 @@ class PositionDictionaryReader: - "Iteration over position dictionary entries." + "Access to position dictionary entries through iterators." def __init__(self, position_reader, position_index_reader): self.position_reader = position_reader self.position_index_reader = position_index_reader + + def read_term_positions(self, offset, doc_frequency): + iterator = PositionDictionaryIterator( + PositionIterator(self.position_reader), + PositionIndexIterator(self.position_index_reader) + ) + iterator.seek(offset, doc_frequency) + return iterator + + def close(self): + self.position_reader.close() + self.position_index_reader.close() + +class PositionDictionaryIterator: + + "Iteration over position dictionary entries." + + def __init__(self, position_iterator, position_index_iterator): + self.position_iterator = position_iterator + self.position_index_iterator = position_index_iterator self.reset() def reset(self): @@ -339,9 +392,9 @@ # Seek to the appropriate index entry. - self.position_index_reader.seek(offset, doc_frequency) + self.position_index_iterator.seek(offset, doc_frequency) - # Initialise the current index entry and current position file reader. + # Initialise the current index entry and current position file iterator. self._next_section() self._init_section() @@ -349,7 +402,7 @@ # Sequence methods. def __len__(self): - return len(self.position_index_reader) + return len(self.position_index_iterator) def sort(self): pass @@ -380,23 +433,18 @@ # Either return the next record. try: - return self.position_reader.next() + return self.position_iterator.next() # Or, where a section is finished, get the next section and try again. except StopIteration: - # Where a section follows, update the index reader, but keep - # reading using the same file reader (since the data should just - # follow on from the last section). + # Although, where a single iterator is in use, the file reader + # would be positioned appropriately, this is not guaranteed in a + # multiple iterator situation. self._next_section() - self.position_reader.replenish(self.section_count) - - # Reset the state of the reader to make sure that document - # numbers are correct. - - self.position_reader.reset() + self._init_section() def from_document(self, docnum): @@ -415,7 +463,7 @@ try: if self.next_docnum is None: - self.next_docnum, self.next_pos_offset, self.next_section_count = self.position_index_reader.next() + self.next_docnum, self.next_pos_offset, self.next_section_count = self.position_index_iterator.next() # Read until the next entry is after the desired document number, # or until the end of the results. @@ -423,7 +471,7 @@ while self.next_docnum <= docnum: self._next_read_section() if self.docnum < docnum: - self.next_docnum, self.next_pos_offset, self.next_section_count = self.position_index_reader.next() + self.next_docnum, self.next_pos_offset, self.next_section_count = self.position_index_iterator.next() else: break @@ -436,7 +484,7 @@ try: while 1: - found_docnum, found_positions = self.position_reader.next() + found_docnum, found_positions = self.position_iterator.next() # Return the desired document positions or None (retaining the # positions for the document immediately after). @@ -457,7 +505,7 @@ "Attempt to get the next section in the index." if self.next_docnum is None: - self.docnum, self.pos_offset, self.section_count = self.position_index_reader.next() + self.docnum, self.pos_offset, self.section_count = self.position_index_iterator.next() else: self._next_read_section() @@ -473,14 +521,10 @@ def _init_section(self): - "Initialise the reader for the section in the position file." + "Initialise the iterator for the section in the position file." # Seek to the position entry. - self.position_reader.seek(self.pos_offset, self.section_count) - - def close(self): - self.position_reader.close() - self.position_index_reader.close() + self.position_iterator.seek(self.pos_offset, self.section_count) # vim: tabstop=4 expandtab shiftwidth=4 diff -r 9d836f8a4075 -r 1077b05c9b76 iixr/terms.py --- a/iixr/terms.py Fri Jan 08 00:44:59 2010 +0100 +++ b/iixr/terms.py Sun Jan 10 20:47:41 2010 +0100 @@ -3,7 +3,7 @@ """ Specific classes for storing term information. -Copyright (C) 2009 Paul Boddie +Copyright (C) 2009, 2010 Paul Boddie This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -106,7 +106,7 @@ permits the scanning for later terms from the specified term. """ - self.f.seek(info_offset) + self.seek(info_offset) self.last_term = term self.last_offset = offset @@ -301,8 +301,7 @@ documents equal to the given 'doc_frequency'. """ - self.position_dict_reader.seek(offset, doc_frequency) - return self.position_dict_reader + return self.position_dict_reader.read_term_positions(offset, doc_frequency) # Iterator convenience methods. @@ -329,9 +328,7 @@ """ term, offset, frequency, doc_frequency = self.info_reader.read_term() - - self.position_dict_reader.seek(offset, doc_frequency) - return term, frequency, doc_frequency, self.position_dict_reader + return term, frequency, doc_frequency, self._get_positions(offset, doc_frequency) # Query methods. diff -r 9d836f8a4075 -r 1077b05c9b76 test.py --- a/test.py Fri Jan 08 00:44:59 2010 +0100 +++ b/test.py Sun Jan 10 20:47:41 2010 +0100 @@ -26,7 +26,7 @@ if "clean" in sys.argv: sys.exit(0) -# Test basic data types. +print "- Test basic data types." numbers = [12345678, 0, 1, 127, 128, 255, 256] @@ -43,7 +43,7 @@ print number == n, number, n r.close() -# Test positions. +print "- Test positions." all_doc_positions = [ [ @@ -77,7 +77,7 @@ r.reset() r.close() -# Test position index files. +print "- Test position index files." indexed_positions = [ [ @@ -105,7 +105,7 @@ offsets.append((offset, doc_frequency)) w.close() -r = PositionIndexReader(open("testPI", "rb")) +r = PositionIndexIterator(PositionIndexReader(open("testPI", "rb"))) offsets.reverse() indexed_positions.reverse() for (offset, doc_frequency), term_positions in zip(offsets, indexed_positions): @@ -114,9 +114,9 @@ print docnum == dn, docnum, dn print pos_offset == po, pos_offset, po print count == c, count, c -r.close() +r.reader.close() -# Test position dictionaries. +print "- Test position dictionaries." f = open("testP", "wb") w = PositionWriter(f) @@ -135,12 +135,12 @@ offsets.reverse() all_doc_positions.reverse() for (offset, doc_frequency), doc_positions in zip(offsets, all_doc_positions): - rd.seek(offset, doc_frequency) - dp = list(rd) + it = rd.read_term_positions(offset, doc_frequency) + dp = list(it) print doc_positions == dp, doc_positions, dp rd.close() -# Test fields. +print "- Test fields." doc_fields = [ (123, ["testing", "fields", "stored", "compressed"]), @@ -166,7 +166,7 @@ print list(enumerate(fields)) == df, list(enumerate(fields)), df r.close() -# Test field index files. +print "- Test field index files." indexed_docs = [ (123, 100000987), @@ -188,7 +188,7 @@ print offset == o, offset, o r.close() -# Test field dictionaries. +print "- Test field dictionaries." f = open("testF", "wb") w = FieldWriter(f) @@ -213,7 +213,7 @@ df = rd.get_fields(docnum) print df is None, df -# (Test sequential access.) +print "- (Test sequential access.)" rd.rewind() for docnum, fields in doc_fields: @@ -222,7 +222,7 @@ print list(enumerate(fields)) == df, list(enumerate(fields)), df rd.close() -# Test terms. +print "- Test terms." terms = [ # term offset frequency doc_frequency @@ -250,7 +250,7 @@ print doc_frequency == df, doc_frequency, df r.close() -# Test terms in index files. +print "- Test terms in index files." indexed_terms = [ # term offset frequency doc_frequency info_offset @@ -279,7 +279,7 @@ print info_offset == i, info_offset, i r.close() -# Test dictionaries with only term data. +print "- Test dictionaries with only term data." f = open("test", "wb") w = TermWriter(f) @@ -314,7 +314,7 @@ t = rd._find_term(term) print t is None, t -# (Test term prefix searching.) +print "- (Test term prefix searching.)" print rd.find_terms("a") == ["aardvark", "anteater"], rd.find_terms("a"), ["aardvark", "anteater"] print rd.find_terms("bu") == ["bull", "bulldog"], rd.find_terms("bu"), ["bull", "bulldog"] @@ -322,7 +322,7 @@ print rd.find_terms("d") == [], rd.find_terms("d"), [] rd.close() -# Test dictionaries with term and position data. +print "- Test dictionaries with term and position data." terms_with_positions = [ ("aardvark", [(1, [2, 45, 96]), (20, [13])]), @@ -374,14 +374,14 @@ dp = rd.find_positions(term) print dp == [], dp -# (Test iterators.) +print "- (Test iterators.)" for term, docnum, positions in position_dict_tests: dp = rd.find_positions(term) pos = dp.from_document(docnum) print positions is None and pos is None or pos is not None and positions == list(pos), positions, pos -# (Test sequential access.) +print "- (Test sequential access.)" rd.rewind() for term, doc_positions in terms_with_positions: @@ -391,7 +391,7 @@ print doc_positions == dp, doc_positions, dp rd.close() -# Test high-level index operations (including merging). +print "- Test high-level index operations (including merging)." docs = [ (1, "The cat sat on the mat"), @@ -434,7 +434,7 @@ rd = index.get_reader() -# (Test searching.) +print "- (Test searching.)" for term, frequency, doc_positions in doc_tests: dp = list(rd.find_positions(term)) @@ -442,20 +442,20 @@ fr = rd.get_frequency(term) print frequency == fr, frequency, fr -# (Test fields.) +print "- (Test fields.)" for docnum, text in docs: df = dict(rd.get_fields(docnum)) print df[123] == text, text, df[123] -# (Test navigation.) +print "- (Test navigation.)" for term, docnum, positions in position_tests: dp = rd.find_positions(term) pos = dp.from_document(docnum) print positions is None and pos is None or pos is not None and positions == list(pos), positions, pos -# (Test phrases.) +print "- (Test phrases.)" for terms, results in phrase_tests: res = list(rd.find_common_positions(terms)) @@ -463,7 +463,7 @@ index.close() -# Test index updates. +print "- Test index updates." index = Index("test_index") index2 = Index("test_index2", 3, 2, 3, 6) @@ -500,7 +500,7 @@ print frequency == fr, frequency, fr index2.close() -# (Test update of an empty index.) +print "- (Test update of an empty index.)" index = Index("test_index") index3 = Index("test_index3")