# HG changeset patch # User Paul Boddie # Date 1252448970 -7200 # Node ID c1b98b5509c0d0e9f3e5ce01518df28049cd1fbb # Parent 667ed14c1807e557148025e8a0baf998d3c4800c Changed the merging classes to take advantage of document-oriented data storage. Changed the term flushing method to access only the terms when preparing sorted data for use in the writing operation. diff -r 667ed14c1807 -r c1b98b5509c0 iixr.py --- a/iixr.py Tue Sep 08 20:29:47 2009 +0200 +++ b/iixr.py Wed Sep 09 00:29:30 2009 +0200 @@ -1311,9 +1311,11 @@ other_term, other_doc_positions, other_partition = entries[i] # For such entries, merge the positions. + # Since document positions should only appear in a single + # partition, a simple update should be sufficient. if other_term == term: - doc_positions = self.merge_positions(doc_positions, other_doc_positions) + doc_positions.update(other_doc_positions) to_update.append(other_partition) i += 1 else: @@ -1334,23 +1336,6 @@ except EOFError: pass - def merge_positions(self, doc_positions, other_doc_positions): - - """ - Merge 'doc_positions' with 'other_doc_positions' so that common document - records contain positions from both collections. - """ - - doc_position_dict = dict(doc_positions) - - for docnum, positions in other_doc_positions: - if doc_position_dict.has_key(docnum): - doc_position_dict[docnum] += positions - else: - doc_position_dict[docnum] = positions - - return doc_position_dict.items() - class FieldDictionaryMerger(Merger): "Merge field files." @@ -1375,28 +1360,11 @@ pass # While entries are available, write them out in order, merging where - # appropriate. + # appropriate. Since fields from one document should only appear in a + # single partition, only one partition will be updated at a time. while entries: docnum, fields, partition = entries[0] - to_update = [partition] - - nentries = len(entries) - i = 1 - - # Find other entries for the term. - - while i < nentries: - other_docnum, other_fields, other_partition = entries[i] - - # For such entries, merge the positions. - - if other_docnum == docnum: - fields += other_fields - to_update.append(other_partition) - i += 1 - else: - break # Write the combined term details. @@ -1404,14 +1372,13 @@ # Update the entries from the affected readers. - del entries[:i] - - for partition in to_update: - try: - docnum, fields = self.readers[partition].read_fields() - insort_right(entries, (docnum, fields, partition)) - except EOFError: - pass + del entries[0] + + try: + docnum, fields = self.readers[partition].read_fields() + insort_right(entries, (docnum, fields, partition)) + except EOFError: + pass # Utility functions. @@ -1604,13 +1571,14 @@ # Get the terms in order. - terms = self.terms.items() + all_terms = self.terms + terms = all_terms.keys() terms.sort() dict_writer = self.get_term_writer() - for term, doc_positions in terms: - doc_positions = doc_positions.items() + for term in terms: + doc_positions = all_terms[term].items() dict_writer.write_term_positions(term, doc_positions) dict_writer.close()