# HG changeset patch # User Paul Boddie # Date 1252432176 -7200 # Node ID 147f0472ed01bdd217280ee200e88c9c183f4d8d # Parent 89404463dda25a94f893e62d18c4657d8cac488a Introduced a Document class which is instantiated in order to present data to the IndexWriter in a more efficient way through the add_document method, thus rendering the commit_document method obsolete. Reduced the data flushing threshold to a more reasonable size. Fixed maximum offset definitions in cases where datasets are empty. diff -r 89404463dda2 -r 147f0472ed01 iixr.py --- a/iixr.py Tue Sep 08 00:13:23 2009 +0200 +++ b/iixr.py Tue Sep 08 19:49:36 2009 +0200 @@ -37,7 +37,7 @@ TERM_INTERVAL = 100 DOCUMENT_INTERVAL = 100 FIELD_INTERVAL = 100 -FLUSH_INTERVAL = 100000 +FLUSH_INTERVAL = 10000 TERM_FILENAMES = "terms", "terms_index", "positions", "positions_index" FIELD_FILENAMES = "fields", "fields_index" @@ -877,7 +877,10 @@ # Large numbers for ordering purposes. - self.max_offset = self.terms[-1][1] + 1 + if self.terms: + self.max_offset = self.terms[-1][1] + 1 + else: + self.max_offset = None def _find_closest_entry(self, term): @@ -1204,7 +1207,10 @@ # Large numbers for ordering purposes. - self.max_offset = self.docs[-1][1] + if self.docs: + self.max_offset = self.docs[-1][1] + else: + self.max_offset = None def rewind(self): self.field_reader.rewind() @@ -1509,6 +1515,39 @@ # High-level classes. +class Document: + + "A container of document information." + + def __init__(self, docnum): + self.docnum = docnum + self.fields = [] + self.terms = {} + + def add_position(self, term, position): + + """ + Add a position entry for the given 'term', indicating the given + 'position'. + """ + + self.terms.setdefault(term, []).append(position) + + def add_field(self, identifier, value): + + "Add a field having the given 'identifier' and 'value'." + + self.fields.append((identifier, unicode(value))) # convert to string + + def set_fields(self, docnum, fields): + + """ + Add for the document with the given 'docnum' the given 'fields': a list + of tuples each containing an integer identifier and a string value. + """ + + self.fields = fields + class IndexWriter: """ @@ -1529,51 +1568,17 @@ self.doc_counter = 0 - def add_position(self, term, docnum, position): - - """ - Add a position entry for the given 'term' in the document with the given - 'docnum', indicating the given 'position'. - """ - - if not self.terms.has_key(term): - doc_positions = self.terms[term] = {} - else: - doc_positions = self.terms[term] - - if not doc_positions.has_key(docnum): - doc = doc_positions[docnum] = [] - else: - doc = doc_positions[docnum] - - doc.append(position) - - def add_field(self, docnum, identifier, value): + def add_document(self, doc): """ - Add for the document with the given 'docnum' a field having the given - 'identifier' and 'value'. + Add the given document 'doc', updating the document counter and flushing + terms and fields if appropriate. """ - if not self.docs.has_key(docnum): - doc_fields = self.docs[docnum] = [] - else: - doc_fields = self.docs[docnum] - - doc_fields.append((identifier, unicode(value))) # convert to string - - def set_fields(self, docnum, fields): - - """ - Add for the document with the given 'docnum' the given 'fields': a list - of tuples each containing an integer identifier and a string value. - """ - - self.docs[docnum] = fields - - def commit_document(self): - - "Update the document counter, flushing terms and fields if appropriate." + for term, positions in doc.terms.items(): + self.terms.setdefault(term, {})[doc.docnum] = positions + + self.docs[doc.docnum] = doc.fields self.doc_counter += 1 if self.flush_interval and self.doc_counter >= self.flush_interval: diff -r 89404463dda2 -r 147f0472ed01 test.py --- a/test.py Tue Sep 08 00:13:23 2009 +0200 +++ b/test.py Tue Sep 08 19:49:36 2009 +0200 @@ -417,9 +417,11 @@ index = iixr.Index("test_index") wi = index.get_writer(3, 2, 6) for docnum, text in docs: + doc = iixr.Document(docnum) for position, term in enumerate(text.split()): - wi.add_position(term, docnum, position) - wi.add_field(docnum, 123, text) + doc.add_position(term, position) + doc.add_field(123, text) + wi.add_document(doc) wi.close() rd = index.get_reader()