# HG changeset patch # User Paul Boddie # Date 1251504929 -7200 # Node ID fe7ed6b96612e2f0fd90cb13690e9b317f0383fe # Parent 5e4746613f833ecb14c54547660ac704f11fae3e Added field dictionary and field index readers and writers. Renamed various internal methods. Added document number deltas to field collections in order to support scanning for documents. diff -r 5e4746613f83 -r fe7ed6b96612 iixr.py --- a/iixr.py Fri Aug 28 01:15:17 2009 +0200 +++ b/iixr.py Sat Aug 29 02:15:29 2009 +0200 @@ -149,7 +149,7 @@ return unicode(s, "utf-8") -# Specific classes. +# Specific classes for storing term and position information. class PositionWriter(FileWriter): @@ -336,7 +336,10 @@ def go_to_term(self, term, offset, info_offset): - "Seek past the entry for 'term' having 'offset' to 'info_offset'." + """ + Seek past the entry for 'term' having 'offset' to 'info_offset'. This + permits the scanning for later terms from the specified term. + """ self.f.seek(info_offset) self.last_term = term @@ -399,7 +402,7 @@ self.interval = interval self.entry = 0 - def write_term(self, term, offset): + def _write_term(self, term, offset): """ Write the given 'term' and its position file 'offset' to the term @@ -421,7 +424,7 @@ """ offset = self.position_writer.write_all_positions(doc_positions) - self.write_term(term, offset) + self._write_term(term, offset) def close(self): self.info_writer.close() @@ -449,7 +452,7 @@ self.max_offset = self.terms[-1][1] self.max_info_offset = self.terms[-1][2] - def find_term(self, term): + def _find_term(self, term): "Find the position file offset of 'term' from the term dictionary." @@ -489,7 +492,7 @@ "Return the documents and positions at which the given 'term' is found." - offset = self.find_term(term) + offset = self._find_term(term) if offset is None: return None else: @@ -500,19 +503,28 @@ self.index_reader.close() self.position_reader.close() +# Specific classes for storing document information. + class FieldWriter(FileWriter): "Writing field data to files." - def write_fields(self, fields): + def reset(self): + self.last_docnum = 0 + + def write_fields(self, docnum, fields): """ - Write the given list of 'fields' (strings representing field values). - Return the offset at which the fields are stored. + Write for the given 'docnum', a list of 'fields' (strings representing + field values). Return the offset at which the fields are stored. """ offset = self.f.tell() + # Write the document number delta. + + self.write_number(docnum - self.last_docnum) + # Write the number of fields. self.write_number(len(fields)) @@ -522,15 +534,26 @@ for field in fields: self.write_string(field, 0) # compress + self.last_docnum = docnum return offset class FieldReader(FileReader): "Reading field data from files." + def reset(self): + self.last_docnum = 0 + def read_fields(self): - "Read fields from the file, returning the field values in a list." + """ + Read fields from the file, returning a tuple containing the document + number and a list of field values. + """ + + # Read the document number. + + self.last_docnum += self.read_number() # Read the number of fields. @@ -545,14 +568,139 @@ fields.append(self.read_string(0)) # decompress i += 1 - return fields + return self.last_docnum, fields + + def read_document_fields(self, docnum, offset): - def read_doc_fields(self, offset): - - "Read all fields at the given 'offset." + """ + Read fields for 'docnum' at the given 'offset'. This permits the + retrieval of details for the specified document, as well as scanning for + later documents. + """ self.f.seek(offset) - return self.read_fields() + bad_docnum, fields = self.read_fields() + self.last_docnum = docnum + return docnum, fields + +class FieldIndexWriter(FileWriter): + + "Writing field index details to files." + + def reset(self): + self.last_docnum = 0 + + def write_document(self, docnum, offset): + + """ + Write for the given 'docnum', the 'offset' at which the fields for the + document are stored in the fields file. + """ + + # Write the document number delta and offset. + + self.write_number(docnum - self.last_docnum) + self.write_number(offset) + + self.last_docnum = docnum + +class FieldIndexReader(FileReader): + + "Reading field index details from files." + + def reset(self): + self.last_docnum = 0 + + def read_document(self): + + "Read a document number and field file offset." + + # Read the document number delta and offset. + + self.last_docnum += self.read_number() + offset = self.read_number() + + return self.last_docnum, offset + +class FieldDictionaryWriter: + + "Writing field dictionary details." + + def __init__(self, field_writer, field_index_writer, interval): + self.field_writer = field_writer + self.field_index_writer = field_index_writer + self.interval = interval + self.entry = 0 + + def write_fields(self, docnum, fields): + + "Write details of the document with the given 'docnum' and 'fields'." + + offset = self.field_writer.write_fields(docnum, fields) + + if self.entry % self.interval == 0: + self.field_index_writer.write_document(docnum, offset) + + self.entry += 1 + + def close(self): + self.field_writer.close() + self.field_index_writer.close() + +class FieldDictionaryReader: + + "Reading field dictionary details." + + def __init__(self, field_reader, field_index_reader): + self.field_reader = field_reader + self.field_index_reader = field_index_reader + + self.docs = [] + try: + while 1: + self.docs.append(self.field_index_reader.read_document()) + except EOFError: + pass + + # Large numbers for ordering purposes. + + self.max_offset = self.docs[-1][1] + + def read_fields(self, docnum): + + "Read the fields of the document with the given 'docnum'." + + i = bisect_right(self.docs, (docnum, self.max_offset)) - 1 + + # Get the entry position providing the term or one preceding it. + + if i == -1: + return None + + found_docnum, offset = self.docs[i] + + # Read from the fields file. + + found_docnum, fields = self.field_reader.read_document_fields(found_docnum, offset) + + # Scan for the document, if necessary. + + try: + while docnum > found_docnum: + found_docnum, fields = self.field_reader.read_fields() + except EOFError: + pass + + # If the document is found, return the fields. + + if docnum == found_docnum: + return fields + else: + return None + + def close(self): + self.field_reader.close() + self.field_index_reader.close() # High-level classes. diff -r 5e4746613f83 -r fe7ed6b96612 test.py --- a/test.py Fri Aug 28 01:15:17 2009 +0200 +++ b/test.py Sat Aug 29 02:15:29 2009 +0200 @@ -2,6 +2,8 @@ import iixr +# Test basic data types. + numbers = [12345678, 0, 1, 127, 128, 255, 256] f = open("test", "wb") @@ -17,6 +19,8 @@ print number == n, number, n r.close() +# Test positions. + all_doc_positions = [ [ (123, [1, 3, 5, 15, 25]), @@ -64,30 +68,82 @@ print doc_positions == dp, doc_positions, dp r.close() +# Test fields. + doc_fields = [ - ["testing", "fields", "stored", "compressed"], - ["fields", "for a second", "document"] + (123, ["testing", "fields", "stored", "compressed"]), + (456, ["fields", "for a second", "document"]), + (789, ["field value"]), + (1234, []), + (2345, ["abc", "def"]), + (3456, ["apple", "banana", "cherry"]), + (4567, ["drue", "eple"]) ] f = open("testF", "wb") w = iixr.FieldWriter(f) -offsets = [] -for fields in doc_fields: - offsets.append(w.write_fields(fields)) +for docnum, fields in doc_fields: + w.write_fields(docnum, fields) w.close() f = open("testF", "rb") r = iixr.FieldReader(f) -for fields in doc_fields: - df = r.read_fields() - print fields == df, fields, df -offsets.reverse() -doc_fields.reverse() -for offset, fields in zip(offsets, doc_fields): - df = r.read_doc_fields(offset) +for docnum, fields in doc_fields: + dn, df = r.read_fields() + print docnum == dn, docnum, dn print fields == df, fields, df r.close() +# Test field index files. + +indexed_docs = [ + (123, 100000987), + (456, 100004321), + (789, 100008765) + ] + +f = open("testFI", "wb") +w = iixr.FieldIndexWriter(f) +for docnum, offset in indexed_docs: + w.write_document(docnum, offset) +w.close() + +f = open("testFI", "rb") +r = iixr.FieldIndexReader(f) +for docnum, offset in indexed_docs: + dn, o = r.read_document() + print docnum == dn, docnum, dn + print offset == o, offset, o +r.close() + +# Test field dictionaries. + +f = open("testF", "wb") +w = iixr.FieldWriter(f) +f2 = open("testFI", "wb") +w2 = iixr.FieldIndexWriter(f2) +wd = iixr.FieldDictionaryWriter(w, w2, 3) +for docnum, fields in doc_fields: + wd.write_fields(docnum, fields) +wd.close() + +f = open("testF", "rb") +r = iixr.FieldReader(f) +f2 = open("testFI", "rb") +r2 = iixr.FieldIndexReader(f2) +rd = iixr.FieldDictionaryReader(r, r2) +doc_fields_reversed = doc_fields[:] +doc_fields_reversed.reverse() +for docnum, fields in doc_fields_reversed: + df = rd.read_fields(docnum) + print fields == df, fields, df +for docnum in (13579, 246810): + df = rd.read_fields(docnum) + print df is None, df +rd.close() + +# Test terms. + terms = [ ("aardvark", 100000123), ("anteater", 100000456), @@ -111,6 +167,8 @@ print offset == o, offset, o r.close() +# Test terms in index files. + indexed_terms = [ ("aardvark", 100000123, 200000321), ("anteater", 100000456, 200000654), @@ -135,6 +193,8 @@ print info_offset == i, info_offset, i r.close() +# Test dictionaries with only term data. + f = open("test", "wb") w = iixr.TermWriter(f) f2 = open("testI", "wb") @@ -143,7 +203,7 @@ w3 = iixr.PositionWriter(f3) wd = iixr.TermDictionaryWriter(w, w2, w3, 3) for term, offset in terms: - wd.write_term(term, offset) + wd._write_term(term, offset) wd.close() f = open("test", "rb") @@ -156,13 +216,15 @@ terms_reversed = terms[:] terms_reversed.reverse() for term, offset in terms_reversed: - o = rd.find_term(term) + o = rd._find_term(term) print offset == o, offset, o for term in ("dog", "dingo"): - o = rd.find_term(term) + o = rd._find_term(term) print o is None, o rd.close() +# Test dictionaries with term and position data. + terms_with_positions = [ ("aardvark", [(1, [2, 45, 96]), (20, [13])]), ("anteater", [(1, [43, 44])]), @@ -200,6 +262,8 @@ print dp is None, dp rd.close() +# Test high-level index operations. + docs = [ (1, "The cat sat on the mat"), (2, "Every good boy deserves football"),