# HG changeset patch # User Paul Boddie # Date 1251233055 -7200 # Node ID 67fdbd4c2a90064d0f6a8f939957ee29589c0547 # Parent e014559eec92a39c0d958647a3237ebb57cfdcee Added a term information reader and writer. diff -r e014559eec92 -r 67fdbd4c2a90 iixr.py --- a/iixr.py Tue Aug 25 22:10:37 2009 +0200 +++ b/iixr.py Tue Aug 25 22:44:15 2009 +0200 @@ -18,6 +18,8 @@ with this program. If not, see . """ +from os.path import commonprefix # to find common string prefixes + # Foundation classes. class File: @@ -73,6 +75,27 @@ record = "".join(bytes) self.f.write(record) + def write_unsigned_byte(self, number): + + "Write 'number' to the file using a single byte." + + if not (0 <= number <= 255): + raise ValueError, "Number %r is out of range." % number + + self.f.write(chr(number)) + + def write_string(self, s): + + "Write 's' to the file, recording its length." + + length = len(s) + + if not (0 <= length <= 255): + raise ValueError, "String %r is too long." % s + + self.write_unsigned_byte(length) + self.f.write(s) + class FileReader(File): "Reading basic data types from files." @@ -99,6 +122,19 @@ return number + def read_unsigned_byte(self): + + "Read a number from the file, consuming a single byte." + + return ord(self.f.read(1)) + + def read_string(self): + + "Read a string from the file." + + length = self.read_unsigned_byte() + return self.f.read(length) + # Specific classes. class PositionWriter(FileWriter): @@ -215,4 +251,66 @@ return doc_positions +class TermWriter(FileWriter): + + "Writing term information to files." + + def reset(self): + self.last_term = "" + self.last_offset = 0 + + def write_term(self, term, offset): + + """ + Write the given 'term' and its position file 'offset' to the term + information file. + """ + + # Too long terms are not currently supported. + + if len(term) > 255: + raise ValueError, "Term %r is too long." % term + + # Write the prefix length and term suffix. + + common = len(commonprefix([self.last_term, term])) + suffix = term[common:] + + self.write_unsigned_byte(common) + self.write_string(suffix) + + # Write the offset delta. + + self.write_number(offset - self.last_offset) + + self.last_term = term + self.last_offset = offset + +class TermReader(FileReader): + + "Reading term information from files." + + def reset(self): + self.last_term = "" + self.last_offset = 0 + + def read_term(self): + + """ + Read a term and its position file offset from the term information file. + """ + + # Read the prefix length and term suffix. + + common = self.read_unsigned_byte() + suffix = self.read_string() + + self.last_term = self.last_term[:common] + suffix + + # Read the offset delta. + + self.last_offset += self.read_number() + + return self.last_term, self.last_offset + # vim: tabstop=4 expandtab shiftwidth=4 diff -r e014559eec92 -r 67fdbd4c2a90 test.py --- a/test.py Tue Aug 25 22:10:37 2009 +0200 +++ b/test.py Tue Aug 25 22:44:15 2009 +0200 @@ -64,4 +64,27 @@ print doc_positions == dp, doc_positions, dp r.close() +terms = [ + ("aardvark", 100000123), + ("anteater", 100000456), + ("badger", 100000789), + ("bull", 1000001234), + ("bulldog", 1000002345), + ("cat", 1000003456) + ] + +f = open("test", "wb") +w = iixr.TermWriter(f) +for term, offset in terms: + w.write_term(term, offset) +w.close() + +f = open("test", "r") +r = iixr.TermReader(f) +for term, offset in terms: + t, o = r.read_term() + print term == t, term, t + print offset == o, offset, o +r.close() + # vim: tabstop=4 expandtab shiftwidth=4