# HG changeset patch # User Paul Boddie # Date 1251399168 -7200 # Node ID 42cc066da2fd477abde431f05d3aa71b73c4be99 # Parent 736ff8a2e02297fa984d788263e5eaaf4f5cf644 Added Unicode conversion, position sorting. Added an Index class which manages files within a directory. Added measures to close methods in order to tolerate repeated invocations. diff -r 736ff8a2e022 -r 42cc066da2fd iixr.py --- a/iixr.py Thu Aug 27 00:02:50 2009 +0200 +++ b/iixr.py Thu Aug 27 20:52:48 2009 +0200 @@ -18,9 +18,15 @@ with this program. If not, see . """ +from os import mkdir # to determine whether to create indexes +from os.path import exists, join from os.path import commonprefix # to find common string prefixes from bisect import bisect_right # to find terms in the dictionary index +# Constants. + +INTERVAL = 100 + # Foundation classes. class File: @@ -35,7 +41,9 @@ pass def close(self): - self.f.close() + if self.f is not None: + self.f.close() + self.f = None class FileWriter(File): @@ -74,6 +82,11 @@ "Write 's' to the file, recording its length." + # Convert Unicode objects to strings. + + if isinstance(s, unicode): + s = s.encode("utf-8") + length = len(s) if not (0 <= length <= 255): @@ -115,7 +128,10 @@ "Read a string from the file." length = self.read_number() - return self.f.read(length) + + # Convert strings to Unicode objects. + + return unicode(self.f.read(length), "utf-8") # Specific classes. @@ -141,6 +157,10 @@ self.write_number(len(positions)) + # Make sure that the positions are sorted. + + positions.sort() + # Write the position deltas. last = 0 @@ -492,6 +512,8 @@ doc.append(position) def close(self): + if self.dict_writer is None: + return # Get the terms in order. @@ -504,5 +526,63 @@ self.dict_writer.write_term_positions(term, doc_positions) self.dict_writer.close() + self.dict_writer = None + +class Index: + + "An inverted index solution encapsulating the various components." + + def __init__(self, pathname): + self.pathname = pathname + self.reader = None + self.writer = None + + def get_writer(self, interval=INTERVAL): + + "Return a writer, optionally using the given indexing 'interval'." + + if not exists(self.pathname): + mkdir(self.pathname) + + tdf = open(join(self.pathname, "terms"), "wb") + info_writer = TermWriter(tdf) + + tdif = open(join(self.pathname, "index"), "wb") + index_writer = TermIndexWriter(tdif) + + tpf = open(join(self.pathname, "positions"), "wb") + positions_writer = PositionWriter(tpf) + + dict_writer = TermDictionaryWriter(info_writer, index_writer, positions_writer, interval) + + self.writer = IndexWriter(dict_writer) + return self.writer + + def get_reader(self): + + "Return a reader for the index." + + if not exists(self.pathname): + raise OSError, "Index path %r does not exist." % self.pathname + + tdf = open(join(self.pathname, "terms"), "rb") + info_reader = TermReader(tdf) + + tdif = open(join(self.pathname, "index"), "rb") + index_reader = TermIndexReader(tdif) + + tpf = open(join(self.pathname, "positions"), "rb") + positions_reader = PositionReader(tpf) + + self.reader = TermDictionaryReader(info_reader, index_reader, positions_reader) + return self.reader + + def close(self): + if self.reader is not None: + self.reader.close() + self.reader = None + if self.writer is not None: + self.writer.close() + self.writer = None # vim: tabstop=4 expandtab shiftwidth=4 diff -r 736ff8a2e022 -r 42cc066da2fd test.py --- a/test.py Thu Aug 27 00:02:50 2009 +0200 +++ b/test.py Thu Aug 27 20:52:48 2009 +0200 @@ -192,29 +192,17 @@ ("sea", [(36, [2, 6])]) ] -f = open("test", "wb") -w = iixr.TermWriter(f) -f2 = open("testI", "wb") -w2 = iixr.TermIndexWriter(f2) -f3 = open("testP", "wb") -w3 = iixr.PositionWriter(f3) -wd = iixr.TermDictionaryWriter(w, w2, w3, 3) -wi = iixr.IndexWriter(wd) +index = iixr.Index("test_index") +wi = index.get_writer(3) for docnum, text in docs: for position, term in enumerate(text.split()): wi.add_position(term, docnum, position) wi.close() -f = open("test", "rb") -r = iixr.TermReader(f) -f2 = open("testI", "rb") -r2 = iixr.TermIndexReader(f2) -f3 = open("testP", "rb") -r3 = iixr.PositionReader(f3) -rd = iixr.TermDictionaryReader(r, r2, r3) +rd = index.get_reader() for term, doc_positions in doc_tests: dp = rd.find_positions(term) print doc_positions == dp, doc_positions, dp -rd.close() +index.close() # vim: tabstop=4 expandtab shiftwidth=4