# HG changeset patch # User Paul Boddie # Date 1250711368 -7200 # Node ID 9b4a09a84e669dcbed54320c00fc77f3d7c33b9d The fundamental aspects of an inverted indexer. diff -r 000000000000 -r 9b4a09a84e66 iixr.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/iixr.py Wed Aug 19 21:49:28 2009 +0200 @@ -0,0 +1,204 @@ +#!/usr/bin/env python + +""" +A simple (and sane) text indexing library. +""" + +# Foundation classes. + +class File: + + "A basic file abstraction." + + def __init__(self, f): + self.f = f + self.reset() + + def reset(self): + pass + + def close(self): + self.f.close() + +class FileWriter(File): + + "Writing basic data types to files." + + def write_number(self, number): + + "Write 'number' to the file using a variable length encoding." + + # Negative numbers are not supported. + + if number < 0: + raise ValueError, "Number %r is negative." % number + + # Special case: one byte containing zero. + + elif number == 0: + self.f.write(chr(1) + chr(0)) + return + + # Write the number from least to most significant digits. + + nbytes = 0 + bytes = [] + + while number != 0: + lsd = number & 255 + bytes.append(chr(lsd)) + number = number >> 8 + nbytes += 1 + + # Too large numbers are not supported. + + if nbytes > 255: + raise ValueError, "Number %r is too large." % number + + bytes.insert(0, chr(nbytes)) + record = "".join(bytes) + self.f.write(record) + +class FileReader(File): + + "Reading basic data types from files." + + def read_number(self): + + "Read a number from the file." + + nbytes = ord(self.f.read(1)) + + # Read each byte, adding it to the number. + + bytes = self.f.read(nbytes) + + i = 0 + shift = 0 + number = 0 + + while i < nbytes: + csd = ord(bytes[i]) + number += (csd << shift) + shift += 8 + i += 1 + + return number + +# Specific classes. + +class PositionWriter(FileWriter): + + "Writing position information to files." + + def reset(self): + self.last_docnum = 0 + + def write_positions(self, docnum, positions): + + "Write for the document 'docnum' the given 'positions'." + + if docnum < self.last_docnum: + raise ValueError, "Document number %r is less than previous number %r." % (docnum, self.last_docnum) + + # Write the document number delta. + + self.write_number(docnum - self.last_docnum) + + # Write the number of positions. + + self.write_number(len(positions)) + + # Write the position deltas. + + last = 0 + for position in positions: + pos = position - last + self.write_number(pos) + last = position + + self.last_docnum = docnum + + def write_all_positions(self, doc_positions): + + """ + Write all 'doc_positions' - a collection of tuples of the form (document + number, position list) - to the file, returning the offset at which they + were stored. + """ + + # Reset the writer and record the current file offset. + + self.reset() + offset = self.f.tell() + + # Write the number of documents. + + self.write_number(len(doc_positions)) + + # Write the positions. + + for docnum, positions in doc_positions: + self.write_positions(docnum, positions) + + return offset + +class PositionReader(FileReader): + + "Reading position information from files." + + def reset(self): + self.last_docnum = 0 + + def read_positions(self): + + "Read positions, returning a document number and a list of positions." + + # Read the document number delta and add it to the last number. + + self.last_docnum += self.read_number() + + # Read the number of positions. + + npositions = self.read_number() + + # Read the position deltas, adding each previous position to get the + # appropriate collection of absolute positions. + + i = 0 + last = 0 + positions = [] + + while i < npositions: + last += self.read_number() + positions.append(last) + i += 1 + + return self.last_docnum, positions + + def read_all_positions(self, offset): + + """ + Read all positions from 'offset', seeking to that position in the file + before reading. + """ + + self.reset() + self.f.seek(offset) + + # Read the number of documents. + + ndocuments = self.read_number() + + # Read all records. + + i = 0 + doc_positions = [] + + while i < ndocuments: + doc_positions.append(self.read_positions()) + i += 1 + + return doc_positions + +# vim: tabstop=4 expandtab shiftwidth=4 diff -r 000000000000 -r 9b4a09a84e66 test.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test.py Wed Aug 19 21:49:28 2009 +0200 @@ -0,0 +1,67 @@ +#!/usr/bin/env python + +import iixr + +numbers = [12345678, 0, 1] + +f = open("test", "wb") +w = iixr.FileWriter(f) +for number in numbers: + w.write_number(number) +w.close() + +f = open("test", "r") +r = iixr.FileReader(f) +for number in numbers: + n = r.read_number() + print number == n, number, n +r.close() + +all_doc_positions = [ + [ + (123, [1, 3, 5, 15, 25]), + (124, [0, 100]) + ], + [ + (78, [9]), + (196, [10, 11]) + ] + ] + +f = open("test", "wb") +w = iixr.PositionWriter(f) +for doc_positions in all_doc_positions: + for docnum, positions in doc_positions: + w.write_positions(docnum, positions) + w.reset() +w.close() + +f = open("test", "r") +r = iixr.PositionReader(f) +for doc_positions in all_doc_positions: + for docnum, positions in doc_positions: + d, p = r.read_positions() + print docnum == d, docnum, d + print positions == p, positions, p + r.reset() +r.close() + +f = open("test", "wb") +w = iixr.PositionWriter(f) +offsets = [] +for doc_positions in all_doc_positions: + offsets.append( + w.write_all_positions(doc_positions) + ) +w.close() + +f = open("test", "r") +r = iixr.PositionReader(f) +offsets.reverse() +all_doc_positions.reverse() +for offset, doc_positions in zip(offsets, all_doc_positions): + dp = r.read_all_positions(offset) + print doc_positions == dp, doc_positions, dp +r.close() + +# vim: tabstop=4 expandtab shiftwidth=4