# HG changeset patch
# User Paul Boddie <paul@boddie.org.uk>
# Date 1250711368 -7200
# Node ID 9b4a09a84e669dcbed54320c00fc77f3d7c33b9d

The fundamental aspects of an inverted indexer.

diff -r 000000000000 -r 9b4a09a84e66 iixr.py
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/iixr.py	Wed Aug 19 21:49:28 2009 +0200
@@ -0,0 +1,204 @@
+#!/usr/bin/env python
+
+"""
+A simple (and sane) text indexing library.
+"""
+
+# Foundation classes.
+
+class File:
+
+    "A basic file abstraction."
+
+    def __init__(self, f):
+        self.f = f
+        self.reset()
+
+    def reset(self):
+        pass
+
+    def close(self):
+        self.f.close()
+
+class FileWriter(File):
+
+    "Writing basic data types to files."
+
+    def write_number(self, number):
+
+        "Write 'number' to the file using a variable length encoding."
+
+        # Negative numbers are not supported.
+
+        if number < 0:
+            raise ValueError, "Number %r is negative." % number
+
+        # Special case: one byte containing zero.
+
+        elif number == 0:
+            self.f.write(chr(1) + chr(0))
+            return
+
+        # Write the number from least to most significant digits.
+
+        nbytes = 0
+        bytes = []
+
+        while number != 0:
+            lsd = number & 255
+            bytes.append(chr(lsd))
+            number = number >> 8
+            nbytes += 1
+
+        # Too large numbers are not supported.
+
+        if nbytes > 255:
+            raise ValueError, "Number %r is too large." % number
+
+        bytes.insert(0, chr(nbytes))
+        record = "".join(bytes)
+        self.f.write(record)
+
+class FileReader(File):
+
+    "Reading basic data types from files."
+
+    def read_number(self):
+
+        "Read a number from the file."
+
+        nbytes = ord(self.f.read(1))
+
+        # Read each byte, adding it to the number.
+
+        bytes = self.f.read(nbytes)
+
+        i = 0
+        shift = 0
+        number = 0
+
+        while i < nbytes:
+            csd = ord(bytes[i])
+            number += (csd << shift)
+            shift += 8
+            i += 1
+
+        return number
+
+# Specific classes.
+
+class PositionWriter(FileWriter):
+
+    "Writing position information to files."
+
+    def reset(self):
+        self.last_docnum = 0
+
+    def write_positions(self, docnum, positions):
+
+        "Write for the document 'docnum' the given 'positions'."
+
+        if docnum < self.last_docnum:
+            raise ValueError, "Document number %r is less than previous number %r." % (docnum, self.last_docnum)
+
+        # Write the document number delta.
+
+        self.write_number(docnum - self.last_docnum)
+
+        # Write the number of positions.
+
+        self.write_number(len(positions))
+
+        # Write the position deltas.
+
+        last = 0
+        for position in positions:
+            pos = position - last
+            self.write_number(pos)
+            last = position
+
+        self.last_docnum = docnum
+
+    def write_all_positions(self, doc_positions):
+
+        """
+        Write all 'doc_positions' - a collection of tuples of the form (document
+        number, position list) - to the file, returning the offset at which they
+        were stored.
+        """
+
+        # Reset the writer and record the current file offset.
+
+        self.reset()
+        offset = self.f.tell()
+
+        # Write the number of documents.
+
+        self.write_number(len(doc_positions))
+
+        # Write the positions.
+
+        for docnum, positions in doc_positions:
+            self.write_positions(docnum, positions)
+
+        return offset
+
+class PositionReader(FileReader):
+
+    "Reading position information from files."
+
+    def reset(self):
+        self.last_docnum = 0
+
+    def read_positions(self):
+
+        "Read positions, returning a document number and a list of positions."
+
+        # Read the document number delta and add it to the last number.
+
+        self.last_docnum += self.read_number()
+
+        # Read the number of positions.
+
+        npositions = self.read_number()
+
+        # Read the position deltas, adding each previous position to get the
+        # appropriate collection of absolute positions.
+
+        i = 0
+        last = 0
+        positions = []
+
+        while i < npositions:
+            last += self.read_number()
+            positions.append(last)
+            i += 1
+
+        return self.last_docnum, positions
+
+    def read_all_positions(self, offset):
+
+        """
+        Read all positions from 'offset', seeking to that position in the file
+        before reading.
+        """
+
+        self.reset()
+        self.f.seek(offset)
+
+        # Read the number of documents.
+
+        ndocuments = self.read_number()
+
+        # Read all records.
+
+        i = 0
+        doc_positions = []
+
+        while i < ndocuments:
+            doc_positions.append(self.read_positions())
+            i += 1
+
+        return doc_positions
+
+# vim: tabstop=4 expandtab shiftwidth=4
diff -r 000000000000 -r 9b4a09a84e66 test.py
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test.py	Wed Aug 19 21:49:28 2009 +0200
@@ -0,0 +1,67 @@
+#!/usr/bin/env python
+
+import iixr
+
+numbers = [12345678, 0, 1]
+
+f = open("test", "wb")
+w = iixr.FileWriter(f)
+for number in numbers:
+    w.write_number(number)
+w.close()
+
+f = open("test", "r")
+r = iixr.FileReader(f)
+for number in numbers:
+    n = r.read_number()
+    print number == n, number, n
+r.close()
+
+all_doc_positions = [
+    [
+        (123, [1, 3, 5, 15, 25]),
+        (124, [0, 100])
+    ],
+    [
+        (78, [9]),
+        (196, [10, 11])
+    ]
+    ]
+
+f = open("test", "wb")
+w = iixr.PositionWriter(f)
+for doc_positions in all_doc_positions:
+    for docnum, positions in doc_positions:
+        w.write_positions(docnum, positions)
+    w.reset()
+w.close()
+
+f = open("test", "r")
+r = iixr.PositionReader(f)
+for doc_positions in all_doc_positions:
+    for docnum, positions in doc_positions:
+        d, p = r.read_positions()
+        print docnum == d, docnum, d
+        print positions == p, positions, p
+    r.reset()
+r.close()
+
+f = open("test", "wb")
+w = iixr.PositionWriter(f)
+offsets = []
+for doc_positions in all_doc_positions:
+    offsets.append(
+        w.write_all_positions(doc_positions)
+        )
+w.close()
+
+f = open("test", "r")
+r = iixr.PositionReader(f)
+offsets.reverse()
+all_doc_positions.reverse()
+for offset, doc_positions in zip(offsets, all_doc_positions):
+    dp = r.read_all_positions(offset)
+    print doc_positions == dp, doc_positions, dp
+r.close()
+
+# vim: tabstop=4 expandtab shiftwidth=4