# HG changeset patch # User Paul Boddie # Date 1251414917 -7200 # Node ID 5e4746613f833ecb14c54547660ac704f11fae3e # Parent 42cc066da2fd477abde431f05d3aa71b73c4be99 Added field reading and writing, although compression should be enabled only where space is saved, and offsets to fields should be stored in an appropriate index file. In addition, document numbers should also be stored to permit scanning of the fields file in a manner similar to that done with the term dictionary. diff -r 42cc066da2fd -r 5e4746613f83 iixr.py --- a/iixr.py Thu Aug 27 20:52:48 2009 +0200 +++ b/iixr.py Fri Aug 28 01:15:17 2009 +0200 @@ -22,6 +22,7 @@ from os.path import exists, join from os.path import commonprefix # to find common string prefixes from bisect import bisect_right # to find terms in the dictionary index +import bz2 # for field compression # Constants. @@ -78,20 +79,26 @@ record = "".join(bytes) self.f.write(record) - def write_string(self, s): + def write_string(self, s, compress=0): - "Write 's' to the file, recording its length." + """ + Write 's' to the file, recording its length and compressing the string + if 'compress' is set to a true value. + """ # Convert Unicode objects to strings. if isinstance(s, unicode): s = s.encode("utf-8") - length = len(s) + # Compress the string if requested. - if not (0 <= length <= 255): - raise ValueError, "String %r is too long." % s + if compress: + s = bz2.compress(s) + # Write the length of the data before the data itself. + + length = len(s) self.write_number(length) self.f.write(s) @@ -123,15 +130,24 @@ return number - def read_string(self): + def read_string(self, decompress=0): - "Read a string from the file." + """ + Read a string from the file, decompressing the stored data if + 'decompress' is set to a true value. + """ length = self.read_number() + s = self.f.read(length) + + # Decompress the data if requested. + + if decompress: + s = bz2.decompress(s) # Convert strings to Unicode objects. - return unicode(self.f.read(length), "utf-8") + return unicode(s, "utf-8") # Specific classes. @@ -484,6 +500,62 @@ self.index_reader.close() self.position_reader.close() +class FieldWriter(FileWriter): + + "Writing field data to files." + + def write_fields(self, fields): + + """ + Write the given list of 'fields' (strings representing field values). + Return the offset at which the fields are stored. + """ + + offset = self.f.tell() + + # Write the number of fields. + + self.write_number(len(fields)) + + # Write the fields themselves. + + for field in fields: + self.write_string(field, 0) # compress + + return offset + +class FieldReader(FileReader): + + "Reading field data from files." + + def read_fields(self): + + "Read fields from the file, returning the field values in a list." + + # Read the number of fields. + + nfields = self.read_number() + + # Collect the fields. + + fields = [] + i = 0 + + while i < nfields: + fields.append(self.read_string(0)) # decompress + i += 1 + + return fields + + def read_doc_fields(self, offset): + + "Read all fields at the given 'offset." + + self.f.seek(offset) + return self.read_fields() + +# High-level classes. + class IndexWriter: "Building term information and writing it to the term dictionary." diff -r 42cc066da2fd -r 5e4746613f83 test.py --- a/test.py Thu Aug 27 20:52:48 2009 +0200 +++ b/test.py Fri Aug 28 01:15:17 2009 +0200 @@ -64,6 +64,30 @@ print doc_positions == dp, doc_positions, dp r.close() +doc_fields = [ + ["testing", "fields", "stored", "compressed"], + ["fields", "for a second", "document"] + ] + +f = open("testF", "wb") +w = iixr.FieldWriter(f) +offsets = [] +for fields in doc_fields: + offsets.append(w.write_fields(fields)) +w.close() + +f = open("testF", "rb") +r = iixr.FieldReader(f) +for fields in doc_fields: + df = r.read_fields() + print fields == df, fields, df +offsets.reverse() +doc_fields.reverse() +for offset, fields in zip(offsets, doc_fields): + df = r.read_doc_fields(offset) + print fields == df, fields, df +r.close() + terms = [ ("aardvark", 100000123), ("anteater", 100000456),