iixr

Changeset

90:fc0e9882717b
2011-02-08 Paul Boddie raw files shortlog changelog graph Moved the record handling into reset methods in order to have records encompass entire "pages" of stored data, rather than individual entries. Changed the term dictionary index to refer to the start of each "page" of term dictionary entries rather than the second entry. This is done so that the entire "page" or record can be loaded when such a "page" is requested, although it effectively prohibits direct traversal of the term dictionary without having to refer to the term dictionary index. Introduced a test for array exhaustion when reading variable-length integers from a particular starting position.
iixr/data.py (file) iixr/fields.py (file) iixr/files.py (file) iixr/positions.py (file) iixr/terms.py (file)
     1.1 --- a/iixr/data.py	Mon Feb 07 02:05:38 2011 +0100
     1.2 +++ b/iixr/data.py	Tue Feb 08 00:08:27 2011 +0100
     1.3 @@ -150,8 +150,11 @@
     1.4      number.
     1.5      """
     1.6  
     1.7 +    length = len(bytes)
     1.8 +    if start == length:
     1.9 +        raise EOFError
    1.10 +
    1.11      number = 0
    1.12 -    length = len(bytes)
    1.13      digit = 0
    1.14      while start < length:
    1.15          x = bytes[start]
     2.1 --- a/iixr/fields.py	Mon Feb 07 02:05:38 2011 +0100
     2.2 +++ b/iixr/fields.py	Tue Feb 08 00:08:27 2011 +0100
     2.3 @@ -29,6 +29,7 @@
     2.4      "Writing field data to files."
     2.5  
     2.6      def reset(self):
     2.7 +        self.end_record()
     2.8          self.last_docnum = None
     2.9          self.subtractor = None
    2.10  
    2.11 @@ -47,8 +48,6 @@
    2.12              self.subtractor = get_subtractor(docnum)
    2.13              docnum_seq = docnum
    2.14  
    2.15 -        self.begin_record()
    2.16 -
    2.17          # Write the document number.
    2.18  
    2.19          self.write_sequence_value(docnum_seq)
    2.20 @@ -63,8 +62,6 @@
    2.21              self.write_number(i)
    2.22              self.write_string(field, 1) # compress
    2.23  
    2.24 -        self.end_record()
    2.25 -
    2.26          self.last_docnum = docnum
    2.27  
    2.28  class FieldReader(FileReader):
    2.29 @@ -74,6 +71,7 @@
    2.30      def reset(self):
    2.31          self.last_docnum = None
    2.32          self.adder = None
    2.33 +        self.begin_record()
    2.34  
    2.35      def read_fields(self):
    2.36  
    2.37 @@ -82,8 +80,6 @@
    2.38          number and a list of field (identifier, value) pairs.
    2.39          """
    2.40  
    2.41 -        self.begin_record()
    2.42 -
    2.43          # Read the document number.
    2.44  
    2.45          docnum = self.read_sequence_value()
    2.46 @@ -109,8 +105,6 @@
    2.47              fields.append((identifier, value))
    2.48              i += 1
    2.49  
    2.50 -        self.end_record()
    2.51 -
    2.52          return self.last_docnum, fields
    2.53  
    2.54      def read_document_fields(self, docnum, offset):
    2.55 @@ -131,6 +125,7 @@
    2.56      "Writing field index details to files."
    2.57  
    2.58      def reset(self):
    2.59 +        self.end_record()
    2.60          self.last_docnum = None
    2.61          self.subtractor = None
    2.62          self.last_offset = 0
    2.63 @@ -150,8 +145,6 @@
    2.64              self.subtractor = get_subtractor(docnum)
    2.65              docnum_seq = docnum
    2.66  
    2.67 -        self.begin_record()
    2.68 -
    2.69          # Write the document number.
    2.70  
    2.71          self.write_sequence_value(docnum_seq)
    2.72 @@ -159,7 +152,6 @@
    2.73          # Write the offset delta.
    2.74  
    2.75          self.write_number(offset - self.last_offset)
    2.76 -        self.end_record()
    2.77  
    2.78          self.last_docnum = docnum
    2.79          self.last_offset = offset
    2.80 @@ -172,13 +164,12 @@
    2.81          self.last_docnum = None
    2.82          self.adder = None
    2.83          self.last_offset = 0
    2.84 +        self.begin_record()
    2.85  
    2.86      def read_document(self):
    2.87  
    2.88          "Read a document number and field file offset."
    2.89  
    2.90 -        self.begin_record()
    2.91 -
    2.92          # Read the document number.
    2.93  
    2.94          docnum = self.read_sequence_value()
    2.95 @@ -192,7 +183,6 @@
    2.96          # Read the offset.
    2.97  
    2.98          self.last_offset += self.read_number()
    2.99 -        self.end_record()
   2.100  
   2.101          return self.last_docnum, self.last_offset
   2.102  
   2.103 @@ -211,6 +201,7 @@
   2.104          "Write details of the document with the given 'docnum' and 'fields'."
   2.105  
   2.106          if self.entry % self.interval == 0:
   2.107 +            self.field_writer.reset()
   2.108              offset = self.field_writer.tell()
   2.109              self.field_writer.write_fields(docnum, fields)
   2.110              self.field_index_writer.write_document(docnum, offset)
   2.111 @@ -230,6 +221,7 @@
   2.112      def __init__(self, field_reader, field_index_reader):
   2.113          self.field_reader = field_reader
   2.114          self.field_index_reader = field_index_reader
   2.115 +        self.entry = 0
   2.116  
   2.117          self.cache = {}
   2.118          self.docs = []
   2.119 @@ -267,7 +259,17 @@
   2.120  
   2.121          "Return the next document number and fields."
   2.122  
   2.123 -        return self.field_reader.read_fields()
   2.124 +        try:
   2.125 +            return self.field_reader.read_fields()
   2.126 +        except EOFError:
   2.127 +            self.entry += 1
   2.128 +            try:
   2.129 +                found_docnum, offset = self.docs[self.entry]
   2.130 +            except IndexError:
   2.131 +                raise EOFError
   2.132 +            else:
   2.133 +                self.field_reader.reset()
   2.134 +                return self.field_reader.read_fields()
   2.135  
   2.136      # Random access methods.
   2.137  
     3.1 --- a/iixr/files.py	Mon Feb 07 02:05:38 2011 +0100
     3.2 +++ b/iixr/files.py	Tue Feb 08 00:08:27 2011 +0100
     3.3 @@ -58,15 +58,17 @@
     3.4      "Writing basic data types to files."
     3.5  
     3.6      def tell(self):
     3.7 +        # NOTE: Will not be accurate within the current record.
     3.8          return self.f.tell() + len(self.data)
     3.9  
    3.10      def begin_record(self):
    3.11          pass
    3.12  
    3.13      def end_record(self):
    3.14 -        vint_to_array(len(self.record), self.data)
    3.15 -        self.data += self.record
    3.16 -        self.record = array('B')
    3.17 +        if self.record:
    3.18 +            vint_to_array(len(self.record), self.data)
    3.19 +            self.data += self.record
    3.20 +            self.record = array('B')
    3.21  
    3.22      def write_number(self, number):
    3.23  
    3.24 @@ -132,6 +134,7 @@
    3.25  
    3.26      def flush(self):
    3.27          if self.f is not None:
    3.28 +            self.end_record()
    3.29              self.data.tofile(self.f)
    3.30              self.data = array('B')
    3.31  
    3.32 @@ -144,12 +147,16 @@
    3.33      "Reading basic data types from files."
    3.34  
    3.35      def begin_record(self):
    3.36 -        size = self.read_number_from_file()
    3.37 -        self.record.fromfile(self.f, size)
    3.38 +        self.record = array('B')
    3.39          self.start = 0
    3.40 +        try:
    3.41 +            size = self.read_number_from_file()
    3.42 +            self.record.fromfile(self.f, size)
    3.43 +        except EOFError:
    3.44 +            pass
    3.45  
    3.46      def end_record(self):
    3.47 -        self.record = array('B')
    3.48 +        pass
    3.49  
    3.50      def read_number_from_file(self):
    3.51  
     4.1 --- a/iixr/positions.py	Mon Feb 07 02:05:38 2011 +0100
     4.2 +++ b/iixr/positions.py	Tue Feb 08 00:08:27 2011 +0100
     4.3 @@ -26,6 +26,7 @@
     4.4      "Writing position information to files."
     4.5  
     4.6      def reset(self):
     4.7 +        self.end_record()
     4.8          self.last_docnum = None
     4.9          self.subtractor = None
    4.10  
    4.11 @@ -56,10 +57,8 @@
    4.12              self.subtractor = get_subtractor(docnum)
    4.13              docnum_seq = docnum
    4.14  
    4.15 -        self.begin_record()
    4.16          self.write_sequence_value(docnum_seq)
    4.17          self.write_monotonic_sequence(positions)
    4.18 -        self.end_record()
    4.19  
    4.20          self.last_docnum = docnum
    4.21  
    4.22 @@ -70,6 +69,7 @@
    4.23      def reset(self):
    4.24          self.last_docnum = None
    4.25          self.adder = None
    4.26 +        self.begin_record()
    4.27  
    4.28      def read_positions(self):
    4.29  
    4.30 @@ -77,8 +77,6 @@
    4.31          Read positions, returning a document number and a list of positions.
    4.32          """
    4.33  
    4.34 -        self.begin_record()
    4.35 -
    4.36          # Read the document number.
    4.37  
    4.38          docnum = self.read_sequence_value()
    4.39 @@ -95,7 +93,6 @@
    4.40              self.last_docnum = docnum
    4.41  
    4.42          positions = self.read_monotonic_sequence()
    4.43 -        self.end_record()
    4.44  
    4.45          return self.last_docnum, positions
    4.46  
    4.47 @@ -104,6 +101,7 @@
    4.48      "Writing position index information to files."
    4.49  
    4.50      def reset(self):
    4.51 +        self.end_record()
    4.52          self.last_docnum = None
    4.53          self.subtractor = None
    4.54          self.last_pos_offset = 0
    4.55 @@ -123,11 +121,9 @@
    4.56              self.subtractor = get_subtractor(docnum)
    4.57              docnum_seq = docnum
    4.58  
    4.59 -        self.begin_record()
    4.60          self.write_sequence_value(docnum_seq)
    4.61          self.write_number(pos_offset - self.last_pos_offset)
    4.62          self.write_number(count)
    4.63 -        self.end_record()
    4.64  
    4.65          self.last_docnum = docnum
    4.66          self.last_pos_offset = pos_offset
    4.67 @@ -140,6 +136,7 @@
    4.68          self.last_docnum = None
    4.69          self.adder = None
    4.70          self.last_pos_offset = 0
    4.71 +        self.begin_record()
    4.72  
    4.73      def read_positions(self):
    4.74  
    4.75 @@ -148,8 +145,6 @@
    4.76          file, and the number of documents in a section of that file.
    4.77          """
    4.78  
    4.79 -        self.begin_record()
    4.80 -
    4.81          # Read the document number.
    4.82  
    4.83          docnum = self.read_sequence_value()
    4.84 @@ -167,7 +162,6 @@
    4.85          # Read the document count.
    4.86  
    4.87          count = self.read_number()
    4.88 -        self.end_record()
    4.89  
    4.90          return self.last_docnum, self.last_pos_offset, count
    4.91  
    4.92 @@ -339,14 +333,14 @@
    4.93  
    4.94                      self.position_index_writer.write_positions(first_docnum, first_offset, self.interval)
    4.95  
    4.96 -                    first_offset = self.position_writer.tell()
    4.97 -                    first_docnum = None
    4.98 -
    4.99                      # Reset the position writer so that position readers accessing
   4.100                      # a section start with the correct document number.
   4.101  
   4.102                      self.position_writer.reset()
   4.103  
   4.104 +                    first_offset = self.position_writer.tell()
   4.105 +                    first_docnum = None
   4.106 +
   4.107              # Finish writing an index entry for the remaining documents.
   4.108  
   4.109              else:
     5.1 --- a/iixr/terms.py	Mon Feb 07 02:05:38 2011 +0100
     5.2 +++ b/iixr/terms.py	Tue Feb 08 00:08:27 2011 +0100
     5.3 @@ -29,6 +29,7 @@
     5.4      "Writing term information to files."
     5.5  
     5.6      def reset(self):
     5.7 +        self.end_record()
     5.8          self.last_term = ""
     5.9          self.last_offset = 0
    5.10  
    5.11 @@ -40,14 +41,6 @@
    5.12          term information file.
    5.13          """
    5.14  
    5.15 -        self.begin_record()
    5.16 -        self._write_term(term, offset, frequency, doc_frequency)
    5.17 -        self.end_record()
    5.18 -
    5.19 -    def _write_term(self, term, offset, frequency, doc_frequency):
    5.20 -
    5.21 -        "Performs the term writing for 'write_term'."
    5.22 -
    5.23          if term <= self.last_term:
    5.24              raise ValueError, "Term %r precedes the previous term %r." % (term, self.last_term)
    5.25  
    5.26 @@ -79,6 +72,7 @@
    5.27      def reset(self):
    5.28          self.last_term = ""
    5.29          self.last_offset = 0
    5.30 +        self.begin_record()
    5.31  
    5.32      def read_term(self):
    5.33  
    5.34 @@ -87,16 +81,6 @@
    5.35          frequency from the term information file.
    5.36          """
    5.37  
    5.38 -        self.begin_record()
    5.39 -        try:
    5.40 -            return self._read_term()
    5.41 -        finally:
    5.42 -            self.end_record()
    5.43 -
    5.44 -    def _read_term(self):
    5.45 -
    5.46 -        "Performs the term reading for 'read_term'."
    5.47 -
    5.48          # Read the prefix length and term suffix.
    5.49  
    5.50          common = self.read_number()
    5.51 @@ -145,13 +129,11 @@
    5.52          'info_offset' in the term information file.
    5.53          """
    5.54  
    5.55 -        self.begin_record()
    5.56 -        TermWriter._write_term(self, term, offset, frequency, doc_frequency)
    5.57 +        TermWriter.write_term(self, term, offset, frequency, doc_frequency)
    5.58  
    5.59          # Write the information file offset delta.
    5.60  
    5.61          self.write_number(info_offset - self.last_info_offset)
    5.62 -        self.end_record()
    5.63  
    5.64          self.last_info_offset = info_offset
    5.65  
    5.66 @@ -171,13 +153,11 @@
    5.67          index file.
    5.68          """
    5.69  
    5.70 -        self.begin_record()
    5.71 -        term, offset, frequency, doc_frequency = TermReader._read_term(self)
    5.72 +        term, offset, frequency, doc_frequency = TermReader.read_term(self)
    5.73  
    5.74          # Read the offset delta.
    5.75  
    5.76          self.last_info_offset += self.read_number()
    5.77 -        self.end_record()
    5.78  
    5.79          return term, offset, frequency, doc_frequency, self.last_info_offset
    5.80  
    5.81 @@ -197,16 +177,16 @@
    5.82          """
    5.83          Write the given 'term', its position file 'offset', its 'frequency' and
    5.84          its 'doc_frequency' (number of documents in which it appears) to the
    5.85 -        term information file. Return the offset after the term information was
    5.86 +        term information file. Return the offset before the term information was
    5.87          written to the file.
    5.88          """
    5.89  
    5.90 -        self.info_writer.write_term(term, offset, frequency, doc_frequency)
    5.91 -
    5.92          if self.entry % self.interval == 0:
    5.93 +            self.info_writer.reset()
    5.94              info_offset = self.info_writer.tell()
    5.95              self.index_writer.write_term(term, offset, frequency, doc_frequency, info_offset)
    5.96  
    5.97 +        self.info_writer.write_term(term, offset, frequency, doc_frequency)
    5.98          self.entry += 1
    5.99  
   5.100      def write_term_positions(self, term, doc_positions):
   5.101 @@ -236,6 +216,7 @@
   5.102          self.info_reader = info_reader
   5.103          self.index_reader = index_reader
   5.104          self.position_dict_reader = position_dict_reader
   5.105 +        self.entry = 0
   5.106  
   5.107          self.terms = []
   5.108          try:
   5.109 @@ -269,8 +250,10 @@
   5.110          # as the closest.
   5.111  
   5.112          if i == -1:
   5.113 +            self.entry = 0
   5.114              return self.terms[0]
   5.115          else:
   5.116 +            self.entry = i
   5.117              return self.terms[i]
   5.118  
   5.119      def _find_closest_term(self, term):
   5.120 @@ -297,10 +280,11 @@
   5.121          # and scan for the desired term.
   5.122  
   5.123          else:
   5.124 -            self.info_reader.go_to_term(found_term, offset, info_offset)
   5.125 +            # Reset the term and offset for the new page.
   5.126 +            self.info_reader.go_to_term("", 0, info_offset)
   5.127              try:
   5.128                  while term > found_term:
   5.129 -                    found_term, offset, frequency, doc_frequency = self.info_reader.read_term()
   5.130 +                    found_term, offset, frequency, doc_frequency = self._read_term()
   5.131              except EOFError:
   5.132                  pass
   5.133  
   5.134 @@ -355,6 +339,7 @@
   5.135      # Sequential access methods.
   5.136  
   5.137      def rewind(self):
   5.138 +        self.entry = 0
   5.139          self.info_reader.rewind()
   5.140  
   5.141      def read_term(self):
   5.142 @@ -364,8 +349,28 @@
   5.143          documents and positions at which the term is found.
   5.144          """
   5.145  
   5.146 -        term, offset, frequency, doc_frequency = self.info_reader.read_term()
   5.147 -        return self._get_term_and_positions(term, offset, frequency, doc_frequency)
   5.148 +        return self._get_term_and_positions(*self._read_term())
   5.149 +
   5.150 +    def _read_term(self):
   5.151 +
   5.152 +        try:
   5.153 +            term, offset, frequency, doc_frequency = self.info_reader.read_term()
   5.154 +        except EOFError:
   5.155 +            self.entry += 1
   5.156 +            try:
   5.157 +                term, offset, frequency, doc_frequency, info_offset = self.terms[self.entry]
   5.158 +            except IndexError:
   5.159 +                raise EOFError
   5.160 +            else:
   5.161 +                # Reset the term and offset for the new page.
   5.162 +
   5.163 +                self.info_reader.go_to_term("", 0, info_offset)
   5.164 +
   5.165 +                # Skip the term in the information file.
   5.166 +
   5.167 +                self.info_reader.read_term()
   5.168 +
   5.169 +        return term, offset, frequency, doc_frequency
   5.170  
   5.171      def go_to_term(self, term):
   5.172  
   5.173 @@ -380,7 +385,14 @@
   5.174          # Position the reader, if necessary.
   5.175  
   5.176          if info_offset is not None:
   5.177 -            self.info_reader.go_to_term(found_term, offset, info_offset)
   5.178 +
   5.179 +            # Reset the term and offset for the new page.
   5.180 +
   5.181 +            self.info_reader.go_to_term("", 0, info_offset)
   5.182 +
   5.183 +            # Skip the term in the information file.
   5.184 +
   5.185 +            self.info_reader.read_term()
   5.186  
   5.187          return found_term, offset, frequency, doc_frequency
   5.188  
   5.189 @@ -407,7 +419,7 @@
   5.190  
   5.191              while found_term.startswith(term):
   5.192                  terms.append(found_term)
   5.193 -                found_term, offset, frequency, doc_frequency = self.info_reader.read_term()
   5.194 +                found_term, offset, frequency, doc_frequency = self._read_term()
   5.195  
   5.196          except EOFError:
   5.197              pass