iixr (file iixr.py at 0ba1bf2fa563)

     1 #!/usr/bin/env python     2      3 """     4 A simple (and sane) text indexing library.     5      6 Copyright (C) 2009 Paul Boddie <paul@boddie.org.uk>     7      8 This program is free software; you can redistribute it and/or modify it under     9 the terms of the GNU General Public License as published by the Free Software    10 Foundation; either version 3 of the License, or (at your option) any later    11 version.    12     13 This program is distributed in the hope that it will be useful, but WITHOUT ANY    14 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A    15 PARTICULAR PURPOSE.  See the GNU General Public License for more details.    16     17 You should have received a copy of the GNU General Public License along    18 with this program.  If not, see <http://www.gnu.org/licenses/>.    19 """    20     21 from os import listdir, mkdir    # index and partition discovery    22 from os.path import exists, join    23 from os.path import commonprefix # to find common string prefixes    24 from bisect import bisect_right  # to find terms in the dictionary index    25 from bisect import insort_right  # to maintain a sorted list of data for merging    26 import bz2, zlib                 # for field compression    27     28 # Constants.    29     30 INTERVAL = 100    31 FLUSH_INTERVAL = 1000000    32     33 compressors = [("b", bz2.compress), ("z", zlib.compress)]    34 decompressors = {"b" : bz2.decompress, "z" : zlib.decompress}    35     36 # Foundation classes.    37     38 class File:    39     40     "A basic file abstraction."    41     42     def __init__(self, f):    43         self.f = f    44         self.reset()    45     46     def reset(self):    47     48         "To be used to reset the state of the reader or writer between records."    49     50         pass    51     52     def rewind(self):    53         self.f.seek(0)    54     55     def close(self):    56         if self.f is not None:    57             self.f.close()    58             self.f = None    59     60 class FileWriter(File):    61     62     "Writing basic data types to files."    63     64     def write_number(self, number):    65     66         "Write 'number' to the file using a variable length encoding."    67     68         # Negative numbers are not supported.    69     70         if number < 0:    71             raise ValueError, "Number %r is negative." % number    72     73         # Special case: one byte containing zero.    74     75         elif number == 0:    76             self.f.write(chr(0))    77             return    78     79         # Write the number from least to most significant digits.    80     81         bytes = []    82     83         while number != 0:    84             lsd = number & 127    85             number = number >> 7    86             if number != 0:    87                 lsd |= 128    88             bytes.append(chr(lsd))    89     90         record = "".join(bytes)    91         self.f.write(record)    92     93     def write_string(self, s, compress=0):    94     95         """    96         Write 's' to the file, recording its length and compressing the string    97         if 'compress' is set to a true value.    98         """    99    100         # Convert Unicode objects to strings.   101    102         if isinstance(s, unicode):   103             s = s.encode("utf-8")   104    105         # Compress the string if requested.   106    107         if compress:   108             for flag, fn in compressors:   109                 cs = fn(s)   110    111                 # Take the first string shorter than the original.   112    113                 if len(cs) < len(s):   114                     s = cs   115                     break   116             else:   117                 flag = "-"   118    119             # Record whether compression was used.   120    121             self.f.write(flag)   122    123         # Write the length of the data before the data itself.   124    125         length = len(s)   126         self.write_number(length)   127         self.f.write(s)   128    129 class FileReader(File):   130    131     "Reading basic data types from files."   132    133     def read_number(self):   134    135         "Read a number from the file."   136    137         # Read each byte, adding it to the number.   138    139         shift = 0   140         number = 0   141         more = 1   142    143         while more:   144             byte = self.f.read(1)   145             if not byte:   146                 raise EOFError   147    148             csd = ord(byte)   149             more = csd & 128 != 0   150             if more:   151                 csd &= 127   152             number += (csd << shift)   153             shift += 7   154    155         return number   156    157     def read_string(self, decompress=0):   158    159         """   160         Read a string from the file, decompressing the stored data if   161         'decompress' is set to a true value.   162         """   163    164         # Decompress the data if requested.   165    166         if decompress:   167             flag = self.f.read(1)   168         else:   169             flag = "-"   170    171         length = self.read_number()   172         s = self.f.read(length)   173    174         # Perform decompression if applicable.   175    176         if flag != "-":   177             fn = decompressors[flag]   178             s = fn(s)   179    180         # Convert strings to Unicode objects.   181    182         return unicode(s, "utf-8")   183    184 # Specific classes for storing term and position information.   185    186 class PositionWriter(FileWriter):   187    188     "Writing position information to files."   189    190     def reset(self):   191         self.last_docnum = 0   192    193     def write_positions(self, docnum, positions):   194    195         "Write for the document 'docnum' the given 'positions'."   196    197         if docnum < self.last_docnum:   198             raise ValueError, "Document number %r is less than previous number %r." % (docnum, self.last_docnum)   199    200         # Write the document number delta.   201    202         self.write_number(docnum - self.last_docnum)   203    204         # Write the number of positions.   205    206         self.write_number(len(positions))   207    208         # Make sure that the positions are sorted.   209    210         positions.sort()   211    212         # Write the position deltas.   213    214         last = 0   215         for position in positions:   216             pos = position - last   217             self.write_number(pos)   218             last = position   219    220         self.last_docnum = docnum   221    222     def write_term_positions(self, doc_positions):   223    224         """   225         Write all 'doc_positions' - a collection of tuples of the form (document   226         number, position list) - to the file, returning a tuple containing the   227         offset at which they were stored together with the frequency (number of   228         positions) for the term involved.   229         """   230    231         # Reset the writer and record the current file offset.   232    233         self.reset()   234         offset = self.f.tell()   235    236         # Write the number of documents.   237    238         self.write_number(len(doc_positions))   239    240         # Write the positions.   241    242         frequency = 0   243    244         for docnum, positions in doc_positions:   245             self.write_positions(docnum, positions)   246             frequency += len(positions)   247    248         return offset, frequency   249    250 class PositionReader(FileReader):   251    252     "Reading position information from files."   253    254     def reset(self):   255         self.last_docnum = 0   256    257     def read_positions(self):   258    259         "Read positions, returning a document number and a list of positions."   260    261         # Read the document number delta and add it to the last number.   262    263         self.last_docnum += self.read_number()   264    265         # Read the number of positions.   266    267         npositions = self.read_number()   268    269         # Read the position deltas, adding each previous position to get the   270         # appropriate collection of absolute positions.   271    272         i = 0   273         last = 0   274         positions = []   275    276         while i < npositions:   277             last += self.read_number()   278             positions.append(last)   279             i += 1   280    281         return self.last_docnum, positions   282    283     def read_term_positions(self, offset):   284    285         """   286         Read all positions from 'offset', seeking to that position in the file   287         before reading.   288         """   289    290         self.reset()   291         self.f.seek(offset)   292    293         # Read the number of documents.   294    295         ndocuments = self.read_number()   296    297         # Read all records.   298    299         i = 0   300         doc_positions = []   301    302         while i < ndocuments:   303             doc_positions.append(self.read_positions())   304             i += 1   305    306         return doc_positions   307    308 class TermWriter(FileWriter):   309    310     "Writing term information to files."   311    312     def reset(self):   313         self.last_term = ""   314         self.last_offset = 0   315    316     def write_term(self, term, offset, frequency):   317    318         """   319         Write the given 'term', its position file 'offset', and its 'frequency'   320         to the term information file. Return the offset after the term   321         information was written to the file.   322         """   323    324         # Too long terms are not currently supported.   325    326         if len(term) > 255:   327             raise ValueError, "Term %r is too long." % term   328    329         # Write the prefix length and term suffix.   330    331         common = len(commonprefix([self.last_term, term]))   332         suffix = term[common:]   333    334         self.write_number(common)   335         self.write_string(suffix)   336    337         # Write the offset delta.   338    339         self.write_number(offset - self.last_offset)   340    341         # Write the frequency.   342    343         self.write_number(frequency)   344    345         self.last_term = term   346         self.last_offset = offset   347    348         return self.f.tell()   349    350 class TermReader(FileReader):   351    352     "Reading term information from files."   353    354     def reset(self):   355         self.last_term = ""   356         self.last_offset = 0   357    358     def rewind(self):   359         self.reset()   360         FileReader.rewind(self)   361    362     def read_term(self):   363    364         """   365         Read a term, its position file offset, and its frequency from the term   366         information file.   367         """   368    369         # Read the prefix length and term suffix.   370    371         common = self.read_number()   372         suffix = self.read_string()   373    374         self.last_term = self.last_term[:common] + suffix   375    376         # Read the offset delta.   377    378         self.last_offset += self.read_number()   379    380         # Read the frequency.   381    382         frequency = self.read_number()   383    384         return self.last_term, self.last_offset, frequency   385    386     def go_to_term(self, term, offset, info_offset):   387    388         """   389         Seek past the entry for 'term' having 'offset' to 'info_offset'. This   390         permits the scanning for later terms from the specified term.   391         """   392    393         self.f.seek(info_offset)   394         self.last_term = term   395         self.last_offset = offset   396    397 class TermIndexWriter(TermWriter):   398    399     "Writing term dictionary index details to files."   400    401     def reset(self):   402         TermWriter.reset(self)   403         self.last_info_offset = 0   404    405     def write_term(self, term, offset, frequency, info_offset):   406    407         """   408         Write the given 'term', its position file 'offset', and its 'frequency'   409         to the term dictionary index file, along with the 'info_offset' in the   410         term information file.   411         """   412    413         TermWriter.write_term(self, term, offset, frequency)   414    415         # Write the information file offset delta.   416    417         self.write_number(info_offset - self.last_info_offset)   418         self.last_info_offset = info_offset   419    420 class TermIndexReader(TermReader):   421    422     "Reading term dictionary index details from files."   423    424     def reset(self):   425         TermReader.reset(self)   426         self.last_info_offset = 0   427    428     def read_term(self):   429    430         """   431         Read a term, its position file offset, its frequency, and its term   432         information file offset from the term dictionary index file.   433         """   434    435         term, offset, frequency = TermReader.read_term(self)   436    437         # Read the offset delta.   438    439         self.last_info_offset += self.read_number()   440    441         return term, offset, frequency, self.last_info_offset   442    443 class TermDictionaryWriter:   444    445     "Writing term dictionaries."   446    447     def __init__(self, info_writer, index_writer, position_writer, interval):   448         self.info_writer = info_writer   449         self.index_writer = index_writer   450         self.position_writer = position_writer   451         self.interval = interval   452         self.entry = 0   453    454     def _write_term(self, term, offset, frequency):   455    456         """   457         Write the given 'term', its position file 'offset', and its 'frequency'   458         to the term information file and optionally to the index, making a   459         dictionary entry.   460         """   461    462         info_offset = self.info_writer.write_term(term, offset, frequency)   463    464         if self.entry % self.interval == 0:   465             self.index_writer.write_term(term, offset, frequency, info_offset)   466    467         self.entry += 1   468    469     def write_term_positions(self, term, doc_positions):   470    471         """   472         Write the given 'term' and the 'doc_positions' recording the documents   473         and positions at which the term is found.   474         """   475    476         offset, frequency = self.position_writer.write_term_positions(doc_positions)   477         self._write_term(term, offset, frequency)   478    479     def close(self):   480         self.info_writer.close()   481         self.index_writer.close()   482         self.position_writer.close()   483    484 class TermDictionaryReader:   485    486     "Reading term dictionaries."   487    488     def __init__(self, info_reader, index_reader, position_reader):   489         self.info_reader = info_reader   490         self.index_reader = index_reader   491         self.position_reader = position_reader   492    493         self.terms = []   494         try:   495             while 1:   496                 self.terms.append(self.index_reader.read_term())   497         except EOFError:   498             pass   499    500         # Large numbers for ordering purposes.   501    502         self.max_offset = self.terms[-1][1]   503         self.max_info_offset = self.terms[-1][2]   504    505     def _find_term(self, term):   506    507         """   508         Find the position file offset and frequency of 'term' from the term   509         dictionary.   510         """   511    512         i = bisect_right(self.terms, (term, self.max_offset, self.max_info_offset)) - 1   513    514         # Get the entry position providing the term or one preceding it.   515    516         if i == -1:   517             return None   518    519         found_term, offset, frequency, info_offset = self.terms[i]   520    521         # Where the term is found immediately, return the offset.   522    523         if term == found_term:   524             return offset, frequency   525    526         # Otherwise, seek past the index term's entry in the information file   527         # and scan for the desired term.   528    529         else:   530             self.info_reader.go_to_term(found_term, offset, info_offset)   531             try:   532                 while term > found_term:   533                     found_term, offset, frequency = self.info_reader.read_term()   534             except EOFError:   535                 pass   536    537             # If the term is found, return the offset and frequency.   538    539             if term == found_term:   540                 return offset, frequency   541             else:   542                 return None   543    544     def rewind(self):   545         self.info_reader.rewind()   546    547     def _get_positions(self, offset):   548         return self.position_reader.read_term_positions(offset)   549    550     def read_term(self):   551    552         """   553         Return the next term, its frequency and the documents and positions at   554         which the term is found.   555         """   556    557         term, offset, frequency = self.info_reader.read_term()   558         positions = self._get_positions(offset)   559         return term, frequency, positions   560    561     def find_positions(self, term):   562    563         "Return the documents and positions at which the given 'term' is found."   564    565         t = self._find_term(term)   566         if t is None:   567             return None   568         else:   569             offset, frequency = t   570             return self._get_positions(offset)   571    572     def get_frequency(self, term):   573    574         "Return the frequency of the given 'term'."   575    576         t = self._find_term(term)   577         if t is None:   578             return None   579         else:   580             offset, frequency = t   581             return frequency   582    583     def close(self):   584         self.info_reader.close()   585         self.index_reader.close()   586         self.position_reader.close()   587    588 # Specific classes for storing document information.   589    590 class FieldWriter(FileWriter):   591    592     "Writing field data to files."   593    594     def reset(self):   595         self.last_docnum = 0   596    597     def write_fields(self, docnum, fields):   598    599         """   600         Write for the given 'docnum', a list of 'fields' (strings representing   601         field values). Return the offset at which the fields are stored.   602         """   603    604         offset = self.f.tell()   605    606         # Write the document number delta.   607    608         self.write_number(docnum - self.last_docnum)   609    610         # Write the number of fields.   611    612         self.write_number(len(fields))   613    614         # Write the fields themselves.   615    616         for field in fields:   617             self.write_string(field, 1) # compress   618    619         self.last_docnum = docnum   620         return offset   621    622 class FieldReader(FileReader):   623    624     "Reading field data from files."   625    626     def reset(self):   627         self.last_docnum = 0   628    629     def read_fields(self):   630    631         """   632         Read fields from the file, returning a tuple containing the document   633         number and a list of field values.   634         """   635    636         # Read the document number.   637    638         self.last_docnum += self.read_number()   639    640         # Read the number of fields.   641    642         nfields = self.read_number()   643    644         # Collect the fields.   645    646         fields = []   647         i = 0   648    649         while i < nfields:   650             fields.append(self.read_string(1)) # decompress   651             i += 1   652    653         return self.last_docnum, fields   654    655     def read_document_fields(self, docnum, offset):   656    657         """   658         Read fields for 'docnum' at the given 'offset'. This permits the   659         retrieval of details for the specified document, as well as scanning for   660         later documents.   661         """   662    663         self.f.seek(offset)   664         bad_docnum, fields = self.read_fields()   665         self.last_docnum = docnum   666         return docnum, fields   667    668 class FieldIndexWriter(FileWriter):   669    670     "Writing field index details to files."   671    672     def reset(self):   673         self.last_docnum = 0   674         self.last_offset = 0   675    676     def write_document(self, docnum, offset):   677    678         """   679         Write for the given 'docnum', the 'offset' at which the fields for the   680         document are stored in the fields file.   681         """   682    683         # Write the document number and offset deltas.   684    685         self.write_number(docnum - self.last_docnum)   686         self.write_number(offset - self.last_offset)   687    688         self.last_docnum = docnum   689         self.last_offset = offset   690    691 class FieldIndexReader(FileReader):   692    693     "Reading field index details from files."   694    695     def reset(self):   696         self.last_docnum = 0   697         self.last_offset = 0   698    699     def read_document(self):   700    701         "Read a document number and field file offset."   702    703         # Read the document number delta and offset.   704    705         self.last_docnum += self.read_number()   706         self.last_offset += self.read_number()   707    708         return self.last_docnum, self.last_offset   709    710 class FieldDictionaryWriter:   711    712     "Writing field dictionary details."   713    714     def __init__(self, field_writer, field_index_writer, interval):   715         self.field_writer = field_writer   716         self.field_index_writer = field_index_writer   717         self.interval = interval   718         self.entry = 0   719    720     def write_fields(self, docnum, fields):   721    722         "Write details of the document with the given 'docnum' and 'fields'."   723    724         offset = self.field_writer.write_fields(docnum, fields)   725    726         if self.entry % self.interval == 0:   727             self.field_index_writer.write_document(docnum, offset)   728    729         self.entry += 1   730    731     def close(self):   732         self.field_writer.close()   733         self.field_index_writer.close()   734    735 class FieldDictionaryReader:   736    737     "Reading field dictionary details."   738    739     def __init__(self, field_reader, field_index_reader):   740         self.field_reader = field_reader   741         self.field_index_reader = field_index_reader   742    743         self.docs = []   744         try:   745             while 1:   746                 self.docs.append(self.field_index_reader.read_document())   747         except EOFError:   748             pass   749    750         # Large numbers for ordering purposes.   751    752         self.max_offset = self.docs[-1][1]   753    754     def read_fields(self, docnum):   755    756         "Read the fields of the document with the given 'docnum'."   757    758         i = bisect_right(self.docs, (docnum, self.max_offset)) - 1   759    760         # Get the entry position providing the term or one preceding it.   761    762         if i == -1:   763             return None   764    765         found_docnum, offset = self.docs[i]   766    767         # Read from the fields file.   768    769         found_docnum, fields = self.field_reader.read_document_fields(found_docnum, offset)   770    771         # Scan for the document, if necessary.   772    773         try:   774             while docnum > found_docnum:   775                 found_docnum, fields = self.field_reader.read_fields()   776         except EOFError:   777             pass   778    779         # If the document is found, return the fields.   780    781         if docnum == found_docnum:   782             return fields   783         else:   784             return None   785    786     def close(self):   787         self.field_reader.close()   788         self.field_index_reader.close()   789    790 # Dictionary merging classes.   791    792 class TermDictionaryMerger:   793    794     "Merge position files."   795    796     def __init__(self, writer, readers):   797         self.writer = writer   798         self.readers = readers   799    800     def merge(self):   801         entries = []   802    803         # Get the first entries from the readers.   804    805         for partition, reader in enumerate(self.readers):   806             reader.rewind()   807    808             try:   809                 term, frequency, positions = reader.read_term()   810                 insort_right(entries, (term, positions, partition))   811             except EOFError:   812                 pass   813    814         # While entries are available, write them out in order, merging where   815         # appropriate.   816    817         while entries:   818             term, doc_positions, partition = entries[0]   819             to_update = [partition]   820    821             nentries = len(entries)   822             i = 1   823    824             # Find other entries for the term.   825    826             while i < nentries:   827                 other_term, other_doc_positions, other_partition = entries[i]   828    829                 # For such entries, merge the positions.   830    831                 if other_term == term:   832                     doc_positions += other_doc_positions   833                     to_update.append(other_partition)   834                     i += 1   835                 else:   836                     break   837    838             # Write the combined term details.   839    840             doc_positions.sort()   841             self.writer.write_term_positions(term, doc_positions)   842    843             # Update the entries from the affected readers.   844    845             del entries[:i]   846    847             for partition in to_update:   848                 try:   849                     term, frequency, positions = self_readers[partition].read_term()   850                     insort_right(entries, (term, positions, partition))   851                 except EOFError:   852                     pass   853    854 # High-level classes.   855    856 class IndexWriter:   857    858     """   859     Building term information and writing it to the term and field dictionaries.   860     """   861    862     def __init__(self, pathname, interval, flush_interval):   863         self.pathname = pathname   864         self.interval = interval   865         self.flush_interval = flush_interval   866    867         self.dict_partition = 0   868         self.field_dict_partition = 0   869    870         self.terms = {}   871         self.docs = {}   872    873         self.position_counter = 0   874         self.field_counter = 0   875    876     def add_position(self, term, docnum, position):   877    878         """   879         Add a position entry for the given 'term' in the document with the given   880         'docnum', indicating the given 'position'.   881         """   882    883         if not self.terms.has_key(term):   884             doc_positions = self.terms[term] = {}   885         else:   886             doc_positions = self.terms[term]   887    888         if not doc_positions.has_key(docnum):   889             doc = doc_positions[docnum] = []   890         else:   891             doc = doc_positions[docnum]   892    893         doc.append(position)   894    895         self.position_counter += 1   896         if self.flush_threshold and self.position_counter >= self.flush_threshold:   897             self.flush_terms()   898    899     def add_fields(self, docnum, fields):   900    901         "Add for the document with the given 'docnum' a list of 'fields'."   902    903         if not self.docs.has_key(docnum):   904             doc_fields = self.docs[docnum] = fields   905         else:   906             self.docs[docnum] += fields   907    908         self.field_counter += len(fields)   909         if self.flush_threshold and self.field_counter >= self.flush_threshold:   910             self.flush_fields()   911    912     def get_term_writer(self):   913    914         "Return a term dictionary writer for the current partition."   915    916         tdf = open(join(self.pathname, "terms-%d" % self.dict_partition), "wb")   917         info_writer = TermWriter(tdf)   918    919         tdif = open(join(self.pathname, "index-%d" % self.dict_partition), "wb")   920         index_writer = TermIndexWriter(tdif)   921    922         tpf = open(join(self.pathname, "positions-%d" % self.dict_partition), "wb")   923         positions_writer = PositionWriter(tpf)   924    925         return TermDictionaryWriter(info_writer, index_writer, positions_writer, self.interval)   926    927     def get_field_writer(self):   928    929         "Return a field dictionary writer for the current partition."   930    931         ff = open(join(self.pathname, "fields-%d" % self.field_dict_partition), "wb")   932         field_writer = FieldWriter(ff)   933    934         fif = open(join(self.pathname, "fields_index-%d" % self.field_dict_partition), "wb")   935         field_index_writer = FieldIndexWriter(fif)   936    937         return FieldDictionaryWriter(field_writer, field_index_writer, self.interval)   938    939     def flush_terms(self):   940    941         "Flush terms into the current term dictionary partition."   942    943         # Get the terms in order.   944    945         terms = self.terms.items()   946         terms.sort()   947    948         dict_writer = self.get_term_writer()   949    950         for term, doc_positions in terms:   951             doc_positions = doc_positions.items()   952             doc_positions.sort()   953             dict_writer.write_term_positions(term, doc_positions)   954    955         dict_writer.close()   956    957         self.terms = {}   958         self.dict_partition += 1   959    960     def flush_fields(self):   961    962         "Flush fields into the current term dictionary partition."   963    964         # Get the documents in order.   965    966         docs = self.docs.items()   967         docs.sort()   968    969         field_dict_writer = self.get_field_writer()   970    971         for docnum, fields in docs:   972             field_dict_writer.write_fields(docnum, fields)   973    974         field_dict_writer.close()   975    976         self.docs = {}   977         self.field_dict_partition += 1   978    979     def close(self):   980         if self.terms:   981             self.flush_terms()   982         if self.docs:   983             self.flush_fields()   984    985 class IndexReader:   986    987     "Accessing the term and field dictionaries."   988    989     def __init__(self, pathname, partition=0):   990         self.pathname = pathname   991         self.dict_reader = self.get_term_reader(partition)   992         self.field_dict_reader = self.get_field_reader(partition)   993    994     def get_term_reader(self, partition):   995         tdf = open(join(self.pathname, "terms-%d" % partition), "rb")   996         info_reader = TermReader(tdf)   997    998         tdif = open(join(self.pathname, "index-%d" % partition), "rb")   999         index_reader = TermIndexReader(tdif)  1000   1001         tpf = open(join(self.pathname, "positions-%d" % partition), "rb")  1002         positions_reader = PositionReader(tpf)  1003   1004         return TermDictionaryReader(info_reader, index_reader, positions_reader)  1005   1006     def get_field_reader(self, partition):  1007         ff = open(join(self.pathname, "fields-%d" % partition), "rb")  1008         field_reader = FieldReader(ff)  1009   1010         fif = open(join(self.pathname, "fields_index-%d" % partition), "rb")  1011         field_index_reader = FieldIndexReader(fif)  1012   1013         return FieldDictionaryReader(field_reader, field_index_reader)  1014   1015     def find_positions(self, term):  1016         return self.dict_reader.find_positions(term)  1017   1018     def get_frequency(self, term):  1019         return self.dict_reader.get_frequency(term)  1020   1021     def get_fields(self, docnum):  1022         return self.field_dict_reader.read_fields(docnum)  1023   1024     def close(self):  1025         self.dict_reader.close()  1026         self.field_dict_reader.close()  1027   1028 class Index:  1029   1030     "An inverted index solution encapsulating the various components."  1031   1032     def __init__(self, pathname):  1033         self.pathname = pathname  1034         self.reader = None  1035         self.writer = None  1036   1037     def get_writer(self, interval=INTERVAL, flush_interval=FLUSH_INTERVAL):  1038   1039         """  1040         Return a writer, optionally using the given indexing 'interval' and  1041         'flush_interval'.  1042         """  1043   1044         if not exists(self.pathname):  1045             mkdir(self.pathname)  1046   1047         self.writer = IndexWriter(self.pathname, interval, flush_interval)  1048         return self.writer  1049   1050     def get_reader(self, partition=0):  1051   1052         "Return a reader for the index."  1053   1054         if not exists(self.pathname):  1055             raise OSError, "Index path %r does not exist." % self.pathname  1056   1057         self.reader = IndexReader(self.pathname, partition)  1058         return self.reader  1059   1060     def merge_terms(self):  1061   1062         "Merge term dictionaries."  1063   1064         readers = []  1065   1066         for filename in os.listdir(self.pathname):  1067             if filename.startswith("terms-"): # 6 character prefix  1068                 partition = int(filename[6:])  1069                 readers.append(self.get_reader(partition))  1070   1071         # NOTE: Make a distinct new writer/index.  1072   1073     def close(self):  1074         if self.reader is not None:  1075             self.reader.close()  1076             self.reader = None  1077         if self.writer is not None:  1078             self.writer.close()  1079             self.writer = None  1080   1081 # vim: tabstop=4 expandtab shiftwidth=4