iixr (file iixr.py at eafc8a8a84ff)

     1 #!/usr/bin/env python     2      3 """     4 A simple (and sane) text indexing library.     5      6 Copyright (C) 2009 Paul Boddie <paul@boddie.org.uk>     7      8 This program is free software; you can redistribute it and/or modify it under     9 the terms of the GNU General Public License as published by the Free Software    10 Foundation; either version 3 of the License, or (at your option) any later    11 version.    12     13 This program is distributed in the hope that it will be useful, but WITHOUT ANY    14 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A    15 PARTICULAR PURPOSE.  See the GNU General Public License for more details.    16     17 You should have received a copy of the GNU General Public License along    18 with this program.  If not, see <http://www.gnu.org/licenses/>.    19 """    20     21 from os import listdir, mkdir    # index and partition discovery    22 from os.path import exists, join    23 from os.path import commonprefix # to find common string prefixes    24 from bisect import bisect_right  # to find terms in the dictionary index    25 from bisect import insort_right  # to maintain a sorted list of data for merging    26 import bz2, zlib                 # for field compression    27     28 # Constants.    29     30 INTERVAL = 100    31 FLUSH_INTERVAL = 1000000    32     33 compressors = [("b", bz2.compress), ("z", zlib.compress)]    34 decompressors = {"b" : bz2.decompress, "z" : zlib.decompress}    35     36 # Foundation classes.    37     38 class File:    39     40     "A basic file abstraction."    41     42     def __init__(self, f):    43         self.f = f    44         self.reset()    45     46     def reset(self):    47     48         "To be used to reset the state of the reader or writer between records."    49     50         pass    51     52     def rewind(self):    53         self.f.seek(0)    54         self.reset()    55     56     def close(self):    57         if self.f is not None:    58             self.f.close()    59             self.f = None    60     61 class FileWriter(File):    62     63     "Writing basic data types to files."    64     65     def write_number(self, number):    66     67         "Write 'number' to the file using a variable length encoding."    68     69         # Negative numbers are not supported.    70     71         if number < 0:    72             raise ValueError, "Number %r is negative." % number    73     74         # Special case: one byte containing zero.    75     76         elif number == 0:    77             self.f.write(chr(0))    78             return    79     80         # Write the number from least to most significant digits.    81     82         bytes = []    83     84         while number != 0:    85             lsd = number & 127    86             number = number >> 7    87             if number != 0:    88                 lsd |= 128    89             bytes.append(chr(lsd))    90     91         record = "".join(bytes)    92         self.f.write(record)    93     94     def write_string(self, s, compress=0):    95     96         """    97         Write 's' to the file, recording its length and compressing the string    98         if 'compress' is set to a true value.    99         """   100    101         # Convert Unicode objects to strings.   102    103         if isinstance(s, unicode):   104             s = s.encode("utf-8")   105    106         # Compress the string if requested.   107    108         if compress:   109             for flag, fn in compressors:   110                 cs = fn(s)   111    112                 # Take the first string shorter than the original.   113    114                 if len(cs) < len(s):   115                     s = cs   116                     break   117             else:   118                 flag = "-"   119    120             # Record whether compression was used.   121    122             self.f.write(flag)   123    124         # Write the length of the data before the data itself.   125    126         length = len(s)   127         self.write_number(length)   128         self.f.write(s)   129    130 class FileReader(File):   131    132     "Reading basic data types from files."   133    134     def read_number(self):   135    136         "Read a number from the file."   137    138         # Read each byte, adding it to the number.   139    140         shift = 0   141         number = 0   142         more = 1   143    144         while more:   145             byte = self.f.read(1)   146             if not byte:   147                 raise EOFError   148    149             csd = ord(byte)   150             more = csd & 128 != 0   151             if more:   152                 csd &= 127   153             number += (csd << shift)   154             shift += 7   155    156         return number   157    158     def read_string(self, decompress=0):   159    160         """   161         Read a string from the file, decompressing the stored data if   162         'decompress' is set to a true value.   163         """   164    165         # Decompress the data if requested.   166    167         if decompress:   168             flag = self.f.read(1)   169         else:   170             flag = "-"   171    172         length = self.read_number()   173         s = self.f.read(length)   174    175         # Perform decompression if applicable.   176    177         if flag != "-":   178             fn = decompressors[flag]   179             s = fn(s)   180    181         # Convert strings to Unicode objects.   182    183         return unicode(s, "utf-8")   184    185 # Specific classes for storing term and position information.   186    187 class PositionWriter(FileWriter):   188    189     "Writing position information to files."   190    191     def reset(self):   192         self.last_docnum = 0   193    194     def write_positions(self, docnum, positions):   195    196         "Write for the document 'docnum' the given 'positions'."   197    198         if docnum < self.last_docnum:   199             raise ValueError, "Document number %r is less than previous number %r." % (docnum, self.last_docnum)   200    201         # Write the document number delta.   202    203         self.write_number(docnum - self.last_docnum)   204    205         # Write the number of positions.   206    207         self.write_number(len(positions))   208    209         # Make sure that the positions are sorted.   210    211         positions.sort()   212    213         # Write the position deltas.   214    215         last = 0   216         for position in positions:   217             pos = position - last   218             self.write_number(pos)   219             last = position   220    221         self.last_docnum = docnum   222    223     def write_term_positions(self, doc_positions):   224    225         """   226         Write all 'doc_positions' - a collection of tuples of the form (document   227         number, position list) - to the file, returning a tuple containing the   228         offset at which they were stored together with the frequency (number of   229         positions) for the term involved.   230         """   231    232         # Reset the writer and record the current file offset.   233    234         self.reset()   235         offset = self.f.tell()   236    237         # Write the number of documents.   238    239         self.write_number(len(doc_positions))   240    241         # Write the positions.   242    243         frequency = 0   244    245         for docnum, positions in doc_positions:   246             self.write_positions(docnum, positions)   247             frequency += len(positions)   248    249         return offset, frequency   250    251 class PositionReader(FileReader):   252    253     "Reading position information from files."   254    255     def reset(self):   256         self.last_docnum = 0   257    258     def read_positions(self):   259    260         "Read positions, returning a document number and a list of positions."   261    262         # Read the document number delta and add it to the last number.   263    264         self.last_docnum += self.read_number()   265    266         # Read the number of positions.   267    268         npositions = self.read_number()   269    270         # Read the position deltas, adding each previous position to get the   271         # appropriate collection of absolute positions.   272    273         i = 0   274         last = 0   275         positions = []   276    277         while i < npositions:   278             last += self.read_number()   279             positions.append(last)   280             i += 1   281    282         return self.last_docnum, positions   283    284     def read_term_positions(self, offset):   285    286         """   287         Read all positions from 'offset', seeking to that position in the file   288         before reading.   289         """   290    291         self.reset()   292         self.f.seek(offset)   293    294         # Read the number of documents.   295    296         ndocuments = self.read_number()   297    298         # Read all records.   299    300         i = 0   301         doc_positions = []   302    303         while i < ndocuments:   304             doc_positions.append(self.read_positions())   305             i += 1   306    307         return doc_positions   308    309 class TermWriter(FileWriter):   310    311     "Writing term information to files."   312    313     def reset(self):   314         self.last_term = ""   315         self.last_offset = 0   316    317     def write_term(self, term, offset, frequency):   318    319         """   320         Write the given 'term', its position file 'offset', and its 'frequency'   321         to the term information file. Return the offset after the term   322         information was written to the file.   323         """   324    325         # Write the prefix length and term suffix.   326    327         common = len(commonprefix([self.last_term, term]))   328         suffix = term[common:]   329    330         self.write_number(common)   331         self.write_string(suffix)   332    333         # Write the offset delta.   334    335         self.write_number(offset - self.last_offset)   336    337         # Write the frequency.   338    339         self.write_number(frequency)   340    341         self.last_term = term   342         self.last_offset = offset   343    344         return self.f.tell()   345    346 class TermReader(FileReader):   347    348     "Reading term information from files."   349    350     def reset(self):   351         self.last_term = ""   352         self.last_offset = 0   353    354     def read_term(self):   355    356         """   357         Read a term, its position file offset, and its frequency from the term   358         information file.   359         """   360    361         # Read the prefix length and term suffix.   362    363         common = self.read_number()   364         suffix = self.read_string()   365    366         self.last_term = self.last_term[:common] + suffix   367    368         # Read the offset delta.   369    370         self.last_offset += self.read_number()   371    372         # Read the frequency.   373    374         frequency = self.read_number()   375    376         return self.last_term, self.last_offset, frequency   377    378     def go_to_term(self, term, offset, info_offset):   379    380         """   381         Seek past the entry for 'term' having 'offset' to 'info_offset'. This   382         permits the scanning for later terms from the specified term.   383         """   384    385         self.f.seek(info_offset)   386         self.last_term = term   387         self.last_offset = offset   388    389 class TermIndexWriter(TermWriter):   390    391     "Writing term dictionary index details to files."   392    393     def reset(self):   394         TermWriter.reset(self)   395         self.last_info_offset = 0   396    397     def write_term(self, term, offset, frequency, info_offset):   398    399         """   400         Write the given 'term', its position file 'offset', and its 'frequency'   401         to the term dictionary index file, along with the 'info_offset' in the   402         term information file.   403         """   404    405         TermWriter.write_term(self, term, offset, frequency)   406    407         # Write the information file offset delta.   408    409         self.write_number(info_offset - self.last_info_offset)   410         self.last_info_offset = info_offset   411    412 class TermIndexReader(TermReader):   413    414     "Reading term dictionary index details from files."   415    416     def reset(self):   417         TermReader.reset(self)   418         self.last_info_offset = 0   419    420     def read_term(self):   421    422         """   423         Read a term, its position file offset, its frequency, and its term   424         information file offset from the term dictionary index file.   425         """   426    427         term, offset, frequency = TermReader.read_term(self)   428    429         # Read the offset delta.   430    431         self.last_info_offset += self.read_number()   432    433         return term, offset, frequency, self.last_info_offset   434    435 class TermDictionaryWriter:   436    437     "Writing term dictionaries."   438    439     def __init__(self, info_writer, index_writer, position_writer, interval):   440         self.info_writer = info_writer   441         self.index_writer = index_writer   442         self.position_writer = position_writer   443         self.interval = interval   444         self.entry = 0   445    446     def _write_term(self, term, offset, frequency):   447    448         """   449         Write the given 'term', its position file 'offset', and its 'frequency'   450         to the term information file and optionally to the index, making a   451         dictionary entry.   452         """   453    454         info_offset = self.info_writer.write_term(term, offset, frequency)   455    456         if self.entry % self.interval == 0:   457             self.index_writer.write_term(term, offset, frequency, info_offset)   458    459         self.entry += 1   460    461     def write_term_positions(self, term, doc_positions):   462    463         """   464         Write the given 'term' and the 'doc_positions' recording the documents   465         and positions at which the term is found.   466         """   467    468         offset, frequency = self.position_writer.write_term_positions(doc_positions)   469         self._write_term(term, offset, frequency)   470    471     def close(self):   472         self.info_writer.close()   473         self.index_writer.close()   474         self.position_writer.close()   475    476 class TermDictionaryReader:   477    478     "Reading term dictionaries."   479    480     def __init__(self, info_reader, index_reader, position_reader):   481         self.info_reader = info_reader   482         self.index_reader = index_reader   483         self.position_reader = position_reader   484    485         self.terms = []   486         try:   487             while 1:   488                 self.terms.append(self.index_reader.read_term())   489         except EOFError:   490             pass   491    492         # Large numbers for ordering purposes.   493    494         self.max_offset = self.terms[-1][1]   495         self.max_info_offset = self.terms[-1][2]   496    497     def _find_term(self, term):   498    499         """   500         Find the position file offset and frequency of 'term' from the term   501         dictionary.   502         """   503    504         i = bisect_right(self.terms, (term, self.max_offset, self.max_info_offset)) - 1   505    506         # Get the entry position providing the term or one preceding it.   507    508         if i == -1:   509             return None   510    511         found_term, offset, frequency, info_offset = self.terms[i]   512    513         # Where the term is found immediately, return the offset.   514    515         if term == found_term:   516             return offset, frequency   517    518         # Otherwise, seek past the index term's entry in the information file   519         # and scan for the desired term.   520    521         else:   522             self.info_reader.go_to_term(found_term, offset, info_offset)   523             try:   524                 while term > found_term:   525                     found_term, offset, frequency = self.info_reader.read_term()   526             except EOFError:   527                 pass   528    529             # If the term is found, return the offset and frequency.   530    531             if term == found_term:   532                 return offset, frequency   533             else:   534                 return None   535    536     def rewind(self):   537         self.info_reader.rewind()   538    539     def _get_positions(self, offset):   540         return self.position_reader.read_term_positions(offset)   541    542     def read_term(self):   543    544         """   545         Return the next term, its frequency and the documents and positions at   546         which the term is found.   547         """   548    549         term, offset, frequency = self.info_reader.read_term()   550         positions = self._get_positions(offset)   551         return term, frequency, positions   552    553     def find_positions(self, term):   554    555         "Return the documents and positions at which the given 'term' is found."   556    557         t = self._find_term(term)   558         if t is None:   559             return None   560         else:   561             offset, frequency = t   562             return self._get_positions(offset)   563    564     def get_frequency(self, term):   565    566         "Return the frequency of the given 'term'."   567    568         t = self._find_term(term)   569         if t is None:   570             return None   571         else:   572             offset, frequency = t   573             return frequency   574    575     def close(self):   576         self.info_reader.close()   577         self.index_reader.close()   578         self.position_reader.close()   579    580 # Specific classes for storing document information.   581    582 class FieldWriter(FileWriter):   583    584     "Writing field data to files."   585    586     def reset(self):   587         self.last_docnum = 0   588    589     def write_fields(self, docnum, fields):   590    591         """   592         Write for the given 'docnum', a list of 'fields' (integer, string pairs   593         representing field identifiers and values respectively).   594         Return the offset at which the fields are stored.   595         """   596    597         offset = self.f.tell()   598    599         # Write the document number delta.   600    601         self.write_number(docnum - self.last_docnum)   602    603         # Write the number of fields.   604    605         self.write_number(len(fields))   606    607         # Write the fields themselves.   608    609         for i, field in fields:   610             self.write_number(i)   611             self.write_string(field, 1) # compress   612    613         self.last_docnum = docnum   614         return offset   615    616 class FieldReader(FileReader):   617    618     "Reading field data from files."   619    620     def reset(self):   621         self.last_docnum = 0   622    623     def read_fields(self):   624    625         """   626         Read fields from the file, returning a tuple containing the document   627         number and a list of field (identifier, value) pairs.   628         """   629    630         # Read the document number.   631    632         self.last_docnum += self.read_number()   633    634         # Read the number of fields.   635    636         nfields = self.read_number()   637    638         # Collect the fields.   639    640         fields = []   641         i = 0   642    643         while i < nfields:   644             identifier = self.read_number()   645             value = self.read_string(1) # decompress   646             fields.append((identifier, value))   647             i += 1   648    649         return self.last_docnum, fields   650    651     def read_document_fields(self, docnum, offset):   652    653         """   654         Read fields for 'docnum' at the given 'offset'. This permits the   655         retrieval of details for the specified document, as well as scanning for   656         later documents.   657         """   658    659         self.f.seek(offset)   660         bad_docnum, fields = self.read_fields()   661         self.last_docnum = docnum   662         return docnum, fields   663    664 class FieldIndexWriter(FileWriter):   665    666     "Writing field index details to files."   667    668     def reset(self):   669         self.last_docnum = 0   670         self.last_offset = 0   671    672     def write_document(self, docnum, offset):   673    674         """   675         Write for the given 'docnum', the 'offset' at which the fields for the   676         document are stored in the fields file.   677         """   678    679         # Write the document number and offset deltas.   680    681         self.write_number(docnum - self.last_docnum)   682         self.write_number(offset - self.last_offset)   683    684         self.last_docnum = docnum   685         self.last_offset = offset   686    687 class FieldIndexReader(FileReader):   688    689     "Reading field index details from files."   690    691     def reset(self):   692         self.last_docnum = 0   693         self.last_offset = 0   694    695     def read_document(self):   696    697         "Read a document number and field file offset."   698    699         # Read the document number delta and offset.   700    701         self.last_docnum += self.read_number()   702         self.last_offset += self.read_number()   703    704         return self.last_docnum, self.last_offset   705    706 class FieldDictionaryWriter:   707    708     "Writing field dictionary details."   709    710     def __init__(self, field_writer, field_index_writer, interval):   711         self.field_writer = field_writer   712         self.field_index_writer = field_index_writer   713         self.interval = interval   714         self.entry = 0   715    716     def write_fields(self, docnum, fields):   717    718         "Write details of the document with the given 'docnum' and 'fields'."   719    720         offset = self.field_writer.write_fields(docnum, fields)   721    722         if self.entry % self.interval == 0:   723             self.field_index_writer.write_document(docnum, offset)   724    725         self.entry += 1   726    727     def close(self):   728         self.field_writer.close()   729         self.field_index_writer.close()   730    731 class FieldDictionaryReader:   732    733     "Reading field dictionary details."   734    735     def __init__(self, field_reader, field_index_reader):   736         self.field_reader = field_reader   737         self.field_index_reader = field_index_reader   738    739         self.docs = []   740         try:   741             while 1:   742                 self.docs.append(self.field_index_reader.read_document())   743         except EOFError:   744             pass   745    746         # Large numbers for ordering purposes.   747    748         self.max_offset = self.docs[-1][1]   749    750     def rewind(self):   751         self.field_reader.rewind()   752    753     def read_fields(self):   754    755         "Return the next document number and fields."   756    757         return self.field_reader.read_fields()   758    759     def get_fields(self, docnum):   760    761         "Read the fields of the document with the given 'docnum'."   762    763         i = bisect_right(self.docs, (docnum, self.max_offset)) - 1   764    765         # Get the entry position providing the term or one preceding it.   766    767         if i == -1:   768             return None   769    770         found_docnum, offset = self.docs[i]   771    772         # Read from the fields file.   773    774         found_docnum, fields = self.field_reader.read_document_fields(found_docnum, offset)   775    776         # Scan for the document, if necessary.   777    778         try:   779             while docnum > found_docnum:   780                 found_docnum, fields = self.field_reader.read_fields()   781         except EOFError:   782             pass   783    784         # If the document is found, return the fields.   785    786         if docnum == found_docnum:   787             return fields   788         else:   789             return None   790    791     def close(self):   792         self.field_reader.close()   793         self.field_index_reader.close()   794    795 # Dictionary merging classes.   796    797 class Merger:   798    799     "Merge files."   800    801     def __init__(self, writer, readers):   802         self.writer = writer   803         self.readers = readers   804    805     def close(self):   806         for reader in self.readers:   807             reader.close()   808         self.writer.close()   809    810 class TermDictionaryMerger(Merger):   811    812     "Merge term and position files."   813    814     def merge(self):   815    816         """   817         Merge terms and positions from the readers, sending them to the writer.   818         """   819    820         entries = []   821    822         # Get the first entries from the readers.   823    824         for partition, reader in enumerate(self.readers):   825             reader.rewind()   826    827             try:   828                 term, frequency, positions = reader.read_term()   829                 insort_right(entries, (term, positions, partition))   830             except EOFError:   831                 pass   832    833         # While entries are available, write them out in order, merging where   834         # appropriate.   835    836         while entries:   837             term, doc_positions, partition = entries[0]   838             to_update = [partition]   839    840             nentries = len(entries)   841             i = 1   842    843             # Find other entries for the term.   844    845             while i < nentries:   846                 other_term, other_doc_positions, other_partition = entries[i]   847    848                 # For such entries, merge the positions.   849    850                 if other_term == term:   851                     self.merge_positions(doc_positions, other_doc_positions)   852                     to_update.append(other_partition)   853                     i += 1   854                 else:   855                     break   856    857             # Write the combined term details.   858    859             self.writer.write_term_positions(term, doc_positions)   860    861             # Update the entries from the affected readers.   862    863             del entries[:i]   864    865             for partition in to_update:   866                 try:   867                     term, frequency, positions = self_readers[partition].read_term()   868                     insort_right(entries, (term, positions, partition))   869                 except EOFError:   870                     pass   871    872     def merge_positions(self, doc_positions, other_doc_positions):   873    874         """   875         Merge 'doc_positions' with 'other_doc_positions' so that common document   876         records contain positions from both collections.   877         """   878    879         doc_position_dict = dict(doc_positions)   880    881         for docnum, positions in other_doc_positions:   882             if doc_position_dict.has_key(docnum):   883                 doc_position_dict[docnum] += positions   884                 doc_position_dict[docnum].sort()   885             else:   886                 doc_position_dict[docnum] = positions   887    888         doc_positions = doc_position_dict.items()   889         return doc_positions   890    891 class FieldDictionaryMerger(Merger):   892    893     "Merge field files."   894    895     def merge(self):   896    897         """   898         Merge fields from the readers, sending them to the writer.   899         """   900    901         entries = []   902    903         # Get the first entries from the readers.   904    905         for partition, reader in enumerate(self.readers):   906             reader.rewind()   907    908             try:   909                 docnum, fields = reader.read_fields()   910                 insort_right(entries, (docnum, fields, partition))   911             except EOFError:   912                 pass   913    914         # While entries are available, write them out in order, merging where   915         # appropriate.   916    917         while entries:   918             docnum, fields, partition = entries[0]   919             to_update = [partition]   920    921             nentries = len(entries)   922             i = 1   923    924             # Find other entries for the term.   925    926             while i < nentries:   927                 other_docnum, other_fields, other_partition = entries[i]   928    929                 # For such entries, merge the positions.   930    931                 if other_term == term:   932                     fields += other_fields   933                     to_update.append(other_partition)   934                     i += 1   935                 else:   936                     break   937    938             # Write the combined term details.   939    940             self.writer.write_fields(docnum, fields)   941    942             # Update the entries from the affected readers.   943    944             del entries[:i]   945    946             for partition in to_update:   947                 try:   948                     docnum, fields = self_readers[partition].read_fields()   949                     insort_right(entries, (docnum, fields, partition))   950                 except EOFError:   951                     pass   952    953 # Utility functions.   954    955 def get_term_writer(pathname, partition, interval):   956    957     """   958     Return a term dictionary writer using files under the given 'pathname'   959     labelled according to the given 'partition', using the given indexing   960     'interval'.   961     """   962    963     tdf = open(join(pathname, "terms-%s" % partition), "wb")   964     info_writer = TermWriter(tdf)   965    966     tdif = open(join(pathname, "index-%s" % partition), "wb")   967     index_writer = TermIndexWriter(tdif)   968    969     tpf = open(join(pathname, "positions-%s" % partition), "wb")   970     positions_writer = PositionWriter(tpf)   971    972     return TermDictionaryWriter(info_writer, index_writer, positions_writer, interval)   973    974 def get_field_writer(pathname, partition, interval):   975    976     """   977     Return a field dictionary writer using files under the given 'pathname'   978     labelled according to the given 'partition', using the given indexing   979     'interval'.   980     """   981    982     ff = open(join(pathname, "fields-%s" % partition), "wb")   983     field_writer = FieldWriter(ff)   984    985     fif = open(join(pathname, "fields_index-%s" % partition), "wb")   986     field_index_writer = FieldIndexWriter(fif)   987    988     return FieldDictionaryWriter(field_writer, field_index_writer, interval)   989    990 # High-level classes.   991    992 class IndexWriter:   993    994     """   995     Building term information and writing it to the term and field dictionaries.   996     """   997    998     def __init__(self, pathname, interval, flush_interval):   999         self.pathname = pathname  1000         self.interval = interval  1001         self.flush_interval = flush_interval  1002   1003         self.dict_partition = 0  1004         self.field_dict_partition = 0  1005   1006         self.terms = {}  1007         self.docs = {}  1008   1009         self.position_counter = 0  1010         self.field_counter = 0  1011   1012     def add_position(self, term, docnum, position):  1013   1014         """  1015         Add a position entry for the given 'term' in the document with the given  1016         'docnum', indicating the given 'position'.  1017         """  1018   1019         if not self.terms.has_key(term):  1020             doc_positions = self.terms[term] = {}  1021         else:  1022             doc_positions = self.terms[term]  1023   1024         if not doc_positions.has_key(docnum):  1025             doc = doc_positions[docnum] = []  1026         else:  1027             doc = doc_positions[docnum]  1028   1029         doc.append(position)  1030   1031         self.position_counter += 1  1032         if self.flush_interval and self.position_counter >= self.flush_interval:  1033             self.flush_terms()  1034   1035     def add_field(self, docnum, identifier, value):  1036   1037         """  1038         Add for the document with the given 'docnum' a field having the given  1039         'identifier' and 'value'.  1040         """  1041   1042         if not self.docs.has_key(docnum):  1043             doc_fields = self.docs[docnum] = []  1044         else:  1045             doc_fields = self.docs[docnum]  1046   1047         doc_fields.append((identifier, value))  1048   1049         self.field_counter += 1  1050         if self.flush_interval and self.field_counter >= self.flush_interval:  1051             self.flush_fields()  1052   1053     def get_term_writer(self):  1054   1055         "Return a term dictionary writer for the current partition."  1056   1057         return get_term_writer(self.pathname, self.dict_partition, self.interval)  1058   1059     def get_field_writer(self):  1060   1061         "Return a field dictionary writer for the current partition."  1062   1063         return get_field_writer(self.pathname, self.field_dict_partition, self.interval)  1064   1065     def flush_terms(self):  1066   1067         "Flush terms into the current term dictionary partition."  1068   1069         # Get the terms in order.  1070   1071         terms = self.terms.items()  1072         terms.sort()  1073   1074         dict_writer = self.get_term_writer()  1075   1076         for term, doc_positions in terms:  1077             doc_positions = doc_positions.items()  1078             doc_positions.sort()  1079             dict_writer.write_term_positions(term, doc_positions)  1080   1081         dict_writer.close()  1082   1083         self.terms = {}  1084         self.dict_partition += 1  1085   1086     def flush_fields(self):  1087   1088         "Flush fields into the current term dictionary partition."  1089   1090         # Get the documents in order.  1091   1092         docs = self.docs.items()  1093         docs.sort()  1094   1095         field_dict_writer = self.get_field_writer()  1096   1097         for docnum, fields in docs:  1098             field_dict_writer.write_fields(docnum, fields)  1099   1100         field_dict_writer.close()  1101   1102         self.docs = {}  1103         self.field_dict_partition += 1  1104   1105     def close(self):  1106         if self.terms:  1107             self.flush_terms()  1108         if self.docs:  1109             self.flush_fields()  1110   1111 class IndexReader:  1112   1113     "Accessing the term and field dictionaries."  1114   1115     def __init__(self, pathname, partition=0):  1116         self.pathname = pathname  1117         self.dict_reader = self.get_term_reader(partition)  1118         self.field_dict_reader = self.get_field_reader(partition)  1119   1120     def get_term_reader(self, partition):  1121         tdf = open(join(self.pathname, "terms-%s" % partition), "rb")  1122         info_reader = TermReader(tdf)  1123   1124         tdif = open(join(self.pathname, "index-%s" % partition), "rb")  1125         index_reader = TermIndexReader(tdif)  1126   1127         tpf = open(join(self.pathname, "positions-%s" % partition), "rb")  1128         positions_reader = PositionReader(tpf)  1129   1130         return TermDictionaryReader(info_reader, index_reader, positions_reader)  1131   1132     def get_field_reader(self, partition):  1133         ff = open(join(self.pathname, "fields-%s" % partition), "rb")  1134         field_reader = FieldReader(ff)  1135   1136         fif = open(join(self.pathname, "fields_index-%s" % partition), "rb")  1137         field_index_reader = FieldIndexReader(fif)  1138   1139         return FieldDictionaryReader(field_reader, field_index_reader)  1140   1141     def find_positions(self, term):  1142         return self.dict_reader.find_positions(term)  1143   1144     def get_frequency(self, term):  1145         return self.dict_reader.get_frequency(term)  1146   1147     def get_fields(self, docnum):  1148         return self.field_dict_reader.get_fields(docnum)  1149   1150     def close(self):  1151         self.dict_reader.close()  1152         self.field_dict_reader.close()  1153   1154 class Index:  1155   1156     "An inverted index solution encapsulating the various components."  1157   1158     def __init__(self, pathname):  1159         self.pathname = pathname  1160         self.reader = None  1161         self.writer = None  1162   1163     def get_writer(self, interval=INTERVAL, flush_interval=FLUSH_INTERVAL):  1164   1165         """  1166         Return a writer, optionally using the given indexing 'interval' and  1167         'flush_interval'.  1168         """  1169   1170         if not exists(self.pathname):  1171             mkdir(self.pathname)  1172   1173         self.writer = IndexWriter(self.pathname, interval, flush_interval)  1174         return self.writer  1175   1176     def get_reader(self, partition=0):  1177   1178         "Return a reader for the index."  1179   1180         if not exists(self.pathname):  1181             raise OSError, "Index path %r does not exist." % self.pathname  1182   1183         self.reader = IndexReader(self.pathname, partition)  1184         return self.reader  1185   1186     def merge_terms(self, interval=INTERVAL):  1187   1188         "Merge term dictionaries using the given indexing 'interval'."  1189   1190         readers = []  1191   1192         for filename in os.listdir(self.pathname):  1193             if filename.startswith("terms-"): # 6 character prefix  1194                 partition = int(filename[6:])  1195                 readers.append(self.get_reader(partition))  1196   1197         writer = get_writer(self.pathname, "new", interval)  1198   1199         merger = TermDictionaryMerger(writer, readers)  1200         merger.merge()  1201         merger.close()  1202   1203     def close(self):  1204         if self.reader is not None:  1205             self.reader.close()  1206             self.reader = None  1207         if self.writer is not None:  1208             self.writer.close()  1209             self.writer = None  1210   1211 # vim: tabstop=4 expandtab shiftwidth=4