1.1 --- a/iixr/index.py Fri Jan 21 00:22:03 2011 +0100
1.2 +++ b/iixr/index.py Tue Jan 25 00:36:31 2011 +0100
1.3 @@ -3,7 +3,7 @@
1.4 """
1.5 High-level classes.
1.6
1.7 -Copyright (C) 2009, 2010 Paul Boddie <paul@boddie.org.uk>
1.8 +Copyright (C) 2009, 2010, 2011 Paul Boddie <paul@boddie.org.uk>
1.9
1.10 This program is free software; you can redistribute it and/or modify it under
1.11 the terms of the GNU General Public License as published by the Free Software
1.12 @@ -30,6 +30,7 @@
1.13 DOCUMENT_INTERVAL = 100
1.14 FIELD_INTERVAL = 100
1.15 FLUSH_INTERVAL = 10000
1.16 +POSITIONS_FLUSH_INTERVAL = 1000000
1.17 OPEN_PARTITIONS = 20
1.18
1.19 # High-level classes.
1.20 @@ -86,12 +87,13 @@
1.21 Building term information and writing it to the term and field dictionaries.
1.22 """
1.23
1.24 - def __init__(self, pathname, interval, doc_interval, field_interval, flush_interval):
1.25 + def __init__(self, pathname, interval, doc_interval, field_interval, flush_interval, positions_flush_interval):
1.26 self.pathname = pathname
1.27 self.interval = interval
1.28 self.doc_interval = doc_interval
1.29 self.field_interval = field_interval
1.30 self.flush_interval = flush_interval
1.31 + self.positions_flush_interval = positions_flush_interval
1.32
1.33 self.dict_partition = get_next_partition(get_term_partitions(self.pathname))
1.34 self.field_dict_partition = get_next_partition(get_field_partitions(self.pathname))
1.35 @@ -100,6 +102,7 @@
1.36 self.docs = []
1.37
1.38 self.doc_counter = 0
1.39 + self.position_counter = 0
1.40
1.41 def add_document(self, doc):
1.42
1.43 @@ -108,16 +111,23 @@
1.44 terms and fields if appropriate.
1.45 """
1.46
1.47 + docnum = doc.docnum
1.48 +
1.49 for term, positions in doc.terms.items():
1.50 - self.terms.setdefault(term, {})[doc.docnum] = positions
1.51 + self.terms.setdefault(term, {})[docnum] = positions
1.52 + self.position_counter += len(positions)
1.53
1.54 - self.docs.append((doc.docnum, doc.fields))
1.55 + self.docs.append((docnum, doc.fields))
1.56
1.57 self.doc_counter += 1
1.58 - if self.flush_interval and self.doc_counter >= self.flush_interval:
1.59 +
1.60 + if self.flush_interval and self.doc_counter >= self.flush_interval or \
1.61 + self.positions_flush_interval and self.position_counter >= self.positions_flush_interval:
1.62 +
1.63 self.flush_terms()
1.64 self.flush_fields()
1.65 self.doc_counter = 0
1.66 + self.position_counter = 0
1.67
1.68 def get_term_writer(self):
1.69
1.70 @@ -227,13 +237,14 @@
1.71 "An inverted index solution encapsulating the various components."
1.72
1.73 def __init__(self, pathname, interval=TERM_INTERVAL, doc_interval=DOCUMENT_INTERVAL, field_interval=FIELD_INTERVAL,
1.74 - flush_interval=FLUSH_INTERVAL, open_partitions=OPEN_PARTITIONS):
1.75 + flush_interval=FLUSH_INTERVAL, positions_flush_interval=POSITIONS_FLUSH_INTERVAL, open_partitions=OPEN_PARTITIONS):
1.76
1.77 self.pathname = pathname
1.78 self.interval = interval
1.79 self.doc_interval = doc_interval
1.80 self.field_interval = field_interval
1.81 self.flush_interval = flush_interval
1.82 + self.positions_flush_interval = positions_flush_interval
1.83 self.open_partitions = open_partitions
1.84 self.reader = None
1.85 self.writer = None
1.86 @@ -244,7 +255,7 @@
1.87
1.88 self._ensure_directory()
1.89 self.writer = IndexWriter(self.pathname, self.interval, self.doc_interval,
1.90 - self.field_interval, self.flush_interval)
1.91 + self.field_interval, self.flush_interval, self.positions_flush_interval)
1.92 return self.writer
1.93
1.94 def _ensure_directory(self):