# HG changeset patch # User Paul Boddie # Date 1317490438 -7200 # Node ID ce60e75cb65b88934e93582ac35fc3e989762625 # Parent 7812d65bc2d9de4d2f9a780182e6ca9ad3e991d8 Changed readers to act like iterators instead of providing get_records methods, to accept record iterators in their initialisation instead of accessors, and to present records to record iterators instead of instantiating new record iterators. Moved accessors and iterators into their own modules. Introduced the concept of stateful iteration, providing examples of state management. diff -r 7812d65bc2d9 -r ce60e75cb65b simplex/__init__.py --- a/simplex/__init__.py Sat Oct 01 17:59:20 2011 +0200 +++ b/simplex/__init__.py Sat Oct 01 19:33:58 2011 +0200 @@ -29,6 +29,10 @@ """ from simplex.readers import * +from simplex.iterators import * +from simplex.accessors import * +from simplex.state import * + import bisect def make_index(reader, interval): @@ -45,7 +49,7 @@ current_key = None start_pos = 0 - for i, (key, record) in enumerate(reader.get_records()): + for i, (key, record) in enumerate(reader): # Where duplicate keys are permitted, the first record employing the key # must be available as an index entry. Otherwise, records preceding the @@ -97,7 +101,7 @@ found. """ - for key, record in reader.get_records(): + for key, record in reader: if term == key: return record diff -r 7812d65bc2d9 -r ce60e75cb65b simplex/accessors.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/simplex/accessors.py Sat Oct 01 19:33:58 2011 +0200 @@ -0,0 +1,63 @@ +#!/usr/bin/env python + +""" +Accessor classes for indexing. + +Copyright (C) 2011 Paul Boddie + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; either version 3 of the License, or (at your option) any later +version. + +This program is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A +PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along +with this program. If not, see . +""" + +class DelimitedRecord: + + "An accessor using a delimiter to split a record." + + def __init__(self, keys=None, delimiter=None, numeric=0): + + """ + Initialise the accessor using a sequence of 'keys' indicating the + columns in each record that provide the values in the eventual compound + key provided by each record, along with a 'delimiter' indicating how + such columns are identified. If 'numeric' is set to a true value, keys + will be interpreted as numbers. + """ + + self.keys = keys or [0] + self.delimiter = delimiter + self.numeric = numeric + + # Define a conversion method. + + self.convert = numeric and self.convert_numeric or (lambda x: x) + + def convert_numeric(self, term): + return map(int, term) + + def get_key(self, record): + values = record.split(self.delimiter) + return self.convert([values[key] for key in self.keys]) + + def get_sort_command(self): + + """ + Return the Unix sort command invocation required to produce the ordering + described by this instance. + """ + + return "sort%s%s%s" % ( + self.delimiter and (" -t $'%s'" % repr(self.delimiter)[1:-1]) or "", + self.numeric and " -n" or "", + "".join([(" -k %d,%d" % (key + 1, key + 1)) for key in self.keys]) + ) + +# vim: tabstop=4 expandtab shiftwidth=4 diff -r 7812d65bc2d9 -r ce60e75cb65b simplex/iterators.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/simplex/iterators.py Sat Oct 01 19:33:58 2011 +0200 @@ -0,0 +1,61 @@ +#!/usr/bin/env python + +""" +Iterator classes for indexing. + +Copyright (C) 2011 Paul Boddie + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; either version 3 of the License, or (at your option) any later +version. + +This program is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A +PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along +with this program. If not, see . +""" + +class Iterator: + + "An iterator over records employing record accessors." + + def __init__(self, accessor): + self.accessor = accessor + self.records = None + self.iterator = None + + def set_records(self, records): + self.records = records + + def __iter__(self): + self.iterator = iter(self.records) + return self + + def next(self): + if self.iterator is None: + iter(self) + record = self.iterator.next() + return self.accessor.get_key(record), record + +class StatefulIterator(Iterator): + + "An iterator over records maintaining state." + + def __init__(self, accessor, state): + Iterator.__init__(self, accessor) + self.state = state + + def __iter__(self): + Iterator.__iter__(self) + self.state.reset() + return self + + def next(self): + key, record = Iterator.next(self) + self.key = self.state.update(key) + return self.key, record + +# vim: tabstop=4 expandtab shiftwidth=4 diff -r 7812d65bc2d9 -r ce60e75cb65b simplex/readers.py --- a/simplex/readers.py Sat Oct 01 17:59:20 2011 +0200 +++ b/simplex/readers.py Sat Oct 01 19:33:58 2011 +0200 @@ -1,7 +1,7 @@ #!/usr/bin/env python """ -Reader and accessor classes for indexing. +Reader classes for indexing. Copyright (C) 2011 Paul Boddie @@ -22,9 +22,9 @@ "A generic file wrapper." - def __init__(self, f, accessor): + def __init__(self, f, iterator): self.f = f - self.accessor = accessor + self.iterator = iterator def seek(self, pos): self.f.seek(pos) @@ -33,62 +33,8 @@ "A wrapper around text files." - def get_records(self): - return Iterator(self.f.xreadlines(), self.accessor) - -class Iterator: - - "An iterator over records employing record accessors." - - def __init__(self, records, accessor): - self.records = records - self.accessor = accessor - self.iterator = None - def __iter__(self): - self.iterator = iter(self.records) - return self - - def next(self): - if self.iterator is None: - iter(self) - record = self.iterator.next() - return self.accessor.get_key(record), record - -class DelimitedRecord: - - "An accessor using a delimiter to split a record." - - def __init__(self, keys=None, delimiter=None, numeric=0): - - """ - Initialise the accessor using a sequence of 'keys' indicating the - columns in each record that provide the values in the eventual compound - key provided by each record, along with a 'delimiter' indicating how - such columns are identified. If 'numeric' is set to a true value, keys - will be interpreted as numbers. - """ - - self.keys = keys or [0] - self.delimiter = delimiter - self.numeric = numeric - - # Define a conversion method. - - self.convert = numeric and self.convert_numeric or (lambda x: x) - - def convert_numeric(self, term): - return map(int, term) - - def get_key(self, record): - values = record.split(self.delimiter) - return self.convert([values[key] for key in self.keys]) - - def get_sort_command(self): - return "sort%s%s%s" % ( - self.delimiter and (" -t $'%s'" % repr(self.delimiter)[1:-1]) or "", - self.numeric and " -n" or "", - "".join([(" -k %d,%d" % (key + 1, key + 1)) for key in self.keys]) - ) + self.iterator.set_records(self.f.xreadlines()) + return self.iterator # vim: tabstop=4 expandtab shiftwidth=4 diff -r 7812d65bc2d9 -r ce60e75cb65b simplex/state.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/simplex/state.py Sat Oct 01 19:33:58 2011 +0200 @@ -0,0 +1,53 @@ +#!/usr/bin/env python + +""" +State management classes for iterators. + +Copyright (C) 2011 Paul Boddie + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; either version 3 of the License, or (at your option) any later +version. + +This program is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A +PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along +with this program. If not, see . +""" + +from os.path import commonprefix + +class CommonPrefixState: + + "A class whose instances maintain common prefix state." + + def __init__(self, initial=""): + self.initial = initial + self.reset() + + def reset(self): + self.value = self.initial + +class CommonPrefixDecoder(CommonPrefixState): + + "A class whose instances decode common prefix information." + + def update(self, common_plus_suffix): + common, suffix = common_plus_suffix + self.value = self.value[:common] + suffix + return self.value + +class CommonPrefixEncoder(CommonPrefixState): + + "A class whose instances encode common prefix information." + + def update(self, value): + common = len(commonprefix((self.value, value))) + suffix = value[common:] + self.value = value + return common, suffix + +# vim: tabstop=4 expandtab shiftwidth=4 diff -r 7812d65bc2d9 -r ce60e75cb65b test_indexed.py --- a/test_indexed.py Sat Oct 01 17:59:20 2011 +0200 +++ b/test_indexed.py Sat Oct 01 19:33:58 2011 +0200 @@ -14,7 +14,7 @@ f = open(filename) accessor = DelimitedRecord(keys, numeric=(numeric == "true")) -reader = TextFile(f, accessor) +reader = TextFile(f, Iterator(accessor)) try: t = time.time() l = make_index(reader, int(interval)) diff -r 7812d65bc2d9 -r ce60e75cb65b test_scan.py --- a/test_scan.py Sat Oct 01 17:59:20 2011 +0200 +++ b/test_scan.py Sat Oct 01 19:33:58 2011 +0200 @@ -14,7 +14,7 @@ f = open(filename) accessor = DelimitedRecord(keys, numeric=(numeric == "true")) -reader = TextFile(f, accessor) +reader = TextFile(f, Iterator(accessor)) try: for term in terms: reader.seek(0)