iixr (annotate test.py in 9867931a9269)

iixr

Annotated test.py

82:9867931a9269

2010-12-17

Paul Boddie

Avoid identical adjacent tokens being matched to the same document token.

paul@0	1	#!/usr/bin/env python
paul@0	2
paul@44	3	from iixr.files import *
paul@44	4	from iixr.fields import *
paul@44	5	from iixr.terms import *
paul@44	6	from iixr.positions import *
paul@44	7	from iixr.index import *
paul@74	8	from array import array
paul@59	9	import os, sys
paul@18	10
paul@18	11	# Remove old test files.
paul@18	12
paul@74	13	for filename in ("test", "testMS", "testNMS", "testF", "testFI", "testI", "testP", "testP2", "testPI"):
paul@18	14	try:
paul@18	15	os.remove(filename)
paul@18	16	except OSError:
paul@18	17	pass
paul@18	18
paul@18	19	try:
paul@77	20	for dirname in ("test_index", "test_index2", "test_index3", "test_indexT"):
paul@59	21	for filename in os.listdir(dirname):
paul@59	22	os.remove(os.path.join(dirname, filename))
paul@59	23	os.rmdir(dirname)
paul@18	24	except OSError:
paul@18	25	pass
paul@0	26
paul@59	27	if "clean" in sys.argv:
paul@59	28	sys.exit(0)
paul@59	29
paul@69	30	print "- Test basic data types."
paul@9	31
paul@5	32	numbers = [12345678, 0, 1, 127, 128, 255, 256]
paul@0	33
paul@0	34	f = open("test", "wb")
paul@44	35	w = FileWriter(f)
paul@0	36	for number in numbers:
paul@0	37	w.write_number(number)
paul@0	38	w.close()
paul@0	39
paul@3	40	f = open("test", "rb")
paul@44	41	r = FileReader(f)
paul@0	42	for number in numbers:
paul@0	43	n = r.read_number()
paul@0	44	print number == n, number, n
paul@0	45	r.close()
paul@0	46
paul@74	47	tuples = [(0, 0), (1, 3), (2, 5), (3, 9)]
paul@74	48
paul@74	49	f = open("testMS", "wb")
paul@74	50	w = FileWriter(f)
paul@74	51	b = array("B")
paul@74	52	last = w.get_initial_value(2)
paul@74	53	for t in tuples:
paul@74	54	last = w.write_sequence(b, t, last, 2)
paul@74	55	b.tofile(w.f)
paul@74	56	w.close()
paul@74	57
paul@74	58	f = open("testMS", "rb")
paul@74	59	r = FileReader(f)
paul@74	60	last = r.get_initial_value(2)
paul@74	61	for t in tuples:
paul@74	62	last = t2 = r.read_sequence(last, 2)
paul@74	63	print t == t2, t, t2
paul@74	64	r.close()
paul@74	65
paul@74	66	tuples2 = [(0, 0), (1, 3), (2, 1), (3, 2), (4, 0)]
paul@74	67
paul@74	68	f = open("testNMS", "wb")
paul@74	69	w = FileWriter(f)
paul@74	70	b = array("B")
paul@74	71	last = w.get_initial_value(2)
paul@74	72	for t in tuples2:
paul@74	73	last = w.write_sequence(b, t, last, 2, monotonic=0)
paul@74	74	b.tofile(w.f)
paul@74	75	w.close()
paul@74	76
paul@74	77	f = open("testNMS", "rb")
paul@74	78	r = FileReader(f)
paul@74	79	last = r.get_initial_value(2)
paul@74	80	for t in tuples2:
paul@74	81	last = t2 = r.read_sequence(last, 2, monotonic=0)
paul@74	82	print t == t2, t, t2
paul@74	83	r.close()
paul@74	84
paul@69	85	print "- Test positions."
paul@9	86
paul@0	87	all_doc_positions = [
paul@0	88	[
paul@0	89	(123, [1, 3, 5, 15, 25]),
paul@19	90	(124, [0, 100]),
paul@19	91	(125, [11, 99, 199]),
paul@19	92	(130, [77, 78, 80, 82, 89])
paul@0	93	],
paul@0	94	[
paul@0	95	(78, [9]),
paul@19	96	(196, [10, 11]),
paul@19	97	(197, [17, 21, 30])
paul@0	98	]
paul@0	99	]
paul@0	100
paul@19	101	f = open("testP", "wb")
paul@44	102	w = PositionWriter(f)
paul@0	103	for doc_positions in all_doc_positions:
paul@0	104	for docnum, positions in doc_positions:
paul@0	105	w.write_positions(docnum, positions)
paul@0	106	w.reset()
paul@0	107	w.close()
paul@0	108
paul@19	109	f = open("testP", "rb")
paul@68	110	r = PositionReader(f)
paul@0	111	for doc_positions in all_doc_positions:
paul@0	112	for docnum, positions in doc_positions:
paul@0	113	d, p = r.read_positions()
paul@0	114	print docnum == d, docnum, d
paul@0	115	print positions == p, positions, p
paul@0	116	r.reset()
paul@0	117	r.close()
paul@0	118
paul@74	119	all_doc_positions_seq = [
paul@74	120	[
paul@74	121	((123, 0), [(1, 5), (3, 9), (5, 15), (15, 45), (25, 70)]),
paul@74	122	((124, 1), [(0, 0), (100, 350)]),
paul@74	123	((124, 2), [(11, 38), (99, 379), (199, 720)]),
paul@74	124	((130, 0), [(77, 286), (78, 290), (80, 300), (82, 304), (89, 316)])
paul@74	125	],
paul@74	126	[
paul@74	127	((78, 1), [(9, 19)]),
paul@74	128	((196, 0), [(10, 27), (11, 29)]),
paul@74	129	((196, 1), [(17, 46), (21, 52), (30, 60)])
paul@74	130	]
paul@74	131	]
paul@74	132
paul@74	133	f = open("testP2", "wb")
paul@74	134	w = PositionWriter(f)
paul@74	135	for doc_positions in all_doc_positions_seq:
paul@74	136	for docnum, positions in doc_positions:
paul@74	137	w.write_positions(docnum, positions)
paul@74	138	w.reset()
paul@74	139	w.close()
paul@74	140
paul@74	141	f = open("testP2", "rb")
paul@74	142	r = PositionReader(f)
paul@74	143	for doc_positions in all_doc_positions_seq:
paul@74	144	for docnum, positions in doc_positions:
paul@74	145	d, p = r.read_positions()
paul@74	146	print tuple(docnum) == tuple(d), docnum, d
paul@74	147	print tuple(positions) == tuple(p), positions, p
paul@74	148	r.reset()
paul@74	149	r.close()
paul@74	150
paul@69	151	print "- Test position index files."
paul@19	152
paul@19	153	indexed_positions = [
paul@19	154	[
paul@19	155	(1234, 0, 100),
paul@19	156	(2345, 700, 100),
paul@19	157	(3456, 1900, 50)
paul@19	158	],
paul@19	159	[
paul@19	160	(4567, 2800, 20)
paul@19	161	]
paul@19	162	]
paul@19	163
paul@19	164	offsets = []
paul@19	165	f = open("testPI", "wb")
paul@44	166	w = PositionIndexWriter(f)
paul@19	167	for term_positions in indexed_positions:
paul@19	168	offset = None
paul@19	169	doc_frequency = 0
paul@19	170	w.reset()
paul@19	171	for docnum, pos_offset, count in term_positions:
paul@19	172	if offset is None:
paul@55	173	offset = w.f.tell()
paul@55	174	w.write_positions(docnum, pos_offset, count)
paul@19	175	doc_frequency += count
paul@19	176	offsets.append((offset, doc_frequency))
paul@19	177	w.close()
paul@19	178
paul@69	179	r = PositionIndexIterator(PositionIndexReader(open("testPI", "rb")))
paul@19	180	offsets.reverse()
paul@19	181	indexed_positions.reverse()
paul@19	182	for (offset, doc_frequency), term_positions in zip(offsets, indexed_positions):
paul@68	183	r.seek(offset, doc_frequency)
paul@68	184	for (docnum, pos_offset, count), (dn, po, c) in zip(term_positions, r):
paul@19	185	print docnum == dn, docnum, dn
paul@19	186	print pos_offset == po, pos_offset, po
paul@19	187	print count == c, count, c
paul@69	188	r.reader.close()
paul@19	189
paul@69	190	print "- Test position dictionaries."
paul@19	191
paul@19	192	f = open("testP", "wb")
paul@44	193	w = PositionWriter(f)
paul@19	194	f2 = open("testPI", "wb")
paul@44	195	w2 = PositionIndexWriter(f2)
paul@44	196	wd = PositionDictionaryWriter(w, w2, 2)
paul@0	197	offsets = []
paul@0	198	for doc_positions in all_doc_positions:
paul@19	199	offset, frequency, doc_frequency = wd.write_term_positions(doc_positions)
paul@19	200	offsets.append((offset, doc_frequency))
paul@20	201	wd.close()
paul@0	202
paul@68	203	r = PositionReader(open("testP", "rb"))
paul@68	204	r2 = PositionIndexReader(open("testPI", "rb"))
paul@44	205	rd = PositionDictionaryReader(r, r2)
paul@0	206	offsets.reverse()
paul@0	207	all_doc_positions.reverse()
paul@19	208	for (offset, doc_frequency), doc_positions in zip(offsets, all_doc_positions):
paul@69	209	it = rd.read_term_positions(offset, doc_frequency)
paul@69	210	dp = list(it)
paul@0	211	print doc_positions == dp, doc_positions, dp
paul@20	212	rd.close()
paul@0	213
paul@69	214	print "- Test fields."
paul@9	215
paul@8	216	doc_fields = [
paul@9	217	(123, ["testing", "fields", "stored", "compressed"]),
paul@9	218	(456, ["fields", "for a second", "document"]),
paul@9	219	(789, ["field value"]),
paul@9	220	(1234, []),
paul@9	221	(2345, ["abc", "def"]),
paul@9	222	(3456, ["apple", "banana", "cherry"]),
paul@9	223	(4567, ["drue", "eple"])
paul@8	224	]
paul@8	225
paul@8	226	f = open("testF", "wb")
paul@44	227	w = FieldWriter(f)
paul@9	228	for docnum, fields in doc_fields:
paul@13	229	w.write_fields(docnum, list(enumerate(fields)))
paul@8	230	w.close()
paul@8	231
paul@8	232	f = open("testF", "rb")
paul@44	233	r = FieldReader(f)
paul@9	234	for docnum, fields in doc_fields:
paul@9	235	dn, df = r.read_fields()
paul@9	236	print docnum == dn, docnum, dn
paul@13	237	print list(enumerate(fields)) == df, list(enumerate(fields)), df
paul@8	238	r.close()
paul@8	239
paul@69	240	print "- Test field index files."
paul@9	241
paul@9	242	indexed_docs = [
paul@9	243	(123, 100000987),
paul@9	244	(456, 100004321),
paul@9	245	(789, 100008765)
paul@9	246	]
paul@9	247
paul@9	248	f = open("testFI", "wb")
paul@44	249	w = FieldIndexWriter(f)
paul@9	250	for docnum, offset in indexed_docs:
paul@9	251	w.write_document(docnum, offset)
paul@9	252	w.close()
paul@9	253
paul@9	254	f = open("testFI", "rb")
paul@44	255	r = FieldIndexReader(f)
paul@9	256	for docnum, offset in indexed_docs:
paul@9	257	dn, o = r.read_document()
paul@9	258	print docnum == dn, docnum, dn
paul@9	259	print offset == o, offset, o
paul@9	260	r.close()
paul@9	261
paul@69	262	print "- Test field dictionaries."
paul@9	263
paul@9	264	f = open("testF", "wb")
paul@44	265	w = FieldWriter(f)
paul@9	266	f2 = open("testFI", "wb")
paul@44	267	w2 = FieldIndexWriter(f2)
paul@44	268	wd = FieldDictionaryWriter(w, w2, 3)
paul@9	269	for docnum, fields in doc_fields:
paul@13	270	wd.write_fields(docnum, list(enumerate(fields)))
paul@9	271	wd.close()
paul@9	272
paul@9	273	f = open("testF", "rb")
paul@44	274	r = FieldReader(f)
paul@9	275	f2 = open("testFI", "rb")
paul@44	276	r2 = FieldIndexReader(f2)
paul@44	277	rd = FieldDictionaryReader(r, r2)
paul@9	278	doc_fields_reversed = doc_fields[:]
paul@9	279	doc_fields_reversed.reverse()
paul@9	280	for docnum, fields in doc_fields_reversed:
paul@25	281	df = dict(rd.get_fields(docnum))
paul@25	282	print dict(enumerate(fields)) == df, dict(enumerate(fields)), df
paul@9	283	for docnum in (13579, 246810):
paul@13	284	df = rd.get_fields(docnum)
paul@9	285	print df is None, df
paul@13	286
paul@69	287	print "- (Test sequential access.)"
paul@13	288
paul@13	289	rd.rewind()
paul@13	290	for docnum, fields in doc_fields:
paul@13	291	dn, df = rd.read_fields()
paul@13	292	print docnum == dn, docnum, dn
paul@13	293	print list(enumerate(fields)) == df, list(enumerate(fields)), df
paul@9	294	rd.close()
paul@9	295
paul@69	296	print "- Test terms."
paul@9	297
paul@2	298	terms = [
paul@19	299	# term offset frequency doc_frequency
paul@19	300	("aardvark", 100000123, 1, 1),
paul@19	301	("anteater", 100000456, 2, 1),
paul@19	302	("badger", 100000789, 13, 7),
paul@19	303	("bull", 1000001234, 59, 17),
paul@19	304	("bulldog", 1000002345, 99, 80),
paul@19	305	("cat", 1000003456, 89, 28)
paul@2	306	]
paul@2	307
paul@2	308	f = open("test", "wb")
paul@44	309	w = TermWriter(f)
paul@19	310	for term, offset, frequency, doc_frequency in terms:
paul@19	311	w.write_term(term, offset, frequency, doc_frequency)
paul@2	312	w.close()
paul@2	313
paul@3	314	f = open("test", "rb")
paul@44	315	r = TermReader(f)
paul@19	316	for term, offset, frequency, doc_frequency in terms:
paul@19	317	t, o, fr, df = r.read_term()
paul@2	318	print term == t, term, t
paul@2	319	print offset == o, offset, o
paul@11	320	print frequency == fr, frequency, fr
paul@19	321	print doc_frequency == df, doc_frequency, df
paul@2	322	r.close()
paul@2	323
paul@69	324	print "- Test terms in index files."
paul@9	325
paul@3	326	indexed_terms = [
paul@19	327	# term offset frequency doc_frequency info_offset
paul@19	328	("aardvark", 100000123, 1, 1, 200000321),
paul@19	329	("anteater", 100000456, 2, 1, 200000654),
paul@19	330	("badger", 100000789, 13, 7, 200000987),
paul@19	331	("bull", 1000001234, 59, 17, 200004321),
paul@19	332	("bulldog", 1000002345, 99, 80, 200005432),
paul@19	333	("cat", 1000003456, 89, 28, 200006543)
paul@3	334	]
paul@3	335
paul@3	336	f = open("test", "wb")
paul@44	337	w = TermIndexWriter(f)
paul@19	338	for term, offset, frequency, doc_frequency, info_offset in indexed_terms:
paul@19	339	w.write_term(term, offset, frequency, doc_frequency, info_offset)
paul@3	340	w.close()
paul@3	341
paul@3	342	f = open("test", "rb")
paul@44	343	r = TermIndexReader(f)
paul@19	344	for term, offset, frequency, doc_frequency, info_offset in indexed_terms:
paul@19	345	t, o, fr, df, i = r.read_term()
paul@3	346	print term == t, term, t
paul@3	347	print offset == o, offset, o
paul@11	348	print frequency == fr, frequency, fr
paul@19	349	print doc_frequency == df, doc_frequency, df
paul@3	350	print info_offset == i, info_offset, i
paul@3	351	r.close()
paul@3	352
paul@69	353	print "- Test dictionaries with only term data."
paul@9	354
paul@3	355	f = open("test", "wb")
paul@44	356	w = TermWriter(f)
paul@3	357	f2 = open("testI", "wb")
paul@44	358	w2 = TermIndexWriter(f2)
paul@20	359	f3 = open("testP", "wb")
paul@44	360	w3 = PositionWriter(f3)
paul@20	361	f4 = open("testPI", "wb")
paul@44	362	w4 = PositionIndexWriter(f4)
paul@44	363	wp = PositionDictionaryWriter(w3, w4, 2)
paul@44	364	wd = TermDictionaryWriter(w, w2, wp, 3)
paul@19	365	for term, offset, frequency, doc_frequency in terms:
paul@19	366	wd._write_term(term, offset, frequency, doc_frequency)
paul@5	367	wd.close()
paul@3	368
paul@3	369	f = open("test", "rb")
paul@44	370	r = TermReader(f)
paul@3	371	f2 = open("testI", "rb")
paul@44	372	r2 = TermIndexReader(f2)
paul@68	373	r3 = PositionReader(open("testP", "rb"))
paul@68	374	r4 = PositionIndexReader(open("testPI", "rb"))
paul@44	375	rp = PositionDictionaryReader(r3, r4)
paul@44	376	rd = TermDictionaryReader(r, r2, rp)
paul@3	377	terms_reversed = terms[:]
paul@3	378	terms_reversed.reverse()
paul@19	379	for term, offset, frequency, doc_frequency in terms_reversed:
paul@19	380	o, fr, df = rd._find_term(term)
paul@3	381	print offset == o, offset, o
paul@11	382	print frequency == fr, frequency, fr
paul@19	383	print doc_frequency == df, doc_frequency, df
paul@3	384	for term in ("dog", "dingo"):
paul@11	385	t = rd._find_term(term)
paul@11	386	print t is None, t
paul@25	387
paul@69	388	print "- (Test term prefix searching.)"
paul@25	389
paul@25	390	print rd.find_terms("a") == ["aardvark", "anteater"], rd.find_terms("a"), ["aardvark", "anteater"]
paul@25	391	print rd.find_terms("bu") == ["bull", "bulldog"], rd.find_terms("bu"), ["bull", "bulldog"]
paul@25	392	print rd.find_terms("c") == ["cat"], rd.find_terms("c"), ["cat"]
paul@25	393	print rd.find_terms("d") == [], rd.find_terms("d"), []
paul@5	394	rd.close()
paul@5	395
paul@69	396	print "- Test dictionaries with term and position data."
paul@9	397
paul@5	398	terms_with_positions = [
paul@5	399	("aardvark", [(1, [2, 45, 96]), (20, [13])]),
paul@5	400	("anteater", [(1, [43, 44])]),
paul@5	401	("badger", [(7, [2, 22, 196]), (19, [55, 1333]), (21, [0])]),
paul@19	402	("bull", [(6, [128]), (16, [12]), (26, [1, 3, 5, 7, 9]), (36, [2, 4, 6, 8, 10])]),
paul@5	403	("bulldog", [(43, [17, 19, 256, 512])]),
paul@5	404	("cat", [(123, [12, 145, 196]), (1200, [113])])
paul@5	405	]
paul@5	406
paul@22	407	position_dict_tests = [
paul@22	408	("badger", 19, [55, 1333]),
paul@22	409	("badger", 20, None),
paul@22	410	("bull", 6, [128]),
paul@22	411	("bull", 26, [1, 3, 5, 7, 9]),
paul@22	412	("cat", 111, None),
paul@22	413	("cat", 123, [12, 145, 196]),
paul@22	414	("cat", 1234, None)
paul@22	415	]
paul@22	416
paul@5	417	f = open("test", "wb")
paul@44	418	w = TermWriter(f)
paul@5	419	f2 = open("testI", "wb")
paul@44	420	w2 = TermIndexWriter(f2)
paul@5	421	f3 = open("testP", "wb")
paul@44	422	w3 = PositionWriter(f3)
paul@19	423	f4 = open("testPI", "wb")
paul@44	424	w4 = PositionIndexWriter(f4)
paul@44	425	wp = PositionDictionaryWriter(w3, w4, 2)
paul@44	426	wd = TermDictionaryWriter(w, w2, wp, 3)
paul@5	427	for term, doc_positions in terms_with_positions:
paul@5	428	wd.write_term_positions(term, doc_positions)
paul@5	429	wd.close()
paul@5	430
paul@5	431	f = open("test", "rb")
paul@44	432	r = TermReader(f)
paul@5	433	f2 = open("testI", "rb")
paul@44	434	r2 = TermIndexReader(f2)
paul@68	435	r3 = PositionReader(open("testP", "rb"))
paul@68	436	r4 = PositionIndexReader(open("testPI", "rb"))
paul@44	437	rp = PositionDictionaryReader(r3, r4)
paul@44	438	rd = TermDictionaryReader(r, r2, rp)
paul@5	439	terms_reversed = terms_with_positions[:]
paul@5	440	terms_reversed.reverse()
paul@5	441	for term, doc_positions in terms_reversed:
paul@18	442	dp = list(rd.find_positions(term))
paul@5	443	print doc_positions == dp, doc_positions, dp
paul@25	444	for term in ("aaa", "dog", "dingo"):
paul@5	445	dp = rd.find_positions(term)
paul@61	446	print dp == [], dp
paul@12	447
paul@69	448	print "- (Test iterators.)"
paul@22	449
paul@22	450	for term, docnum, positions in position_dict_tests:
paul@22	451	dp = rd.find_positions(term)
paul@22	452	pos = dp.from_document(docnum)
paul@22	453	print positions is None and pos is None or pos is not None and positions == list(pos), positions, pos
paul@22	454
paul@69	455	print "- (Test sequential access.)"
paul@12	456
paul@12	457	rd.rewind()
paul@12	458	for term, doc_positions in terms_with_positions:
paul@19	459	t, fr, df, dp = rd.read_term()
paul@18	460	dp = list(dp)
paul@12	461	print term == t, term, t
paul@12	462	print doc_positions == dp, doc_positions, dp
paul@5	463	rd.close()
paul@3	464
paul@69	465	print "- Test high-level index operations (including merging)."
paul@9	466
paul@6	467	docs = [
paul@6	468	(1, "The cat sat on the mat"),
paul@6	469	(2, "Every good boy deserves football"),
paul@6	470	(13, "One good turn deserves another"),
paul@6	471	(14, "Every man for himself"),
paul@6	472	(25, "Red sky at night shepherd's delight"),
paul@6	473	(36, "She sells sea shells on the sea shore")
paul@6	474	]
paul@6	475
paul@6	476	doc_tests = [
paul@11	477	("Every", 2, [(2, [0]), (14, [0])]),
paul@11	478	("good", 2, [(2, [1]), (13, [1])]),
paul@11	479	("deserves", 2, [(2, [3]), (13, [3])]),
paul@11	480	("sea", 2, [(36, [2, 6])])
paul@6	481	]
paul@6	482
paul@21	483	position_tests = [
paul@21	484	("Every", 14, [0]),
paul@21	485	("sea", 36, [2, 6]),
paul@22	486	("shells", 1, None),
paul@22	487	("shells", 37, None)
paul@21	488	]
paul@21	489
paul@60	490	phrase_tests = [
paul@62	491	(["good", "boy"], [(2, [1, 2])]),
paul@62	492	(["on", "the"], [(1, [3, 4]), (36, [4, 5])]),
paul@62	493	(["sea", "shore"], [(36, [6, 7])])
paul@60	494	]
paul@60	495
paul@64	496	index = Index("test_index", 3, 2, 3, 6)
paul@64	497	wi = index.get_writer()
paul@6	498	for docnum, text in docs:
paul@44	499	doc = Document(docnum)
paul@6	500	for position, term in enumerate(text.split()):
paul@28	501	doc.add_position(term, position)
paul@28	502	doc.add_field(123, text)
paul@28	503	wi.add_document(doc)
paul@6	504	wi.close()
paul@6	505
paul@7	506	rd = index.get_reader()
paul@60	507
paul@69	508	print "- (Test searching.)"
paul@60	509
paul@11	510	for term, frequency, doc_positions in doc_tests:
paul@18	511	dp = list(rd.find_positions(term))
paul@6	512	print doc_positions == dp, doc_positions, dp
paul@11	513	fr = rd.get_frequency(term)
paul@11	514	print frequency == fr, frequency, fr
paul@60	515
paul@69	516	print "- (Test fields.)"
paul@60	517
paul@10	518	for docnum, text in docs:
paul@25	519	df = dict(rd.get_fields(docnum))
paul@25	520	print df[123] == text, text, df[123]
paul@60	521
paul@69	522	print "- (Test navigation.)"
paul@60	523
paul@21	524	for term, docnum, positions in position_tests:
paul@21	525	dp = rd.find_positions(term)
paul@22	526	pos = dp.from_document(docnum)
paul@22	527	print positions is None and pos is None or pos is not None and positions == list(pos), positions, pos
paul@60	528
paul@69	529	print "- (Test phrases.)"
paul@60	530
paul@60	531	for terms, results in phrase_tests:
paul@60	532	res = list(rd.find_common_positions(terms))
paul@60	533	print results == res, results, res
paul@60	534
paul@7	535	index.close()
paul@6	536
paul@77	537	docs2 = [
paul@77	538	((1, 0), "The cat sat on the mat"),
paul@77	539	((1, 2), "Every good boy deserves football"),
paul@77	540	((13, 1), "One good turn deserves another"),
paul@77	541	((14, 0), "Every man for himself"),
paul@77	542	((14, 25), "Red sky at night shepherd's delight"),
paul@77	543	((36, 12), "She sells sea shells on the sea shore")
paul@77	544	]
paul@77	545
paul@77	546	doc_tests2 = [
paul@77	547	("Every", 2, [((1, 2), [(0, 0)]), ((14, 0), [(0, 0)])]),
paul@77	548	("good", 2, [((1, 2), [(1, 6)]), ((13, 1), [(1, 4)])]),
paul@77	549	("deserves", 2, [((1, 2), [(3, 15)]), ((13, 1), [(3, 14)])]),
paul@77	550	("sea", 2, [((36, 12), [(2, 10), (6, 28)])])
paul@77	551	]
paul@77	552
paul@77	553	position_tests2 = [
paul@77	554	("Every", (14, 0), [(0, 0)]),
paul@77	555	("sea", (36, 12), [(2, 10), (6, 28)]),
paul@77	556	("shells", (1, 0), None),
paul@77	557	("shells", (37, 0), None)
paul@77	558	]
paul@77	559
paul@77	560	phrase_tests2 = [
paul@77	561	(["good", "boy"], [((1, 2), [(1, 6), (2, 11)])]),
paul@77	562	(["on", "the"], [((1, 0), [(3, 12), (4, 15)]), ((36, 12), [(4, 21), (5, 24)])]),
paul@77	563	(["sea", "shore"], [((36, 12), [(6, 28), (7, 32)])])
paul@77	564	]
paul@77	565
paul@77	566	index = Index("test_indexT", 3, 2, 3, 6)
paul@77	567	wi = index.get_writer()
paul@77	568	for docnum, text in docs2:
paul@77	569	doc = Document(docnum)
paul@77	570	offset = 0
paul@77	571	for position, term in enumerate(text.split()):
paul@77	572	doc.add_position(term, (position, offset))
paul@77	573	offset += len(term) + 1 # assume one space after the term
paul@77	574	doc.add_field(123, text)
paul@77	575	wi.add_document(doc)
paul@77	576	wi.close()
paul@77	577
paul@77	578	rd = index.get_reader()
paul@77	579
paul@77	580	print "- (Test searching.)"
paul@77	581
paul@77	582	for term, frequency, doc_positions in doc_tests2:
paul@77	583	dp = list(rd.find_positions(term))
paul@77	584	print doc_positions == dp, doc_positions, dp
paul@77	585	fr = rd.get_frequency(term)
paul@77	586	print frequency == fr, frequency, fr
paul@77	587
paul@77	588	print "- (Test fields.)"
paul@77	589
paul@77	590	for docnum, text in docs2:
paul@77	591	df = dict(rd.get_fields(docnum))
paul@77	592	print df[123] == text, text, df[123]
paul@77	593
paul@77	594	print "- (Test navigation.)"
paul@77	595
paul@77	596	for term, docnum, positions in position_tests2:
paul@77	597	dp = rd.find_positions(term)
paul@77	598	pos = dp.from_document(docnum)
paul@77	599	print positions is None and pos is None or pos is not None and positions == list(pos), positions, pos
paul@77	600
paul@77	601	print "- (Test phrases.)"
paul@77	602
paul@77	603	for terms, results in phrase_tests2:
paul@77	604	res = list(rd.find_common_positions(terms))
paul@77	605	print results == res, results, res
paul@77	606
paul@77	607	index.close()
paul@77	608
paul@69	609	print "- Test index updates."
paul@58	610
paul@58	611	index = Index("test_index")
paul@64	612	index2 = Index("test_index2", 3, 2, 3, 6)
paul@64	613	wi = index2.get_writer()
paul@58	614	for docnum, text in docs:
paul@58	615
paul@58	616	# Add the same documents but with different numbers.
paul@58	617
paul@58	618	doc = Document(docnum + 100)
paul@58	619	for position, term in enumerate(text.split()):
paul@58	620	doc.add_position(term, position)
paul@58	621	doc.add_field(123, text)
paul@58	622	wi.add_document(doc)
paul@58	623	wi.close()
paul@58	624
paul@58	625	index2.update([index])
paul@58	626	index.close()
paul@58	627
paul@58	628	rd = index2.get_reader()
paul@58	629	for term, frequency, doc_positions in doc_tests:
paul@58	630
paul@58	631	# Add the extra documents to the expected result.
paul@58	632
paul@59	633	orig_doc_positions = doc_positions
paul@59	634	doc_positions = doc_positions[:]
paul@59	635
paul@59	636	for docnum, positions in orig_doc_positions:
paul@58	637	doc_positions.append((docnum + 100, positions))
paul@58	638	frequency *= 2
paul@58	639
paul@58	640	dp = list(rd.find_positions(term))
paul@58	641	print doc_positions == dp, doc_positions, dp
paul@58	642	fr = rd.get_frequency(term)
paul@58	643	print frequency == fr, frequency, fr
paul@58	644	index2.close()
paul@58	645
paul@69	646	print "- (Test update of an empty index.)"
paul@59	647
paul@59	648	index = Index("test_index")
paul@59	649	index3 = Index("test_index3")
paul@59	650	index3.update([index])
paul@59	651	index.close()
paul@59	652
paul@59	653	rd = index3.get_reader()
paul@59	654	for term, frequency, doc_positions in doc_tests:
paul@59	655	dp = list(rd.find_positions(term))
paul@59	656	print doc_positions == dp, doc_positions, dp
paul@59	657	fr = rd.get_frequency(term)
paul@59	658	print frequency == fr, frequency, fr
paul@59	659	index3.close()
paul@59	660
paul@0	661	# vim: tabstop=4 expandtab shiftwidth=4