ImprovedMoinSearch (annotate ImprovedMoinSearch.py in b9d34861708c)

ImprovedMoinSearch

Annotated ImprovedMoinSearch.py

5:b9d34861708c

2010-09-26

Paul Boddie

Added control over which parts of page names will be shown in search results.

paul@0	1	# -- coding: iso-8859-1 --
paul@0	2	"""
paul@0	3	MoinMoin - ImprovedMoinSearch library
paul@0	4
paul@0	5	@copyright: 2010 Paul Boddie <paul@boddie.org.uk>
paul@0	6	@license: GNU GPL (v2 or later), see COPYING.txt for details.
paul@0	7	"""
paul@0	8
paul@0	9	from MoinMoin.search import searchPages
paul@0	10	from MoinMoin.Page import Page
paul@1	11	from MoinMoin import wikiutil
paul@0	12	import re
paul@0	13
paul@0	14	heading_regexp = re.compile(r"^(?P<level>=+)(?P<heading>.*?)(?P=level)$", re.UNICODE \| re.MULTILINE)
paul@3	15	paragraph_regexp = re.compile(r"(?P<paragraph>(?:^[^#=\s].*$\n)+)", re.UNICODE \| re.MULTILINE)
paul@3	16
paul@3	17	def range_groups(min_name, max_name):
paul@5	18	return r"(?P<%s>-?\d+)?(?:\s-\s(?P<%s>-?\d+))?" % (min_name, max_name)
paul@3	19
paul@3	20	format_options_regexp = re.compile(r"("
paul@3	21	"(?P<heading>(heading\|title\|h)\s*" + range_groups("min_heading", "max_heading") + ")"
paul@3	22	"\|(?P<paragraph>(paragraph\|para\|p)\s*(?P<paragraph_number>\d+)?)"
paul@5	23	"\|(?P<name>(name\|page)\s*" + range_groups("first", "last") + ")"
paul@3	24	")", re.UNICODE)
paul@0	25
paul@5	26	def convert_index(i, length):
paul@5	27
paul@5	28	"""
paul@5	29	Convert from a 1-based indexing scheme to a 0-based scheme for the given
paul@5	30	index 'i' in a sequence having the given 'length'.
paul@5	31	"""
paul@5	32
paul@5	33	if i is None:
paul@5	34	return i
paul@5	35	elif i > 0:
paul@5	36	return i - 1
paul@5	37	elif i < 0:
paul@5	38	return length + i
paul@5	39	else:
paul@5	40	return i
paul@5	41
paul@0	42	def getSearchResultPages(request, query, **kw):
paul@0	43
paul@0	44	"""
paul@0	45	Return matching pages using the given 'request' and search 'query'. Optional
paul@0	46	keyword arguments are passed to the underlying search infrastructure.
paul@0	47	"""
paul@0	48
paul@0	49	results = searchPages(request, query, **kw)
paul@0	50	return results.hits
paul@0	51
paul@4	52	def getFirstPageHeading(request, page, start=0, min_level=None, max_level=None):
paul@0	53
paul@0	54	"""
paul@0	55	Using the given 'request', return the first heading in the given 'page'
paul@4	56	from the given 'start' point (optional, defaulting to the start of the page)
paul@3	57	having a heading level of at least 'min_level' (which is undefined if not
paul@3	58	specified) and at most 'max_level' (which is undefined if not specified).
paul@4	59
paul@4	60	A tuple containing the heading and the span (the start offset and the end
paul@4	61	offset as a tuple) is returned for a successful retrieval. Otherwise, None
paul@4	62	is returned.
paul@0	63	"""
paul@0	64
paul@0	65	full_page = Page(request, page.page_name)
paul@0	66	body = full_page.get_raw_body()
paul@4	67	if start != 0:
paul@4	68	body = body[start:]
paul@0	69
paul@0	70	for match in heading_regexp.finditer(body):
paul@0	71	level = len(match.group("level"))
paul@0	72
paul@0	73	if (min_level is None or level >= min_level) and \
paul@0	74	(max_level is None or level <= max_level):
paul@0	75
paul@4	76	return match.group("heading"), match.span()
paul@0	77
paul@0	78	return None
paul@0	79
paul@4	80	def getParagraph(request, page, start=0, number=None):
paul@3	81
paul@5	82	"""
paul@5	83	Using the given 'request', return from the given 'page', starting from the
paul@5	84	optional 'start' offset (or the beginning, if no such offset is specified),
paul@5	85	the first paragraph or, if the optional 'number' is given, the paragraph
paul@5	86	whose position corresponds to that number, with a number of 1 being the
paul@5	87	first paragraph found, 2 being the second, and so on.
paul@5	88	"""
paul@5	89
paul@3	90	full_page = Page(request, page.page_name)
paul@3	91	body = full_page.get_raw_body()
paul@4	92	if start != 0:
paul@4	93	body = body[start:]
paul@3	94
paul@3	95	for i, match in enumerate(paragraph_regexp.finditer(body)):
paul@4	96	if number is None or i == max(0, number - 1):
paul@4	97	return match.group("paragraph"), match.span()
paul@3	98
paul@3	99	return None
paul@3	100
paul@5	101	def getPageName(request, page, start=0, first=None, last=None):
paul@5	102
paul@5	103	"""
paul@5	104	Using the given 'request', return the name of the given 'page'. The optional
paul@5	105	'start' offset refers to the body of the page and is returned as the start
paul@5	106	and end of the result span if specified.
paul@5	107
paul@5	108	If the optional 'first' or 'last' parameters are specified, only the
paul@5	109	specified span of parts extracted from the page name will be returned, where
paul@5	110	the parts of the name are obtained by splitting the full name where the
paul@5	111	slash ("/") character is found. The first part has an index of 1, and the
paul@5	112	last part can be referred to using an index of -1.
paul@5	113	"""
paul@5	114
paul@5	115	parts = page.page_name.split("/")
paul@5	116
paul@5	117	first = convert_index(first, len(parts))
paul@5	118	last = convert_index(last, len(parts))
paul@5	119
paul@5	120	if first is None:
paul@5	121	if last is None:
paul@5	122	pass
paul@5	123	else:
paul@5	124	parts = parts[:last+1]
paul@5	125	else:
paul@5	126	if last is None:
paul@5	127	parts = parts[first:]
paul@5	128	else:
paul@5	129	parts = parts[first:last+1]
paul@5	130
paul@5	131	return "/".join(parts), (start, start)
paul@3	132
paul@1	133	def formatResultPages(request, formatter, pages, paging, format, page_from=0):
paul@0	134
paul@0	135	"""
paul@0	136	Using the given 'request' and 'formatter', return a formatted string showing
paul@0	137	the result 'pages', providing paging controls when 'paging' is set to a true
paul@0	138	value, and providing page details according to the given 'format'.
paul@1	139
paul@1	140	If the optional 'pages_from' parameter is set, the result pages from the
paul@1	141	given result (specified within a range from 0 to the length of the 'pages'
paul@1	142	collection) will be shown.
paul@0	143	"""
paul@0	144
paul@3	145	actions = []
paul@1	146
paul@3	147	if format:
paul@3	148	for match in format_options_regexp.finditer(format):
paul@3	149	if match.group("heading"):
paul@3	150	actions.append((getFirstPageHeading, map(int_or_none, (match.group("min_heading"), match.group("max_heading")))))
paul@3	151	elif match.group("paragraph"):
paul@3	152	actions.append((getParagraph, map(int_or_none, (match.group("paragraph_number"),))))
paul@5	153	elif match.group("name"):
paul@5	154	actions.append((getPageName, map(int_or_none, (match.group("first"), match.group("last")))))
paul@0	155	else:
paul@3	156	actions.append((getPageName, ()))
paul@0	157
paul@1	158	# Use paging only when there are enough results.
paul@1	159
paul@1	160	results_per_page = request.cfg.search_results_per_page
paul@1	161	paging = paging and len(pages) > results_per_page
paul@1	162
paul@1	163	if paging:
paul@1	164	pages_to_show = pages[page_from:page_from + results_per_page]
paul@1	165	else:
paul@1	166	pages_to_show = pages
paul@1	167
paul@1	168	# Prepare the output.
paul@1	169
paul@0	170	output = []
paul@2	171	output.append(formatter.number_list(on=1, start=page_from + 1))
paul@0	172
paul@1	173	for page in pages_to_show:
paul@0	174	output.append(formatter.listitem(on=1))
paul@0	175
paul@4	176	start = 0
paul@3	177	first = 1
paul@3	178	for action, args in actions:
paul@4	179	result = action(request, page, start, *args)
paul@4	180
paul@4	181	if result is not None:
paul@4	182	if first:
paul@4	183	output.append(formatter.pagelink(on=1, pagename=page.page_name))
paul@4	184	else:
paul@4	185	output.append(" ")
paul@0	186
paul@4	187	text, span = result
paul@4	188	output.append(formatter.text(text))
paul@4	189
paul@4	190	# Position the search for the next action.
paul@3	191
paul@4	192	_start, _end = span
paul@4	193	start = _end + 1
paul@4	194
paul@4	195	if first:
paul@4	196	output.append(formatter.pagelink(on=0))
paul@4	197
paul@3	198	first = 0
paul@3	199
paul@0	200	output.append(formatter.listitem(on=0))
paul@0	201
paul@0	202	output.append(formatter.number_list(on=0))
paul@0	203
paul@1	204	# Show paging navigation.
paul@1	205
paul@1	206	if paging:
paul@1	207	output.append(formatPagingNavigation(request, formatter, pages, page_from))
paul@1	208
paul@0	209	return "".join(output)
paul@0	210
paul@1	211	def formatPagingNavigation(request, formatter, pages, page_from=0):
paul@1	212
paul@1	213	"""
paul@1	214	Using the given 'request' and 'formatter', return a formatted string showing
paul@1	215	the paging navigation for the result 'pages', according to the 'page_from'
paul@1	216	indicator which provides the current position in the result set.
paul@1	217	"""
paul@1	218
paul@2	219	page = formatter.page
paul@2	220	pagename = page.page_name
paul@1	221	_ = request.getText
paul@1	222
paul@1	223	output = []
paul@1	224
paul@1	225	results_per_page = request.cfg.search_results_per_page
paul@1	226	number_of_results = len(pages)
paul@1	227
paul@1	228	pages_total = number_of_results / results_per_page
paul@1	229	pages_before = page_from / results_per_page
paul@1	230	pages_after = ((number_of_results - page_from) / results_per_page) - 1
paul@1	231
paul@1	232	querydict = wikiutil.parseQueryString(request.query_string)
paul@1	233
paul@1	234	output.append(formatter.paragraph(on=1))
paul@1	235	output.append(formatter.text(_("Result pages:")))
paul@1	236	output.append(formatter.text(" "))
paul@1	237
paul@1	238	n = 0
paul@1	239	while n < pages_before:
paul@2	240	output.append(formatter.pagelink(on=1, pagename=pagename, querystr=getPagingQueryString(querydict, n * results_per_page)))
paul@1	241	output.append(formatter.text(str(n + 1)))
paul@1	242	output.append(formatter.pagelink(on=0))
paul@1	243	output.append(formatter.text(" "))
paul@1	244	n += 1
paul@1	245
paul@1	246	output.append(formatter.text(str(n + 1)))
paul@1	247	output.append(formatter.text(" "))
paul@1	248	n += 1
paul@1	249
paul@2	250	while n <= pages_total:
paul@2	251	output.append(formatter.pagelink(on=1, pagename=pagename, querystr=getPagingQueryString(querydict, n * results_per_page)))
paul@1	252	output.append(formatter.text(str(n + 1)))
paul@1	253	output.append(formatter.pagelink(on=0))
paul@1	254	output.append(formatter.text(" "))
paul@1	255	n += 1
paul@1	256
paul@1	257	output.append(formatter.paragraph(on=0))
paul@1	258
paul@1	259	return "".join(output)
paul@1	260
paul@1	261	def getPagingQueryString(querydict, page_from):
paul@1	262	querydict["from"] = page_from
paul@1	263	return wikiutil.makeQueryString(querydict)
paul@1	264
paul@0	265	def int_or_none(x):
paul@0	266	if x is None:
paul@0	267	return x
paul@0	268	else:
paul@0	269	return int(x)
paul@0	270
paul@0	271	# vim: tabstop=4 expandtab shiftwidth=4