imip-agent (annotate imiptools/text.py in 2fffc03fa3ef)

imip-agent

Annotated imiptools/text.py

1174:2fffc03fa3ef

2016-05-12

Paul Boddie

Introduced line length configuration for more convenient testing of output. Moved tabular file parsing to the text module for potential use by the tools.

paul@1031	1	#!/usr/bin/env python
paul@1031	2
paul@1031	3	"""
paul@1031	4	Parsing of textual content.
paul@1031	5
paul@1174	6	Copyright (C) 2014, 2015, 2016 Paul Boddie <paul@boddie.org.uk>
paul@1031	7
paul@1031	8	This program is free software; you can redistribute it and/or modify it under
paul@1031	9	the terms of the GNU General Public License as published by the Free Software
paul@1031	10	Foundation; either version 3 of the License, or (at your option) any later
paul@1031	11	version.
paul@1031	12
paul@1031	13	This program is distributed in the hope that it will be useful, but WITHOUT
paul@1031	14	ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
paul@1031	15	FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
paul@1031	16	details.
paul@1031	17
paul@1031	18	You should have received a copy of the GNU General Public License along with
paul@1031	19	this program. If not, see <http://www.gnu.org/licenses/>.
paul@1031	20	"""
paul@1031	21
paul@1174	22	import codecs
paul@1031	23	import re
paul@1031	24
paul@1031	25	# Parsing of lines to obtain functions and arguments.
paul@1031	26
paul@1031	27	line_pattern_str = r"(?:" \
paul@1031	28	r"(?:'(.*?)')" \
paul@1031	29	r"\|" \
paul@1031	30	r'(?:"(.*?)")' \
paul@1031	31	r"\|" \
paul@1031	32	r"([^\s]+)" \
paul@1031	33	r")+" \
paul@1031	34	r"(?:\s+\|$)"
paul@1031	35	line_pattern = re.compile(line_pattern_str)
paul@1031	36
paul@1031	37	def parse_line(text):
paul@1031	38
paul@1031	39	"""
paul@1031	40	Parse the given 'text', returning a list of words separated by whitespace in
paul@1031	41	the input, where whitespace may occur inside words if quoted using single or
paul@1031	42	double quotes.
paul@1031	43	"""
paul@1031	44
paul@1031	45	parts = []
paul@1031	46
paul@1031	47	# Match the components of each part.
paul@1031	48
paul@1031	49	for match in line_pattern.finditer(text):
paul@1031	50
paul@1031	51	# Combine the components by traversing the matching groups.
paul@1031	52
paul@1031	53	parts.append(reduce(lambda a, b: (a or "") + (b or ""), match.groups()))
paul@1031	54
paul@1031	55	return parts
paul@1031	56
paul@1174	57	# Parsing of tabular files.
paul@1174	58
paul@1174	59	def set_defaults(t, empty_defaults):
paul@1174	60
paul@1174	61	"""
paul@1174	62	In the list 't', replace values that are empty or absent with defaults
paul@1174	63	provided by the 'empty_defaults' collection whose entries are of the form
paul@1174	64	(index, value).
paul@1174	65	"""
paul@1174	66
paul@1174	67	for i, default in empty_defaults:
paul@1174	68	if i >= len(t):
paul@1174	69	t += [None] * (i - len(t) + 1)
paul@1174	70	if not t[i]:
paul@1174	71	t[i] = default
paul@1174	72	return t
paul@1174	73
paul@1174	74	def get_table(filename, empty_defaults=None, tab_separated=True):
paul@1174	75
paul@1174	76	"""
paul@1174	77	From the file having the given 'filename', return a list of tuples
paul@1174	78	representing the file's contents.
paul@1174	79
paul@1174	80	The 'empty_defaults' is a list of (index, value) tuples indicating the
paul@1174	81	default value where a column either does not exist or provides an empty
paul@1174	82	value.
paul@1174	83
paul@1174	84	If 'tab_separated' is specified and is a false value, line parsing using
paul@1174	85	the imiptools.text.parse_line function will be performed instead of
paul@1174	86	splitting each line of the file using tab characters as separators.
paul@1174	87	"""
paul@1174	88
paul@1174	89	f = codecs.open(filename, "rb", encoding="utf-8")
paul@1174	90	try:
paul@1174	91	return get_table_from_stream(f, empty_defaults, tab_separated)
paul@1174	92	finally:
paul@1174	93	f.close()
paul@1174	94
paul@1174	95	def get_table_from_stream(f, empty_defaults=None, tab_separated=True):
paul@1174	96
paul@1174	97	"""
paul@1174	98	Return a list of tuples representing the contents of the stream 'f'.
paul@1174	99
paul@1174	100	The 'empty_defaults' is a list of (index, value) tuples indicating the
paul@1174	101	default value where a column either does not exist or provides an empty
paul@1174	102	value.
paul@1174	103
paul@1174	104	If 'tab_separated' is specified and is a false value, line parsing using
paul@1174	105	the imiptools.text.parse_line function will be performed instead of
paul@1174	106	splitting each line of the file using tab characters as separators.
paul@1174	107	"""
paul@1174	108
paul@1174	109	l = []
paul@1174	110
paul@1174	111	for line in f.readlines():
paul@1174	112	line = line.strip(" \r\n")
paul@1174	113
paul@1174	114	if tab_separated:
paul@1174	115	t = line.split("\t")
paul@1174	116	else:
paul@1174	117	t = parse_line(line)
paul@1174	118
paul@1174	119	if empty_defaults:
paul@1174	120	t = set_defaults(t, empty_defaults)
paul@1174	121	l.append(tuple(t))
paul@1174	122
paul@1174	123	return l
paul@1174	124
paul@1031	125	# vim: tabstop=4 expandtab shiftwidth=4