Lichen (annotate lib/__builtins__/unicode.py in 786ddedfee67)

Lichen

Annotated lib/builtins/unicode.py

803:786ddedfee67

2017-04-04

Paul Boddie

Fixed default population where parameters are known with zero defaults.

paul@390	1	#!/usr/bin/env python
paul@390	2
paul@390	3	"""
paul@390	4	Unicode objects.
paul@390	5
paul@519	6	Copyright (C) 2015, 2016, 2017 Paul Boddie <paul@boddie.org.uk>
paul@390	7
paul@390	8	This program is free software; you can redistribute it and/or modify it under
paul@390	9	the terms of the GNU General Public License as published by the Free Software
paul@390	10	Foundation; either version 3 of the License, or (at your option) any later
paul@390	11	version.
paul@390	12
paul@390	13	This program is distributed in the hope that it will be useful, but WITHOUT
paul@390	14	ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
paul@390	15	FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
paul@390	16	details.
paul@390	17
paul@390	18	You should have received a copy of the GNU General Public License along with
paul@390	19	this program. If not, see <http://www.gnu.org/licenses/>.
paul@390	20	"""
paul@390	21
paul@390	22	from __builtins__.str import basestring
paul@431	23	from __builtins__.types import check_int
paul@390	24	from posix.iconv import Converter
paul@534	25	from native import str_add, unicode_len, unicode_ord, unicode_substr, \
paul@431	26	isinstance as _isinstance
paul@390	27
paul@390	28	class utf8string(basestring):
paul@390	29
paul@390	30	"A character string representation based on UTF-8."
paul@390	31
paul@396	32	def __init__(self, other=None, encoding=None):
paul@396	33
paul@396	34	"""
paul@396	35	Initialise the string, perhaps from 'other', with any original
paul@396	36	'encoding' indicated.
paul@396	37	"""
paul@396	38
paul@396	39	get_using(basestring.__init__, self)(other)
paul@396	40	self.encoding = encoding
paul@403	41	self.length = None
paul@396	42
paul@585	43	def _binary_op(self, op, other, sizes=False):
paul@396	44
paul@396	45	"Perform 'op' on this object and 'other' if appropriate."
paul@396	46
paul@396	47	# Reject non-strings.
paul@396	48
paul@396	49	if not _isinstance(other, basestring):
paul@396	50	return NotImplemented
paul@396	51
paul@396	52	# Combining text with bytes.
paul@396	53
paul@585	54	if not _isinstance(other, utf8string):
paul@396	55	s = self.encode()
paul@585	56	else:
paul@585	57	s = self
paul@585	58
paul@585	59	if sizes:
paul@585	60	return op(s.__data__, other.__data__, s.__size__, other.__size__)
paul@585	61	else:
paul@396	62	return op(s.__data__, other.__data__)
paul@396	63
paul@585	64	def _binary_op_rev(self, op, other, sizes=False):
paul@396	65
paul@396	66	"Perform 'op' on 'other' and this object if appropriate."
paul@396	67
paul@396	68	# Reject non-strings.
paul@396	69
paul@396	70	if not _isinstance(other, basestring):
paul@396	71	return NotImplemented
paul@396	72
paul@396	73	# Combining text with bytes.
paul@396	74
paul@585	75	if not _isinstance(other, utf8string):
paul@396	76	s = self.encode()
paul@585	77	else:
paul@585	78	s = self
paul@396	79
paul@585	80	if sizes:
paul@585	81	return op(other.__data__, s.__data__, other.__size__, s.__size__)
paul@396	82	else:
paul@585	83	return op(other.__data__, s.__data__)
paul@396	84
paul@398	85	def _convert(self, result, other):
paul@398	86
paul@398	87	"Convert 'result' to a Unicode object if 'other' already is."
paul@398	88
paul@398	89	if _isinstance(other, utf8string):
paul@398	90	return utf8string(result, self.encoding)
paul@398	91	else:
paul@398	92	return result
paul@398	93
paul@536	94	def _quote_value(self, b, n):
paul@536	95
paul@536	96	"Append to 'b' the quoted form of 'n'."
paul@536	97
paul@536	98	if n < 0:
paul@536	99	n += 256
paul@536	100
paul@536	101	if n > 0xffff:
paul@536	102	b.append("\\U")
paul@536	103	digits = 8
paul@550	104	else:
paul@536	105	b.append("\\u")
paul@536	106	digits = 4
paul@536	107
paul@536	108	x = hex(n, "")
paul@536	109	i = len(x)
paul@536	110
paul@536	111	while i < digits:
paul@536	112	b.append("0")
paul@536	113	i += 1
paul@536	114
paul@536	115	b.append(x)
paul@536	116
paul@536	117	# Operator methods.
paul@536	118
paul@398	119	def __iadd__(self, other):
paul@398	120
paul@398	121	"Return a string combining this string with 'other'."
paul@398	122
paul@585	123	return self._convert(self._binary_op(str_add, other, True), other)
paul@398	124
paul@398	125	__add__ = __iadd__
paul@398	126
paul@398	127	def __radd__(self, other):
paul@398	128
paul@398	129	"Return a string combining this string with 'other'."
paul@398	130
paul@585	131	return self._convert(self._binary_op_rev(str_add, other, True), other)
paul@398	132
paul@403	133	def __len__(self):
paul@403	134
paul@403	135	"Return the length of this string in characters."
paul@403	136
paul@403	137	if self.length is None:
paul@583	138	self.length = unicode_len(self.__data__, self.__size__)
paul@403	139
paul@403	140	return self.length
paul@403	141
paul@534	142	def __ord__(self):
paul@534	143
paul@534	144	"Return the value of the string, if only a single character."
paul@534	145
paul@534	146	if self.__len__() == 1:
paul@583	147	return unicode_ord(self.__data__, self.__size__)
paul@534	148	else:
paul@534	149	raise ValueError, self
paul@534	150
paul@396	151	def encode(self, encoding=None):
paul@396	152
paul@396	153	"""
paul@396	154	Encode the string to the given 'encoding' or any original encoding if
paul@396	155	omitted.
paul@396	156	"""
paul@396	157
paul@396	158	encoding = encoding or self.encoding
paul@396	159	if not encoding:
paul@396	160	return self
paul@390	161
paul@390	162	from_utf8 = Converter("UTF-8", encoding)
paul@392	163
paul@390	164	try:
paul@390	165	from_utf8.feed(self)
paul@390	166	return str(from_utf8)
paul@392	167
paul@390	168	finally:
paul@390	169	from_utf8.close()
paul@390	170
paul@409	171	def join(self, l):
paul@409	172
paul@409	173	"Join the elements in 'l' with this string."
paul@409	174
paul@409	175	# Empty strings just cause the list elements to be concatenated.
paul@409	176
paul@409	177	nonempty = self.__bool__()
paul@409	178
paul@409	179	# Non-empty strings join the elements together in a buffer.
paul@409	180
paul@409	181	b = buffer()
paul@409	182	first = True
paul@409	183	encoding = self.encoding
paul@409	184
paul@409	185	for s in l:
paul@409	186	if first:
paul@409	187	first = False
paul@409	188	elif nonempty:
paul@409	189	b.append(self)
paul@409	190
paul@409	191	if _isinstance(s, utf8string):
paul@409	192	encoding = None
paul@409	193
paul@409	194	b.append(s)
paul@409	195
paul@409	196	s = str(b)
paul@409	197	if encoding:
paul@409	198	s = utf8string(s)
paul@409	199	s.encoding = encoding
paul@409	200	return s
paul@409	201
paul@431	202	# Special implementation methods.
paul@431	203
paul@431	204	def __get_single_item__(self, index):
paul@431	205
paul@431	206	"Return the item at the normalised (positive) 'index'."
paul@431	207
paul@431	208	self._check_index(index)
paul@583	209	return utf8string(unicode_substr(self.__data__, self.__size__, index, index + 1, 1), self.encoding)
paul@431	210
paul@431	211	def __get_multiple_items__(self, start, end, step):
paul@431	212
paul@431	213	"""
paul@431	214	Return items from 'start' until (but excluding) 'end', at 'step'
paul@431	215	intervals.
paul@431	216	"""
paul@431	217
paul@520	218	if start == end:
paul@520	219	return ""
paul@520	220
paul@431	221	check_int(step)
paul@431	222
paul@431	223	if step == 0:
paul@431	224	raise ValueError(step)
paul@431	225
paul@520	226	l = get_using(basestring.__get_multiple_items__, self)(start, end, step)
paul@520	227	return utf8string("".join(l), self.encoding)
paul@431	228
paul@390	229	def unicode(s, encoding):
paul@390	230
paul@390	231	"Convert 's' to a Unicode object, interpreting 's' as using 'encoding'."
paul@390	232
paul@390	233	if isinstance(s, utf8string):
paul@390	234	return s
paul@390	235
paul@390	236	# Obtain a string representation.
paul@390	237
paul@390	238	s = s.__str__()
paul@390	239
paul@410	240	# Convert the string to UTF-8. Even if the stated encoding is UTF-8, it
paul@410	241	# needs to be validated.
paul@390	242
paul@390	243	to_utf8 = Converter(encoding, "UTF-8")
paul@392	244
paul@390	245	try:
paul@390	246	to_utf8.feed(s)
paul@396	247	return utf8string(str(to_utf8), encoding)
paul@392	248
paul@390	249	finally:
paul@390	250	to_utf8.close()
paul@390	251
paul@390	252	# vim: tabstop=4 expandtab shiftwidth=4

Lichen

Annotated lib/__builtins__/unicode.py

Annotated lib/builtins/unicode.py