1 #!/usr/bin/env python 2 3 """ 4 Unicode objects. 5 6 Copyright (C) 2015, 2016, 2017, 2021 Paul Boddie <paul@boddie.org.uk> 7 8 This program is free software; you can redistribute it and/or modify it under 9 the terms of the GNU General Public License as published by the Free Software 10 Foundation; either version 3 of the License, or (at your option) any later 11 version. 12 13 This program is distributed in the hope that it will be useful, but WITHOUT 14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS 15 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more 16 details. 17 18 You should have received a copy of the GNU General Public License along with 19 this program. If not, see <http://www.gnu.org/licenses/>. 20 """ 21 22 from __builtins__.str import basestring 23 from __builtins__.types import check_int 24 from posix.iconv import Converter 25 from native import str_add, unicode_len, unicode_ord, unicode_substr, \ 26 isinstance as _isinstance 27 28 class unicode(basestring): 29 30 "A character string representation based on UTF-8." 31 32 def __init__(self, s, encoding=None, original=None): 33 34 """ 35 Initialise the string from 'other', employing any indicated 'encoding' 36 for the provided string data. 37 38 If 'original' is indicated, this may be used to override the original 39 encoding. This is useful when the string data is already in UTF-8 40 format, but where the original encoding needs to be communicated. 41 """ 42 43 self.length = None 44 45 # Initialise using another Unicode object. 46 47 if _isinstance(s, unicode): 48 get_using(basestring.__init__, self)(s) 49 self.encoding = s.encoding 50 51 # Initialise using suitable string data but with an explicit original 52 # encoding. 53 54 elif original: 55 get_using(basestring.__init__, self)(s) 56 self.encoding = original 57 58 # Initialise using string data having either UTF-8 or another encoding, 59 # converting to UTF-8 and retaining the encoding details as the original 60 # encoding. 61 62 else: 63 # Obtain a string representation. 64 65 s = s.__str__() 66 67 # Convert the string to UTF-8. Even if the stated encoding is UTF-8, it 68 # needs to be validated. 69 70 to_utf8 = Converter(encoding or "UTF-8", "UTF-8") 71 72 try: 73 to_utf8.feed(s) 74 get_using(basestring.__init__, self)(str(to_utf8)) 75 finally: 76 to_utf8.close() 77 78 self.encoding = encoding 79 80 def _binary_op(self, op, other, sizes=False): 81 82 "Perform 'op' on this object and 'other' if appropriate." 83 84 # Reject non-strings. 85 86 if not _isinstance(other, basestring): 87 return NotImplemented 88 89 # Combining text with bytes. 90 91 if not _isinstance(other, unicode): 92 s = self.encode() 93 else: 94 s = self 95 96 if sizes: 97 return op(s.__data__, other.__data__, s.__size__, other.__size__) 98 else: 99 return op(s.__data__, other.__data__) 100 101 def _binary_op_rev(self, op, other, sizes=False): 102 103 "Perform 'op' on 'other' and this object if appropriate." 104 105 # Reject non-strings. 106 107 if not _isinstance(other, basestring): 108 return NotImplemented 109 110 # Combining text with bytes. 111 112 if not _isinstance(other, unicode): 113 s = self.encode() 114 else: 115 s = self 116 117 if sizes: 118 return op(other.__data__, s.__data__, other.__size__, s.__size__) 119 else: 120 return op(other.__data__, s.__data__) 121 122 def _convert(self, result, other): 123 124 "Convert 'result' to a Unicode object if 'other' already is." 125 126 if _isinstance(other, unicode): 127 return unicode(result, None, self.encoding) 128 else: 129 return result 130 131 def _quote_value(self, b, n): 132 133 "Append to 'b' the quoted form of 'n'." 134 135 if n < 0: 136 n += 256 137 138 if n > 0xffff: 139 b.append("\\U") 140 digits = 8 141 else: 142 b.append("\\u") 143 digits = 4 144 145 x = hex(n, "") 146 i = len(x) 147 148 while i < digits: 149 b.append("0") 150 i += 1 151 152 b.append(x) 153 154 # Operator methods. 155 156 def __iadd__(self, other): 157 158 "Return a string combining this string with 'other'." 159 160 return self._convert(self._binary_op(str_add, other, True), other) 161 162 __add__ = __iadd__ 163 164 def __radd__(self, other): 165 166 "Return a string combining this string with 'other'." 167 168 return self._convert(self._binary_op_rev(str_add, other, True), other) 169 170 def __len__(self): 171 172 "Return the length of this string in characters." 173 174 if self.length is None: 175 self.length = unicode_len(self.__data__, self.__size__) 176 177 return self.length 178 179 def __ord__(self): 180 181 "Return the value of the string, if only a single character." 182 183 if self.__len__() == 1: 184 return unicode_ord(self.__data__, self.__size__) 185 else: 186 raise ValueError, self 187 188 def encode(self, encoding=None): 189 190 """ 191 Encode the string to the given 'encoding' or any original encoding if 192 omitted. 193 """ 194 195 encoding = encoding or self.encoding 196 if not encoding: 197 return self 198 199 from_utf8 = Converter("UTF-8", encoding) 200 201 try: 202 from_utf8.feed(self) 203 return str(from_utf8) 204 205 finally: 206 from_utf8.close() 207 208 def join(self, l): 209 210 "Join the elements in 'l' with this string." 211 212 # Empty strings just cause the list elements to be concatenated. 213 214 nonempty = self.__bool__() 215 216 # Non-empty strings join the elements together in a buffer. 217 218 b = buffer() 219 first = True 220 encoding = self.encoding 221 222 for s in l: 223 if first: 224 first = False 225 elif nonempty: 226 b.append(self) 227 228 if _isinstance(s, unicode): 229 encoding = None 230 231 b.append(s) 232 233 s = str(b) 234 if encoding: 235 s = unicode(s, None, encoding) 236 return s 237 238 # Special implementation methods. 239 240 def __get_single_item__(self, index): 241 242 "Return the item at the normalised (positive) 'index'." 243 244 self._check_index(index) 245 return unicode(unicode_substr(self.__data__, self.__size__, index, index + 1, 1), None, self.encoding) 246 247 def __get_multiple_items__(self, start, end, step): 248 249 """ 250 Return items from 'start' until (but excluding) 'end', at 'step' 251 intervals. 252 """ 253 254 if start == end: 255 return "" 256 257 check_int(step) 258 259 if step == 0: 260 raise ValueError(step) 261 262 l = get_using(basestring.__get_multiple_items__, self)(start, end, step) 263 return unicode("".join(l), None, self.encoding) 264 265 # vim: tabstop=4 expandtab shiftwidth=4