1 #!/usr/bin/env python 2 3 """ 4 Unicode objects. 5 6 Copyright (C) 2015, 2016, 2017 Paul Boddie <paul@boddie.org.uk> 7 8 This program is free software; you can redistribute it and/or modify it under 9 the terms of the GNU General Public License as published by the Free Software 10 Foundation; either version 3 of the License, or (at your option) any later 11 version. 12 13 This program is distributed in the hope that it will be useful, but WITHOUT 14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS 15 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more 16 details. 17 18 You should have received a copy of the GNU General Public License along with 19 this program. If not, see <http://www.gnu.org/licenses/>. 20 """ 21 22 from __builtins__.str import basestring 23 from __builtins__.types import check_int 24 from posix.iconv import Converter 25 from native import str_add, unicode_len, unicode_ord, unicode_substr, \ 26 isinstance as _isinstance 27 28 class utf8string(basestring): 29 30 "A character string representation based on UTF-8." 31 32 def __init__(self, other=None, encoding=None): 33 34 """ 35 Initialise the string, perhaps from 'other', with any original 36 'encoding' indicated. 37 """ 38 39 get_using(basestring.__init__, self)(other) 40 self.encoding = encoding 41 self.length = None 42 43 def _binary_op(self, op, other, sizes=False): 44 45 "Perform 'op' on this object and 'other' if appropriate." 46 47 # Reject non-strings. 48 49 if not _isinstance(other, basestring): 50 return NotImplemented 51 52 # Combining text with bytes. 53 54 if not _isinstance(other, utf8string): 55 s = self.encode() 56 else: 57 s = self 58 59 if sizes: 60 return op(s.__data__, other.__data__, s.__size__, other.__size__) 61 else: 62 return op(s.__data__, other.__data__) 63 64 def _binary_op_rev(self, op, other, sizes=False): 65 66 "Perform 'op' on 'other' and this object if appropriate." 67 68 # Reject non-strings. 69 70 if not _isinstance(other, basestring): 71 return NotImplemented 72 73 # Combining text with bytes. 74 75 if not _isinstance(other, utf8string): 76 s = self.encode() 77 else: 78 s = self 79 80 if sizes: 81 return op(other.__data__, s.__data__, other.__size__, s.__size__) 82 else: 83 return op(other.__data__, s.__data__) 84 85 def _convert(self, result, other): 86 87 "Convert 'result' to a Unicode object if 'other' already is." 88 89 if _isinstance(other, utf8string): 90 return utf8string(result, self.encoding) 91 else: 92 return result 93 94 def _quote_value(self, b, n): 95 96 "Append to 'b' the quoted form of 'n'." 97 98 if n < 0: 99 n += 256 100 101 if n > 0xffff: 102 b.append("\\U") 103 digits = 8 104 else: 105 b.append("\\u") 106 digits = 4 107 108 x = hex(n, "") 109 i = len(x) 110 111 while i < digits: 112 b.append("0") 113 i += 1 114 115 b.append(x) 116 117 # Operator methods. 118 119 def __iadd__(self, other): 120 121 "Return a string combining this string with 'other'." 122 123 return self._convert(self._binary_op(str_add, other, True), other) 124 125 __add__ = __iadd__ 126 127 def __radd__(self, other): 128 129 "Return a string combining this string with 'other'." 130 131 return self._convert(self._binary_op_rev(str_add, other, True), other) 132 133 def __len__(self): 134 135 "Return the length of this string in characters." 136 137 if self.length is None: 138 self.length = unicode_len(self.__data__, self.__size__) 139 140 return self.length 141 142 def __ord__(self): 143 144 "Return the value of the string, if only a single character." 145 146 if self.__len__() == 1: 147 return unicode_ord(self.__data__, self.__size__) 148 else: 149 raise ValueError, self 150 151 def encode(self, encoding=None): 152 153 """ 154 Encode the string to the given 'encoding' or any original encoding if 155 omitted. 156 """ 157 158 encoding = encoding or self.encoding 159 if not encoding: 160 return self 161 162 from_utf8 = Converter("UTF-8", encoding) 163 164 try: 165 from_utf8.feed(self) 166 return str(from_utf8) 167 168 finally: 169 from_utf8.close() 170 171 def join(self, l): 172 173 "Join the elements in 'l' with this string." 174 175 # Empty strings just cause the list elements to be concatenated. 176 177 nonempty = self.__bool__() 178 179 # Non-empty strings join the elements together in a buffer. 180 181 b = buffer() 182 first = True 183 encoding = self.encoding 184 185 for s in l: 186 if first: 187 first = False 188 elif nonempty: 189 b.append(self) 190 191 if _isinstance(s, utf8string): 192 encoding = None 193 194 b.append(s) 195 196 s = str(b) 197 if encoding: 198 s = utf8string(s) 199 s.encoding = encoding 200 return s 201 202 # Special implementation methods. 203 204 def __get_single_item__(self, index): 205 206 "Return the item at the normalised (positive) 'index'." 207 208 self._check_index(index) 209 return utf8string(unicode_substr(self.__data__, self.__size__, index, index + 1, 1), self.encoding) 210 211 def __get_multiple_items__(self, start, end, step): 212 213 """ 214 Return items from 'start' until (but excluding) 'end', at 'step' 215 intervals. 216 """ 217 218 if start == end: 219 return "" 220 221 check_int(step) 222 223 if step == 0: 224 raise ValueError(step) 225 226 l = get_using(basestring.__get_multiple_items__, self)(start, end, step) 227 return utf8string("".join(l), self.encoding) 228 229 def unicode(s, encoding): 230 231 "Convert 's' to a Unicode object, interpreting 's' as using 'encoding'." 232 233 if isinstance(s, utf8string): 234 return s 235 236 # Obtain a string representation. 237 238 s = s.__str__() 239 240 # Convert the string to UTF-8. Even if the stated encoding is UTF-8, it 241 # needs to be validated. 242 243 to_utf8 = Converter(encoding, "UTF-8") 244 245 try: 246 to_utf8.feed(s) 247 return utf8string(str(to_utf8), encoding) 248 249 finally: 250 to_utf8.close() 251 252 # vim: tabstop=4 expandtab shiftwidth=4