1 #!/usr/bin/env python 2 3 """ 4 Unicode objects. 5 6 Copyright (C) 2015, 2016 Paul Boddie <paul@boddie.org.uk> 7 8 This program is free software; you can redistribute it and/or modify it under 9 the terms of the GNU General Public License as published by the Free Software 10 Foundation; either version 3 of the License, or (at your option) any later 11 version. 12 13 This program is distributed in the hope that it will be useful, but WITHOUT 14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS 15 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more 16 details. 17 18 You should have received a copy of the GNU General Public License along with 19 this program. If not, see <http://www.gnu.org/licenses/>. 20 """ 21 22 from __builtins__.str import basestring 23 from posix.iconv import Converter 24 from native import str_add, unicode_len, isinstance as _isinstance 25 26 class utf8string(basestring): 27 28 "A character string representation based on UTF-8." 29 30 def __init__(self, other=None, encoding=None): 31 32 """ 33 Initialise the string, perhaps from 'other', with any original 34 'encoding' indicated. 35 """ 36 37 get_using(basestring.__init__, self)(other) 38 self.encoding = encoding 39 self.length = None 40 41 def _binary_op(self, op, other): 42 43 "Perform 'op' on this object and 'other' if appropriate." 44 45 # Reject non-strings. 46 47 if not _isinstance(other, basestring): 48 return NotImplemented 49 50 # Combining text with bytes. 51 52 elif not _isinstance(other, utf8string): 53 s = self.encode() 54 return op(s.__data__, other.__data__) 55 56 # Otherwise, perform the operation on the operands' data. 57 58 else: 59 return op(self.__data__, other.__data__) 60 61 def _binary_op_rev(self, op, other): 62 63 "Perform 'op' on 'other' and this object if appropriate." 64 65 # Reject non-strings. 66 67 if not _isinstance(other, basestring): 68 return NotImplemented 69 70 # Combining text with bytes. 71 72 elif not _isinstance(other, utf8string): 73 s = self.encode() 74 return op(other.__data__, s.__data__) 75 76 # Otherwise, perform the operation on the operands' data. 77 78 else: 79 return op(other.__data__, self.__data__) 80 81 def _convert(self, result, other): 82 83 "Convert 'result' to a Unicode object if 'other' already is." 84 85 if _isinstance(other, utf8string): 86 return utf8string(result, self.encoding) 87 else: 88 return result 89 90 def __iadd__(self, other): 91 92 "Return a string combining this string with 'other'." 93 94 return self._convert(self._binary_op(str_add, other), other) 95 96 __add__ = __iadd__ 97 98 def __radd__(self, other): 99 100 "Return a string combining this string with 'other'." 101 102 return self._convert(self._binary_op_rev(str_add, other), other) 103 104 def __len__(self): 105 106 "Return the length of this string in characters." 107 108 if self.length is None: 109 self.length = unicode_len(self.__data__) 110 111 return self.length 112 113 def encode(self, encoding=None): 114 115 """ 116 Encode the string to the given 'encoding' or any original encoding if 117 omitted. 118 """ 119 120 encoding = encoding or self.encoding 121 if not encoding: 122 return self 123 124 from_utf8 = Converter("UTF-8", encoding) 125 126 try: 127 from_utf8.feed(self) 128 return str(from_utf8) 129 130 finally: 131 from_utf8.close() 132 133 def join(self, l): 134 135 "Join the elements in 'l' with this string." 136 137 # Empty strings just cause the list elements to be concatenated. 138 139 nonempty = self.__bool__() 140 141 # Non-empty strings join the elements together in a buffer. 142 143 b = buffer() 144 first = True 145 encoding = self.encoding 146 147 for s in l: 148 if first: 149 first = False 150 elif nonempty: 151 b.append(self) 152 153 if _isinstance(s, utf8string): 154 encoding = None 155 156 b.append(s) 157 158 s = str(b) 159 if encoding: 160 s = utf8string(s) 161 s.encoding = encoding 162 return s 163 164 def unicode(s, encoding): 165 166 "Convert 's' to a Unicode object, interpreting 's' as using 'encoding'." 167 168 if isinstance(s, utf8string): 169 return s 170 171 # Obtain a string representation. 172 173 s = s.__str__() 174 175 # Convert the string to UTF-8. Even if the stated encoding is UTF-8, it 176 # needs to be validated. 177 178 to_utf8 = Converter(encoding, "UTF-8") 179 180 try: 181 to_utf8.feed(s) 182 return utf8string(str(to_utf8), encoding) 183 184 finally: 185 to_utf8.close() 186 187 # vim: tabstop=4 expandtab shiftwidth=4