1.1 --- a/lib/__builtins__/unicode.py Mon Jun 28 22:29:21 2021 +0200
1.2 +++ b/lib/__builtins__/unicode.py Tue Jun 29 22:24:09 2021 +0200
1.3 @@ -3,7 +3,7 @@
1.4 """
1.5 Unicode objects.
1.6
1.7 -Copyright (C) 2015, 2016, 2017 Paul Boddie <paul@boddie.org.uk>
1.8 +Copyright (C) 2015, 2016, 2017, 2021 Paul Boddie <paul@boddie.org.uk>
1.9
1.10 This program is free software; you can redistribute it and/or modify it under
1.11 the terms of the GNU General Public License as published by the Free Software
1.12 @@ -25,21 +25,58 @@
1.13 from native import str_add, unicode_len, unicode_ord, unicode_substr, \
1.14 isinstance as _isinstance
1.15
1.16 -class utf8string(basestring):
1.17 +class unicode(basestring):
1.18
1.19 "A character string representation based on UTF-8."
1.20
1.21 - def __init__(self, other=None, encoding=None):
1.22 + def __init__(self, s, encoding=None, original=None):
1.23
1.24 """
1.25 - Initialise the string, perhaps from 'other', with any original
1.26 - 'encoding' indicated.
1.27 + Initialise the string from 'other', employing any indicated 'encoding'
1.28 + for the provided string data.
1.29 +
1.30 + If 'original' is indicated, this may be used to override the original
1.31 + encoding. This is useful when the string data is already in UTF-8
1.32 + format, but where the original encoding needs to be communicated.
1.33 """
1.34
1.35 - get_using(basestring.__init__, self)(other)
1.36 - self.encoding = encoding
1.37 self.length = None
1.38
1.39 + # Initialise using another Unicode object.
1.40 +
1.41 + if _isinstance(s, unicode):
1.42 + get_using(basestring.__init__, self)(s)
1.43 + self.encoding = s.encoding
1.44 +
1.45 + # Initialise using suitable string data but with an explicit original
1.46 + # encoding.
1.47 +
1.48 + elif original:
1.49 + get_using(basestring.__init__, self)(s)
1.50 + self.encoding = original
1.51 +
1.52 + # Initialise using string data having either UTF-8 or another encoding,
1.53 + # converting to UTF-8 and retaining the encoding details as the original
1.54 + # encoding.
1.55 +
1.56 + else:
1.57 + # Obtain a string representation.
1.58 +
1.59 + s = s.__str__()
1.60 +
1.61 + # Convert the string to UTF-8. Even if the stated encoding is UTF-8, it
1.62 + # needs to be validated.
1.63 +
1.64 + to_utf8 = Converter(encoding or "UTF-8", "UTF-8")
1.65 +
1.66 + try:
1.67 + to_utf8.feed(s)
1.68 + get_using(basestring.__init__, self)(str(to_utf8))
1.69 + finally:
1.70 + to_utf8.close()
1.71 +
1.72 + self.encoding = encoding
1.73 +
1.74 def _binary_op(self, op, other, sizes=False):
1.75
1.76 "Perform 'op' on this object and 'other' if appropriate."
1.77 @@ -51,7 +88,7 @@
1.78
1.79 # Combining text with bytes.
1.80
1.81 - if not _isinstance(other, utf8string):
1.82 + if not _isinstance(other, unicode):
1.83 s = self.encode()
1.84 else:
1.85 s = self
1.86 @@ -72,7 +109,7 @@
1.87
1.88 # Combining text with bytes.
1.89
1.90 - if not _isinstance(other, utf8string):
1.91 + if not _isinstance(other, unicode):
1.92 s = self.encode()
1.93 else:
1.94 s = self
1.95 @@ -86,8 +123,8 @@
1.96
1.97 "Convert 'result' to a Unicode object if 'other' already is."
1.98
1.99 - if _isinstance(other, utf8string):
1.100 - return utf8string(result, self.encoding)
1.101 + if _isinstance(other, unicode):
1.102 + return unicode(result, None, self.encoding)
1.103 else:
1.104 return result
1.105
1.106 @@ -188,15 +225,14 @@
1.107 elif nonempty:
1.108 b.append(self)
1.109
1.110 - if _isinstance(s, utf8string):
1.111 + if _isinstance(s, unicode):
1.112 encoding = None
1.113
1.114 b.append(s)
1.115
1.116 s = str(b)
1.117 if encoding:
1.118 - s = utf8string(s)
1.119 - s.encoding = encoding
1.120 + s = unicode(s, None, encoding)
1.121 return s
1.122
1.123 # Special implementation methods.
1.124 @@ -204,9 +240,9 @@
1.125 def __get_single_item__(self, index):
1.126
1.127 "Return the item at the normalised (positive) 'index'."
1.128 -
1.129 +
1.130 self._check_index(index)
1.131 - return utf8string(unicode_substr(self.__data__, self.__size__, index, index + 1, 1), self.encoding)
1.132 + return unicode(unicode_substr(self.__data__, self.__size__, index, index + 1, 1), None, self.encoding)
1.133
1.134 def __get_multiple_items__(self, start, end, step):
1.135
1.136 @@ -224,29 +260,6 @@
1.137 raise ValueError(step)
1.138
1.139 l = get_using(basestring.__get_multiple_items__, self)(start, end, step)
1.140 - return utf8string("".join(l), self.encoding)
1.141 -
1.142 -def unicode(s, encoding):
1.143 -
1.144 - "Convert 's' to a Unicode object, interpreting 's' as using 'encoding'."
1.145 -
1.146 - if isinstance(s, utf8string):
1.147 - return s
1.148 -
1.149 - # Obtain a string representation.
1.150 -
1.151 - s = s.__str__()
1.152 -
1.153 - # Convert the string to UTF-8. Even if the stated encoding is UTF-8, it
1.154 - # needs to be validated.
1.155 -
1.156 - to_utf8 = Converter(encoding, "UTF-8")
1.157 -
1.158 - try:
1.159 - to_utf8.feed(s)
1.160 - return utf8string(str(to_utf8), encoding)
1.161 -
1.162 - finally:
1.163 - to_utf8.close()
1.164 + return unicode("".join(l), None, self.encoding)
1.165
1.166 # vim: tabstop=4 expandtab shiftwidth=4