1.1 --- a/lib/__builtins__/unicode.py	Mon Jun 28 22:29:21 2021 +0200
     1.2 +++ b/lib/__builtins__/unicode.py	Tue Jun 29 22:24:09 2021 +0200
     1.3 @@ -3,7 +3,7 @@
     1.4  """
     1.5  Unicode objects.
     1.6  
     1.7 -Copyright (C) 2015, 2016, 2017 Paul Boddie <paul@boddie.org.uk>
     1.8 +Copyright (C) 2015, 2016, 2017, 2021 Paul Boddie <paul@boddie.org.uk>
     1.9  
    1.10  This program is free software; you can redistribute it and/or modify it under
    1.11  the terms of the GNU General Public License as published by the Free Software
    1.12 @@ -25,21 +25,58 @@
    1.13  from native import str_add, unicode_len, unicode_ord, unicode_substr, \
    1.14                     isinstance as _isinstance
    1.15  
    1.16 -class utf8string(basestring):
    1.17 +class unicode(basestring):
    1.18  
    1.19      "A character string representation based on UTF-8."
    1.20  
    1.21 -    def __init__(self, other=None, encoding=None):
    1.22 +    def __init__(self, s, encoding=None, original=None):
    1.23  
    1.24          """
    1.25 -        Initialise the string, perhaps from 'other', with any original
    1.26 -        'encoding' indicated.
    1.27 +        Initialise the string from 'other', employing any indicated 'encoding'
    1.28 +        for the provided string data.
    1.29 +
    1.30 +        If 'original' is indicated, this may be used to override the original
    1.31 +        encoding. This is useful when the string data is already in UTF-8
    1.32 +        format, but where the original encoding needs to be communicated.
    1.33          """
    1.34  
    1.35 -        get_using(basestring.__init__, self)(other)
    1.36 -        self.encoding = encoding
    1.37          self.length = None
    1.38  
    1.39 +        # Initialise using another Unicode object.
    1.40 +
    1.41 +        if _isinstance(s, unicode):
    1.42 +            get_using(basestring.__init__, self)(s)
    1.43 +            self.encoding = s.encoding
    1.44 +
    1.45 +        # Initialise using suitable string data but with an explicit original
    1.46 +        # encoding.
    1.47 +
    1.48 +        elif original:
    1.49 +            get_using(basestring.__init__, self)(s)
    1.50 +            self.encoding = original
    1.51 +
    1.52 +        # Initialise using string data having either UTF-8 or another encoding,
    1.53 +        # converting to UTF-8 and retaining the encoding details as the original
    1.54 +        # encoding.
    1.55 +
    1.56 +        else:
    1.57 +            # Obtain a string representation.
    1.58 +
    1.59 +            s = s.__str__()
    1.60 +
    1.61 +            # Convert the string to UTF-8. Even if the stated encoding is UTF-8, it
    1.62 +            # needs to be validated.
    1.63 +
    1.64 +            to_utf8 = Converter(encoding or "UTF-8", "UTF-8")
    1.65 +
    1.66 +            try:
    1.67 +                to_utf8.feed(s)
    1.68 +                get_using(basestring.__init__, self)(str(to_utf8))
    1.69 +            finally:
    1.70 +                to_utf8.close()
    1.71 +
    1.72 +            self.encoding = encoding
    1.73 +
    1.74      def _binary_op(self, op, other, sizes=False):
    1.75  
    1.76          "Perform 'op' on this object and 'other' if appropriate."
    1.77 @@ -51,7 +88,7 @@
    1.78  
    1.79          # Combining text with bytes.
    1.80  
    1.81 -        if not _isinstance(other, utf8string):
    1.82 +        if not _isinstance(other, unicode):
    1.83              s = self.encode()
    1.84          else:
    1.85              s = self
    1.86 @@ -72,7 +109,7 @@
    1.87  
    1.88          # Combining text with bytes.
    1.89  
    1.90 -        if not _isinstance(other, utf8string):
    1.91 +        if not _isinstance(other, unicode):
    1.92              s = self.encode()
    1.93          else:
    1.94              s = self
    1.95 @@ -86,8 +123,8 @@
    1.96  
    1.97          "Convert 'result' to a Unicode object if 'other' already is."
    1.98  
    1.99 -        if _isinstance(other, utf8string):
   1.100 -            return utf8string(result, self.encoding)
   1.101 +        if _isinstance(other, unicode):
   1.102 +            return unicode(result, None, self.encoding)
   1.103          else:
   1.104              return result
   1.105  
   1.106 @@ -188,15 +225,14 @@
   1.107              elif nonempty:
   1.108                  b.append(self)
   1.109  
   1.110 -            if _isinstance(s, utf8string):
   1.111 +            if _isinstance(s, unicode):
   1.112                  encoding = None
   1.113  
   1.114              b.append(s)
   1.115  
   1.116          s = str(b)
   1.117          if encoding:
   1.118 -            s = utf8string(s)
   1.119 -            s.encoding = encoding
   1.120 +            s = unicode(s, None, encoding)
   1.121          return s
   1.122  
   1.123      # Special implementation methods.
   1.124 @@ -204,9 +240,9 @@
   1.125      def __get_single_item__(self, index):
   1.126      
   1.127          "Return the item at the normalised (positive) 'index'."
   1.128 -    
   1.129 + 
   1.130          self._check_index(index)
   1.131 -        return utf8string(unicode_substr(self.__data__, self.__size__, index, index + 1, 1), self.encoding)
   1.132 +        return unicode(unicode_substr(self.__data__, self.__size__, index, index + 1, 1), None, self.encoding)
   1.133  
   1.134      def __get_multiple_items__(self, start, end, step):
   1.135  
   1.136 @@ -224,29 +260,6 @@
   1.137              raise ValueError(step)
   1.138  
   1.139          l = get_using(basestring.__get_multiple_items__, self)(start, end, step)
   1.140 -        return utf8string("".join(l), self.encoding)
   1.141 -
   1.142 -def unicode(s, encoding):
   1.143 -
   1.144 -    "Convert 's' to a Unicode object, interpreting 's' as using 'encoding'."
   1.145 -
   1.146 -    if isinstance(s, utf8string):
   1.147 -        return s
   1.148 -
   1.149 -    # Obtain a string representation.
   1.150 -
   1.151 -    s = s.__str__()
   1.152 -
   1.153 -    # Convert the string to UTF-8. Even if the stated encoding is UTF-8, it
   1.154 -    # needs to be validated.
   1.155 -
   1.156 -    to_utf8 = Converter(encoding, "UTF-8")
   1.157 -
   1.158 -    try:
   1.159 -        to_utf8.feed(s)
   1.160 -        return utf8string(str(to_utf8), encoding)
   1.161 -
   1.162 -    finally:
   1.163 -        to_utf8.close()
   1.164 +        return unicode("".join(l), None, self.encoding)
   1.165  
   1.166  # vim: tabstop=4 expandtab shiftwidth=4