1.1 --- a/common.py Mon Jun 28 22:29:21 2021 +0200
1.2 +++ b/common.py Tue Jun 29 22:24:09 2021 +0200
1.3 @@ -3,8 +3,7 @@
1.4 """
1.5 Common functions.
1.6
1.7 -Copyright (C) 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016,
1.8 - 2017, 2018, 2019 Paul Boddie <paul@boddie.org.uk>
1.9 +Copyright (C) 2007-2019, 2021 Paul Boddie <paul@boddie.org.uk>
1.10
1.11 This program is free software; you can redistribute it and/or modify it under
1.12 the terms of the GNU General Public License as published by the Free Software
1.13 @@ -1597,8 +1596,6 @@
1.14
1.15 if name == "string":
1.16 modname = "str"
1.17 - elif name == "utf8string":
1.18 - modname = "unicode"
1.19 elif name == "NoneType":
1.20 modname = "none"
1.21 else:
1.22 @@ -1612,8 +1609,6 @@
1.23
1.24 if name == "str":
1.25 return "string"
1.26 - elif name == "unicode":
1.27 - return "utf8string"
1.28 else:
1.29 return name
1.30
2.1 --- a/generator.py Mon Jun 28 22:29:21 2021 +0200
2.2 +++ b/generator.py Tue Jun 29 22:24:09 2021 +0200
2.3 @@ -3,7 +3,7 @@
2.4 """
2.5 Generate C code from object layouts and other deduced information.
2.6
2.7 -Copyright (C) 2015, 2016, 2017, 2018, 2019 Paul Boddie <paul@boddie.org.uk>
2.8 +Copyright (C) 2015-2019, 2021 Paul Boddie <paul@boddie.org.uk>
2.9
2.10 This program is free software; you can redistribute it and/or modify it under
2.11 the terms of the GNU General Public License as published by the Free Software
2.12 @@ -49,7 +49,7 @@
2.13 string_type = "__builtins__.str.string"
2.14 tuple_type = "__builtins__.tuple.tuple"
2.15 type_type = "__builtins__.core.type"
2.16 - unicode_type = "__builtins__.unicode.utf8string"
2.17 + unicode_type = "__builtins__.unicode.unicode"
2.18
2.19 none_value = "__builtins__.none.None"
2.20
3.1 --- a/lib/__builtins__/__init__.py Mon Jun 28 22:29:21 2021 +0200
3.2 +++ b/lib/__builtins__/__init__.py Tue Jun 29 22:24:09 2021 +0200
3.3 @@ -3,7 +3,7 @@
3.4 """
3.5 Simple built-in classes and functions.
3.6
3.7 -Copyright (C) 2015, 2016, 2017, 2019 Paul Boddie <paul@boddie.org.uk>
3.8 +Copyright (C) 2015, 2016, 2017, 2019, 2021 Paul Boddie <paul@boddie.org.uk>
3.9
3.10 This program is free software; you can redistribute it and/or modify it under
3.11 the terms of the GNU General Public License as published by the Free Software
3.12 @@ -70,7 +70,7 @@
3.13 from __builtins__.set import frozenset, set
3.14 from __builtins__.str import basestring, str, string
3.15 from __builtins__.tuple import tuple
3.16 -from __builtins__.unicode import unicode, utf8string
3.17 +from __builtins__.unicode import unicode
3.18
3.19 # Functions.
3.20
4.1 --- a/lib/__builtins__/character.py Mon Jun 28 22:29:21 2021 +0200
4.2 +++ b/lib/__builtins__/character.py Tue Jun 29 22:24:09 2021 +0200
4.3 @@ -103,7 +103,7 @@
4.4 check_int(i)
4.5
4.6 if 0 <= i <= 2097151:
4.7 - return utf8string(unicode_unichr(i))
4.8 + return unicode(unicode_unichr(i))
4.9 else:
4.10 raise ValueError, i
4.11
5.1 --- a/lib/__builtins__/int.py Mon Jun 28 22:29:21 2021 +0200
5.2 +++ b/lib/__builtins__/int.py Tue Jun 29 22:24:09 2021 +0200
5.3 @@ -20,7 +20,7 @@
5.4 """
5.5
5.6 from __builtins__.str import basestring
5.7 -from __builtins__.unicode import utf8string
5.8 +from __builtins__.unicode import unicode
5.9 from native import get_maxint, get_minint, is_int, \
5.10 int_add, int_and, int_div, int_eq, int_ge, int_gt, \
5.11 int_lshift, int_le, int_lt, int_mod, int_mul, int_ne, \
5.12 @@ -278,7 +278,7 @@
5.13
5.14 "Return a string representation."
5.15
5.16 - return utf8string(int_str(self))
5.17 + return unicode(int_str(self))
5.18
5.19 __repr__ = __str__
5.20
6.1 --- a/lib/__builtins__/stream.py Mon Jun 28 22:29:21 2021 +0200
6.2 +++ b/lib/__builtins__/stream.py Tue Jun 29 22:24:09 2021 +0200
6.3 @@ -144,7 +144,7 @@
6.4 # Encode text as bytes if necessary. When the encoding is not set, any
6.5 # original encoding of the text will be applied.
6.6
6.7 - if _isinstance(s, utf8string):
6.8 + if _isinstance(s, unicode):
6.9 s = s.encode(self.encoding)
6.10
6.11 fwrite(self.__data__, s)
7.1 --- a/lib/__builtins__/unicode.py Mon Jun 28 22:29:21 2021 +0200
7.2 +++ b/lib/__builtins__/unicode.py Tue Jun 29 22:24:09 2021 +0200
7.3 @@ -3,7 +3,7 @@
7.4 """
7.5 Unicode objects.
7.6
7.7 -Copyright (C) 2015, 2016, 2017 Paul Boddie <paul@boddie.org.uk>
7.8 +Copyright (C) 2015, 2016, 2017, 2021 Paul Boddie <paul@boddie.org.uk>
7.9
7.10 This program is free software; you can redistribute it and/or modify it under
7.11 the terms of the GNU General Public License as published by the Free Software
7.12 @@ -25,21 +25,58 @@
7.13 from native import str_add, unicode_len, unicode_ord, unicode_substr, \
7.14 isinstance as _isinstance
7.15
7.16 -class utf8string(basestring):
7.17 +class unicode(basestring):
7.18
7.19 "A character string representation based on UTF-8."
7.20
7.21 - def __init__(self, other=None, encoding=None):
7.22 + def __init__(self, s, encoding=None, original=None):
7.23
7.24 """
7.25 - Initialise the string, perhaps from 'other', with any original
7.26 - 'encoding' indicated.
7.27 + Initialise the string from 'other', employing any indicated 'encoding'
7.28 + for the provided string data.
7.29 +
7.30 + If 'original' is indicated, this may be used to override the original
7.31 + encoding. This is useful when the string data is already in UTF-8
7.32 + format, but where the original encoding needs to be communicated.
7.33 """
7.34
7.35 - get_using(basestring.__init__, self)(other)
7.36 - self.encoding = encoding
7.37 self.length = None
7.38
7.39 + # Initialise using another Unicode object.
7.40 +
7.41 + if _isinstance(s, unicode):
7.42 + get_using(basestring.__init__, self)(s)
7.43 + self.encoding = s.encoding
7.44 +
7.45 + # Initialise using suitable string data but with an explicit original
7.46 + # encoding.
7.47 +
7.48 + elif original:
7.49 + get_using(basestring.__init__, self)(s)
7.50 + self.encoding = original
7.51 +
7.52 + # Initialise using string data having either UTF-8 or another encoding,
7.53 + # converting to UTF-8 and retaining the encoding details as the original
7.54 + # encoding.
7.55 +
7.56 + else:
7.57 + # Obtain a string representation.
7.58 +
7.59 + s = s.__str__()
7.60 +
7.61 + # Convert the string to UTF-8. Even if the stated encoding is UTF-8, it
7.62 + # needs to be validated.
7.63 +
7.64 + to_utf8 = Converter(encoding or "UTF-8", "UTF-8")
7.65 +
7.66 + try:
7.67 + to_utf8.feed(s)
7.68 + get_using(basestring.__init__, self)(str(to_utf8))
7.69 + finally:
7.70 + to_utf8.close()
7.71 +
7.72 + self.encoding = encoding
7.73 +
7.74 def _binary_op(self, op, other, sizes=False):
7.75
7.76 "Perform 'op' on this object and 'other' if appropriate."
7.77 @@ -51,7 +88,7 @@
7.78
7.79 # Combining text with bytes.
7.80
7.81 - if not _isinstance(other, utf8string):
7.82 + if not _isinstance(other, unicode):
7.83 s = self.encode()
7.84 else:
7.85 s = self
7.86 @@ -72,7 +109,7 @@
7.87
7.88 # Combining text with bytes.
7.89
7.90 - if not _isinstance(other, utf8string):
7.91 + if not _isinstance(other, unicode):
7.92 s = self.encode()
7.93 else:
7.94 s = self
7.95 @@ -86,8 +123,8 @@
7.96
7.97 "Convert 'result' to a Unicode object if 'other' already is."
7.98
7.99 - if _isinstance(other, utf8string):
7.100 - return utf8string(result, self.encoding)
7.101 + if _isinstance(other, unicode):
7.102 + return unicode(result, None, self.encoding)
7.103 else:
7.104 return result
7.105
7.106 @@ -188,15 +225,14 @@
7.107 elif nonempty:
7.108 b.append(self)
7.109
7.110 - if _isinstance(s, utf8string):
7.111 + if _isinstance(s, unicode):
7.112 encoding = None
7.113
7.114 b.append(s)
7.115
7.116 s = str(b)
7.117 if encoding:
7.118 - s = utf8string(s)
7.119 - s.encoding = encoding
7.120 + s = unicode(s, None, encoding)
7.121 return s
7.122
7.123 # Special implementation methods.
7.124 @@ -204,9 +240,9 @@
7.125 def __get_single_item__(self, index):
7.126
7.127 "Return the item at the normalised (positive) 'index'."
7.128 -
7.129 +
7.130 self._check_index(index)
7.131 - return utf8string(unicode_substr(self.__data__, self.__size__, index, index + 1, 1), self.encoding)
7.132 + return unicode(unicode_substr(self.__data__, self.__size__, index, index + 1, 1), None, self.encoding)
7.133
7.134 def __get_multiple_items__(self, start, end, step):
7.135
7.136 @@ -224,29 +260,6 @@
7.137 raise ValueError(step)
7.138
7.139 l = get_using(basestring.__get_multiple_items__, self)(start, end, step)
7.140 - return utf8string("".join(l), self.encoding)
7.141 -
7.142 -def unicode(s, encoding):
7.143 -
7.144 - "Convert 's' to a Unicode object, interpreting 's' as using 'encoding'."
7.145 -
7.146 - if isinstance(s, utf8string):
7.147 - return s
7.148 -
7.149 - # Obtain a string representation.
7.150 -
7.151 - s = s.__str__()
7.152 -
7.153 - # Convert the string to UTF-8. Even if the stated encoding is UTF-8, it
7.154 - # needs to be validated.
7.155 -
7.156 - to_utf8 = Converter(encoding, "UTF-8")
7.157 -
7.158 - try:
7.159 - to_utf8.feed(s)
7.160 - return utf8string(str(to_utf8), encoding)
7.161 -
7.162 - finally:
7.163 - to_utf8.close()
7.164 + return unicode("".join(l), None, self.encoding)
7.165
7.166 # vim: tabstop=4 expandtab shiftwidth=4
8.1 --- a/tests/unicode.py Mon Jun 28 22:29:21 2021 +0200
8.2 +++ b/tests/unicode.py Tue Jun 29 22:24:09 2021 +0200
8.3 @@ -48,7 +48,7 @@
8.4 s7 = r"\346\370\345"
8.5 print "Untranslated values:"
8.6 print s7 # \346\370\345
8.7 -print s7.__class__ # __builtins__.unicode.utf8string
8.8 +print s7.__class__ # __builtins__.unicode.unicode
8.9 print len(s7) # 12
8.10
8.11 # Obtain text and print it.
8.12 @@ -58,7 +58,7 @@
8.13 u = unicode(b"זרו", "ISO-8859-15")
8.14 print "Unicode values:"
8.15 print u # זרו
8.16 -print u.__class__ # __builtins__.unicode.utf8string
8.17 +print u.__class__ # __builtins__.unicode.unicode
8.18 print u.encode("ISO-8859-15") # זרו
8.19 print u.encoding # ISO-8859-15
8.20 print len(u) # 3
8.21 @@ -68,7 +68,7 @@
8.22 u2 = u"זרו"
8.23 print "Unicode values:"
8.24 print u2 # זרו
8.25 -print u2.__class__ # __builtins__.unicode.utf8string
8.26 +print u2.__class__ # __builtins__.unicode.unicode
8.27 print u2.encode("ISO-8859-15") # זרו
8.28 print u2.encoding # ISO-8859-15
8.29 print len(u2) # 3
8.30 @@ -78,7 +78,7 @@
8.31 u3 = "זרו"
8.32 print "Unicode values:"
8.33 print u3 # זרו
8.34 -print u3.__class__ # __builtins__.unicode.utf8string
8.35 +print u3.__class__ # __builtins__.unicode.unicode
8.36 print u3.encode("ISO-8859-15") # זרו
8.37 print u3.encoding # ISO-8859-15
8.38 print len(u3) # 3
8.39 @@ -88,7 +88,7 @@
8.40 u4 = unicode("זרו", "ISO-8859-15")
8.41 print "Unicode values:"
8.42 print u4 # זרו
8.43 -print u4.__class__ # __builtins__.unicode.utf8string
8.44 +print u4.__class__ # __builtins__.unicode.unicode
8.45 print u4.encode("ISO-8859-15") # זרו
8.46 print u4.encoding # ISO-8859-15
8.47 print len(u4) # 3
8.48 @@ -163,7 +163,7 @@
8.49 uu2 = u + u2
8.50 print "Unicode values:"
8.51 print uu2 # זרוזרו
8.52 -print uu2.__class__ # __builtins__.unicode.utf8string
8.53 +print uu2.__class__ # __builtins__.unicode.unicode
8.54 print uu2.encoding # ISO-8859-15
8.55 print len(uu2) # 6
8.56
8.57 @@ -195,7 +195,7 @@
8.58 # Test character access.
8.59
8.60 u0 = u[0]
8.61 -print u0.__class__ # __builtins__.unicode.utf8string
8.62 +print u0.__class__ # __builtins__.unicode.unicode
8.63 print u0.encoding # ISO-8859-15
8.64 print u0 # ז
8.65 print u[-1] # ו