1.1 --- a/tests/unicode.py Fri Jan 27 15:27:17 2017 +0100
1.2 +++ b/tests/unicode.py Fri Jan 27 23:54:18 2017 +0100
1.3 @@ -5,14 +5,52 @@
1.4 # Print bytes.
1.5
1.6 s = b"ֶ״ֵ"
1.7 +print "ISO-8859-1 values:"
1.8 print s # ֶ״ֵ
1.9 print len(s) # 3
1.10
1.11 +s2 = b"\xe6\xf8\xe5"
1.12 +print "ISO-8859-1 values:"
1.13 +print s2 # זרו
1.14 +print s2.__class__ # __builtins__.str.string
1.15 +print len(s2) # 3
1.16 +
1.17 +s3 = "\xe6\xf8\xe5"
1.18 +print "ISO-8859-1 values:"
1.19 +print s3 # זרו
1.20 +print s3.__class__ # __builtins__.str.string
1.21 +print len(s3) # 3
1.22 +
1.23 +s4 = b"\u00e6\u00f8\u00e5"
1.24 +print "Untranslated values:"
1.25 +print s4 # \u00e6\u00f8\u00e5
1.26 +print s4.__class__ # __builtins__.str.string
1.27 +print len(s4) # 18
1.28 +
1.29 +s5 = b"\346\370\345"
1.30 +print "ISO-8859-1 values:"
1.31 +print s5 # זרו
1.32 +print s5.__class__ # __builtins__.str.string
1.33 +print len(s5) # 3
1.34 +
1.35 +s6 = "\346\370\345"
1.36 +print "ISO-8859-1 values:"
1.37 +print s6 # זרו
1.38 +print s6.__class__ # __builtins__.str.string
1.39 +print len(s6) # 3
1.40 +
1.41 +s7 = r"\346\370\345"
1.42 +print "Untranslated values:"
1.43 +print s7 # \346\370\345
1.44 +print s7.__class__ # __builtins__.unicode.utf8string
1.45 +print len(s7) # 12
1.46 +
1.47 # Obtain text and print it.
1.48
1.49 # Explicitly from bytes.
1.50
1.51 -u = unicode("זרו", "ISO-8859-1")
1.52 +u = unicode(b"זרו", "ISO-8859-1")
1.53 +print "Unicode values:"
1.54 print u # זרו
1.55 print u.__class__ # __builtins__.unicode.utf8string
1.56 print u.encode("ISO-8859-1") # זרו
1.57 @@ -22,6 +60,7 @@
1.58 # Explicitly from Unicode literals.
1.59
1.60 u2 = u"זרו"
1.61 +print "Unicode values:"
1.62 print u2 # זרו
1.63 print u2.__class__ # __builtins__.unicode.utf8string
1.64 print u2.encode("ISO-8859-1") # זרו
1.65 @@ -31,16 +70,59 @@
1.66 # Implicitly from string literals.
1.67
1.68 u3 = "זרו"
1.69 +print "Unicode values:"
1.70 print u3 # זרו
1.71 print u3.__class__ # __builtins__.unicode.utf8string
1.72 print u3.encode("ISO-8859-1") # זרו
1.73 print u3.encoding # ISO-8859-1
1.74 print len(u3) # 3
1.75
1.76 +# Explicitly from implicitly-converted literal.
1.77 +
1.78 +u4 = unicode("זרו", "ISO-8859-1")
1.79 +print "Unicode values:"
1.80 +print u4 # זרו
1.81 +print u4.__class__ # __builtins__.unicode.utf8string
1.82 +print u4.encode("ISO-8859-1") # זרו
1.83 +print u4.encoding # ISO-8859-1
1.84 +print len(u4) # 3
1.85 +
1.86 +# Test Unicode values.
1.87 +
1.88 +u5 = "\u00e6\u00f8\u00e5"
1.89 +print "Unicode values:"
1.90 +print u5 # זרו
1.91 +print u5.__class__ # __builtins__.unicode.ut8string
1.92 +print len(u5) # 3
1.93 +
1.94 +# Test some untranslated values.
1.95 +
1.96 +u6 = "\\u00e6\\u00f8\\u00e5"
1.97 +print "Untranslated values:"
1.98 +print u6 # \u00e6\u00f8\u00e5
1.99 +print u6.__class__ # __builtins__.unicode.ut8string
1.100 +print len(u6) # 18
1.101 +
1.102 +# Test Unicode values.
1.103 +
1.104 +u7 = u"\346\370\345"
1.105 +print "Unicode values:"
1.106 +print u7 # זרו
1.107 +print u7.__class__ # __builtins__.unicode.ut8string
1.108 +print len(u7) # 3
1.109 +
1.110 +# Test Unicode values.
1.111 +
1.112 +u8 = ur"\346\370\345"
1.113 +print "Untranslated values:"
1.114 +print u8 # \346\370\345
1.115 +print u8.__class__ # __builtins__.unicode.ut8string
1.116 +print len(u8) # 12
1.117 +
1.118 # Test invalid sequences.
1.119
1.120 try:
1.121 - u4 = unicode(s, "UTF-8")
1.122 + u9 = unicode(s, "UTF-8")
1.123 except UnicodeDecodeError, exc:
1.124 print "Attempt to decode", s, "as UTF-8 failed."
1.125
1.126 @@ -48,6 +130,7 @@
1.127 # The text should be decoded.
1.128
1.129 su = s + u
1.130 +print "ISO-8859-1 values:"
1.131 print su # ֶ״ֵזרו
1.132 print su.__class__ # __builtins__.str.string
1.133 print len(su) # 6
1.134 @@ -56,6 +139,7 @@
1.135 # The text should be decoded.
1.136
1.137 us = u + s
1.138 +print "ISO-8859-1 values:"
1.139 print us # זרוֶ״ֵ
1.140 print us.__class__ # __builtins__.str.string
1.141 print len(us) # 6
1.142 @@ -63,6 +147,7 @@
1.143 # Combine text and text.
1.144
1.145 uu2 = u + u2
1.146 +print "Unicode values:"
1.147 print uu2 # זרוזרו
1.148 print uu2.__class__ # __builtins__.unicode.utf8string
1.149 print uu2.encoding # ISO-8859-1
1.150 @@ -75,14 +160,17 @@
1.151 print sys.stdout.encoding # None
1.152
1.153 sys.stdout.encoding = "ISO-8859-1"
1.154 +print "ISO-8859-1 and Unicode values as ISO-8859-1:"
1.155 print sys.stdout.encoding # ISO-8859-1
1.156 print u # זרו
1.157 print su # ֶ״ֵזרו
1.158 print us # זרוֶ״ֵ
1.159
1.160 sys.stdout.encoding = "UTF-8"
1.161 +print "Unicode values as UTF-8:"
1.162 print sys.stdout.encoding # UTF-8
1.163 print u # ֳ¦ֳ¸ֳ¥
1.164 +print "ISO-8859-1 values bypassing UTF-8 output encoding:"
1.165 print su # ֶ״ֵזרו
1.166 print us # זרוֶ״ֵ
1.167