Lichen

Annotated tests/unicode.py

938:799711337453
6 months ago Paul Boddie Renamed the string class to str, replacing the str function with the new_str function, this being invoked specially by the string instantiation function. As with the Unicode type renaming, a more general instantiation mechanism might permit the new_str function to be part of the functionality of the str or basestring classes.
paul@536 1
# -*- coding: ISO-8859-15 -*-
paul@392 2
paul@392 3
import sys
paul@392 4
paul@392 5
# Print bytes.
paul@392 6
paul@396 7
s = b"???"
paul@536 8
print "ISO-8859-15 values:"
paul@396 9
print s                             # ???
paul@403 10
print len(s)                        # 3
paul@392 11
paul@537 12
s1 = b"???" \
paul@537 13
      "???"
paul@537 14
print "ISO-8859-15 values:"
paul@537 15
print s1                            # ??????
paul@537 16
print len(s1)                       # 6
paul@537 17
paul@512 18
s2 = b"\xe6\xf8\xe5"
paul@536 19
print "ISO-8859-15 values:"
paul@512 20
print s2                            # ???
paul@938 21
print s2.__class__                  # __builtins__.str.str
paul@512 22
print len(s2)                       # 3
paul@512 23
paul@512 24
s3 = "\xe6\xf8\xe5"
paul@536 25
print "ISO-8859-15 values:"
paul@512 26
print s3                            # ???
paul@938 27
print s3.__class__                  # __builtins__.str.str
paul@512 28
print len(s3)                       # 3
paul@512 29
paul@512 30
s4 = b"\u00e6\u00f8\u00e5"
paul@512 31
print "Untranslated values:"
paul@512 32
print s4                            # \u00e6\u00f8\u00e5
paul@938 33
print s4.__class__                  # __builtins__.str.str
paul@512 34
print len(s4)                       # 18
paul@512 35
paul@512 36
s5 = b"\346\370\345"
paul@536 37
print "ISO-8859-15 values:"
paul@512 38
print s5                            # ???
paul@938 39
print s5.__class__                  # __builtins__.str.str
paul@512 40
print len(s5)                       # 3
paul@512 41
paul@512 42
s6 = "\346\370\345"
paul@536 43
print "ISO-8859-15 values:"
paul@512 44
print s6                            # ???
paul@938 45
print s6.__class__                  # __builtins__.str.str
paul@512 46
print len(s6)                       # 3
paul@512 47
paul@512 48
s7 = r"\346\370\345"
paul@512 49
print "Untranslated values:"
paul@512 50
print s7                            # \346\370\345
paul@934 51
print s7.__class__                  # __builtins__.unicode.unicode
paul@512 52
print len(s7)                       # 12
paul@512 53
paul@392 54
# Obtain text and print it.
paul@392 55
paul@394 56
# Explicitly from bytes.
paul@394 57
paul@536 58
u = unicode(b"???", "ISO-8859-15")
paul@512 59
print "Unicode values:"
paul@406 60
print u                             # ???
paul@934 61
print u.__class__                   # __builtins__.unicode.unicode
paul@536 62
print u.encode("ISO-8859-15")       # ???
paul@536 63
print u.encoding                    # ISO-8859-15
paul@403 64
print len(u)                        # 3
paul@392 65
paul@394 66
# Explicitly from Unicode literals.
paul@394 67
paul@394 68
u2 = u"???"
paul@512 69
print "Unicode values:"
paul@406 70
print u2                            # ???
paul@934 71
print u2.__class__                  # __builtins__.unicode.unicode
paul@536 72
print u2.encode("ISO-8859-15")      # ???
paul@536 73
print u2.encoding                   # ISO-8859-15
paul@403 74
print len(u2)                       # 3
paul@394 75
paul@394 76
# Implicitly from string literals.
paul@394 77
paul@405 78
u3 = "???"
paul@512 79
print "Unicode values:"
paul@406 80
print u3                            # ???
paul@934 81
print u3.__class__                  # __builtins__.unicode.unicode
paul@536 82
print u3.encode("ISO-8859-15")      # ???
paul@536 83
print u3.encoding                   # ISO-8859-15
paul@405 84
print len(u3)                       # 3
paul@394 85
paul@512 86
# Explicitly from implicitly-converted literal.
paul@512 87
paul@536 88
u4 = unicode("???", "ISO-8859-15")
paul@512 89
print "Unicode values:"
paul@512 90
print u4                            # ???
paul@934 91
print u4.__class__                  # __builtins__.unicode.unicode
paul@536 92
print u4.encode("ISO-8859-15")      # ???
paul@536 93
print u4.encoding                   # ISO-8859-15
paul@512 94
print len(u4)                       # 3
paul@512 95
paul@512 96
# Test Unicode values.
paul@512 97
paul@512 98
u5 = "\u00e6\u00f8\u00e5"
paul@512 99
print "Unicode values:"
paul@512 100
print u5                            # ???
paul@938 101
print u5.__class__                  # __builtins__.unicode.unicode
paul@512 102
print len(u5)                       # 3
paul@512 103
paul@512 104
# Test some untranslated values.
paul@512 105
paul@512 106
u6 = "\\u00e6\\u00f8\\u00e5"
paul@512 107
print "Untranslated values:"
paul@512 108
print u6                            # \u00e6\u00f8\u00e5
paul@938 109
print u6.__class__                  # __builtins__.unicode.unicode
paul@512 110
print len(u6)                       # 18
paul@512 111
paul@512 112
# Test Unicode values.
paul@512 113
paul@512 114
u7 = u"\346\370\345"
paul@512 115
print "Unicode values:"
paul@512 116
print u7                            # ???
paul@938 117
print u7.__class__                  # __builtins__.unicode.unicode
paul@512 118
print len(u7)                       # 3
paul@512 119
paul@512 120
# Test Unicode values.
paul@512 121
paul@512 122
u8 = ur"\346\370\345"
paul@512 123
print "Untranslated values:"
paul@512 124
print u8                            # \346\370\345
paul@938 125
print u8.__class__                  # __builtins__.unicode.unicode
paul@512 126
print len(u8)                       # 12
paul@512 127
paul@410 128
# Test invalid sequences.
paul@410 129
paul@410 130
try:
paul@512 131
    u9 = unicode(s, "UTF-8")
paul@410 132
except UnicodeDecodeError, exc:
paul@410 133
    print "Attempt to decode", s, "as UTF-8 failed."
paul@410 134
paul@817 135
# Mix Unicode and byte values.
paul@817 136
paul@817 137
u10 = "\u00e6\xf8\u00e5"
paul@817 138
print "ISO-8859-15 values:"
paul@817 139
print u10                           # \u00e6?\u00e5
paul@938 140
print u10.__class__                 # __builtins__.str.str
paul@817 141
print len(u10)                      # 13
paul@817 142
paul@396 143
# Combine bytes and text.
paul@396 144
# The text should be decoded.
paul@396 145
paul@396 146
su = s + u
paul@536 147
print "ISO-8859-15 values:"
paul@396 148
print su                            # ??????
paul@938 149
print su.__class__                  # __builtins__.str.str
paul@403 150
print len(su)                       # 6
paul@396 151
paul@396 152
# Combine text and bytes.
paul@396 153
# The text should be decoded.
paul@396 154
paul@396 155
us = u + s
paul@536 156
print "ISO-8859-15 values:"
paul@396 157
print us                            # ??????
paul@938 158
print us.__class__                  # __builtins__.str.str
paul@403 159
print len(us)                       # 6
paul@398 160
paul@398 161
# Combine text and text.
paul@398 162
paul@398 163
uu2 = u + u2
paul@512 164
print "Unicode values:"
paul@406 165
print uu2                           # ??????
paul@934 166
print uu2.__class__                 # __builtins__.unicode.unicode
paul@536 167
print uu2.encoding                  # ISO-8859-15
paul@403 168
print len(uu2)                      # 6
paul@396 169
paul@392 170
# Inspect and update the encoding of stdout.
paul@398 171
# Note that su and us are byte strings and are not recoded.
paul@392 172
paul@532 173
print sys.stdout                    # <libc.io.sysstream instance>
paul@392 174
print sys.stdout.encoding           # None
paul@398 175
paul@536 176
sys.stdout.encoding = "ISO-8859-15"
paul@536 177
print "ISO-8859-15 and Unicode values as ISO-8859-15:"
paul@536 178
print sys.stdout.encoding           # ISO-8859-15
paul@392 179
print u                             # ???
paul@396 180
print su                            # ??????
paul@396 181
print us                            # ??????
paul@398 182
paul@398 183
sys.stdout.encoding = "UTF-8"
paul@512 184
print "Unicode values as UTF-8:"
paul@398 185
print sys.stdout.encoding           # UTF-8
paul@398 186
print u                             # ??????
paul@536 187
print "ISO-8859-15 values bypassing UTF-8 output encoding:"
paul@398 188
print su                            # ??????
paul@398 189
print us                            # ??????
paul@431 190
paul@431 191
# Reset the encoding.
paul@431 192
paul@536 193
sys.stdout.encoding = "ISO-8859-15"
paul@431 194
paul@431 195
# Test character access.
paul@431 196
paul@431 197
u0 = u[0]
paul@934 198
print u0.__class__                  # __builtins__.unicode.unicode
paul@536 199
print u0.encoding                   # ISO-8859-15
paul@431 200
print u0                            # ?
paul@431 201
print u[-1]                         # ?
paul@431 202
print len(u[0])                     # 1
paul@431 203
print len(u[-1])                    # 1
paul@431 204
print u[:2]                         # ??
paul@431 205
print len(u[:2])                    # 2
paul@431 206
print u[-1::-1]                     # ???
paul@431 207
print len(u[-1::-1])                # 3
paul@534 208
paul@534 209
# Test character values.
paul@534 210
paul@534 211
print ord(u[0])                     # 230
paul@534 212
paul@534 213
try:
paul@534 214
    print ord(u)                    # should raise an exception
paul@534 215
except ValueError, exc:
paul@534 216
    print "ord(u): value is not appropriate", repr(exc.value)
paul@536 217
paul@536 218
euro = "?"
paul@536 219
print euro                          # ?
paul@536 220
print repr(euro)                    # "\u20ac"
paul@536 221
print ord(euro)                     # 8364
paul@536 222
print "\u20ac"                      # ?
paul@607 223
print unichr(ord(euro))             # ?
paul@607 224
print unichr(ord(euro)) == euro     # True