Lichen

Annotated tests/unicode.py

595:60c18e54c8db
2017-02-18 Paul Boddie Introduced a context test that sets a local context for static attributes. method-wrapper-for-context
paul@536 1
# -*- coding: ISO-8859-15 -*-
paul@392 2
paul@392 3
import sys
paul@392 4
paul@392 5
# Print bytes.
paul@392 6
paul@396 7
s = b"???"
paul@536 8
print "ISO-8859-15 values:"
paul@396 9
print s                             # ???
paul@403 10
print len(s)                        # 3
paul@392 11
paul@537 12
s1 = b"???" \
paul@537 13
      "???"
paul@537 14
print "ISO-8859-15 values:"
paul@537 15
print s1                            # ??????
paul@537 16
print len(s1)                       # 6
paul@537 17
paul@512 18
s2 = b"\xe6\xf8\xe5"
paul@536 19
print "ISO-8859-15 values:"
paul@512 20
print s2                            # ???
paul@512 21
print s2.__class__                  # __builtins__.str.string
paul@512 22
print len(s2)                       # 3
paul@512 23
paul@512 24
s3 = "\xe6\xf8\xe5"
paul@536 25
print "ISO-8859-15 values:"
paul@512 26
print s3                            # ???
paul@512 27
print s3.__class__                  # __builtins__.str.string
paul@512 28
print len(s3)                       # 3
paul@512 29
paul@512 30
s4 = b"\u00e6\u00f8\u00e5"
paul@512 31
print "Untranslated values:"
paul@512 32
print s4                            # \u00e6\u00f8\u00e5
paul@512 33
print s4.__class__                  # __builtins__.str.string
paul@512 34
print len(s4)                       # 18
paul@512 35
paul@512 36
s5 = b"\346\370\345"
paul@536 37
print "ISO-8859-15 values:"
paul@512 38
print s5                            # ???
paul@512 39
print s5.__class__                  # __builtins__.str.string
paul@512 40
print len(s5)                       # 3
paul@512 41
paul@512 42
s6 = "\346\370\345"
paul@536 43
print "ISO-8859-15 values:"
paul@512 44
print s6                            # ???
paul@512 45
print s6.__class__                  # __builtins__.str.string
paul@512 46
print len(s6)                       # 3
paul@512 47
paul@512 48
s7 = r"\346\370\345"
paul@512 49
print "Untranslated values:"
paul@512 50
print s7                            # \346\370\345
paul@512 51
print s7.__class__                  # __builtins__.unicode.utf8string
paul@512 52
print len(s7)                       # 12
paul@512 53
paul@392 54
# Obtain text and print it.
paul@392 55
paul@394 56
# Explicitly from bytes.
paul@394 57
paul@536 58
u = unicode(b"???", "ISO-8859-15")
paul@512 59
print "Unicode values:"
paul@406 60
print u                             # ???
paul@403 61
print u.__class__                   # __builtins__.unicode.utf8string
paul@536 62
print u.encode("ISO-8859-15")       # ???
paul@536 63
print u.encoding                    # ISO-8859-15
paul@403 64
print len(u)                        # 3
paul@392 65
paul@394 66
# Explicitly from Unicode literals.
paul@394 67
paul@394 68
u2 = u"???"
paul@512 69
print "Unicode values:"
paul@406 70
print u2                            # ???
paul@403 71
print u2.__class__                  # __builtins__.unicode.utf8string
paul@536 72
print u2.encode("ISO-8859-15")      # ???
paul@536 73
print u2.encoding                   # ISO-8859-15
paul@403 74
print len(u2)                       # 3
paul@394 75
paul@394 76
# Implicitly from string literals.
paul@394 77
paul@405 78
u3 = "???"
paul@512 79
print "Unicode values:"
paul@406 80
print u3                            # ???
paul@405 81
print u3.__class__                  # __builtins__.unicode.utf8string
paul@536 82
print u3.encode("ISO-8859-15")      # ???
paul@536 83
print u3.encoding                   # ISO-8859-15
paul@405 84
print len(u3)                       # 3
paul@394 85
paul@512 86
# Explicitly from implicitly-converted literal.
paul@512 87
paul@536 88
u4 = unicode("???", "ISO-8859-15")
paul@512 89
print "Unicode values:"
paul@512 90
print u4                            # ???
paul@512 91
print u4.__class__                  # __builtins__.unicode.utf8string
paul@536 92
print u4.encode("ISO-8859-15")      # ???
paul@536 93
print u4.encoding                   # ISO-8859-15
paul@512 94
print len(u4)                       # 3
paul@512 95
paul@512 96
# Test Unicode values.
paul@512 97
paul@512 98
u5 = "\u00e6\u00f8\u00e5"
paul@512 99
print "Unicode values:"
paul@512 100
print u5                            # ???
paul@512 101
print u5.__class__                  # __builtins__.unicode.ut8string
paul@512 102
print len(u5)                       # 3
paul@512 103
paul@512 104
# Test some untranslated values.
paul@512 105
paul@512 106
u6 = "\\u00e6\\u00f8\\u00e5"
paul@512 107
print "Untranslated values:"
paul@512 108
print u6                            # \u00e6\u00f8\u00e5
paul@512 109
print u6.__class__                  # __builtins__.unicode.ut8string
paul@512 110
print len(u6)                       # 18
paul@512 111
paul@512 112
# Test Unicode values.
paul@512 113
paul@512 114
u7 = u"\346\370\345"
paul@512 115
print "Unicode values:"
paul@512 116
print u7                            # ???
paul@512 117
print u7.__class__                  # __builtins__.unicode.ut8string
paul@512 118
print len(u7)                       # 3
paul@512 119
paul@512 120
# Test Unicode values.
paul@512 121
paul@512 122
u8 = ur"\346\370\345"
paul@512 123
print "Untranslated values:"
paul@512 124
print u8                            # \346\370\345
paul@512 125
print u8.__class__                  # __builtins__.unicode.ut8string
paul@512 126
print len(u8)                       # 12
paul@512 127
paul@410 128
# Test invalid sequences.
paul@410 129
paul@410 130
try:
paul@512 131
    u9 = unicode(s, "UTF-8")
paul@410 132
except UnicodeDecodeError, exc:
paul@410 133
    print "Attempt to decode", s, "as UTF-8 failed."
paul@410 134
paul@396 135
# Combine bytes and text.
paul@396 136
# The text should be decoded.
paul@396 137
paul@396 138
su = s + u
paul@536 139
print "ISO-8859-15 values:"
paul@396 140
print su                            # ??????
paul@398 141
print su.__class__                  # __builtins__.str.string
paul@403 142
print len(su)                       # 6
paul@396 143
paul@396 144
# Combine text and bytes.
paul@396 145
# The text should be decoded.
paul@396 146
paul@396 147
us = u + s
paul@536 148
print "ISO-8859-15 values:"
paul@396 149
print us                            # ??????
paul@398 150
print us.__class__                  # __builtins__.str.string
paul@403 151
print len(us)                       # 6
paul@398 152
paul@398 153
# Combine text and text.
paul@398 154
paul@398 155
uu2 = u + u2
paul@512 156
print "Unicode values:"
paul@406 157
print uu2                           # ??????
paul@398 158
print uu2.__class__                 # __builtins__.unicode.utf8string
paul@536 159
print uu2.encoding                  # ISO-8859-15
paul@403 160
print len(uu2)                      # 6
paul@396 161
paul@392 162
# Inspect and update the encoding of stdout.
paul@398 163
# Note that su and us are byte strings and are not recoded.
paul@392 164
paul@532 165
print sys.stdout                    # <libc.io.sysstream instance>
paul@392 166
print sys.stdout.encoding           # None
paul@398 167
paul@536 168
sys.stdout.encoding = "ISO-8859-15"
paul@536 169
print "ISO-8859-15 and Unicode values as ISO-8859-15:"
paul@536 170
print sys.stdout.encoding           # ISO-8859-15
paul@392 171
print u                             # ???
paul@396 172
print su                            # ??????
paul@396 173
print us                            # ??????
paul@398 174
paul@398 175
sys.stdout.encoding = "UTF-8"
paul@512 176
print "Unicode values as UTF-8:"
paul@398 177
print sys.stdout.encoding           # UTF-8
paul@398 178
print u                             # ??????
paul@536 179
print "ISO-8859-15 values bypassing UTF-8 output encoding:"
paul@398 180
print su                            # ??????
paul@398 181
print us                            # ??????
paul@431 182
paul@431 183
# Reset the encoding.
paul@431 184
paul@536 185
sys.stdout.encoding = "ISO-8859-15"
paul@431 186
paul@431 187
# Test character access.
paul@431 188
paul@431 189
u0 = u[0]
paul@431 190
print u0.__class__                  # __builtins__.unicode.utf8string
paul@536 191
print u0.encoding                   # ISO-8859-15
paul@431 192
print u0                            # ?
paul@431 193
print u[-1]                         # ?
paul@431 194
print len(u[0])                     # 1
paul@431 195
print len(u[-1])                    # 1
paul@431 196
print u[:2]                         # ??
paul@431 197
print len(u[:2])                    # 2
paul@431 198
print u[-1::-1]                     # ???
paul@431 199
print len(u[-1::-1])                # 3
paul@534 200
paul@534 201
# Test character values.
paul@534 202
paul@534 203
print ord(u[0])                     # 230
paul@534 204
paul@534 205
try:
paul@534 206
    print ord(u)                    # should raise an exception
paul@534 207
except ValueError, exc:
paul@534 208
    print "ord(u): value is not appropriate", repr(exc.value)
paul@536 209
paul@536 210
euro = "?"
paul@536 211
print euro                          # ?
paul@536 212
print repr(euro)                    # "\u20ac"
paul@536 213
print ord(euro)                     # 8364
paul@536 214
print "\u20ac"                      # ?