Lichen

tests/unicode.py

934:2989aab1b4f7
13 months ago Paul Boddie Renamed the utf8string class to unicode, eliminating the unicode function. This means that the simple case of merely returning an object if it is already a Unicode object no longer occurs when using the unicode callable, but such behaviour might be better supported with more general customised instantiation functionality.
     1 # -*- coding: ISO-8859-15 -*-     2      3 import sys     4      5 # Print bytes.     6      7 s = b"???"     8 print "ISO-8859-15 values:"     9 print s                             # ???    10 print len(s)                        # 3    11     12 s1 = b"???" \    13       "???"    14 print "ISO-8859-15 values:"    15 print s1                            # ??????    16 print len(s1)                       # 6    17     18 s2 = b"\xe6\xf8\xe5"    19 print "ISO-8859-15 values:"    20 print s2                            # ???    21 print s2.__class__                  # __builtins__.str.string    22 print len(s2)                       # 3    23     24 s3 = "\xe6\xf8\xe5"    25 print "ISO-8859-15 values:"    26 print s3                            # ???    27 print s3.__class__                  # __builtins__.str.string    28 print len(s3)                       # 3    29     30 s4 = b"\u00e6\u00f8\u00e5"    31 print "Untranslated values:"    32 print s4                            # \u00e6\u00f8\u00e5    33 print s4.__class__                  # __builtins__.str.string    34 print len(s4)                       # 18    35     36 s5 = b"\346\370\345"    37 print "ISO-8859-15 values:"    38 print s5                            # ???    39 print s5.__class__                  # __builtins__.str.string    40 print len(s5)                       # 3    41     42 s6 = "\346\370\345"    43 print "ISO-8859-15 values:"    44 print s6                            # ???    45 print s6.__class__                  # __builtins__.str.string    46 print len(s6)                       # 3    47     48 s7 = r"\346\370\345"    49 print "Untranslated values:"    50 print s7                            # \346\370\345    51 print s7.__class__                  # __builtins__.unicode.unicode    52 print len(s7)                       # 12    53     54 # Obtain text and print it.    55     56 # Explicitly from bytes.    57     58 u = unicode(b"???", "ISO-8859-15")    59 print "Unicode values:"    60 print u                             # ???    61 print u.__class__                   # __builtins__.unicode.unicode    62 print u.encode("ISO-8859-15")       # ???    63 print u.encoding                    # ISO-8859-15    64 print len(u)                        # 3    65     66 # Explicitly from Unicode literals.    67     68 u2 = u"???"    69 print "Unicode values:"    70 print u2                            # ???    71 print u2.__class__                  # __builtins__.unicode.unicode    72 print u2.encode("ISO-8859-15")      # ???    73 print u2.encoding                   # ISO-8859-15    74 print len(u2)                       # 3    75     76 # Implicitly from string literals.    77     78 u3 = "???"    79 print "Unicode values:"    80 print u3                            # ???    81 print u3.__class__                  # __builtins__.unicode.unicode    82 print u3.encode("ISO-8859-15")      # ???    83 print u3.encoding                   # ISO-8859-15    84 print len(u3)                       # 3    85     86 # Explicitly from implicitly-converted literal.    87     88 u4 = unicode("???", "ISO-8859-15")    89 print "Unicode values:"    90 print u4                            # ???    91 print u4.__class__                  # __builtins__.unicode.unicode    92 print u4.encode("ISO-8859-15")      # ???    93 print u4.encoding                   # ISO-8859-15    94 print len(u4)                       # 3    95     96 # Test Unicode values.    97     98 u5 = "\u00e6\u00f8\u00e5"    99 print "Unicode values:"   100 print u5                            # ???   101 print u5.__class__                  # __builtins__.unicode.ut8string   102 print len(u5)                       # 3   103    104 # Test some untranslated values.   105    106 u6 = "\\u00e6\\u00f8\\u00e5"   107 print "Untranslated values:"   108 print u6                            # \u00e6\u00f8\u00e5   109 print u6.__class__                  # __builtins__.unicode.ut8string   110 print len(u6)                       # 18   111    112 # Test Unicode values.   113    114 u7 = u"\346\370\345"   115 print "Unicode values:"   116 print u7                            # ???   117 print u7.__class__                  # __builtins__.unicode.ut8string   118 print len(u7)                       # 3   119    120 # Test Unicode values.   121    122 u8 = ur"\346\370\345"   123 print "Untranslated values:"   124 print u8                            # \346\370\345   125 print u8.__class__                  # __builtins__.unicode.ut8string   126 print len(u8)                       # 12   127    128 # Test invalid sequences.   129    130 try:   131     u9 = unicode(s, "UTF-8")   132 except UnicodeDecodeError, exc:   133     print "Attempt to decode", s, "as UTF-8 failed."   134    135 # Mix Unicode and byte values.   136    137 u10 = "\u00e6\xf8\u00e5"   138 print "ISO-8859-15 values:"   139 print u10                           # \u00e6?\u00e5   140 print u10.__class__                 # __builtins__.str.string   141 print len(u10)                      # 13   142    143 # Combine bytes and text.   144 # The text should be decoded.   145    146 su = s + u   147 print "ISO-8859-15 values:"   148 print su                            # ??????   149 print su.__class__                  # __builtins__.str.string   150 print len(su)                       # 6   151    152 # Combine text and bytes.   153 # The text should be decoded.   154    155 us = u + s   156 print "ISO-8859-15 values:"   157 print us                            # ??????   158 print us.__class__                  # __builtins__.str.string   159 print len(us)                       # 6   160    161 # Combine text and text.   162    163 uu2 = u + u2   164 print "Unicode values:"   165 print uu2                           # ??????   166 print uu2.__class__                 # __builtins__.unicode.unicode   167 print uu2.encoding                  # ISO-8859-15   168 print len(uu2)                      # 6   169    170 # Inspect and update the encoding of stdout.   171 # Note that su and us are byte strings and are not recoded.   172    173 print sys.stdout                    # <libc.io.sysstream instance>   174 print sys.stdout.encoding           # None   175    176 sys.stdout.encoding = "ISO-8859-15"   177 print "ISO-8859-15 and Unicode values as ISO-8859-15:"   178 print sys.stdout.encoding           # ISO-8859-15   179 print u                             # ???   180 print su                            # ??????   181 print us                            # ??????   182    183 sys.stdout.encoding = "UTF-8"   184 print "Unicode values as UTF-8:"   185 print sys.stdout.encoding           # UTF-8   186 print u                             # ??????   187 print "ISO-8859-15 values bypassing UTF-8 output encoding:"   188 print su                            # ??????   189 print us                            # ??????   190    191 # Reset the encoding.   192    193 sys.stdout.encoding = "ISO-8859-15"   194    195 # Test character access.   196    197 u0 = u[0]   198 print u0.__class__                  # __builtins__.unicode.unicode   199 print u0.encoding                   # ISO-8859-15   200 print u0                            # ?   201 print u[-1]                         # ?   202 print len(u[0])                     # 1   203 print len(u[-1])                    # 1   204 print u[:2]                         # ??   205 print len(u[:2])                    # 2   206 print u[-1::-1]                     # ???   207 print len(u[-1::-1])                # 3   208    209 # Test character values.   210    211 print ord(u[0])                     # 230   212    213 try:   214     print ord(u)                    # should raise an exception   215 except ValueError, exc:   216     print "ord(u): value is not appropriate", repr(exc.value)   217    218 euro = "?"   219 print euro                          # ?   220 print repr(euro)                    # "\u20ac"   221 print ord(euro)                     # 8364   222 print "\u20ac"                      # ?   223 print unichr(ord(euro))             # ?   224 print unichr(ord(euro)) == euro     # True