Lichen

tests/unicode.py

843:d305986d05c8
2018-07-05 Paul Boddie Employed sets for attributes and providers referenced by accesses. This causes various attributes to be identified definitively in the access plans and instruction sequences.
     1 # -*- coding: ISO-8859-15 -*-     2      3 import sys     4      5 # Print bytes.     6      7 s = b"???"     8 print "ISO-8859-15 values:"     9 print s                             # ???    10 print len(s)                        # 3    11     12 s1 = b"???" \    13       "???"    14 print "ISO-8859-15 values:"    15 print s1                            # ??????    16 print len(s1)                       # 6    17     18 s2 = b"\xe6\xf8\xe5"    19 print "ISO-8859-15 values:"    20 print s2                            # ???    21 print s2.__class__                  # __builtins__.str.string    22 print len(s2)                       # 3    23     24 s3 = "\xe6\xf8\xe5"    25 print "ISO-8859-15 values:"    26 print s3                            # ???    27 print s3.__class__                  # __builtins__.str.string    28 print len(s3)                       # 3    29     30 s4 = b"\u00e6\u00f8\u00e5"    31 print "Untranslated values:"    32 print s4                            # \u00e6\u00f8\u00e5    33 print s4.__class__                  # __builtins__.str.string    34 print len(s4)                       # 18    35     36 s5 = b"\346\370\345"    37 print "ISO-8859-15 values:"    38 print s5                            # ???    39 print s5.__class__                  # __builtins__.str.string    40 print len(s5)                       # 3    41     42 s6 = "\346\370\345"    43 print "ISO-8859-15 values:"    44 print s6                            # ???    45 print s6.__class__                  # __builtins__.str.string    46 print len(s6)                       # 3    47     48 s7 = r"\346\370\345"    49 print "Untranslated values:"    50 print s7                            # \346\370\345    51 print s7.__class__                  # __builtins__.unicode.utf8string    52 print len(s7)                       # 12    53     54 # Obtain text and print it.    55     56 # Explicitly from bytes.    57     58 u = unicode(b"???", "ISO-8859-15")    59 print "Unicode values:"    60 print u                             # ???    61 print u.__class__                   # __builtins__.unicode.utf8string    62 print u.encode("ISO-8859-15")       # ???    63 print u.encoding                    # ISO-8859-15    64 print len(u)                        # 3    65     66 # Explicitly from Unicode literals.    67     68 u2 = u"???"    69 print "Unicode values:"    70 print u2                            # ???    71 print u2.__class__                  # __builtins__.unicode.utf8string    72 print u2.encode("ISO-8859-15")      # ???    73 print u2.encoding                   # ISO-8859-15    74 print len(u2)                       # 3    75     76 # Implicitly from string literals.    77     78 u3 = "???"    79 print "Unicode values:"    80 print u3                            # ???    81 print u3.__class__                  # __builtins__.unicode.utf8string    82 print u3.encode("ISO-8859-15")      # ???    83 print u3.encoding                   # ISO-8859-15    84 print len(u3)                       # 3    85     86 # Explicitly from implicitly-converted literal.    87     88 u4 = unicode("???", "ISO-8859-15")    89 print "Unicode values:"    90 print u4                            # ???    91 print u4.__class__                  # __builtins__.unicode.utf8string    92 print u4.encode("ISO-8859-15")      # ???    93 print u4.encoding                   # ISO-8859-15    94 print len(u4)                       # 3    95     96 # Test Unicode values.    97     98 u5 = "\u00e6\u00f8\u00e5"    99 print "Unicode values:"   100 print u5                            # ???   101 print u5.__class__                  # __builtins__.unicode.ut8string   102 print len(u5)                       # 3   103    104 # Test some untranslated values.   105    106 u6 = "\\u00e6\\u00f8\\u00e5"   107 print "Untranslated values:"   108 print u6                            # \u00e6\u00f8\u00e5   109 print u6.__class__                  # __builtins__.unicode.ut8string   110 print len(u6)                       # 18   111    112 # Test Unicode values.   113    114 u7 = u"\346\370\345"   115 print "Unicode values:"   116 print u7                            # ???   117 print u7.__class__                  # __builtins__.unicode.ut8string   118 print len(u7)                       # 3   119    120 # Test Unicode values.   121    122 u8 = ur"\346\370\345"   123 print "Untranslated values:"   124 print u8                            # \346\370\345   125 print u8.__class__                  # __builtins__.unicode.ut8string   126 print len(u8)                       # 12   127    128 # Test invalid sequences.   129    130 try:   131     u9 = unicode(s, "UTF-8")   132 except UnicodeDecodeError, exc:   133     print "Attempt to decode", s, "as UTF-8 failed."   134    135 # Mix Unicode and byte values.   136    137 u10 = "\u00e6\xf8\u00e5"   138 print "ISO-8859-15 values:"   139 print u10                           # \u00e6?\u00e5   140 print u10.__class__                 # __builtins__.str.string   141 print len(u10)                      # 13   142    143 # Combine bytes and text.   144 # The text should be decoded.   145    146 su = s + u   147 print "ISO-8859-15 values:"   148 print su                            # ??????   149 print su.__class__                  # __builtins__.str.string   150 print len(su)                       # 6   151    152 # Combine text and bytes.   153 # The text should be decoded.   154    155 us = u + s   156 print "ISO-8859-15 values:"   157 print us                            # ??????   158 print us.__class__                  # __builtins__.str.string   159 print len(us)                       # 6   160    161 # Combine text and text.   162    163 uu2 = u + u2   164 print "Unicode values:"   165 print uu2                           # ??????   166 print uu2.__class__                 # __builtins__.unicode.utf8string   167 print uu2.encoding                  # ISO-8859-15   168 print len(uu2)                      # 6   169    170 # Inspect and update the encoding of stdout.   171 # Note that su and us are byte strings and are not recoded.   172    173 print sys.stdout                    # <libc.io.sysstream instance>   174 print sys.stdout.encoding           # None   175    176 sys.stdout.encoding = "ISO-8859-15"   177 print "ISO-8859-15 and Unicode values as ISO-8859-15:"   178 print sys.stdout.encoding           # ISO-8859-15   179 print u                             # ???   180 print su                            # ??????   181 print us                            # ??????   182    183 sys.stdout.encoding = "UTF-8"   184 print "Unicode values as UTF-8:"   185 print sys.stdout.encoding           # UTF-8   186 print u                             # ??????   187 print "ISO-8859-15 values bypassing UTF-8 output encoding:"   188 print su                            # ??????   189 print us                            # ??????   190    191 # Reset the encoding.   192    193 sys.stdout.encoding = "ISO-8859-15"   194    195 # Test character access.   196    197 u0 = u[0]   198 print u0.__class__                  # __builtins__.unicode.utf8string   199 print u0.encoding                   # ISO-8859-15   200 print u0                            # ?   201 print u[-1]                         # ?   202 print len(u[0])                     # 1   203 print len(u[-1])                    # 1   204 print u[:2]                         # ??   205 print len(u[:2])                    # 2   206 print u[-1::-1]                     # ???   207 print len(u[-1::-1])                # 3   208    209 # Test character values.   210    211 print ord(u[0])                     # 230   212    213 try:   214     print ord(u)                    # should raise an exception   215 except ValueError, exc:   216     print "ord(u): value is not appropriate", repr(exc.value)   217    218 euro = "?"   219 print euro                          # ?   220 print repr(euro)                    # "\u20ac"   221 print ord(euro)                     # 8364   222 print "\u20ac"                      # ?   223 print unichr(ord(euro))             # ?   224 print unichr(ord(euro)) == euro     # True