# HG changeset patch # User Paul Boddie # Date 1486163526 -3600 # Node ID 50463cb7afae1d9cb5f3dc667165001bcc872ed2 # Parent f91b467ef5687586a2903a77dac00be8e6530bd2 Implemented appropriate character quoting for Unicode string representations. Tested ord and quoting with Unicode objects. diff -r f91b467ef568 -r 50463cb7afae lib/__builtins__/str.py --- a/lib/__builtins__/str.py Sat Feb 04 00:10:47 2017 +0100 +++ b/lib/__builtins__/str.py Sat Feb 04 00:12:06 2017 +0100 @@ -127,13 +127,7 @@ elif c == "\r": b.append("\\r") else: - if n < 0: - n += 256 - b.append("\\x") - x = hex(n, "") - if len(x) < 2: - b.append("0") - b.append(x) + self._quote_value(b, n) i += 1 last = i @@ -144,6 +138,18 @@ b.append(quote) return str(b) + def _quote_value(self, b, n): + + "Append to 'b' the quoted form of 'n'." + + if n < 0: + n += 256 + b.append("\\x") + x = hex(n, "") + if len(x) < 2: + b.append("0") + b.append(x) + def bytelength(self): "Return the number of bytes in this string." diff -r f91b467ef568 -r 50463cb7afae lib/__builtins__/unicode.py --- a/lib/__builtins__/unicode.py Sat Feb 04 00:10:47 2017 +0100 +++ b/lib/__builtins__/unicode.py Sat Feb 04 00:12:06 2017 +0100 @@ -89,6 +89,34 @@ else: return result + def _quote_value(self, b, n): + + "Append to 'b' the quoted form of 'n'." + + if n < 0: + n += 256 + + if n > 0xffff: + b.append("\\U") + digits = 8 + elif n > 0xff: + b.append("\\u") + digits = 4 + else: + b.append("\\x") + digits = 2 + + x = hex(n, "") + i = len(x) + + while i < digits: + b.append("0") + i += 1 + + b.append(x) + + # Operator methods. + def __iadd__(self, other): "Return a string combining this string with 'other'." diff -r f91b467ef568 -r 50463cb7afae tests/unicode.py --- a/tests/unicode.py Sat Feb 04 00:10:47 2017 +0100 +++ b/tests/unicode.py Sat Feb 04 00:12:06 2017 +0100 @@ -1,22 +1,22 @@ -# -*- coding: ISO-8859-1 -*- +# -*- coding: ISO-8859-15 -*- import sys # Print bytes. s = b"ÆØÅ" -print "ISO-8859-1 values:" +print "ISO-8859-15 values:" print s # ÆØÅ print len(s) # 3 s2 = b"\xe6\xf8\xe5" -print "ISO-8859-1 values:" +print "ISO-8859-15 values:" print s2 # æøå print s2.__class__ # __builtins__.str.string print len(s2) # 3 s3 = "\xe6\xf8\xe5" -print "ISO-8859-1 values:" +print "ISO-8859-15 values:" print s3 # æøå print s3.__class__ # __builtins__.str.string print len(s3) # 3 @@ -28,13 +28,13 @@ print len(s4) # 18 s5 = b"\346\370\345" -print "ISO-8859-1 values:" +print "ISO-8859-15 values:" print s5 # æøå print s5.__class__ # __builtins__.str.string print len(s5) # 3 s6 = "\346\370\345" -print "ISO-8859-1 values:" +print "ISO-8859-15 values:" print s6 # æøå print s6.__class__ # __builtins__.str.string print len(s6) # 3 @@ -49,12 +49,12 @@ # Explicitly from bytes. -u = unicode(b"æøå", "ISO-8859-1") +u = unicode(b"æøå", "ISO-8859-15") print "Unicode values:" print u # æøå print u.__class__ # __builtins__.unicode.utf8string -print u.encode("ISO-8859-1") # æøå -print u.encoding # ISO-8859-1 +print u.encode("ISO-8859-15") # æøå +print u.encoding # ISO-8859-15 print len(u) # 3 # Explicitly from Unicode literals. @@ -63,8 +63,8 @@ print "Unicode values:" print u2 # æøå print u2.__class__ # __builtins__.unicode.utf8string -print u2.encode("ISO-8859-1") # æøå -print u2.encoding # ISO-8859-1 +print u2.encode("ISO-8859-15") # æøå +print u2.encoding # ISO-8859-15 print len(u2) # 3 # Implicitly from string literals. @@ -73,18 +73,18 @@ print "Unicode values:" print u3 # æøå print u3.__class__ # __builtins__.unicode.utf8string -print u3.encode("ISO-8859-1") # æøå -print u3.encoding # ISO-8859-1 +print u3.encode("ISO-8859-15") # æøå +print u3.encoding # ISO-8859-15 print len(u3) # 3 # Explicitly from implicitly-converted literal. -u4 = unicode("æøå", "ISO-8859-1") +u4 = unicode("æøå", "ISO-8859-15") print "Unicode values:" print u4 # æøå print u4.__class__ # __builtins__.unicode.utf8string -print u4.encode("ISO-8859-1") # æøå -print u4.encoding # ISO-8859-1 +print u4.encode("ISO-8859-15") # æøå +print u4.encoding # ISO-8859-15 print len(u4) # 3 # Test Unicode values. @@ -130,7 +130,7 @@ # The text should be decoded. su = s + u -print "ISO-8859-1 values:" +print "ISO-8859-15 values:" print su # ÆØÅæøå print su.__class__ # __builtins__.str.string print len(su) # 6 @@ -139,7 +139,7 @@ # The text should be decoded. us = u + s -print "ISO-8859-1 values:" +print "ISO-8859-15 values:" print us # æøåÆØÅ print us.__class__ # __builtins__.str.string print len(us) # 6 @@ -150,7 +150,7 @@ print "Unicode values:" print uu2 # æøåæøå print uu2.__class__ # __builtins__.unicode.utf8string -print uu2.encoding # ISO-8859-1 +print uu2.encoding # ISO-8859-15 print len(uu2) # 6 # Inspect and update the encoding of stdout. @@ -159,9 +159,9 @@ print sys.stdout # print sys.stdout.encoding # None -sys.stdout.encoding = "ISO-8859-1" -print "ISO-8859-1 and Unicode values as ISO-8859-1:" -print sys.stdout.encoding # ISO-8859-1 +sys.stdout.encoding = "ISO-8859-15" +print "ISO-8859-15 and Unicode values as ISO-8859-15:" +print sys.stdout.encoding # ISO-8859-15 print u # æøå print su # ÆØÅæøå print us # æøåÆØÅ @@ -170,19 +170,19 @@ print "Unicode values as UTF-8:" print sys.stdout.encoding # UTF-8 print u # æøå -print "ISO-8859-1 values bypassing UTF-8 output encoding:" +print "ISO-8859-15 values bypassing UTF-8 output encoding:" print su # ÆØÅæøå print us # æøåÆØÅ # Reset the encoding. -sys.stdout.encoding = "ISO-8859-1" +sys.stdout.encoding = "ISO-8859-15" # Test character access. u0 = u[0] print u0.__class__ # __builtins__.unicode.utf8string -print u0.encoding # ISO-8859-1 +print u0.encoding # ISO-8859-15 print u0 # æ print u[-1] # å print len(u[0]) # 1 @@ -200,3 +200,9 @@ print ord(u) # should raise an exception except ValueError, exc: print "ord(u): value is not appropriate", repr(exc.value) + +euro = "¤" +print euro # ¤ +print repr(euro) # "\u20ac" +print ord(euro) # 8364 +print "\u20ac" # ¤