1.1 --- a/common.py Fri Jan 27 15:27:17 2017 +0100
1.2 +++ b/common.py Fri Jan 27 23:54:18 2017 +0100
1.3 @@ -20,11 +20,11 @@
1.4 this program. If not, see <http://www.gnu.org/licenses/>.
1.5 """
1.6
1.7 +from compiler.transformer import Transformer
1.8 from errors import InspectError
1.9 from os import listdir, makedirs, remove
1.10 from os.path import exists, isdir, join, split
1.11 from results import ConstantValueRef, LiteralSequenceRef, NameRef
1.12 -from compiler.transformer import Transformer
1.13 import compiler.ast
1.14
1.15 class CommonOutput:
1.16 @@ -248,7 +248,7 @@
1.17 elif isinstance(value, str) and self.encoding:
1.18 if not literal.startswith("b"):
1.19 try:
1.20 - return unicode(value, self.encoding).encode("utf-8"), "unicode", self.encoding
1.21 + return get_string_details(literal, self.encoding)
1.22 except UnicodeDecodeError:
1.23 pass
1.24
1.25 @@ -964,6 +964,175 @@
1.26 x.sort()
1.27 return ", ".join(x)
1.28
1.29 +def get_string_details(s, encoding):
1.30 +
1.31 + """
1.32 + Determine whether 's' represents a Unicode string or a byte string, using
1.33 + 'encoding' to interpret byte sequences. The contents of 's' is the full
1.34 + literal representation including prefix and quotes.
1.35 +
1.36 + Find and convert Unicode values starting with <backslash>u or <backslash>U,
1.37 + and byte or Unicode values starting with <backslash><octal digit> or
1.38 + <backslash>x.
1.39 +
1.40 + Literals prefixed with "u" cause <backslash><octal digit> and <backslash>x
1.41 + to be considered as Unicode values. Otherwise, they produce byte values and
1.42 + cause unprefixed strings to be considered as byte strings.
1.43 +
1.44 + Literals prefixed with "r" do not have their backslash-encoded values
1.45 + converted unless also prefixed with "u", in which case only the above value
1.46 + formats are converted, not any of the other special sequences for things
1.47 + like newlines.
1.48 +
1.49 + Return the encoded literal value, type name, and original encoding as a
1.50 + tuple.
1.51 + """
1.52 +
1.53 + l = []
1.54 + typename = "unicode"
1.55 +
1.56 + # Identify the quote character and use it to identify the prefix.
1.57 +
1.58 + quote_type = s[-1]
1.59 + prefix_end = s.find(quote_type)
1.60 + prefix = s[:prefix_end].lower()
1.61 +
1.62 + if prefix not in ("", "b", "br", "r", "u", "ur"):
1.63 + raise ValueError, "String literal does not have a supported prefix: %s" % s
1.64 +
1.65 + # Identify triple quotes or single quotes.
1.66 +
1.67 + if len(s) >= 6 and s[-2] == quote_type and s[-3] == quote_type:
1.68 + quote = s[prefix_end:prefix_end+3]
1.69 + current = prefix_end + 3
1.70 + end = len(s) - 3
1.71 + else:
1.72 + quote = s[prefix_end]
1.73 + current = prefix_end + 1
1.74 + end = len(s) - 1
1.75 +
1.76 + # Conversions of some quoted values.
1.77 +
1.78 + searches = {
1.79 + "u" : (6, 16),
1.80 + "U" : (10, 16),
1.81 + "x" : (4, 16),
1.82 + }
1.83 +
1.84 + octal_digits = map(str, range(0, 8))
1.85 +
1.86 + # Translations of some quoted values.
1.87 +
1.88 + escaped = {
1.89 + "\\" : "\\", "'" : "'", '"' : '"',
1.90 + "a" : "\a", "b" : "\b", "f" : "\f",
1.91 + "n" : "\n", "r" : "\r", "t" : "\t",
1.92 + }
1.93 +
1.94 + while current < end:
1.95 +
1.96 + # Look for quoted values.
1.97 +
1.98 + index = s.find("\\", current)
1.99 + if index == -1 or index + 1 == end:
1.100 + l.append(s[current:end])
1.101 + break
1.102 +
1.103 + # Add the preceding text.
1.104 +
1.105 + l.append(s[current:index])
1.106 +
1.107 + # Handle quoted text.
1.108 +
1.109 + term = s[index+1]
1.110 +
1.111 + # Add Unicode values. Where a string is u-prefixed, even \o and \x
1.112 + # produce Unicode values.
1.113 +
1.114 + if term in ("u", "U") or prefix == "u" and (
1.115 + term == "x" or term in octal_digits):
1.116 +
1.117 + needed, base = searches.get(term, (4, 8))
1.118 + value = convert_quoted_value(s, index, needed, end, base, unichr)
1.119 + l.append(value)
1.120 + current = index + needed
1.121 +
1.122 + # Add raw byte values, changing the string type.
1.123 +
1.124 + elif "r" not in prefix and (
1.125 + term == "x" or term in octal_digits):
1.126 +
1.127 + needed, base = searches.get(term, (4, 8))
1.128 + value = convert_quoted_value(s, index, needed, end, base, chr)
1.129 + l.append(value)
1.130 + typename = "str"
1.131 + current = index + needed
1.132 +
1.133 + # Add other escaped values.
1.134 +
1.135 + elif "r" not in prefix and escaped.has_key(term):
1.136 + l.append(escaped[term])
1.137 + current = index + 2
1.138 +
1.139 + # Add other text as found.
1.140 +
1.141 + else:
1.142 + l.append(s[index:index+2])
1.143 + current = index + 2
1.144 +
1.145 + # For byte string values, convert any Unicode values to the original
1.146 + # encoding.
1.147 +
1.148 + if typename == "str":
1.149 + out = []
1.150 + for value in l:
1.151 + if isinstance(value, unicode):
1.152 + out.append(value.encode(encoding))
1.153 + else:
1.154 + out.append(value)
1.155 + out = "".join(out)
1.156 +
1.157 + # For Unicode values, convert byte sequences to Unicode.
1.158 +
1.159 + else:
1.160 + out = []
1.161 + for value in l:
1.162 + if isinstance(value, unicode):
1.163 + out.append(value)
1.164 + else:
1.165 + out.append(unicode(value, encoding))
1.166 + out = "".join(out).encode("utf-8")
1.167 +
1.168 + return out, typename, encoding
1.169 +
1.170 +def convert_quoted_value(s, index, needed, end, base, fn):
1.171 +
1.172 + """
1.173 + Interpret a quoted value in 's' at 'index' with the given 'needed' number of
1.174 + positions, and with the given 'end' indicating the first position after the
1.175 + end of the actual string content.
1.176 +
1.177 + Use 'base' as the numerical base when interpreting the value, and use 'fn'
1.178 + to convert the value to an appropriate type.
1.179 + """
1.180 +
1.181 + s = s[index:min(index+needed, end)]
1.182 +
1.183 + # Not a complete occurrence.
1.184 +
1.185 + if len(s) < needed:
1.186 + return s
1.187 +
1.188 + # Test for a well-formed value.
1.189 +
1.190 + try:
1.191 + first = base == 8 and 1 or 2
1.192 + value = int(s[first:needed], base)
1.193 + except ValueError:
1.194 + return s
1.195 + else:
1.196 + return fn(value)
1.197 +
1.198 # Attribute chain decoding.
1.199
1.200 def get_attrnames(attrnames):
3.1 --- a/tests/unicode.py Fri Jan 27 15:27:17 2017 +0100
3.2 +++ b/tests/unicode.py Fri Jan 27 23:54:18 2017 +0100
3.3 @@ -5,14 +5,52 @@
3.4 # Print bytes.
3.5
3.6 s = b"ÆØÅ"
3.7 +print "ISO-8859-1 values:"
3.8 print s # ÆØÅ
3.9 print len(s) # 3
3.10
3.11 +s2 = b"\xe6\xf8\xe5"
3.12 +print "ISO-8859-1 values:"
3.13 +print s2 # æøå
3.14 +print s2.__class__ # __builtins__.str.string
3.15 +print len(s2) # 3
3.16 +
3.17 +s3 = "\xe6\xf8\xe5"
3.18 +print "ISO-8859-1 values:"
3.19 +print s3 # æøå
3.20 +print s3.__class__ # __builtins__.str.string
3.21 +print len(s3) # 3
3.22 +
3.23 +s4 = b"\u00e6\u00f8\u00e5"
3.24 +print "Untranslated values:"
3.25 +print s4 # \u00e6\u00f8\u00e5
3.26 +print s4.__class__ # __builtins__.str.string
3.27 +print len(s4) # 18
3.28 +
3.29 +s5 = b"\346\370\345"
3.30 +print "ISO-8859-1 values:"
3.31 +print s5 # æøå
3.32 +print s5.__class__ # __builtins__.str.string
3.33 +print len(s5) # 3
3.34 +
3.35 +s6 = "\346\370\345"
3.36 +print "ISO-8859-1 values:"
3.37 +print s6 # æøå
3.38 +print s6.__class__ # __builtins__.str.string
3.39 +print len(s6) # 3
3.40 +
3.41 +s7 = r"\346\370\345"
3.42 +print "Untranslated values:"
3.43 +print s7 # \346\370\345
3.44 +print s7.__class__ # __builtins__.unicode.utf8string
3.45 +print len(s7) # 12
3.46 +
3.47 # Obtain text and print it.
3.48
3.49 # Explicitly from bytes.
3.50
3.51 -u = unicode("æøå", "ISO-8859-1")
3.52 +u = unicode(b"æøå", "ISO-8859-1")
3.53 +print "Unicode values:"
3.54 print u # æøå
3.55 print u.__class__ # __builtins__.unicode.utf8string
3.56 print u.encode("ISO-8859-1") # æøå
3.57 @@ -22,6 +60,7 @@
3.58 # Explicitly from Unicode literals.
3.59
3.60 u2 = u"æøå"
3.61 +print "Unicode values:"
3.62 print u2 # æøå
3.63 print u2.__class__ # __builtins__.unicode.utf8string
3.64 print u2.encode("ISO-8859-1") # æøå
3.65 @@ -31,16 +70,59 @@
3.66 # Implicitly from string literals.
3.67
3.68 u3 = "æøå"
3.69 +print "Unicode values:"
3.70 print u3 # æøå
3.71 print u3.__class__ # __builtins__.unicode.utf8string
3.72 print u3.encode("ISO-8859-1") # æøå
3.73 print u3.encoding # ISO-8859-1
3.74 print len(u3) # 3
3.75
3.76 +# Explicitly from implicitly-converted literal.
3.77 +
3.78 +u4 = unicode("æøå", "ISO-8859-1")
3.79 +print "Unicode values:"
3.80 +print u4 # æøå
3.81 +print u4.__class__ # __builtins__.unicode.utf8string
3.82 +print u4.encode("ISO-8859-1") # æøå
3.83 +print u4.encoding # ISO-8859-1
3.84 +print len(u4) # 3
3.85 +
3.86 +# Test Unicode values.
3.87 +
3.88 +u5 = "\u00e6\u00f8\u00e5"
3.89 +print "Unicode values:"
3.90 +print u5 # æøå
3.91 +print u5.__class__ # __builtins__.unicode.ut8string
3.92 +print len(u5) # 3
3.93 +
3.94 +# Test some untranslated values.
3.95 +
3.96 +u6 = "\\u00e6\\u00f8\\u00e5"
3.97 +print "Untranslated values:"
3.98 +print u6 # \u00e6\u00f8\u00e5
3.99 +print u6.__class__ # __builtins__.unicode.ut8string
3.100 +print len(u6) # 18
3.101 +
3.102 +# Test Unicode values.
3.103 +
3.104 +u7 = u"\346\370\345"
3.105 +print "Unicode values:"
3.106 +print u7 # æøå
3.107 +print u7.__class__ # __builtins__.unicode.ut8string
3.108 +print len(u7) # 3
3.109 +
3.110 +# Test Unicode values.
3.111 +
3.112 +u8 = ur"\346\370\345"
3.113 +print "Untranslated values:"
3.114 +print u8 # \346\370\345
3.115 +print u8.__class__ # __builtins__.unicode.ut8string
3.116 +print len(u8) # 12
3.117 +
3.118 # Test invalid sequences.
3.119
3.120 try:
3.121 - u4 = unicode(s, "UTF-8")
3.122 + u9 = unicode(s, "UTF-8")
3.123 except UnicodeDecodeError, exc:
3.124 print "Attempt to decode", s, "as UTF-8 failed."
3.125
3.126 @@ -48,6 +130,7 @@
3.127 # The text should be decoded.
3.128
3.129 su = s + u
3.130 +print "ISO-8859-1 values:"
3.131 print su # ÆØÅæøå
3.132 print su.__class__ # __builtins__.str.string
3.133 print len(su) # 6
3.134 @@ -56,6 +139,7 @@
3.135 # The text should be decoded.
3.136
3.137 us = u + s
3.138 +print "ISO-8859-1 values:"
3.139 print us # æøåÆØÅ
3.140 print us.__class__ # __builtins__.str.string
3.141 print len(us) # 6
3.142 @@ -63,6 +147,7 @@
3.143 # Combine text and text.
3.144
3.145 uu2 = u + u2
3.146 +print "Unicode values:"
3.147 print uu2 # æøåæøå
3.148 print uu2.__class__ # __builtins__.unicode.utf8string
3.149 print uu2.encoding # ISO-8859-1
3.150 @@ -75,14 +160,17 @@
3.151 print sys.stdout.encoding # None
3.152
3.153 sys.stdout.encoding = "ISO-8859-1"
3.154 +print "ISO-8859-1 and Unicode values as ISO-8859-1:"
3.155 print sys.stdout.encoding # ISO-8859-1
3.156 print u # æøå
3.157 print su # ÆØÅæøå
3.158 print us # æøåÆØÅ
3.159
3.160 sys.stdout.encoding = "UTF-8"
3.161 +print "Unicode values as UTF-8:"
3.162 print sys.stdout.encoding # UTF-8
3.163 print u # æøå
3.164 +print "ISO-8859-1 values bypassing UTF-8 output encoding:"
3.165 print su # ÆØÅæøå
3.166 print us # æøåÆØÅ
3.167