# HG changeset patch # User Paul Boddie # Date 1485557658 -3600 # Node ID 8640a7748b1229e100d2057c263b8afd759d0dcd # Parent b99c11afb6f52a39d531c42bbcc596b69e5bab34 Support backslash-encoded values more thoroughly, interpreting certain encoded values as bytes that turn unprefixed strings into byte strings, diverging from Python behaviour. Make sure that backslashes are escaped in generated programs. diff -r b99c11afb6f5 -r 8640a7748b12 common.py --- a/common.py Fri Jan 27 15:27:17 2017 +0100 +++ b/common.py Fri Jan 27 23:54:18 2017 +0100 @@ -20,11 +20,11 @@ this program. If not, see . """ +from compiler.transformer import Transformer from errors import InspectError from os import listdir, makedirs, remove from os.path import exists, isdir, join, split from results import ConstantValueRef, LiteralSequenceRef, NameRef -from compiler.transformer import Transformer import compiler.ast class CommonOutput: @@ -248,7 +248,7 @@ elif isinstance(value, str) and self.encoding: if not literal.startswith("b"): try: - return unicode(value, self.encoding).encode("utf-8"), "unicode", self.encoding + return get_string_details(literal, self.encoding) except UnicodeDecodeError: pass @@ -964,6 +964,175 @@ x.sort() return ", ".join(x) +def get_string_details(s, encoding): + + """ + Determine whether 's' represents a Unicode string or a byte string, using + 'encoding' to interpret byte sequences. The contents of 's' is the full + literal representation including prefix and quotes. + + Find and convert Unicode values starting with u or U, + and byte or Unicode values starting with or + x. + + Literals prefixed with "u" cause and x + to be considered as Unicode values. Otherwise, they produce byte values and + cause unprefixed strings to be considered as byte strings. + + Literals prefixed with "r" do not have their backslash-encoded values + converted unless also prefixed with "u", in which case only the above value + formats are converted, not any of the other special sequences for things + like newlines. + + Return the encoded literal value, type name, and original encoding as a + tuple. + """ + + l = [] + typename = "unicode" + + # Identify the quote character and use it to identify the prefix. + + quote_type = s[-1] + prefix_end = s.find(quote_type) + prefix = s[:prefix_end].lower() + + if prefix not in ("", "b", "br", "r", "u", "ur"): + raise ValueError, "String literal does not have a supported prefix: %s" % s + + # Identify triple quotes or single quotes. + + if len(s) >= 6 and s[-2] == quote_type and s[-3] == quote_type: + quote = s[prefix_end:prefix_end+3] + current = prefix_end + 3 + end = len(s) - 3 + else: + quote = s[prefix_end] + current = prefix_end + 1 + end = len(s) - 1 + + # Conversions of some quoted values. + + searches = { + "u" : (6, 16), + "U" : (10, 16), + "x" : (4, 16), + } + + octal_digits = map(str, range(0, 8)) + + # Translations of some quoted values. + + escaped = { + "\\" : "\\", "'" : "'", '"' : '"', + "a" : "\a", "b" : "\b", "f" : "\f", + "n" : "\n", "r" : "\r", "t" : "\t", + } + + while current < end: + + # Look for quoted values. + + index = s.find("\\", current) + if index == -1 or index + 1 == end: + l.append(s[current:end]) + break + + # Add the preceding text. + + l.append(s[current:index]) + + # Handle quoted text. + + term = s[index+1] + + # Add Unicode values. Where a string is u-prefixed, even \o and \x + # produce Unicode values. + + if term in ("u", "U") or prefix == "u" and ( + term == "x" or term in octal_digits): + + needed, base = searches.get(term, (4, 8)) + value = convert_quoted_value(s, index, needed, end, base, unichr) + l.append(value) + current = index + needed + + # Add raw byte values, changing the string type. + + elif "r" not in prefix and ( + term == "x" or term in octal_digits): + + needed, base = searches.get(term, (4, 8)) + value = convert_quoted_value(s, index, needed, end, base, chr) + l.append(value) + typename = "str" + current = index + needed + + # Add other escaped values. + + elif "r" not in prefix and escaped.has_key(term): + l.append(escaped[term]) + current = index + 2 + + # Add other text as found. + + else: + l.append(s[index:index+2]) + current = index + 2 + + # For byte string values, convert any Unicode values to the original + # encoding. + + if typename == "str": + out = [] + for value in l: + if isinstance(value, unicode): + out.append(value.encode(encoding)) + else: + out.append(value) + out = "".join(out) + + # For Unicode values, convert byte sequences to Unicode. + + else: + out = [] + for value in l: + if isinstance(value, unicode): + out.append(value) + else: + out.append(unicode(value, encoding)) + out = "".join(out).encode("utf-8") + + return out, typename, encoding + +def convert_quoted_value(s, index, needed, end, base, fn): + + """ + Interpret a quoted value in 's' at 'index' with the given 'needed' number of + positions, and with the given 'end' indicating the first position after the + end of the actual string content. + + Use 'base' as the numerical base when interpreting the value, and use 'fn' + to convert the value to an appropriate type. + """ + + s = s[index:min(index+needed, end)] + + # Not a complete occurrence. + + if len(s) < needed: + return s + + # Test for a well-formed value. + + try: + first = base == 8 and 1 or 2 + value = int(s[first:needed], base) + except ValueError: + return s + else: + return fn(value) + # Attribute chain decoding. def get_attrnames(attrnames): diff -r b99c11afb6f5 -r 8640a7748b12 encoders.py --- a/encoders.py Fri Jan 27 15:27:17 2017 +0100 +++ b/encoders.py Fri Jan 27 23:54:18 2017 +0100 @@ -376,6 +376,7 @@ elif c == '\n': l.append('\\n') elif c == '\t': l.append('\\t') elif c == '\r': l.append('\\r') + elif c == '\\': l.append('\\\\') elif 0x20 <= ord(c) < 0x80: l.append(c) else: l.append("\\x%02x" % ord(c)) @@ -404,6 +405,8 @@ return "__constvalue%d" % n + + # Track all encoded paths, detecting and avoiding conflicts. all_encoded_paths = {} diff -r b99c11afb6f5 -r 8640a7748b12 tests/unicode.py --- a/tests/unicode.py Fri Jan 27 15:27:17 2017 +0100 +++ b/tests/unicode.py Fri Jan 27 23:54:18 2017 +0100 @@ -5,14 +5,52 @@ # Print bytes. s = b"ÆØÅ" +print "ISO-8859-1 values:" print s # ÆØÅ print len(s) # 3 +s2 = b"\xe6\xf8\xe5" +print "ISO-8859-1 values:" +print s2 # æøå +print s2.__class__ # __builtins__.str.string +print len(s2) # 3 + +s3 = "\xe6\xf8\xe5" +print "ISO-8859-1 values:" +print s3 # æøå +print s3.__class__ # __builtins__.str.string +print len(s3) # 3 + +s4 = b"\u00e6\u00f8\u00e5" +print "Untranslated values:" +print s4 # \u00e6\u00f8\u00e5 +print s4.__class__ # __builtins__.str.string +print len(s4) # 18 + +s5 = b"\346\370\345" +print "ISO-8859-1 values:" +print s5 # æøå +print s5.__class__ # __builtins__.str.string +print len(s5) # 3 + +s6 = "\346\370\345" +print "ISO-8859-1 values:" +print s6 # æøå +print s6.__class__ # __builtins__.str.string +print len(s6) # 3 + +s7 = r"\346\370\345" +print "Untranslated values:" +print s7 # \346\370\345 +print s7.__class__ # __builtins__.unicode.utf8string +print len(s7) # 12 + # Obtain text and print it. # Explicitly from bytes. -u = unicode("æøå", "ISO-8859-1") +u = unicode(b"æøå", "ISO-8859-1") +print "Unicode values:" print u # æøå print u.__class__ # __builtins__.unicode.utf8string print u.encode("ISO-8859-1") # æøå @@ -22,6 +60,7 @@ # Explicitly from Unicode literals. u2 = u"æøå" +print "Unicode values:" print u2 # æøå print u2.__class__ # __builtins__.unicode.utf8string print u2.encode("ISO-8859-1") # æøå @@ -31,16 +70,59 @@ # Implicitly from string literals. u3 = "æøå" +print "Unicode values:" print u3 # æøå print u3.__class__ # __builtins__.unicode.utf8string print u3.encode("ISO-8859-1") # æøå print u3.encoding # ISO-8859-1 print len(u3) # 3 +# Explicitly from implicitly-converted literal. + +u4 = unicode("æøå", "ISO-8859-1") +print "Unicode values:" +print u4 # æøå +print u4.__class__ # __builtins__.unicode.utf8string +print u4.encode("ISO-8859-1") # æøå +print u4.encoding # ISO-8859-1 +print len(u4) # 3 + +# Test Unicode values. + +u5 = "\u00e6\u00f8\u00e5" +print "Unicode values:" +print u5 # æøå +print u5.__class__ # __builtins__.unicode.ut8string +print len(u5) # 3 + +# Test some untranslated values. + +u6 = "\\u00e6\\u00f8\\u00e5" +print "Untranslated values:" +print u6 # \u00e6\u00f8\u00e5 +print u6.__class__ # __builtins__.unicode.ut8string +print len(u6) # 18 + +# Test Unicode values. + +u7 = u"\346\370\345" +print "Unicode values:" +print u7 # æøå +print u7.__class__ # __builtins__.unicode.ut8string +print len(u7) # 3 + +# Test Unicode values. + +u8 = ur"\346\370\345" +print "Untranslated values:" +print u8 # \346\370\345 +print u8.__class__ # __builtins__.unicode.ut8string +print len(u8) # 12 + # Test invalid sequences. try: - u4 = unicode(s, "UTF-8") + u9 = unicode(s, "UTF-8") except UnicodeDecodeError, exc: print "Attempt to decode", s, "as UTF-8 failed." @@ -48,6 +130,7 @@ # The text should be decoded. su = s + u +print "ISO-8859-1 values:" print su # ÆØÅæøå print su.__class__ # __builtins__.str.string print len(su) # 6 @@ -56,6 +139,7 @@ # The text should be decoded. us = u + s +print "ISO-8859-1 values:" print us # æøåÆØÅ print us.__class__ # __builtins__.str.string print len(us) # 6 @@ -63,6 +147,7 @@ # Combine text and text. uu2 = u + u2 +print "Unicode values:" print uu2 # æøåæøå print uu2.__class__ # __builtins__.unicode.utf8string print uu2.encoding # ISO-8859-1 @@ -75,14 +160,17 @@ print sys.stdout.encoding # None sys.stdout.encoding = "ISO-8859-1" +print "ISO-8859-1 and Unicode values as ISO-8859-1:" print sys.stdout.encoding # ISO-8859-1 print u # æøå print su # ÆØÅæøå print us # æøåÆØÅ sys.stdout.encoding = "UTF-8" +print "Unicode values as UTF-8:" print sys.stdout.encoding # UTF-8 print u # Ã¦Ã¸Ã¥ +print "ISO-8859-1 values bypassing UTF-8 output encoding:" print su # ÆØÅæøå print us # æøåÆØÅ