Support backslash-encoded values more thoroughly, interpreting certain encoded values as bytes that turn unprefixed strings into byte strings, diverging from Python behaviour. Make sure that backslashes are escaped in generated programs.

     1.1 --- a/common.py	Fri Jan 27 15:27:17 2017 +0100
     1.2 +++ b/common.py	Fri Jan 27 23:54:18 2017 +0100
     1.3 @@ -20,11 +20,11 @@
     1.4  this program.  If not, see <http://www.gnu.org/licenses/>.
     1.5  """
     1.6  
     1.7 +from compiler.transformer import Transformer
     1.8  from errors import InspectError
     1.9  from os import listdir, makedirs, remove
    1.10  from os.path import exists, isdir, join, split
    1.11  from results import ConstantValueRef, LiteralSequenceRef, NameRef
    1.12 -from compiler.transformer import Transformer
    1.13  import compiler.ast
    1.14  
    1.15  class CommonOutput:
    1.16 @@ -248,7 +248,7 @@
    1.17          elif isinstance(value, str) and self.encoding:
    1.18              if not literal.startswith("b"):
    1.19                  try:
    1.20 -                    return unicode(value, self.encoding).encode("utf-8"), "unicode", self.encoding
    1.21 +                    return get_string_details(literal, self.encoding)
    1.22                  except UnicodeDecodeError:
    1.23                      pass
    1.24  
    1.25 @@ -964,6 +964,175 @@
    1.26      x.sort()
    1.27      return ", ".join(x)
    1.28  
    1.29 +def get_string_details(s, encoding):
    1.30 +
    1.31 +    """
    1.32 +    Determine whether 's' represents a Unicode string or a byte string, using
    1.33 +    'encoding' to interpret byte sequences. The contents of 's' is the full
    1.34 +    literal representation including prefix and quotes.
    1.35 +
    1.36 +    Find and convert Unicode values starting with <backslash>u or <backslash>U,
    1.37 +    and byte or Unicode values starting with <backslash><octal digit> or
    1.38 +    <backslash>x.
    1.39 +
    1.40 +    Literals prefixed with "u" cause <backslash><octal digit> and <backslash>x
    1.41 +    to be considered as Unicode values. Otherwise, they produce byte values and
    1.42 +    cause unprefixed strings to be considered as byte strings.
    1.43 +
    1.44 +    Literals prefixed with "r" do not have their backslash-encoded values
    1.45 +    converted unless also prefixed with "u", in which case only the above value
    1.46 +    formats are converted, not any of the other special sequences for things
    1.47 +    like newlines.
    1.48 +
    1.49 +    Return the encoded literal value, type name, and original encoding as a
    1.50 +    tuple.
    1.51 +    """
    1.52 +
    1.53 +    l = []
    1.54 +    typename = "unicode"
    1.55 +
    1.56 +    # Identify the quote character and use it to identify the prefix.
    1.57 +
    1.58 +    quote_type = s[-1]
    1.59 +    prefix_end = s.find(quote_type)
    1.60 +    prefix = s[:prefix_end].lower()
    1.61 +
    1.62 +    if prefix not in ("", "b", "br", "r", "u", "ur"):
    1.63 +        raise ValueError, "String literal does not have a supported prefix: %s" % s
    1.64 +
    1.65 +    # Identify triple quotes or single quotes.
    1.66 +
    1.67 +    if len(s) >= 6 and s[-2] == quote_type and s[-3] == quote_type:
    1.68 +        quote = s[prefix_end:prefix_end+3]
    1.69 +        current = prefix_end + 3
    1.70 +        end = len(s) - 3
    1.71 +    else:
    1.72 +        quote = s[prefix_end]
    1.73 +        current = prefix_end + 1
    1.74 +        end = len(s) - 1
    1.75 +
    1.76 +    # Conversions of some quoted values.
    1.77 +
    1.78 +    searches = {
    1.79 +        "u" : (6, 16),
    1.80 +        "U" : (10, 16),
    1.81 +        "x" : (4, 16),
    1.82 +        }
    1.83 +
    1.84 +    octal_digits = map(str, range(0, 8))
    1.85 +
    1.86 +    # Translations of some quoted values.
    1.87 +
    1.88 +    escaped = {
    1.89 +        "\\" : "\\", "'" : "'", '"' : '"',
    1.90 +        "a" : "\a", "b" : "\b", "f" : "\f",
    1.91 +        "n" : "\n", "r" : "\r", "t" : "\t",
    1.92 +        }
    1.93 +
    1.94 +    while current < end:
    1.95 +
    1.96 +        # Look for quoted values.
    1.97 +
    1.98 +        index = s.find("\\", current)
    1.99 +        if index == -1 or index + 1 == end:
   1.100 +            l.append(s[current:end])
   1.101 +            break
   1.102 +
   1.103 +        # Add the preceding text.
   1.104 +
   1.105 +        l.append(s[current:index])
   1.106 +
   1.107 +        # Handle quoted text.
   1.108 +
   1.109 +        term = s[index+1]
   1.110 +
   1.111 +        # Add Unicode values. Where a string is u-prefixed, even \o and \x
   1.112 +        # produce Unicode values.
   1.113 +
   1.114 +        if term in ("u", "U") or prefix == "u" and (
   1.115 +           term == "x" or term in octal_digits):
   1.116 +
   1.117 +            needed, base = searches.get(term, (4, 8))
   1.118 +            value = convert_quoted_value(s, index, needed, end, base, unichr)
   1.119 +            l.append(value)
   1.120 +            current = index + needed
   1.121 +
   1.122 +        # Add raw byte values, changing the string type.
   1.123 +
   1.124 +        elif "r" not in prefix and (
   1.125 +             term == "x" or term in octal_digits):
   1.126 +
   1.127 +            needed, base = searches.get(term, (4, 8))
   1.128 +            value = convert_quoted_value(s, index, needed, end, base, chr)
   1.129 +            l.append(value)
   1.130 +            typename = "str"
   1.131 +            current = index + needed
   1.132 +
   1.133 +        # Add other escaped values.
   1.134 +
   1.135 +        elif "r" not in prefix and escaped.has_key(term):
   1.136 +            l.append(escaped[term])
   1.137 +            current = index + 2
   1.138 +
   1.139 +        # Add other text as found.
   1.140 +
   1.141 +        else:
   1.142 +            l.append(s[index:index+2])
   1.143 +            current = index + 2
   1.144 +
   1.145 +    # For byte string values, convert any Unicode values to the original
   1.146 +    # encoding.
   1.147 +
   1.148 +    if typename == "str":
   1.149 +        out = []
   1.150 +        for value in l:
   1.151 +            if isinstance(value, unicode):
   1.152 +                out.append(value.encode(encoding))
   1.153 +            else:
   1.154 +                out.append(value)
   1.155 +        out = "".join(out)
   1.156 +
   1.157 +    # For Unicode values, convert byte sequences to Unicode.
   1.158 +
   1.159 +    else:
   1.160 +        out = []
   1.161 +        for value in l:
   1.162 +            if isinstance(value, unicode):
   1.163 +                out.append(value)
   1.164 +            else:
   1.165 +                out.append(unicode(value, encoding))
   1.166 +        out = "".join(out).encode("utf-8")
   1.167 +
   1.168 +    return out, typename, encoding
   1.169 +
   1.170 +def convert_quoted_value(s, index, needed, end, base, fn):
   1.171 +
   1.172 +    """
   1.173 +    Interpret a quoted value in 's' at 'index' with the given 'needed' number of
   1.174 +    positions, and with the given 'end' indicating the first position after the
   1.175 +    end of the actual string content.
   1.176 +
   1.177 +    Use 'base' as the numerical base when interpreting the value, and use 'fn'
   1.178 +    to convert the value to an appropriate type.
   1.179 +    """
   1.180 +
   1.181 +    s = s[index:min(index+needed, end)]
   1.182 +
   1.183 +    # Not a complete occurrence.
   1.184 +
   1.185 +    if len(s) < needed:
   1.186 +        return s
   1.187 +
   1.188 +    # Test for a well-formed value.
   1.189 +
   1.190 +    try:
   1.191 +        first = base == 8 and 1 or 2
   1.192 +        value = int(s[first:needed], base)
   1.193 +    except ValueError:
   1.194 +        return s
   1.195 +    else:
   1.196 +        return fn(value)
   1.197 +
   1.198  # Attribute chain decoding.
   1.199  
   1.200  def get_attrnames(attrnames):

     2.1 --- a/encoders.py	Fri Jan 27 15:27:17 2017 +0100
     2.2 +++ b/encoders.py	Fri Jan 27 23:54:18 2017 +0100
     2.3 @@ -376,6 +376,7 @@
     2.4              elif c == '\n': l.append('\\n')
     2.5              elif c == '\t': l.append('\\t')
     2.6              elif c == '\r': l.append('\\r')
     2.7 +            elif c == '\\': l.append('\\\\')
     2.8              elif 0x20 <= ord(c) < 0x80: l.append(c)
     2.9              else: l.append("\\x%02x" % ord(c))
    2.10  
    2.11 @@ -404,6 +405,8 @@
    2.12  
    2.13      return "__constvalue%d" % n
    2.14  
    2.15 +
    2.16 +
    2.17  # Track all encoded paths, detecting and avoiding conflicts.
    2.18  
    2.19  all_encoded_paths = {}

     3.1 --- a/tests/unicode.py	Fri Jan 27 15:27:17 2017 +0100
     3.2 +++ b/tests/unicode.py	Fri Jan 27 23:54:18 2017 +0100
     3.3 @@ -5,14 +5,52 @@
     3.4  # Print bytes.
     3.5  
     3.6  s = b"���"
     3.7 +print "ISO-8859-1 values:"
     3.8  print s                             # ���
     3.9  print len(s)                        # 3
    3.10  
    3.11 +s2 = b"\xe6\xf8\xe5"
    3.12 +print "ISO-8859-1 values:"
    3.13 +print s2                            # ���
    3.14 +print s2.__class__                  # __builtins__.str.string
    3.15 +print len(s2)                       # 3
    3.16 +
    3.17 +s3 = "\xe6\xf8\xe5"
    3.18 +print "ISO-8859-1 values:"
    3.19 +print s3                            # ���
    3.20 +print s3.__class__                  # __builtins__.str.string
    3.21 +print len(s3)                       # 3
    3.22 +
    3.23 +s4 = b"\u00e6\u00f8\u00e5"
    3.24 +print "Untranslated values:"
    3.25 +print s4                            # \u00e6\u00f8\u00e5
    3.26 +print s4.__class__                  # __builtins__.str.string
    3.27 +print len(s4)                       # 18
    3.28 +
    3.29 +s5 = b"\346\370\345"
    3.30 +print "ISO-8859-1 values:"
    3.31 +print s5                            # ���
    3.32 +print s5.__class__                  # __builtins__.str.string
    3.33 +print len(s5)                       # 3
    3.34 +
    3.35 +s6 = "\346\370\345"
    3.36 +print "ISO-8859-1 values:"
    3.37 +print s6                            # ���
    3.38 +print s6.__class__                  # __builtins__.str.string
    3.39 +print len(s6)                       # 3
    3.40 +
    3.41 +s7 = r"\346\370\345"
    3.42 +print "Untranslated values:"
    3.43 +print s7                            # \346\370\345
    3.44 +print s7.__class__                  # __builtins__.unicode.utf8string
    3.45 +print len(s7)                       # 12
    3.46 +
    3.47  # Obtain text and print it.
    3.48  
    3.49  # Explicitly from bytes.
    3.50  
    3.51 -u = unicode("���", "ISO-8859-1")
    3.52 +u = unicode(b"���", "ISO-8859-1")
    3.53 +print "Unicode values:"
    3.54  print u                             # ���
    3.55  print u.__class__                   # __builtins__.unicode.utf8string
    3.56  print u.encode("ISO-8859-1")        # ���
    3.57 @@ -22,6 +60,7 @@
    3.58  # Explicitly from Unicode literals.
    3.59  
    3.60  u2 = u"���"
    3.61 +print "Unicode values:"
    3.62  print u2                            # ���
    3.63  print u2.__class__                  # __builtins__.unicode.utf8string
    3.64  print u2.encode("ISO-8859-1")       # ���
    3.65 @@ -31,16 +70,59 @@
    3.66  # Implicitly from string literals.
    3.67  
    3.68  u3 = "���"
    3.69 +print "Unicode values:"
    3.70  print u3                            # ���
    3.71  print u3.__class__                  # __builtins__.unicode.utf8string
    3.72  print u3.encode("ISO-8859-1")       # ���
    3.73  print u3.encoding                   # ISO-8859-1
    3.74  print len(u3)                       # 3
    3.75  
    3.76 +# Explicitly from implicitly-converted literal.
    3.77 +
    3.78 +u4 = unicode("���", "ISO-8859-1")
    3.79 +print "Unicode values:"
    3.80 +print u4                            # ���
    3.81 +print u4.__class__                  # __builtins__.unicode.utf8string
    3.82 +print u4.encode("ISO-8859-1")       # ���
    3.83 +print u4.encoding                   # ISO-8859-1
    3.84 +print len(u4)                       # 3
    3.85 +
    3.86 +# Test Unicode values.
    3.87 +
    3.88 +u5 = "\u00e6\u00f8\u00e5"
    3.89 +print "Unicode values:"
    3.90 +print u5                            # ���
    3.91 +print u5.__class__                  # __builtins__.unicode.ut8string
    3.92 +print len(u5)                       # 3
    3.93 +
    3.94 +# Test some untranslated values.
    3.95 +
    3.96 +u6 = "\\u00e6\\u00f8\\u00e5"
    3.97 +print "Untranslated values:"
    3.98 +print u6                            # \u00e6\u00f8\u00e5
    3.99 +print u6.__class__                  # __builtins__.unicode.ut8string
   3.100 +print len(u6)                       # 18
   3.101 +
   3.102 +# Test Unicode values.
   3.103 +
   3.104 +u7 = u"\346\370\345"
   3.105 +print "Unicode values:"
   3.106 +print u7                            # ���
   3.107 +print u7.__class__                  # __builtins__.unicode.ut8string
   3.108 +print len(u7)                       # 3
   3.109 +
   3.110 +# Test Unicode values.
   3.111 +
   3.112 +u8 = ur"\346\370\345"
   3.113 +print "Untranslated values:"
   3.114 +print u8                            # \346\370\345
   3.115 +print u8.__class__                  # __builtins__.unicode.ut8string
   3.116 +print len(u8)                       # 12
   3.117 +
   3.118  # Test invalid sequences.
   3.119  
   3.120  try:
   3.121 -    u4 = unicode(s, "UTF-8")
   3.122 +    u9 = unicode(s, "UTF-8")
   3.123  except UnicodeDecodeError, exc:
   3.124      print "Attempt to decode", s, "as UTF-8 failed."
   3.125  
   3.126 @@ -48,6 +130,7 @@
   3.127  # The text should be decoded.
   3.128  
   3.129  su = s + u
   3.130 +print "ISO-8859-1 values:"
   3.131  print su                            # ������
   3.132  print su.__class__                  # __builtins__.str.string
   3.133  print len(su)                       # 6
   3.134 @@ -56,6 +139,7 @@
   3.135  # The text should be decoded.
   3.136  
   3.137  us = u + s
   3.138 +print "ISO-8859-1 values:"
   3.139  print us                            # ������
   3.140  print us.__class__                  # __builtins__.str.string
   3.141  print len(us)                       # 6
   3.142 @@ -63,6 +147,7 @@
   3.143  # Combine text and text.
   3.144  
   3.145  uu2 = u + u2
   3.146 +print "Unicode values:"
   3.147  print uu2                           # ������
   3.148  print uu2.__class__                 # __builtins__.unicode.utf8string
   3.149  print uu2.encoding                  # ISO-8859-1
   3.150 @@ -75,14 +160,17 @@
   3.151  print sys.stdout.encoding           # None
   3.152  
   3.153  sys.stdout.encoding = "ISO-8859-1"
   3.154 +print "ISO-8859-1 and Unicode values as ISO-8859-1:"
   3.155  print sys.stdout.encoding           # ISO-8859-1
   3.156  print u                             # ���
   3.157  print su                            # ������
   3.158  print us                            # ������
   3.159  
   3.160  sys.stdout.encoding = "UTF-8"
   3.161 +print "Unicode values as UTF-8:"
   3.162  print sys.stdout.encoding           # UTF-8
   3.163  print u                             # æøå
   3.164 +print "ISO-8859-1 values bypassing UTF-8 output encoding:"
   3.165  print su                            # ������
   3.166  print us                            # ������
   3.167
2017-01-27	Paul Boddie	raw files shortlog changelog graph	Support backslash-encoded values more thoroughly, interpreting certain encoded values as bytes that turn unprefixed strings into byte strings, diverging from Python behaviour. Make sure that backslashes are escaped in generated programs.
			common.py (file) encoders.py (file) tests/unicode.py (file)