1.1 --- a/common.py	Sat Feb 04 00:12:06 2017 +0100
     1.2 +++ b/common.py	Sat Feb 04 15:35:58 2017 +0100
     1.3 @@ -233,7 +233,7 @@
     1.4  
     1.5      # Constant and literal recording.
     1.6  
     1.7 -    def get_constant_value(self, value, literal=None):
     1.8 +    def get_constant_value(self, value, literals=None):
     1.9  
    1.10          """
    1.11          Encode the 'value' if appropriate, returning a value, a typename and any
    1.12 @@ -247,7 +247,7 @@
    1.13  
    1.14          elif isinstance(value, str) and self.encoding:
    1.15              try:
    1.16 -                return get_string_details(literal, self.encoding)
    1.17 +                return get_string_details(literals, self.encoding)
    1.18              except UnicodeDecodeError:
    1.19                  pass
    1.20  
    1.21 @@ -964,12 +964,48 @@
    1.22      x.sort()
    1.23      return ", ".join(x)
    1.24  
    1.25 -def get_string_details(s, encoding):
    1.26 +def get_string_details(literals, encoding):
    1.27  
    1.28      """
    1.29 -    Determine whether 's' represents a Unicode string or a byte string, using
    1.30 -    'encoding' to interpret byte sequences. The contents of 's' is the full
    1.31 -    literal representation including prefix and quotes.
    1.32 +    Determine whether 'literals' represent Unicode strings or byte strings,
    1.33 +    using 'encoding' to reproduce byte sequences.
    1.34 +
    1.35 +    Each literal is the full program representation including prefix and quotes
    1.36 +    recoded by the parser to UTF-8. Thus, any literal found to represent a byte
    1.37 +    string needs to be translated back to its original encoding.
    1.38 +
    1.39 +    Return a single encoded literal value, a type name, and the original
    1.40 +    encoding as a tuple.
    1.41 +    """
    1.42 +
    1.43 +    typename = "unicode"
    1.44 +
    1.45 +    l = []
    1.46 +
    1.47 +    for s in literals:
    1.48 +        out, _typename = get_literal_details(s)
    1.49 +        if _typename == "str":
    1.50 +            typename = "str"
    1.51 +        l.append(out)
    1.52 +
    1.53 +    out = "".join(l)
    1.54 +
    1.55 +    # For Unicode values, convert to the UTF-8 program representation.
    1.56 +
    1.57 +    if typename == "unicode":
    1.58 +        return out.encode("utf-8"), typename, encoding
    1.59 +
    1.60 +    # For byte string values, convert back to the original encoding.
    1.61 +
    1.62 +    else:
    1.63 +        return out.encode(encoding), typename, encoding
    1.64 +
    1.65 +def get_literal_details(s):
    1.66 +
    1.67 +    """
    1.68 +    Determine whether 's' represents a Unicode string or a byte string, where
    1.69 +    's' contains the full program representation of a literal including prefix
    1.70 +    and quotes, recoded by the parser to UTF-8.
    1.71  
    1.72      Find and convert Unicode values starting with <backslash>u or <backslash>U,
    1.73      and byte or Unicode values starting with <backslash><octal digit> or
    1.74 @@ -984,8 +1020,8 @@
    1.75      formats are converted, not any of the other special sequences for things
    1.76      like newlines.
    1.77  
    1.78 -    Return the encoded literal value, type name, and original encoding as a
    1.79 -    tuple.
    1.80 +    Return the literal value as a Unicode object together with the appropriate
    1.81 +    type name in a tuple.
    1.82      """
    1.83  
    1.84      l = []
    1.85 @@ -1085,30 +1121,19 @@
    1.86              l.append(s[index:index+2])
    1.87              current = index + 2
    1.88  
    1.89 -    # For byte string values, convert any Unicode values to the original
    1.90 -    # encoding.
    1.91 +    # Collect the components into a single Unicode object. Since the literal
    1.92 +    # text was already in UTF-8 form, interpret plain strings as UTF-8
    1.93 +    # sequences.
    1.94  
    1.95 -    if typename == "str":
    1.96 -        out = []
    1.97 -        for value in l:
    1.98 -            if isinstance(value, unicode):
    1.99 -                out.append(value.encode(encoding))
   1.100 -            else:
   1.101 -                out.append(value)
   1.102 -        out = "".join(out)
   1.103 +    out = []
   1.104  
   1.105 -    # For Unicode values, convert byte sequences to Unicode.
   1.106 +    for value in l:
   1.107 +        if isinstance(value, unicode):
   1.108 +            out.append(value)
   1.109 +        else:
   1.110 +            out.append(unicode(value, "utf-8"))
   1.111  
   1.112 -    else:
   1.113 -        out = []
   1.114 -        for value in l:
   1.115 -            if isinstance(value, unicode):
   1.116 -                out.append(value)
   1.117 -            else:
   1.118 -                out.append(unicode(value, encoding))
   1.119 -        out = "".join(out).encode("utf-8")
   1.120 -
   1.121 -    return out, typename, encoding
   1.122 +    return "".join(out), typename
   1.123  
   1.124  def convert_quoted_value(s, index, needed, end, base, fn):
   1.125