1.1 --- a/common.py Sat Feb 04 00:12:06 2017 +0100
1.2 +++ b/common.py Sat Feb 04 15:35:58 2017 +0100
1.3 @@ -233,7 +233,7 @@
1.4
1.5 # Constant and literal recording.
1.6
1.7 - def get_constant_value(self, value, literal=None):
1.8 + def get_constant_value(self, value, literals=None):
1.9
1.10 """
1.11 Encode the 'value' if appropriate, returning a value, a typename and any
1.12 @@ -247,7 +247,7 @@
1.13
1.14 elif isinstance(value, str) and self.encoding:
1.15 try:
1.16 - return get_string_details(literal, self.encoding)
1.17 + return get_string_details(literals, self.encoding)
1.18 except UnicodeDecodeError:
1.19 pass
1.20
1.21 @@ -964,12 +964,48 @@
1.22 x.sort()
1.23 return ", ".join(x)
1.24
1.25 -def get_string_details(s, encoding):
1.26 +def get_string_details(literals, encoding):
1.27
1.28 """
1.29 - Determine whether 's' represents a Unicode string or a byte string, using
1.30 - 'encoding' to interpret byte sequences. The contents of 's' is the full
1.31 - literal representation including prefix and quotes.
1.32 + Determine whether 'literals' represent Unicode strings or byte strings,
1.33 + using 'encoding' to reproduce byte sequences.
1.34 +
1.35 + Each literal is the full program representation including prefix and quotes
1.36 + recoded by the parser to UTF-8. Thus, any literal found to represent a byte
1.37 + string needs to be translated back to its original encoding.
1.38 +
1.39 + Return a single encoded literal value, a type name, and the original
1.40 + encoding as a tuple.
1.41 + """
1.42 +
1.43 + typename = "unicode"
1.44 +
1.45 + l = []
1.46 +
1.47 + for s in literals:
1.48 + out, _typename = get_literal_details(s)
1.49 + if _typename == "str":
1.50 + typename = "str"
1.51 + l.append(out)
1.52 +
1.53 + out = "".join(l)
1.54 +
1.55 + # For Unicode values, convert to the UTF-8 program representation.
1.56 +
1.57 + if typename == "unicode":
1.58 + return out.encode("utf-8"), typename, encoding
1.59 +
1.60 + # For byte string values, convert back to the original encoding.
1.61 +
1.62 + else:
1.63 + return out.encode(encoding), typename, encoding
1.64 +
1.65 +def get_literal_details(s):
1.66 +
1.67 + """
1.68 + Determine whether 's' represents a Unicode string or a byte string, where
1.69 + 's' contains the full program representation of a literal including prefix
1.70 + and quotes, recoded by the parser to UTF-8.
1.71
1.72 Find and convert Unicode values starting with <backslash>u or <backslash>U,
1.73 and byte or Unicode values starting with <backslash><octal digit> or
1.74 @@ -984,8 +1020,8 @@
1.75 formats are converted, not any of the other special sequences for things
1.76 like newlines.
1.77
1.78 - Return the encoded literal value, type name, and original encoding as a
1.79 - tuple.
1.80 + Return the literal value as a Unicode object together with the appropriate
1.81 + type name in a tuple.
1.82 """
1.83
1.84 l = []
1.85 @@ -1085,30 +1121,19 @@
1.86 l.append(s[index:index+2])
1.87 current = index + 2
1.88
1.89 - # For byte string values, convert any Unicode values to the original
1.90 - # encoding.
1.91 + # Collect the components into a single Unicode object. Since the literal
1.92 + # text was already in UTF-8 form, interpret plain strings as UTF-8
1.93 + # sequences.
1.94
1.95 - if typename == "str":
1.96 - out = []
1.97 - for value in l:
1.98 - if isinstance(value, unicode):
1.99 - out.append(value.encode(encoding))
1.100 - else:
1.101 - out.append(value)
1.102 - out = "".join(out)
1.103 + out = []
1.104
1.105 - # For Unicode values, convert byte sequences to Unicode.
1.106 + for value in l:
1.107 + if isinstance(value, unicode):
1.108 + out.append(value)
1.109 + else:
1.110 + out.append(unicode(value, "utf-8"))
1.111
1.112 - else:
1.113 - out = []
1.114 - for value in l:
1.115 - if isinstance(value, unicode):
1.116 - out.append(value)
1.117 - else:
1.118 - out.append(unicode(value, encoding))
1.119 - out = "".join(out).encode("utf-8")
1.120 -
1.121 - return out, typename, encoding
1.122 + return "".join(out), typename
1.123
1.124 def convert_quoted_value(s, index, needed, end, base, fn):
1.125