1.1 --- a/common.py Sat Feb 04 00:12:06 2017 +0100
1.2 +++ b/common.py Sat Feb 04 15:35:58 2017 +0100
1.3 @@ -233,7 +233,7 @@
1.4
1.5 # Constant and literal recording.
1.6
1.7 - def get_constant_value(self, value, literal=None):
1.8 + def get_constant_value(self, value, literals=None):
1.9
1.10 """
1.11 Encode the 'value' if appropriate, returning a value, a typename and any
1.12 @@ -247,7 +247,7 @@
1.13
1.14 elif isinstance(value, str) and self.encoding:
1.15 try:
1.16 - return get_string_details(literal, self.encoding)
1.17 + return get_string_details(literals, self.encoding)
1.18 except UnicodeDecodeError:
1.19 pass
1.20
1.21 @@ -964,12 +964,48 @@
1.22 x.sort()
1.23 return ", ".join(x)
1.24
1.25 -def get_string_details(s, encoding):
1.26 +def get_string_details(literals, encoding):
1.27
1.28 """
1.29 - Determine whether 's' represents a Unicode string or a byte string, using
1.30 - 'encoding' to interpret byte sequences. The contents of 's' is the full
1.31 - literal representation including prefix and quotes.
1.32 + Determine whether 'literals' represent Unicode strings or byte strings,
1.33 + using 'encoding' to reproduce byte sequences.
1.34 +
1.35 + Each literal is the full program representation including prefix and quotes
1.36 + recoded by the parser to UTF-8. Thus, any literal found to represent a byte
1.37 + string needs to be translated back to its original encoding.
1.38 +
1.39 + Return a single encoded literal value, a type name, and the original
1.40 + encoding as a tuple.
1.41 + """
1.42 +
1.43 + typename = "unicode"
1.44 +
1.45 + l = []
1.46 +
1.47 + for s in literals:
1.48 + out, _typename = get_literal_details(s)
1.49 + if _typename == "str":
1.50 + typename = "str"
1.51 + l.append(out)
1.52 +
1.53 + out = "".join(l)
1.54 +
1.55 + # For Unicode values, convert to the UTF-8 program representation.
1.56 +
1.57 + if typename == "unicode":
1.58 + return out.encode("utf-8"), typename, encoding
1.59 +
1.60 + # For byte string values, convert back to the original encoding.
1.61 +
1.62 + else:
1.63 + return out.encode(encoding), typename, encoding
1.64 +
1.65 +def get_literal_details(s):
1.66 +
1.67 + """
1.68 + Determine whether 's' represents a Unicode string or a byte string, where
1.69 + 's' contains the full program representation of a literal including prefix
1.70 + and quotes, recoded by the parser to UTF-8.
1.71
1.72 Find and convert Unicode values starting with <backslash>u or <backslash>U,
1.73 and byte or Unicode values starting with <backslash><octal digit> or
1.74 @@ -984,8 +1020,8 @@
1.75 formats are converted, not any of the other special sequences for things
1.76 like newlines.
1.77
1.78 - Return the encoded literal value, type name, and original encoding as a
1.79 - tuple.
1.80 + Return the literal value as a Unicode object together with the appropriate
1.81 + type name in a tuple.
1.82 """
1.83
1.84 l = []
1.85 @@ -1085,30 +1121,19 @@
1.86 l.append(s[index:index+2])
1.87 current = index + 2
1.88
1.89 - # For byte string values, convert any Unicode values to the original
1.90 - # encoding.
1.91 + # Collect the components into a single Unicode object. Since the literal
1.92 + # text was already in UTF-8 form, interpret plain strings as UTF-8
1.93 + # sequences.
1.94
1.95 - if typename == "str":
1.96 - out = []
1.97 - for value in l:
1.98 - if isinstance(value, unicode):
1.99 - out.append(value.encode(encoding))
1.100 - else:
1.101 - out.append(value)
1.102 - out = "".join(out)
1.103 + out = []
1.104
1.105 - # For Unicode values, convert byte sequences to Unicode.
1.106 + for value in l:
1.107 + if isinstance(value, unicode):
1.108 + out.append(value)
1.109 + else:
1.110 + out.append(unicode(value, "utf-8"))
1.111
1.112 - else:
1.113 - out = []
1.114 - for value in l:
1.115 - if isinstance(value, unicode):
1.116 - out.append(value)
1.117 - else:
1.118 - out.append(unicode(value, encoding))
1.119 - out = "".join(out).encode("utf-8")
1.120 -
1.121 - return out, typename, encoding
1.122 + return "".join(out), typename
1.123
1.124 def convert_quoted_value(s, index, needed, end, base, fn):
1.125
2.1 --- a/compiler/ast.py Sat Feb 04 00:12:06 2017 +0100
2.2 +++ b/compiler/ast.py Sat Feb 04 15:35:58 2017 +0100
2.3 @@ -502,9 +502,9 @@
2.4 return "%s %s" % (self.expr, " ".join([("%s %s" % op) for op in self.ops]))
2.5
2.6 class Const(Node):
2.7 - def __init__(self, value, literal=None, lineno=None):
2.8 + def __init__(self, value, literals=None, lineno=None):
2.9 self.value = value
2.10 - self.literal = literal
2.11 + self.literals = literals
2.12 self.lineno = lineno
2.13
2.14 def getChildren(self):
2.15 @@ -514,7 +514,7 @@
2.16 return ()
2.17
2.18 def __repr__(self):
2.19 - return "Const(%r, %r)" % (self.value, self.literal)
2.20 + return "Const(%r, %r)" % (self.value, self.literals)
2.21
2.22 def __str__(self):
2.23 return repr(self.value)
3.1 --- a/compiler/transformer.py Sat Feb 04 00:12:06 2017 +0100
3.2 +++ b/compiler/transformer.py Sat Feb 04 15:35:58 2017 +0100
3.3 @@ -669,15 +669,22 @@
3.4
3.5 def decode_literal(self, lit):
3.6 if self.encoding:
3.7 + # this is particularly fragile & a bit of a
3.8 + # hack... changes in compile.c:parsestr and
3.9 + # tokenizer.c must be reflected here.
3.10 + if self.encoding != 'utf-8':
3.11 + lit = unicode(lit, 'utf-8').encode(self.encoding)
3.12 return eval("# coding: %s\n%s" % (self.encoding, lit))
3.13 else:
3.14 return eval(lit)
3.15
3.16 def atom_string(self, nodelist):
3.17 k = ''
3.18 + l = []
3.19 for node in nodelist:
3.20 k += self.decode_literal(node[1])
3.21 - return Const(k, node[1], lineno=nodelist[0][2])
3.22 + l.append(node[1])
3.23 + return Const(k, l, lineno=nodelist[0][2])
3.24
3.25 def atom_name(self, nodelist):
3.26 return Name(nodelist[0][1], lineno=nodelist[0][2])
4.1 --- a/inspector.py Sat Feb 04 00:12:06 2017 +0100
4.2 +++ b/inspector.py Sat Feb 04 15:35:58 2017 +0100
4.3 @@ -1407,7 +1407,7 @@
4.4 # Constant values are independently recorded.
4.5
4.6 else:
4.7 - value, typename, encoding = self.get_constant_value(n.value, n.literal)
4.8 + value, typename, encoding = self.get_constant_value(n.value, n.literals)
4.9 name = get_builtin_type(typename)
4.10 ref = self.get_builtin_class(name)
4.11 return self.get_constant_reference(ref, value, encoding)
5.1 --- a/pyparser/pyparse.py Sat Feb 04 00:12:06 2017 +0100
5.2 +++ b/pyparser/pyparse.py Sat Feb 04 15:35:58 2017 +0100
5.3 @@ -1,6 +1,13 @@
5.4 from pyparser import parser, pytokenizer, pygram, error
5.5 from pyparser import consts
5.6
5.7 +def recode_to_utf8(bytes, encoding):
5.8 + text = bytes.decode(encoding)
5.9 + if not isinstance(text, unicode):
5.10 + raise error.SyntaxError("codec did not return a unicode object")
5.11 + recoded = text.encode("utf-8")
5.12 + return recoded
5.13 +
5.14 def _normalize_encoding(encoding):
5.15 """returns normalized name for <encoding>
5.16
5.17 @@ -96,6 +103,17 @@
5.18 filename=compile_info.filename)
5.19 else:
5.20 enc = _normalize_encoding(_check_for_encoding(textsrc))
5.21 + if enc is not None and enc != 'utf-8':
5.22 + try:
5.23 + textsrc = recode_to_utf8(textsrc, enc)
5.24 + except LookupError as e:
5.25 + # if the codec is not found, LookupError is raised.
5.26 + raise error.SyntaxError("Unknown encoding: %s" % enc,
5.27 + filename=compile_info.filename)
5.28 + # Transform unicode errors into SyntaxError
5.29 + except UnicodeDecodeError as e:
5.30 + message = str(e)
5.31 + raise error.SyntaxError(message)
5.32
5.33 flags = compile_info.flags
5.34
6.1 --- a/tests/unicode.py Sat Feb 04 00:12:06 2017 +0100
6.2 +++ b/tests/unicode.py Sat Feb 04 15:35:58 2017 +0100
6.3 @@ -9,6 +9,12 @@
6.4 print s # ÆØÅ
6.5 print len(s) # 3
6.6
6.7 +s1 = b"ÆØÅ" \
6.8 + "ÆØÅ"
6.9 +print "ISO-8859-15 values:"
6.10 +print s1 # ÆØÅÆØÅ
6.11 +print len(s1) # 6
6.12 +
6.13 s2 = b"\xe6\xf8\xe5"
6.14 print "ISO-8859-15 values:"
6.15 print s2 # æøå
7.1 --- a/translator.py Sat Feb 04 00:12:06 2017 +0100
7.2 +++ b/translator.py Sat Feb 04 15:35:58 2017 +0100
7.3 @@ -472,7 +472,7 @@
7.4 ref = self.get_builtin_class(name)
7.5 return self.process_literal_sequence_node(n, name, ref, TrLiteralSequenceRef)
7.6 else:
7.7 - value, typename, encoding = self.get_constant_value(n.value, n.literal)
7.8 + value, typename, encoding = self.get_constant_value(n.value, n.literals)
7.9 name = get_builtin_type(typename)
7.10 ref = self.get_builtin_class(name)
7.11 value_type = ref.get_origin()