# HG changeset patch # User Paul Boddie # Date 1486218958 -3600 # Node ID 2ee50dc501ca2066aebc23a44324be4f5415b8c9 # Parent 50463cb7afae1d9cb5f3dc667165001bcc872ed2 Restored UTF-8 source recoding and added support for concatenated literals. diff -r 50463cb7afae -r 2ee50dc501ca common.py --- a/common.py Sat Feb 04 00:12:06 2017 +0100 +++ b/common.py Sat Feb 04 15:35:58 2017 +0100 @@ -233,7 +233,7 @@ # Constant and literal recording. - def get_constant_value(self, value, literal=None): + def get_constant_value(self, value, literals=None): """ Encode the 'value' if appropriate, returning a value, a typename and any @@ -247,7 +247,7 @@ elif isinstance(value, str) and self.encoding: try: - return get_string_details(literal, self.encoding) + return get_string_details(literals, self.encoding) except UnicodeDecodeError: pass @@ -964,12 +964,48 @@ x.sort() return ", ".join(x) -def get_string_details(s, encoding): +def get_string_details(literals, encoding): """ - Determine whether 's' represents a Unicode string or a byte string, using - 'encoding' to interpret byte sequences. The contents of 's' is the full - literal representation including prefix and quotes. + Determine whether 'literals' represent Unicode strings or byte strings, + using 'encoding' to reproduce byte sequences. + + Each literal is the full program representation including prefix and quotes + recoded by the parser to UTF-8. Thus, any literal found to represent a byte + string needs to be translated back to its original encoding. + + Return a single encoded literal value, a type name, and the original + encoding as a tuple. + """ + + typename = "unicode" + + l = [] + + for s in literals: + out, _typename = get_literal_details(s) + if _typename == "str": + typename = "str" + l.append(out) + + out = "".join(l) + + # For Unicode values, convert to the UTF-8 program representation. + + if typename == "unicode": + return out.encode("utf-8"), typename, encoding + + # For byte string values, convert back to the original encoding. + + else: + return out.encode(encoding), typename, encoding + +def get_literal_details(s): + + """ + Determine whether 's' represents a Unicode string or a byte string, where + 's' contains the full program representation of a literal including prefix + and quotes, recoded by the parser to UTF-8. Find and convert Unicode values starting with u or U, and byte or Unicode values starting with or @@ -984,8 +1020,8 @@ formats are converted, not any of the other special sequences for things like newlines. - Return the encoded literal value, type name, and original encoding as a - tuple. + Return the literal value as a Unicode object together with the appropriate + type name in a tuple. """ l = [] @@ -1085,30 +1121,19 @@ l.append(s[index:index+2]) current = index + 2 - # For byte string values, convert any Unicode values to the original - # encoding. + # Collect the components into a single Unicode object. Since the literal + # text was already in UTF-8 form, interpret plain strings as UTF-8 + # sequences. - if typename == "str": - out = [] - for value in l: - if isinstance(value, unicode): - out.append(value.encode(encoding)) - else: - out.append(value) - out = "".join(out) + out = [] - # For Unicode values, convert byte sequences to Unicode. + for value in l: + if isinstance(value, unicode): + out.append(value) + else: + out.append(unicode(value, "utf-8")) - else: - out = [] - for value in l: - if isinstance(value, unicode): - out.append(value) - else: - out.append(unicode(value, encoding)) - out = "".join(out).encode("utf-8") - - return out, typename, encoding + return "".join(out), typename def convert_quoted_value(s, index, needed, end, base, fn): diff -r 50463cb7afae -r 2ee50dc501ca compiler/ast.py --- a/compiler/ast.py Sat Feb 04 00:12:06 2017 +0100 +++ b/compiler/ast.py Sat Feb 04 15:35:58 2017 +0100 @@ -502,9 +502,9 @@ return "%s %s" % (self.expr, " ".join([("%s %s" % op) for op in self.ops])) class Const(Node): - def __init__(self, value, literal=None, lineno=None): + def __init__(self, value, literals=None, lineno=None): self.value = value - self.literal = literal + self.literals = literals self.lineno = lineno def getChildren(self): @@ -514,7 +514,7 @@ return () def __repr__(self): - return "Const(%r, %r)" % (self.value, self.literal) + return "Const(%r, %r)" % (self.value, self.literals) def __str__(self): return repr(self.value) diff -r 50463cb7afae -r 2ee50dc501ca compiler/transformer.py --- a/compiler/transformer.py Sat Feb 04 00:12:06 2017 +0100 +++ b/compiler/transformer.py Sat Feb 04 15:35:58 2017 +0100 @@ -669,15 +669,22 @@ def decode_literal(self, lit): if self.encoding: + # this is particularly fragile & a bit of a + # hack... changes in compile.c:parsestr and + # tokenizer.c must be reflected here. + if self.encoding != 'utf-8': + lit = unicode(lit, 'utf-8').encode(self.encoding) return eval("# coding: %s\n%s" % (self.encoding, lit)) else: return eval(lit) def atom_string(self, nodelist): k = '' + l = [] for node in nodelist: k += self.decode_literal(node[1]) - return Const(k, node[1], lineno=nodelist[0][2]) + l.append(node[1]) + return Const(k, l, lineno=nodelist[0][2]) def atom_name(self, nodelist): return Name(nodelist[0][1], lineno=nodelist[0][2]) diff -r 50463cb7afae -r 2ee50dc501ca inspector.py --- a/inspector.py Sat Feb 04 00:12:06 2017 +0100 +++ b/inspector.py Sat Feb 04 15:35:58 2017 +0100 @@ -1407,7 +1407,7 @@ # Constant values are independently recorded. else: - value, typename, encoding = self.get_constant_value(n.value, n.literal) + value, typename, encoding = self.get_constant_value(n.value, n.literals) name = get_builtin_type(typename) ref = self.get_builtin_class(name) return self.get_constant_reference(ref, value, encoding) diff -r 50463cb7afae -r 2ee50dc501ca pyparser/pyparse.py --- a/pyparser/pyparse.py Sat Feb 04 00:12:06 2017 +0100 +++ b/pyparser/pyparse.py Sat Feb 04 15:35:58 2017 +0100 @@ -1,6 +1,13 @@ from pyparser import parser, pytokenizer, pygram, error from pyparser import consts +def recode_to_utf8(bytes, encoding): + text = bytes.decode(encoding) + if not isinstance(text, unicode): + raise error.SyntaxError("codec did not return a unicode object") + recoded = text.encode("utf-8") + return recoded + def _normalize_encoding(encoding): """returns normalized name for @@ -96,6 +103,17 @@ filename=compile_info.filename) else: enc = _normalize_encoding(_check_for_encoding(textsrc)) + if enc is not None and enc != 'utf-8': + try: + textsrc = recode_to_utf8(textsrc, enc) + except LookupError as e: + # if the codec is not found, LookupError is raised. + raise error.SyntaxError("Unknown encoding: %s" % enc, + filename=compile_info.filename) + # Transform unicode errors into SyntaxError + except UnicodeDecodeError as e: + message = str(e) + raise error.SyntaxError(message) flags = compile_info.flags diff -r 50463cb7afae -r 2ee50dc501ca tests/unicode.py --- a/tests/unicode.py Sat Feb 04 00:12:06 2017 +0100 +++ b/tests/unicode.py Sat Feb 04 15:35:58 2017 +0100 @@ -9,6 +9,12 @@ print s # ÆØÅ print len(s) # 3 +s1 = b"ÆØÅ" \ + "ÆØÅ" +print "ISO-8859-15 values:" +print s1 # ÆØÅÆØÅ +print len(s1) # 6 + s2 = b"\xe6\xf8\xe5" print "ISO-8859-15 values:" print s2 # æøå diff -r 50463cb7afae -r 2ee50dc501ca translator.py --- a/translator.py Sat Feb 04 00:12:06 2017 +0100 +++ b/translator.py Sat Feb 04 15:35:58 2017 +0100 @@ -472,7 +472,7 @@ ref = self.get_builtin_class(name) return self.process_literal_sequence_node(n, name, ref, TrLiteralSequenceRef) else: - value, typename, encoding = self.get_constant_value(n.value, n.literal) + value, typename, encoding = self.get_constant_value(n.value, n.literals) name = get_builtin_type(typename) ref = self.get_builtin_class(name) value_type = ref.get_origin()