# HG changeset patch
# User Paul Boddie <paul@boddie.org.uk>
# Date 1486218958 -3600
# Node ID 2ee50dc501ca2066aebc23a44324be4f5415b8c9
# Parent  50463cb7afae1d9cb5f3dc667165001bcc872ed2
Restored UTF-8 source recoding and added support for concatenated literals.

diff -r 50463cb7afae -r 2ee50dc501ca common.py
--- a/common.py	Sat Feb 04 00:12:06 2017 +0100
+++ b/common.py	Sat Feb 04 15:35:58 2017 +0100
@@ -233,7 +233,7 @@
 
     # Constant and literal recording.
 
-    def get_constant_value(self, value, literal=None):
+    def get_constant_value(self, value, literals=None):
 
         """
         Encode the 'value' if appropriate, returning a value, a typename and any
@@ -247,7 +247,7 @@
 
         elif isinstance(value, str) and self.encoding:
             try:
-                return get_string_details(literal, self.encoding)
+                return get_string_details(literals, self.encoding)
             except UnicodeDecodeError:
                 pass
 
@@ -964,12 +964,48 @@
     x.sort()
     return ", ".join(x)
 
-def get_string_details(s, encoding):
+def get_string_details(literals, encoding):
 
     """
-    Determine whether 's' represents a Unicode string or a byte string, using
-    'encoding' to interpret byte sequences. The contents of 's' is the full
-    literal representation including prefix and quotes.
+    Determine whether 'literals' represent Unicode strings or byte strings,
+    using 'encoding' to reproduce byte sequences.
+
+    Each literal is the full program representation including prefix and quotes
+    recoded by the parser to UTF-8. Thus, any literal found to represent a byte
+    string needs to be translated back to its original encoding.
+
+    Return a single encoded literal value, a type name, and the original
+    encoding as a tuple.
+    """
+
+    typename = "unicode"
+
+    l = []
+
+    for s in literals:
+        out, _typename = get_literal_details(s)
+        if _typename == "str":
+            typename = "str"
+        l.append(out)
+
+    out = "".join(l)
+
+    # For Unicode values, convert to the UTF-8 program representation.
+
+    if typename == "unicode":
+        return out.encode("utf-8"), typename, encoding
+
+    # For byte string values, convert back to the original encoding.
+
+    else:
+        return out.encode(encoding), typename, encoding
+
+def get_literal_details(s):
+
+    """
+    Determine whether 's' represents a Unicode string or a byte string, where
+    's' contains the full program representation of a literal including prefix
+    and quotes, recoded by the parser to UTF-8.
 
     Find and convert Unicode values starting with <backslash>u or <backslash>U,
     and byte or Unicode values starting with <backslash><octal digit> or
@@ -984,8 +1020,8 @@
     formats are converted, not any of the other special sequences for things
     like newlines.
 
-    Return the encoded literal value, type name, and original encoding as a
-    tuple.
+    Return the literal value as a Unicode object together with the appropriate
+    type name in a tuple.
     """
 
     l = []
@@ -1085,30 +1121,19 @@
             l.append(s[index:index+2])
             current = index + 2
 
-    # For byte string values, convert any Unicode values to the original
-    # encoding.
+    # Collect the components into a single Unicode object. Since the literal
+    # text was already in UTF-8 form, interpret plain strings as UTF-8
+    # sequences.
 
-    if typename == "str":
-        out = []
-        for value in l:
-            if isinstance(value, unicode):
-                out.append(value.encode(encoding))
-            else:
-                out.append(value)
-        out = "".join(out)
+    out = []
 
-    # For Unicode values, convert byte sequences to Unicode.
+    for value in l:
+        if isinstance(value, unicode):
+            out.append(value)
+        else:
+            out.append(unicode(value, "utf-8"))
 
-    else:
-        out = []
-        for value in l:
-            if isinstance(value, unicode):
-                out.append(value)
-            else:
-                out.append(unicode(value, encoding))
-        out = "".join(out).encode("utf-8")
-
-    return out, typename, encoding
+    return "".join(out), typename
 
 def convert_quoted_value(s, index, needed, end, base, fn):
 
diff -r 50463cb7afae -r 2ee50dc501ca compiler/ast.py
--- a/compiler/ast.py	Sat Feb 04 00:12:06 2017 +0100
+++ b/compiler/ast.py	Sat Feb 04 15:35:58 2017 +0100
@@ -502,9 +502,9 @@
         return "%s %s" % (self.expr, " ".join([("%s %s" % op) for op in self.ops]))
 
 class Const(Node):
-    def __init__(self, value, literal=None, lineno=None):
+    def __init__(self, value, literals=None, lineno=None):
         self.value = value
-        self.literal = literal
+        self.literals = literals
         self.lineno = lineno
 
     def getChildren(self):
@@ -514,7 +514,7 @@
         return ()
 
     def __repr__(self):
-        return "Const(%r, %r)" % (self.value, self.literal)
+        return "Const(%r, %r)" % (self.value, self.literals)
 
     def __str__(self):
         return repr(self.value)
diff -r 50463cb7afae -r 2ee50dc501ca compiler/transformer.py
--- a/compiler/transformer.py	Sat Feb 04 00:12:06 2017 +0100
+++ b/compiler/transformer.py	Sat Feb 04 15:35:58 2017 +0100
@@ -669,15 +669,22 @@
 
     def decode_literal(self, lit):
         if self.encoding:
+            # this is particularly fragile & a bit of a
+            # hack... changes in compile.c:parsestr and
+            # tokenizer.c must be reflected here.
+            if self.encoding != 'utf-8':
+                lit = unicode(lit, 'utf-8').encode(self.encoding)
             return eval("# coding: %s\n%s" % (self.encoding, lit))
         else:
             return eval(lit)
 
     def atom_string(self, nodelist):
         k = ''
+        l = []
         for node in nodelist:
             k += self.decode_literal(node[1])
-        return Const(k, node[1], lineno=nodelist[0][2])
+            l.append(node[1])
+        return Const(k, l, lineno=nodelist[0][2])
 
     def atom_name(self, nodelist):
         return Name(nodelist[0][1], lineno=nodelist[0][2])
diff -r 50463cb7afae -r 2ee50dc501ca inspector.py
--- a/inspector.py	Sat Feb 04 00:12:06 2017 +0100
+++ b/inspector.py	Sat Feb 04 15:35:58 2017 +0100
@@ -1407,7 +1407,7 @@
         # Constant values are independently recorded.
 
         else:
-            value, typename, encoding = self.get_constant_value(n.value, n.literal)
+            value, typename, encoding = self.get_constant_value(n.value, n.literals)
             name = get_builtin_type(typename)
             ref = self.get_builtin_class(name)
             return self.get_constant_reference(ref, value, encoding)
diff -r 50463cb7afae -r 2ee50dc501ca pyparser/pyparse.py
--- a/pyparser/pyparse.py	Sat Feb 04 00:12:06 2017 +0100
+++ b/pyparser/pyparse.py	Sat Feb 04 15:35:58 2017 +0100
@@ -1,6 +1,13 @@
 from pyparser import parser, pytokenizer, pygram, error
 from pyparser import consts
 
+def recode_to_utf8(bytes, encoding):
+    text = bytes.decode(encoding)
+    if not isinstance(text, unicode):
+        raise error.SyntaxError("codec did not return a unicode object")
+    recoded = text.encode("utf-8")
+    return recoded
+
 def _normalize_encoding(encoding):
     """returns normalized name for <encoding>
 
@@ -96,6 +103,17 @@
                                         filename=compile_info.filename)
         else:
             enc = _normalize_encoding(_check_for_encoding(textsrc))
+            if enc is not None and enc != 'utf-8':
+                try:
+                    textsrc = recode_to_utf8(textsrc, enc)
+                except LookupError as e:
+                    # if the codec is not found, LookupError is raised.
+                    raise error.SyntaxError("Unknown encoding: %s" % enc,
+                                            filename=compile_info.filename)
+                # Transform unicode errors into SyntaxError
+                except UnicodeDecodeError as e:
+                    message = str(e)
+                    raise error.SyntaxError(message)
 
         flags = compile_info.flags
 
diff -r 50463cb7afae -r 2ee50dc501ca tests/unicode.py
--- a/tests/unicode.py	Sat Feb 04 00:12:06 2017 +0100
+++ b/tests/unicode.py	Sat Feb 04 15:35:58 2017 +0100
@@ -9,6 +9,12 @@
 print s                             # ÆØÅ
 print len(s)                        # 3
 
+s1 = b"ÆØÅ" \
+      "ÆØÅ"
+print "ISO-8859-15 values:"
+print s1                            # ÆØÅÆØÅ
+print len(s1)                       # 6
+
 s2 = b"\xe6\xf8\xe5"
 print "ISO-8859-15 values:"
 print s2                            # æøå
diff -r 50463cb7afae -r 2ee50dc501ca translator.py
--- a/translator.py	Sat Feb 04 00:12:06 2017 +0100
+++ b/translator.py	Sat Feb 04 15:35:58 2017 +0100
@@ -472,7 +472,7 @@
             ref = self.get_builtin_class(name)
             return self.process_literal_sequence_node(n, name, ref, TrLiteralSequenceRef)
         else:
-            value, typename, encoding = self.get_constant_value(n.value, n.literal)
+            value, typename, encoding = self.get_constant_value(n.value, n.literals)
             name = get_builtin_type(typename)
             ref = self.get_builtin_class(name)
             value_type = ref.get_origin()