Restored UTF-8 source recoding and added support for concatenated literals.

     1.1 --- a/common.py	Sat Feb 04 00:12:06 2017 +0100
     1.2 +++ b/common.py	Sat Feb 04 15:35:58 2017 +0100
     1.3 @@ -233,7 +233,7 @@
     1.4  
     1.5      # Constant and literal recording.
     1.6  
     1.7 -    def get_constant_value(self, value, literal=None):
     1.8 +    def get_constant_value(self, value, literals=None):
     1.9  
    1.10          """
    1.11          Encode the 'value' if appropriate, returning a value, a typename and any
    1.12 @@ -247,7 +247,7 @@
    1.13  
    1.14          elif isinstance(value, str) and self.encoding:
    1.15              try:
    1.16 -                return get_string_details(literal, self.encoding)
    1.17 +                return get_string_details(literals, self.encoding)
    1.18              except UnicodeDecodeError:
    1.19                  pass
    1.20  
    1.21 @@ -964,12 +964,48 @@
    1.22      x.sort()
    1.23      return ", ".join(x)
    1.24  
    1.25 -def get_string_details(s, encoding):
    1.26 +def get_string_details(literals, encoding):
    1.27  
    1.28      """
    1.29 -    Determine whether 's' represents a Unicode string or a byte string, using
    1.30 -    'encoding' to interpret byte sequences. The contents of 's' is the full
    1.31 -    literal representation including prefix and quotes.
    1.32 +    Determine whether 'literals' represent Unicode strings or byte strings,
    1.33 +    using 'encoding' to reproduce byte sequences.
    1.34 +
    1.35 +    Each literal is the full program representation including prefix and quotes
    1.36 +    recoded by the parser to UTF-8. Thus, any literal found to represent a byte
    1.37 +    string needs to be translated back to its original encoding.
    1.38 +
    1.39 +    Return a single encoded literal value, a type name, and the original
    1.40 +    encoding as a tuple.
    1.41 +    """
    1.42 +
    1.43 +    typename = "unicode"
    1.44 +
    1.45 +    l = []
    1.46 +
    1.47 +    for s in literals:
    1.48 +        out, _typename = get_literal_details(s)
    1.49 +        if _typename == "str":
    1.50 +            typename = "str"
    1.51 +        l.append(out)
    1.52 +
    1.53 +    out = "".join(l)
    1.54 +
    1.55 +    # For Unicode values, convert to the UTF-8 program representation.
    1.56 +
    1.57 +    if typename == "unicode":
    1.58 +        return out.encode("utf-8"), typename, encoding
    1.59 +
    1.60 +    # For byte string values, convert back to the original encoding.
    1.61 +
    1.62 +    else:
    1.63 +        return out.encode(encoding), typename, encoding
    1.64 +
    1.65 +def get_literal_details(s):
    1.66 +
    1.67 +    """
    1.68 +    Determine whether 's' represents a Unicode string or a byte string, where
    1.69 +    's' contains the full program representation of a literal including prefix
    1.70 +    and quotes, recoded by the parser to UTF-8.
    1.71  
    1.72      Find and convert Unicode values starting with <backslash>u or <backslash>U,
    1.73      and byte or Unicode values starting with <backslash><octal digit> or
    1.74 @@ -984,8 +1020,8 @@
    1.75      formats are converted, not any of the other special sequences for things
    1.76      like newlines.
    1.77  
    1.78 -    Return the encoded literal value, type name, and original encoding as a
    1.79 -    tuple.
    1.80 +    Return the literal value as a Unicode object together with the appropriate
    1.81 +    type name in a tuple.
    1.82      """
    1.83  
    1.84      l = []
    1.85 @@ -1085,30 +1121,19 @@
    1.86              l.append(s[index:index+2])
    1.87              current = index + 2
    1.88  
    1.89 -    # For byte string values, convert any Unicode values to the original
    1.90 -    # encoding.
    1.91 +    # Collect the components into a single Unicode object. Since the literal
    1.92 +    # text was already in UTF-8 form, interpret plain strings as UTF-8
    1.93 +    # sequences.
    1.94  
    1.95 -    if typename == "str":
    1.96 -        out = []
    1.97 -        for value in l:
    1.98 -            if isinstance(value, unicode):
    1.99 -                out.append(value.encode(encoding))
   1.100 -            else:
   1.101 -                out.append(value)
   1.102 -        out = "".join(out)
   1.103 +    out = []
   1.104  
   1.105 -    # For Unicode values, convert byte sequences to Unicode.
   1.106 +    for value in l:
   1.107 +        if isinstance(value, unicode):
   1.108 +            out.append(value)
   1.109 +        else:
   1.110 +            out.append(unicode(value, "utf-8"))
   1.111  
   1.112 -    else:
   1.113 -        out = []
   1.114 -        for value in l:
   1.115 -            if isinstance(value, unicode):
   1.116 -                out.append(value)
   1.117 -            else:
   1.118 -                out.append(unicode(value, encoding))
   1.119 -        out = "".join(out).encode("utf-8")
   1.120 -
   1.121 -    return out, typename, encoding
   1.122 +    return "".join(out), typename
   1.123  
   1.124  def convert_quoted_value(s, index, needed, end, base, fn):
   1.125  

     2.1 --- a/compiler/ast.py	Sat Feb 04 00:12:06 2017 +0100
     2.2 +++ b/compiler/ast.py	Sat Feb 04 15:35:58 2017 +0100
     2.3 @@ -502,9 +502,9 @@
     2.4          return "%s %s" % (self.expr, " ".join([("%s %s" % op) for op in self.ops]))
     2.5  
     2.6  class Const(Node):
     2.7 -    def __init__(self, value, literal=None, lineno=None):
     2.8 +    def __init__(self, value, literals=None, lineno=None):
     2.9          self.value = value
    2.10 -        self.literal = literal
    2.11 +        self.literals = literals
    2.12          self.lineno = lineno
    2.13  
    2.14      def getChildren(self):
    2.15 @@ -514,7 +514,7 @@
    2.16          return ()
    2.17  
    2.18      def __repr__(self):
    2.19 -        return "Const(%r, %r)" % (self.value, self.literal)
    2.20 +        return "Const(%r, %r)" % (self.value, self.literals)
    2.21  
    2.22      def __str__(self):
    2.23          return repr(self.value)

     3.1 --- a/compiler/transformer.py	Sat Feb 04 00:12:06 2017 +0100
     3.2 +++ b/compiler/transformer.py	Sat Feb 04 15:35:58 2017 +0100
     3.3 @@ -669,15 +669,22 @@
     3.4  
     3.5      def decode_literal(self, lit):
     3.6          if self.encoding:
     3.7 +            # this is particularly fragile & a bit of a
     3.8 +            # hack... changes in compile.c:parsestr and
     3.9 +            # tokenizer.c must be reflected here.
    3.10 +            if self.encoding != 'utf-8':
    3.11 +                lit = unicode(lit, 'utf-8').encode(self.encoding)
    3.12              return eval("# coding: %s\n%s" % (self.encoding, lit))
    3.13          else:
    3.14              return eval(lit)
    3.15  
    3.16      def atom_string(self, nodelist):
    3.17          k = ''
    3.18 +        l = []
    3.19          for node in nodelist:
    3.20              k += self.decode_literal(node[1])
    3.21 -        return Const(k, node[1], lineno=nodelist[0][2])
    3.22 +            l.append(node[1])
    3.23 +        return Const(k, l, lineno=nodelist[0][2])
    3.24  
    3.25      def atom_name(self, nodelist):
    3.26          return Name(nodelist[0][1], lineno=nodelist[0][2])

     4.1 --- a/inspector.py	Sat Feb 04 00:12:06 2017 +0100
     4.2 +++ b/inspector.py	Sat Feb 04 15:35:58 2017 +0100
     4.3 @@ -1407,7 +1407,7 @@
     4.4          # Constant values are independently recorded.
     4.5  
     4.6          else:
     4.7 -            value, typename, encoding = self.get_constant_value(n.value, n.literal)
     4.8 +            value, typename, encoding = self.get_constant_value(n.value, n.literals)
     4.9              name = get_builtin_type(typename)
    4.10              ref = self.get_builtin_class(name)
    4.11              return self.get_constant_reference(ref, value, encoding)

     5.1 --- a/pyparser/pyparse.py	Sat Feb 04 00:12:06 2017 +0100
     5.2 +++ b/pyparser/pyparse.py	Sat Feb 04 15:35:58 2017 +0100
     5.3 @@ -1,6 +1,13 @@
     5.4  from pyparser import parser, pytokenizer, pygram, error
     5.5  from pyparser import consts
     5.6  
     5.7 +def recode_to_utf8(bytes, encoding):
     5.8 +    text = bytes.decode(encoding)
     5.9 +    if not isinstance(text, unicode):
    5.10 +        raise error.SyntaxError("codec did not return a unicode object")
    5.11 +    recoded = text.encode("utf-8")
    5.12 +    return recoded
    5.13 +
    5.14  def _normalize_encoding(encoding):
    5.15      """returns normalized name for <encoding>
    5.16  
    5.17 @@ -96,6 +103,17 @@
    5.18                                          filename=compile_info.filename)
    5.19          else:
    5.20              enc = _normalize_encoding(_check_for_encoding(textsrc))
    5.21 +            if enc is not None and enc != 'utf-8':
    5.22 +                try:
    5.23 +                    textsrc = recode_to_utf8(textsrc, enc)
    5.24 +                except LookupError as e:
    5.25 +                    # if the codec is not found, LookupError is raised.
    5.26 +                    raise error.SyntaxError("Unknown encoding: %s" % enc,
    5.27 +                                            filename=compile_info.filename)
    5.28 +                # Transform unicode errors into SyntaxError
    5.29 +                except UnicodeDecodeError as e:
    5.30 +                    message = str(e)
    5.31 +                    raise error.SyntaxError(message)
    5.32  
    5.33          flags = compile_info.flags
    5.34  

     6.1 --- a/tests/unicode.py	Sat Feb 04 00:12:06 2017 +0100
     6.2 +++ b/tests/unicode.py	Sat Feb 04 15:35:58 2017 +0100
     6.3 @@ -9,6 +9,12 @@
     6.4  print s                             # ���
     6.5  print len(s)                        # 3
     6.6  
     6.7 +s1 = b"���" \
     6.8 +      "���"
     6.9 +print "ISO-8859-15 values:"
    6.10 +print s1                            # ������
    6.11 +print len(s1)                       # 6
    6.12 +
    6.13  s2 = b"\xe6\xf8\xe5"
    6.14  print "ISO-8859-15 values:"
    6.15  print s2                            # ���

     7.1 --- a/translator.py	Sat Feb 04 00:12:06 2017 +0100
     7.2 +++ b/translator.py	Sat Feb 04 15:35:58 2017 +0100
     7.3 @@ -472,7 +472,7 @@
     7.4              ref = self.get_builtin_class(name)
     7.5              return self.process_literal_sequence_node(n, name, ref, TrLiteralSequenceRef)
     7.6          else:
     7.7 -            value, typename, encoding = self.get_constant_value(n.value, n.literal)
     7.8 +            value, typename, encoding = self.get_constant_value(n.value, n.literals)
     7.9              name = get_builtin_type(typename)
    7.10              ref = self.get_builtin_class(name)
    7.11              value_type = ref.get_origin()
2017-02-04	Paul Boddie	raw files shortlog changelog graph	Restored UTF-8 source recoding and added support for concatenated literals.
			common.py (file) compiler/ast.py (file) compiler/transformer.py (file) inspector.py (file) pyparser/pyparse.py (file) tests/unicode.py (file) translator.py (file)