1.1 --- a/common.py	Fri Jan 27 15:27:17 2017 +0100
     1.2 +++ b/common.py	Fri Jan 27 23:54:18 2017 +0100
     1.3 @@ -20,11 +20,11 @@
     1.4  this program.  If not, see <http://www.gnu.org/licenses/>.
     1.5  """
     1.6  
     1.7 +from compiler.transformer import Transformer
     1.8  from errors import InspectError
     1.9  from os import listdir, makedirs, remove
    1.10  from os.path import exists, isdir, join, split
    1.11  from results import ConstantValueRef, LiteralSequenceRef, NameRef
    1.12 -from compiler.transformer import Transformer
    1.13  import compiler.ast
    1.14  
    1.15  class CommonOutput:
    1.16 @@ -248,7 +248,7 @@
    1.17          elif isinstance(value, str) and self.encoding:
    1.18              if not literal.startswith("b"):
    1.19                  try:
    1.20 -                    return unicode(value, self.encoding).encode("utf-8"), "unicode", self.encoding
    1.21 +                    return get_string_details(literal, self.encoding)
    1.22                  except UnicodeDecodeError:
    1.23                      pass
    1.24  
    1.25 @@ -964,6 +964,175 @@
    1.26      x.sort()
    1.27      return ", ".join(x)
    1.28  
    1.29 +def get_string_details(s, encoding):
    1.30 +
    1.31 +    """
    1.32 +    Determine whether 's' represents a Unicode string or a byte string, using
    1.33 +    'encoding' to interpret byte sequences. The contents of 's' is the full
    1.34 +    literal representation including prefix and quotes.
    1.35 +
    1.36 +    Find and convert Unicode values starting with <backslash>u or <backslash>U,
    1.37 +    and byte or Unicode values starting with <backslash><octal digit> or
    1.38 +    <backslash>x.
    1.39 +
    1.40 +    Literals prefixed with "u" cause <backslash><octal digit> and <backslash>x
    1.41 +    to be considered as Unicode values. Otherwise, they produce byte values and
    1.42 +    cause unprefixed strings to be considered as byte strings.
    1.43 +
    1.44 +    Literals prefixed with "r" do not have their backslash-encoded values
    1.45 +    converted unless also prefixed with "u", in which case only the above value
    1.46 +    formats are converted, not any of the other special sequences for things
    1.47 +    like newlines.
    1.48 +
    1.49 +    Return the encoded literal value, type name, and original encoding as a
    1.50 +    tuple.
    1.51 +    """
    1.52 +
    1.53 +    l = []
    1.54 +    typename = "unicode"
    1.55 +
    1.56 +    # Identify the quote character and use it to identify the prefix.
    1.57 +
    1.58 +    quote_type = s[-1]
    1.59 +    prefix_end = s.find(quote_type)
    1.60 +    prefix = s[:prefix_end].lower()
    1.61 +
    1.62 +    if prefix not in ("", "b", "br", "r", "u", "ur"):
    1.63 +        raise ValueError, "String literal does not have a supported prefix: %s" % s
    1.64 +
    1.65 +    # Identify triple quotes or single quotes.
    1.66 +
    1.67 +    if len(s) >= 6 and s[-2] == quote_type and s[-3] == quote_type:
    1.68 +        quote = s[prefix_end:prefix_end+3]
    1.69 +        current = prefix_end + 3
    1.70 +        end = len(s) - 3
    1.71 +    else:
    1.72 +        quote = s[prefix_end]
    1.73 +        current = prefix_end + 1
    1.74 +        end = len(s) - 1
    1.75 +
    1.76 +    # Conversions of some quoted values.
    1.77 +
    1.78 +    searches = {
    1.79 +        "u" : (6, 16),
    1.80 +        "U" : (10, 16),
    1.81 +        "x" : (4, 16),
    1.82 +        }
    1.83 +
    1.84 +    octal_digits = map(str, range(0, 8))
    1.85 +
    1.86 +    # Translations of some quoted values.
    1.87 +
    1.88 +    escaped = {
    1.89 +        "\\" : "\\", "'" : "'", '"' : '"',
    1.90 +        "a" : "\a", "b" : "\b", "f" : "\f",
    1.91 +        "n" : "\n", "r" : "\r", "t" : "\t",
    1.92 +        }
    1.93 +
    1.94 +    while current < end:
    1.95 +
    1.96 +        # Look for quoted values.
    1.97 +
    1.98 +        index = s.find("\\", current)
    1.99 +        if index == -1 or index + 1 == end:
   1.100 +            l.append(s[current:end])
   1.101 +            break
   1.102 +
   1.103 +        # Add the preceding text.
   1.104 +
   1.105 +        l.append(s[current:index])
   1.106 +
   1.107 +        # Handle quoted text.
   1.108 +
   1.109 +        term = s[index+1]
   1.110 +
   1.111 +        # Add Unicode values. Where a string is u-prefixed, even \o and \x
   1.112 +        # produce Unicode values.
   1.113 +
   1.114 +        if term in ("u", "U") or prefix == "u" and (
   1.115 +           term == "x" or term in octal_digits):
   1.116 +
   1.117 +            needed, base = searches.get(term, (4, 8))
   1.118 +            value = convert_quoted_value(s, index, needed, end, base, unichr)
   1.119 +            l.append(value)
   1.120 +            current = index + needed
   1.121 +
   1.122 +        # Add raw byte values, changing the string type.
   1.123 +
   1.124 +        elif "r" not in prefix and (
   1.125 +             term == "x" or term in octal_digits):
   1.126 +
   1.127 +            needed, base = searches.get(term, (4, 8))
   1.128 +            value = convert_quoted_value(s, index, needed, end, base, chr)
   1.129 +            l.append(value)
   1.130 +            typename = "str"
   1.131 +            current = index + needed
   1.132 +
   1.133 +        # Add other escaped values.
   1.134 +
   1.135 +        elif "r" not in prefix and escaped.has_key(term):
   1.136 +            l.append(escaped[term])
   1.137 +            current = index + 2
   1.138 +
   1.139 +        # Add other text as found.
   1.140 +
   1.141 +        else:
   1.142 +            l.append(s[index:index+2])
   1.143 +            current = index + 2
   1.144 +
   1.145 +    # For byte string values, convert any Unicode values to the original
   1.146 +    # encoding.
   1.147 +
   1.148 +    if typename == "str":
   1.149 +        out = []
   1.150 +        for value in l:
   1.151 +            if isinstance(value, unicode):
   1.152 +                out.append(value.encode(encoding))
   1.153 +            else:
   1.154 +                out.append(value)
   1.155 +        out = "".join(out)
   1.156 +
   1.157 +    # For Unicode values, convert byte sequences to Unicode.
   1.158 +
   1.159 +    else:
   1.160 +        out = []
   1.161 +        for value in l:
   1.162 +            if isinstance(value, unicode):
   1.163 +                out.append(value)
   1.164 +            else:
   1.165 +                out.append(unicode(value, encoding))
   1.166 +        out = "".join(out).encode("utf-8")
   1.167 +
   1.168 +    return out, typename, encoding
   1.169 +
   1.170 +def convert_quoted_value(s, index, needed, end, base, fn):
   1.171 +
   1.172 +    """
   1.173 +    Interpret a quoted value in 's' at 'index' with the given 'needed' number of
   1.174 +    positions, and with the given 'end' indicating the first position after the
   1.175 +    end of the actual string content.
   1.176 +
   1.177 +    Use 'base' as the numerical base when interpreting the value, and use 'fn'
   1.178 +    to convert the value to an appropriate type.
   1.179 +    """
   1.180 +
   1.181 +    s = s[index:min(index+needed, end)]
   1.182 +
   1.183 +    # Not a complete occurrence.
   1.184 +
   1.185 +    if len(s) < needed:
   1.186 +        return s
   1.187 +
   1.188 +    # Test for a well-formed value.
   1.189 +
   1.190 +    try:
   1.191 +        first = base == 8 and 1 or 2
   1.192 +        value = int(s[first:needed], base)
   1.193 +    except ValueError:
   1.194 +        return s
   1.195 +    else:
   1.196 +        return fn(value)
   1.197 +
   1.198  # Attribute chain decoding.
   1.199  
   1.200  def get_attrnames(attrnames):