1.1 --- a/common.py Wed Dec 14 16:23:32 2016 +0100
1.2 +++ b/common.py Wed Dec 14 16:40:00 2016 +0100
1.3 @@ -24,7 +24,8 @@
1.4 from os import listdir, makedirs, remove
1.5 from os.path import exists, isdir, join, split
1.6 from results import ConstantValueRef, LiteralSequenceRef, NameRef
1.7 -import compiler
1.8 +from compiler.transformer import Transformer
1.9 +import compiler.ast
1.10
1.11 class CommonOutput:
1.12
1.13 @@ -93,6 +94,7 @@
1.14 # Inspection-related attributes.
1.15
1.16 self.astnode = None
1.17 + self.encoding = None
1.18 self.iterators = {}
1.19 self.temp = {}
1.20 self.lambdas = {}
1.21 @@ -128,7 +130,17 @@
1.22 "Parse the file with the given 'filename', initialising attributes."
1.23
1.24 self.filename = filename
1.25 - self.astnode = compiler.parseFile(filename)
1.26 +
1.27 + # Use the Transformer directly to obtain encoding information.
1.28 +
1.29 + t = Transformer()
1.30 + f = open(filename)
1.31 +
1.32 + try:
1.33 + self.astnode = t.parsesuite(f.read() + "\n")
1.34 + self.encoding = t.encoding
1.35 + finally:
1.36 + f.close()
1.37
1.38 # Module-relative naming.
1.39
1.40 @@ -221,13 +233,23 @@
1.41
1.42 # Constant and literal recording.
1.43
1.44 - def get_constant_value(self, value):
1.45 + def get_constant_value(self, value, literal=None):
1.46
1.47 - "Encode the 'value' if appropriate."
1.48 + "Encode the 'value' if appropriate, returning a value and typename."
1.49
1.50 if isinstance(value, unicode):
1.51 - value = value.encode("utf-8")
1.52 - return value
1.53 + return value.encode("utf-8"), "unicode"
1.54 +
1.55 + # Attempt to convert plain strings to text.
1.56 +
1.57 + elif isinstance(value, str) and self.encoding:
1.58 + if not literal.startswith("b"):
1.59 + try:
1.60 + return unicode(value, self.encoding).encode("utf-8"), "unicode"
1.61 + except UnicodeDecodeError:
1.62 + pass
1.63 +
1.64 + return value, value.__class__.__name__
1.65
1.66 def get_constant_reference(self, ref, value):
1.67
2.1 --- a/inspector.py Wed Dec 14 16:23:32 2016 +0100
2.2 +++ b/inspector.py Wed Dec 14 16:40:00 2016 +0100
2.3 @@ -294,8 +294,7 @@
2.4 # Constant usage.
2.5
2.6 elif isinstance(n, compiler.ast.Const):
2.7 - typename = n.value.__class__.__name__
2.8 - return self.get_literal_instance(n, get_builtin_type(typename))
2.9 + return self.get_literal_instance(n)
2.10
2.11 elif isinstance(n, compiler.ast.Dict):
2.12 return self.get_literal_instance(n, "dict")
2.13 @@ -1383,30 +1382,32 @@
2.14 ref = self.get_builtin_class(name)
2.15 return self.get_constant_reference(ref, value)
2.16
2.17 - def get_literal_instance(self, n, name):
2.18 + def get_literal_instance(self, n, name=None):
2.19
2.20 - "For node 'n', return a reference to an instance of 'name'."
2.21 + """
2.22 + For node 'n', return a reference to an instance of 'name', or if 'name'
2.23 + is not specified, deduce the type from the value.
2.24 + """
2.25
2.26 # Handle stray None constants (Sliceobj seems to produce them).
2.27
2.28 if name == "NoneType":
2.29 return self.process_name_node(compiler.ast.Name("None"))
2.30
2.31 - # Get a reference to the built-in class.
2.32 -
2.33 - ref = self.get_builtin_class(name)
2.34 -
2.35 # Obtain the details of the literal itself.
2.36 # An alias to the type is generated for sequences.
2.37
2.38 if name in ("dict", "list", "tuple"):
2.39 + ref = self.get_builtin_class(name)
2.40 self.set_special_literal(name, ref)
2.41 return self.process_literal_sequence_node(n, name, ref, LiteralSequenceRef)
2.42
2.43 # Constant values are independently recorded.
2.44
2.45 else:
2.46 - value = self.get_constant_value(n.value)
2.47 + value, typename = self.get_constant_value(n.value, n.literal)
2.48 + name = get_builtin_type(typename)
2.49 + ref = self.get_builtin_class(name)
2.50 return self.get_constant_reference(ref, value)
2.51
2.52 # Special names.
3.1 --- a/tests/unicode.py Wed Dec 14 16:23:32 2016 +0100
3.2 +++ b/tests/unicode.py Wed Dec 14 16:40:00 2016 +0100
3.3 @@ -30,12 +30,12 @@
3.4
3.5 # Implicitly from string literals.
3.6
3.7 -#u3 = "æøå"
3.8 -#print u3 # æøå
3.9 -#print u3.__class__ # __builtins__.unicode.utf8string
3.10 -#print u3.encode("ISO-8859-1") # æøå
3.11 -#print u3.encoding # ISO-8859-1
3.12 -#print len(u3) # 3
3.13 +u3 = "æøå"
3.14 +print u3 # æøå
3.15 +print u3.__class__ # __builtins__.unicode.utf8string
3.16 +print u3.encode("ISO-8859-1") # æøå
3.17 +print u3.encoding # ISO-8859-1
3.18 +print len(u3) # 3
3.19
3.20 # Combine bytes and text.
3.21 # The text should be decoded.
4.1 --- a/translator.py Wed Dec 14 16:23:32 2016 +0100
4.2 +++ b/translator.py Wed Dec 14 16:40:00 2016 +0100
4.3 @@ -415,23 +415,25 @@
4.4
4.5 # Constant referencing.
4.6
4.7 - def get_literal_instance(self, n, name):
4.8 + def get_literal_instance(self, n, name=None):
4.9
4.10 """
4.11 - For node 'n', return a reference for the type of the given 'name'.
4.12 + For node 'n', return a reference for the type of the given 'name', or if
4.13 + 'name' is not specified, deduce the type from the value.
4.14 """
4.15
4.16 # Handle stray None constants (Sliceobj seems to produce them).
4.17
4.18 - if name == "NoneType":
4.19 + if name is None and n.value is None:
4.20 return self.process_name_node(compiler.ast.Name("None"))
4.21
4.22 - ref = self.get_builtin_class(name)
4.23 -
4.24 if name in ("dict", "list", "tuple"):
4.25 + ref = self.get_builtin_class(name)
4.26 return self.process_literal_sequence_node(n, name, ref, TrLiteralSequenceRef)
4.27 else:
4.28 - value = self.get_constant_value(n.value)
4.29 + value, typename = self.get_constant_value(n.value, n.literal)
4.30 + name = get_builtin_type(typename)
4.31 + ref = self.get_builtin_class(name)
4.32 value_type = ref.get_origin()
4.33
4.34 path = self.get_namespace_path()
4.35 @@ -607,7 +609,7 @@
4.36 # Constant usage.
4.37
4.38 elif isinstance(n, compiler.ast.Const):
4.39 - return self.get_literal_instance(n, n.value.__class__.__name__)
4.40 + return self.get_literal_instance(n)
4.41
4.42 elif isinstance(n, compiler.ast.Dict):
4.43 return self.get_literal_instance(n, "dict")