# HG changeset patch # User Paul Boddie # Date 1481730000 -3600 # Node ID acd579cd51896e44803db37a514c35c439deb432 # Parent 25e43e32ab7e2148b4d9a9efbc33e481ccb192ae Attempt to interpret plain string literals as Unicode, returning constant types when encoding their values. Propagated the file encoding from the parser to each module in order to attempt string value interpretations. diff -r 25e43e32ab7e -r acd579cd5189 common.py --- a/common.py Wed Dec 14 16:23:32 2016 +0100 +++ b/common.py Wed Dec 14 16:40:00 2016 +0100 @@ -24,7 +24,8 @@ from os import listdir, makedirs, remove from os.path import exists, isdir, join, split from results import ConstantValueRef, LiteralSequenceRef, NameRef -import compiler +from compiler.transformer import Transformer +import compiler.ast class CommonOutput: @@ -93,6 +94,7 @@ # Inspection-related attributes. self.astnode = None + self.encoding = None self.iterators = {} self.temp = {} self.lambdas = {} @@ -128,7 +130,17 @@ "Parse the file with the given 'filename', initialising attributes." self.filename = filename - self.astnode = compiler.parseFile(filename) + + # Use the Transformer directly to obtain encoding information. + + t = Transformer() + f = open(filename) + + try: + self.astnode = t.parsesuite(f.read() + "\n") + self.encoding = t.encoding + finally: + f.close() # Module-relative naming. @@ -221,13 +233,23 @@ # Constant and literal recording. - def get_constant_value(self, value): + def get_constant_value(self, value, literal=None): - "Encode the 'value' if appropriate." + "Encode the 'value' if appropriate, returning a value and typename." if isinstance(value, unicode): - value = value.encode("utf-8") - return value + return value.encode("utf-8"), "unicode" + + # Attempt to convert plain strings to text. + + elif isinstance(value, str) and self.encoding: + if not literal.startswith("b"): + try: + return unicode(value, self.encoding).encode("utf-8"), "unicode" + except UnicodeDecodeError: + pass + + return value, value.__class__.__name__ def get_constant_reference(self, ref, value): diff -r 25e43e32ab7e -r acd579cd5189 inspector.py --- a/inspector.py Wed Dec 14 16:23:32 2016 +0100 +++ b/inspector.py Wed Dec 14 16:40:00 2016 +0100 @@ -294,8 +294,7 @@ # Constant usage. elif isinstance(n, compiler.ast.Const): - typename = n.value.__class__.__name__ - return self.get_literal_instance(n, get_builtin_type(typename)) + return self.get_literal_instance(n) elif isinstance(n, compiler.ast.Dict): return self.get_literal_instance(n, "dict") @@ -1383,30 +1382,32 @@ ref = self.get_builtin_class(name) return self.get_constant_reference(ref, value) - def get_literal_instance(self, n, name): + def get_literal_instance(self, n, name=None): - "For node 'n', return a reference to an instance of 'name'." + """ + For node 'n', return a reference to an instance of 'name', or if 'name' + is not specified, deduce the type from the value. + """ # Handle stray None constants (Sliceobj seems to produce them). if name == "NoneType": return self.process_name_node(compiler.ast.Name("None")) - # Get a reference to the built-in class. - - ref = self.get_builtin_class(name) - # Obtain the details of the literal itself. # An alias to the type is generated for sequences. if name in ("dict", "list", "tuple"): + ref = self.get_builtin_class(name) self.set_special_literal(name, ref) return self.process_literal_sequence_node(n, name, ref, LiteralSequenceRef) # Constant values are independently recorded. else: - value = self.get_constant_value(n.value) + value, typename = self.get_constant_value(n.value, n.literal) + name = get_builtin_type(typename) + ref = self.get_builtin_class(name) return self.get_constant_reference(ref, value) # Special names. diff -r 25e43e32ab7e -r acd579cd5189 tests/unicode.py --- a/tests/unicode.py Wed Dec 14 16:23:32 2016 +0100 +++ b/tests/unicode.py Wed Dec 14 16:40:00 2016 +0100 @@ -30,12 +30,12 @@ # Implicitly from string literals. -#u3 = "æøå" -#print u3 # æøå -#print u3.__class__ # __builtins__.unicode.utf8string -#print u3.encode("ISO-8859-1") # æøå -#print u3.encoding # ISO-8859-1 -#print len(u3) # 3 +u3 = "æøå" +print u3 # æøå +print u3.__class__ # __builtins__.unicode.utf8string +print u3.encode("ISO-8859-1") # æøå +print u3.encoding # ISO-8859-1 +print len(u3) # 3 # Combine bytes and text. # The text should be decoded. diff -r 25e43e32ab7e -r acd579cd5189 translator.py --- a/translator.py Wed Dec 14 16:23:32 2016 +0100 +++ b/translator.py Wed Dec 14 16:40:00 2016 +0100 @@ -415,23 +415,25 @@ # Constant referencing. - def get_literal_instance(self, n, name): + def get_literal_instance(self, n, name=None): """ - For node 'n', return a reference for the type of the given 'name'. + For node 'n', return a reference for the type of the given 'name', or if + 'name' is not specified, deduce the type from the value. """ # Handle stray None constants (Sliceobj seems to produce them). - if name == "NoneType": + if name is None and n.value is None: return self.process_name_node(compiler.ast.Name("None")) - ref = self.get_builtin_class(name) - if name in ("dict", "list", "tuple"): + ref = self.get_builtin_class(name) return self.process_literal_sequence_node(n, name, ref, TrLiteralSequenceRef) else: - value = self.get_constant_value(n.value) + value, typename = self.get_constant_value(n.value, n.literal) + name = get_builtin_type(typename) + ref = self.get_builtin_class(name) value_type = ref.get_origin() path = self.get_namespace_path() @@ -607,7 +609,7 @@ # Constant usage. elif isinstance(n, compiler.ast.Const): - return self.get_literal_instance(n, n.value.__class__.__name__) + return self.get_literal_instance(n) elif isinstance(n, compiler.ast.Dict): return self.get_literal_instance(n, "dict")