# HG changeset patch # User Paul Boddie # Date 1481653163 -3600 # Node ID 3130610deb6787ec0cafef5a39c0ffc8fa795aef # Parent 535f437d592e60a743d5c9f0470b81852fcb7868 Fixed "reversed operands" string addition, imposing constraints on the operands so that only byte strings can be combined with each other. Added Unicode binary operator support in order to combine Unicode objects with each other and with byte strings. Added an original encoding attribute to Unicode objects. Tested addition of the different kinds of strings. diff -r 535f437d592e -r 3130610deb67 lib/__builtins__/str.py --- a/lib/__builtins__/str.py Tue Dec 13 17:58:26 2016 +0100 +++ b/lib/__builtins__/str.py Tue Dec 13 19:19:23 2016 +0100 @@ -74,12 +74,31 @@ def _binary_op(self, op, other): - "Perform 'op' on this int and 'other' if appropriate." + "Perform 'op' on this object and 'other' if appropriate." + + # Refuse to operate on specialisations of this class. + + if self.__class__ is not other.__class__: + return NotImplemented + + # Otherwise, perform the operation on the operands' data. + + else: + return op(self.__data__, other.__data__) - if isinstance(other, basestring): - return op(self.__data__, other.__data__) + def _binary_op_rev(self, op, other): + + "Perform 'op' on 'other' and this object if appropriate." + + # Refuse to operate on specialisations of this class. + + if self.__class__ is not other.__class__: + return NotImplemented + + # Otherwise, perform the operation on the operands' data. + else: - return NotImplemented + return op(other.__data__, self.__data__) def __iadd__(self, other): @@ -87,7 +106,13 @@ return self._binary_op(str_add, other) - __add__ = __radd__ = __iadd__ + __add__ = __iadd__ + + def __radd__(self, other): + + "Return a string combining this string with 'other'." + + return self._binary_op_rev(str_add, other) def __mul__(self, other): pass def __rmul__(self, other): pass diff -r 535f437d592e -r 3130610deb67 lib/__builtins__/unicode.py --- a/lib/__builtins__/unicode.py Tue Dec 13 17:58:26 2016 +0100 +++ b/lib/__builtins__/unicode.py Tue Dec 13 19:19:23 2016 +0100 @@ -21,14 +21,72 @@ from __builtins__.str import basestring from posix.iconv import Converter +from native import str_add, isinstance as _isinstance class utf8string(basestring): "A character string representation based on UTF-8." - def encode(self, encoding): + def __init__(self, other=None, encoding=None): + + """ + Initialise the string, perhaps from 'other', with any original + 'encoding' indicated. + """ + + get_using(basestring.__init__, self)(other) + self.encoding = encoding + + def _binary_op(self, op, other): + + "Perform 'op' on this object and 'other' if appropriate." + + # Reject non-strings. + + if not _isinstance(other, basestring): + return NotImplemented + + # Combining text with bytes. + + elif not _isinstance(other, utf8string): + s = self.encode() + return op(s.__data__, other.__data__) + + # Otherwise, perform the operation on the operands' data. + + else: + return op(self.__data__, other.__data__) - "Encode the string to the given 'encoding'." + def _binary_op_rev(self, op, other): + + "Perform 'op' on 'other' and this object if appropriate." + + # Reject non-strings. + + if not _isinstance(other, basestring): + return NotImplemented + + # Combining text with bytes. + + elif not _isinstance(other, utf8string): + s = self.encode() + return op(other.__data__, s.__data__) + + # Otherwise, perform the operation on the operands' data. + + else: + return op(other.__data__, self.__data__) + + def encode(self, encoding=None): + + """ + Encode the string to the given 'encoding' or any original encoding if + omitted. + """ + + encoding = encoding or self.encoding + if not encoding: + return self from_utf8 = Converter("UTF-8", encoding) @@ -56,7 +114,7 @@ try: to_utf8.feed(s) - return utf8string(str(to_utf8)) + return utf8string(str(to_utf8), encoding) finally: to_utf8.close() diff -r 535f437d592e -r 3130610deb67 tests/unicode.py --- a/tests/unicode.py Tue Dec 13 17:58:26 2016 +0100 +++ b/tests/unicode.py Tue Dec 13 19:19:23 2016 +0100 @@ -4,14 +4,14 @@ # Print bytes. -s = b"æøå" -print s # æøå +s = b"ÆØÅ" +print s # ÆØÅ # Obtain text and print it. # Explicitly from bytes. -u = unicode(s, "ISO-8859-1") +u = unicode("æøå", "ISO-8859-1") print u # æøå print u.encode("ISO-8859-1") # æøå @@ -27,8 +27,22 @@ #print u3 # æøå #print u3.encode("ISO-8859-1") # æøå +# Combine bytes and text. +# The text should be decoded. + +su = s + u +print su # ÆØÅæøå + +# Combine text and bytes. +# The text should be decoded. + +us = u + s +print us # æøåÆØÅ + # Inspect and update the encoding of stdout. print sys.stdout.encoding # None sys.stdout.encoding = "ISO-8859-1" print u # æøå +print su # ÆØÅæøå +print us # æøåÆØÅ