1.1 --- a/lib/__builtins__/exception/unicode.py Thu Dec 15 01:40:31 2016 +0100
1.2 +++ b/lib/__builtins__/exception/unicode.py Thu Dec 15 16:09:01 2016 +0100
1.3 @@ -3,7 +3,7 @@
1.4 """
1.5 Unicode exception objects.
1.6
1.7 -Copyright (C) 2015 Paul Boddie <paul@boddie.org.uk>
1.8 +Copyright (C) 2015, 2016 Paul Boddie <paul@boddie.org.uk>
1.9
1.10 This program is free software; you can redistribute it and/or modify it under
1.11 the terms of the GNU General Public License as published by the Free Software
1.12 @@ -19,7 +19,22 @@
1.13 this program. If not, see <http://www.gnu.org/licenses/>.
1.14 """
1.15
1.16 -class UnicodeDecodeError(Exception): pass
1.17 +class UnicodeDecodeError(Exception):
1.18 +
1.19 + """
1.20 + An exception indicating a failure to interpret a byte sequence according to
1.21 + a character encoding.
1.22 + """
1.23 +
1.24 + def __init__(self, value):
1.25 +
1.26 + """
1.27 + Initialise an exception with a 'value' providing the illegal byte
1.28 + sequence responsible for the error.
1.29 + """
1.30 +
1.31 + self.value = value
1.32 +
1.33 class UnicodeEncodeError(Exception): pass
1.34 class UnicodeError(Exception): pass
1.35 class UnicodeTranslateError(Exception): pass
2.1 --- a/lib/__builtins__/unicode.py Thu Dec 15 01:40:31 2016 +0100
2.2 +++ b/lib/__builtins__/unicode.py Thu Dec 15 16:09:01 2016 +0100
2.3 @@ -172,7 +172,8 @@
2.4
2.5 s = s.__str__()
2.6
2.7 - # Convert the string to UTF-8.
2.8 + # Convert the string to UTF-8. Even if the stated encoding is UTF-8, it
2.9 + # needs to be validated.
2.10
2.11 to_utf8 = Converter(encoding, "UTF-8")
2.12
3.1 --- a/lib/posix/iconv.py Thu Dec 15 01:40:31 2016 +0100
3.2 +++ b/lib/posix/iconv.py Thu Dec 15 16:09:01 2016 +0100
3.3 @@ -63,7 +63,7 @@
3.4
3.5 def feed(self, s):
3.6
3.7 - "Feed 's' to the converter."
3.8 + "Feed 's' to the converter, converting its byte representation."
3.9
3.10 if self.__data__ is None:
3.11 raise ConverterError
3.12 @@ -90,6 +90,8 @@
3.13 if exc.value == EINVAL:
3.14 self.result.append(exc.arg)
3.15 return
3.16 + elif exc.value == EILSEQ:
3.17 + raise UnicodeDecodeError(exc.arg)
3.18 else:
3.19 raise
3.20
4.1 --- a/tests/iconv.py Thu Dec 15 01:40:31 2016 +0100
4.2 +++ b/tests/iconv.py Thu Dec 15 16:09:01 2016 +0100
4.3 @@ -1,7 +1,8 @@
4.4 # -*- coding: ISO-8859-1 -*-
4.5
4.6 -from posix.iconv import Converter, EILSEQ
4.7 +from posix.iconv import Converter
4.8
4.9 +only_utf8 = Converter("UTF-8", "UTF-8")
4.10 to_utf8 = Converter("ISO-8859-1", "UTF-8")
4.11 to_utf16 = Converter("ISO-8859-1", "UTF-16")
4.12 from_utf8 = Converter("UTF-8", "ISO-8859-1")
4.13 @@ -21,6 +22,12 @@
4.14 from_utf16.feed(utf16)
4.15 print str(from_utf16) # æøå
4.16
4.17 + # Convert UTF-8 to UTF-8.
4.18 +
4.19 + only_utf8.feed(utf8)
4.20 + utf8_2 = str(only_utf8)
4.21 + print utf8_2 # æøå
4.22 +
4.23 # Convert part of a UTF-16 sequence, then convert the remainder, then obtain
4.24 # the result.
4.25
4.26 @@ -52,11 +59,10 @@
4.27
4.28 try:
4.29 from_utf8.feed(iso) # should raise an exception
4.30 + except UnicodeDecodeError, exc:
4.31 + print "Not UTF-8 input:", exc.value
4.32 except OSError, exc:
4.33 - if exc.value == EILSEQ:
4.34 - print "Not UTF-8 input:", exc.arg
4.35 - else:
4.36 - print "OSError:", exc.value
4.37 + print "OSError:", exc.value
4.38
4.39 print str(from_utf8) #
4.40
4.41 @@ -70,11 +76,10 @@
4.42
4.43 try:
4.44 from_utf8.feed(utf8_2 + iso) # should raise an exception
4.45 + except UnicodeDecodeError, exc:
4.46 + print "Not UTF-8 input:", exc.value
4.47 except OSError, exc:
4.48 - if exc.value == EILSEQ:
4.49 - print "Not UTF-8 input:", exc.arg
4.50 - else:
4.51 - print "OSError:", exc.value
4.52 + print "OSError:", exc.value
4.53
4.54 print str(from_utf8) #
4.55
5.1 --- a/tests/unicode.py Thu Dec 15 01:40:31 2016 +0100
5.2 +++ b/tests/unicode.py Thu Dec 15 16:09:01 2016 +0100
5.3 @@ -37,6 +37,13 @@
5.4 print u3.encoding # ISO-8859-1
5.5 print len(u3) # 3
5.6
5.7 +# Test invalid sequences.
5.8 +
5.9 +try:
5.10 + u4 = unicode(s, "UTF-8")
5.11 +except UnicodeDecodeError, exc:
5.12 + print "Attempt to decode", s, "as UTF-8 failed."
5.13 +
5.14 # Combine bytes and text.
5.15 # The text should be decoded.
5.16