# HG changeset patch # User Paul Boddie # Date 1481563840 -3600 # Node ID ef11d754296983e796f281c209e5149b88e5d201 # Parent e11d55280718a1773403bb52bcf6a67a38250881 Support incremental conversion of strings, handling incomplete sequences of characters, employing the string representation method to obtain the result. Added a native function to reset the iconv context. Added tests of feeding incomplete sequences followed by the remaining bytes. diff -r e11d55280718 -r ef11d7542969 lib/native/__init__.py --- a/lib/native/__init__.py Mon Dec 12 18:00:17 2016 +0100 +++ b/lib/native/__init__.py Mon Dec 12 18:30:40 2016 +0100 @@ -29,7 +29,7 @@ from native.introspection import object_getattr, isinstance, issubclass -from native.iconv import iconv, iconv_close, iconv_open +from native.iconv import iconv, iconv_close, iconv_open, iconv_reset from native.io import fclose, fopen, fdopen, close, read, write, fread, fwrite diff -r e11d55280718 -r ef11d7542969 lib/native/iconv.py --- a/lib/native/iconv.py Mon Dec 12 18:00:17 2016 +0100 +++ b/lib/native/iconv.py Mon Dec 12 18:30:40 2016 +0100 @@ -24,13 +24,15 @@ this program. If not, see . """ -def iconv_open(tocode, fromcode): +def iconv(cd, state): OSError def iconv_close(cd): OSError -def iconv(cd, instr, state): +def iconv_open(tocode, fromcode): OSError +def iconv_reset(cd): pass + # vim: tabstop=4 expandtab shiftwidth=4 diff -r e11d55280718 -r ef11d7542969 lib/posix/iconv.py --- a/lib/posix/iconv.py Mon Dec 12 18:00:17 2016 +0100 +++ b/lib/posix/iconv.py Mon Dec 12 18:30:40 2016 +0100 @@ -20,7 +20,12 @@ """ from __builtins__.types import check_int, check_string -from native import iconv_close, iconv_open, iconv +from native import iconv, iconv_close, iconv_open, iconv_reset + +# Errors produced by iconv. + +EINVAL = 22 +EILSEQ = 84 class ConverterError(Exception): @@ -28,10 +33,6 @@ pass -E2BIG = 7 -EINVAL = 22 -EILSEQ = 84 - class Converter: "A character set converter." @@ -43,6 +44,15 @@ check_string(from_encoding) check_string(to_encoding) self.__data__ = iconv_open(to_encoding, from_encoding) + self.reset() + + def reset(self): + + "Reset the state of the converter." + + self.state = ["", 0, 0] + self.result = [] + iconv_reset(self.__data__) def close(self): @@ -51,29 +61,53 @@ iconv_close(self.__data__) self.__data__ = None - def convert(self, s): + def feed(self, s): - "Convert 's' between the converter's encodings." + "Feed 's' to the converter." if self.__data__ is None: raise ConverterError check_string(s) - result = [] - state = [0, len(s)] + _s, start, remaining = self.state + + if _s: + self.state = [_s + s, start, remaining + len(s)] + else: + self.state = [s, 0, len(s)] while True: # Obtain converted text and update the state. - out = iconv(self.__data__, s, state) - result.append(out) + try: + out = iconv(self.__data__, self.state) + + # Incomplete input does not cause an exception. + + except OSError, exc: + if exc.value == EINVAL: + self.result.append(exc.arg) + return + else: + raise + + # Add any returned text to the result. + + self.result.append(out) # Test for the end of the conversion. - start, remaining = state + _s, start, remaining = self.state + if not remaining: - return "".join(result) + return + + def __str__(self): + + "Return the value of the converted string." + + return "".join(self.result) # vim: tabstop=4 expandtab shiftwidth=4 diff -r e11d55280718 -r ef11d7542969 templates/native/iconv.c --- a/templates/native/iconv.c Mon Dec 12 18:00:17 2016 +0100 +++ b/templates/native/iconv.c Mon Dec 12 18:30:40 2016 +0100 @@ -35,19 +35,17 @@ __attr __fn_native_iconv_iconv(__attr __args[]) { __attr * const cd = &__args[1]; - __attr * const instr = &__args[2]; - __attr * const state = &__args[3]; + __attr * const state = &__args[2]; /* cd interpreted as iconv_t */ iconv_t c = (iconv_t) cd->datavalue; - /* instr.__data__ interpreted as string */ - char *inbuf = __load_via_object(instr->value, __pos___data__).strvalue; /* state.__data__ interpreted as list */ __fragment *f = __load_via_object(state->value, __pos___data__).seqvalue; - /* Obtain the start position from the state. */ + /* Obtain the string, start position, and remaining bytes from the state. */ - int start = __load_via_object(f->attrs[0].value, __pos___data__).intvalue; - int remaining = __load_via_object(f->attrs[1].value, __pos___data__).intvalue; + char *inbuf = __load_via_object(f->attrs[0].value, __pos___data__).strvalue; + int start = __load_via_object(f->attrs[1].value, __pos___data__).intvalue; + int remaining = __load_via_object(f->attrs[2].value, __pos___data__).intvalue; /* Allocate a string for the output buffer using the remaining input size as a guide. */ @@ -69,7 +67,7 @@ /* Return any string. */ - if ((result != -1) || (errno == E2BIG)) + if ((result != -1) || (errno == E2BIG) || (errno == EINVAL)) { outbytestotal = outbufsize - outbytesleft; resultbuf = __ALLOCATE(outbytestotal + 1, sizeof(char)); @@ -77,8 +75,14 @@ /* Mutate the state to indicate the next input buffer position. */ - f->attrs[0] = __new_int(start + remaining - inbytesleft); - f->attrs[1] = __new_int(inbytesleft); + f->attrs[1] = __new_int(start + remaining - inbytesleft); + f->attrs[2] = __new_int(inbytesleft); + + /* Incomplete sequence: raise the string in an OSError instead. */ + + if (errno == EINVAL) + __raise_os_error(__new_int(errno), __new_str(resultbuf, outbytestotal)); + return __new_str(resultbuf, outbytestotal); } @@ -91,15 +95,6 @@ __raise_os_error(__new_int(errno), __new_str(resultbuf, inbytesleft)); } - /* Incomplete sequence. */ - - else if (errno == EINVAL) - { - resultbuf = __ALLOCATE(inbytesleft + 1, sizeof(char)); - memcpy(resultbuf, inbuf, inbytesleft); - __raise_os_error(__new_int(errno), __new_str(resultbuf, inbytesleft)); - } - /* General failure. */ else @@ -144,6 +139,16 @@ return attr; } +__attr __fn_native_iconv_iconv_reset(__attr __args[]) +{ + __attr * const cd = &__args[1]; + /* cd interpreted as iconv_t */ + iconv_t c = (iconv_t) cd->datavalue; + + iconv(c, NULL, NULL, NULL, NULL); + return __builtins___none_None; +} + /* Module initialisation. */ void __main_native_iconv() diff -r e11d55280718 -r ef11d7542969 templates/native/iconv.h --- a/templates/native/iconv.h Mon Dec 12 18:00:17 2016 +0100 +++ b/templates/native/iconv.h Mon Dec 12 18:30:40 2016 +0100 @@ -26,6 +26,7 @@ __attr __fn_native_iconv_iconv(__attr __args[]); __attr __fn_native_iconv_iconv_close(__attr __args[]); __attr __fn_native_iconv_iconv_open(__attr __args[]); +__attr __fn_native_iconv_iconv_reset(__attr __args[]); /* Module initialisation. */ diff -r e11d55280718 -r ef11d7542969 tests/iconv.py --- a/tests/iconv.py Mon Dec 12 18:00:17 2016 +0100 +++ b/tests/iconv.py Mon Dec 12 18:30:40 2016 +0100 @@ -8,22 +8,44 @@ from_utf16 = Converter("UTF-16", "ISO-8859-1") try: - try: - iso = "æøå" - print iso # æøå - utf = to_utf8.convert(iso) - print utf # æøå - print from_utf8.convert(utf) # æøå - utf = to_utf16.convert(iso) - print utf # ... - print from_utf16.convert(utf) # æøå - except OSError, exc: - if exc.value == EINVAL: - print "Incomplete input", exc.arg - elif exc.value == EILSEQ: - print "Invalid input", exc.arg - else: - print exc.value, exc.arg + iso = "æøå" + print iso # æøå + to_utf8.feed(iso) + utf8 = str(to_utf8) + print utf8 # æøå + from_utf8.feed(utf8) + print str(from_utf8) # æøå + to_utf16.feed(iso) + utf16 = str(to_utf16) + print utf16 # ... + from_utf16.feed(utf16) + print str(from_utf16) # æøå + + # Convert part of a UTF-16 sequence, then convert the remainder, then obtain + # the result. + + first = utf16[:3] + second = utf16[3:] + + from_utf16.reset() + print "first:", first # ... + from_utf16.feed(first) # should have handled an incomplete input + print "second:", second # ... + from_utf16.feed(second) # should have handled the complete input + print str(from_utf16) # æøå + + # Convert part of a UTF-8 sequence, then the remainder, then get the result. + + first = utf8[:3] + second = utf8[3:] + + from_utf8.reset() + print "first:", first # æà + from_utf8.feed(first) # should have handled an incomplete input + print "second:", second # ¸Ã¥ + from_utf8.feed(second) # should have handled the complete input + print str(from_utf8) # æøå + finally: to_utf8.close() to_utf16.close()