Added a UTF-8 character counting native function to support the __len__ method on Unicode objects, introducing a bytelength method on strings so that byte-level operations, such as conversion between encodings, can still work with Unicode objects (since __len__ returning characters would be inappropriate for such purposes).

     1.1 --- a/lib/__builtins__/str.py	Wed Dec 14 00:00:21 2016 +0100
     1.2 +++ b/lib/__builtins__/str.py	Wed Dec 14 00:03:08 2016 +0100
     1.3 @@ -155,12 +155,14 @@
     1.4  
     1.5          return _negate(self.__eq__(other))
     1.6  
     1.7 -    def __len__(self):
     1.8 +    def bytelength(self):
     1.9  
    1.10 -        "Return the length of this string."
    1.11 +        "Return the number of bytes in this string."
    1.12  
    1.13          return str_len(self.__data__)
    1.14  
    1.15 +    __len__ = bytelength
    1.16 +
    1.17      def __str__(self):
    1.18  
    1.19          "Return a string representation."

     2.1 --- a/lib/__builtins__/unicode.py	Wed Dec 14 00:00:21 2016 +0100
     2.2 +++ b/lib/__builtins__/unicode.py	Wed Dec 14 00:03:08 2016 +0100
     2.3 @@ -21,7 +21,7 @@
     2.4  
     2.5  from __builtins__.str import basestring
     2.6  from posix.iconv import Converter
     2.7 -from native import str_add, isinstance as _isinstance
     2.8 +from native import str_add, unicode_len, isinstance as _isinstance
     2.9  
    2.10  class utf8string(basestring):
    2.11  
    2.12 @@ -36,6 +36,7 @@
    2.13  
    2.14          get_using(basestring.__init__, self)(other)
    2.15          self.encoding = encoding
    2.16 +        self.length = None
    2.17  
    2.18      def _binary_op(self, op, other):
    2.19  
    2.20 @@ -100,6 +101,15 @@
    2.21  
    2.22          return self._convert(self._binary_op_rev(str_add, other), other)
    2.23  
    2.24 +    def __len__(self):
    2.25 +
    2.26 +        "Return the length of this string in characters."
    2.27 +
    2.28 +        if self.length is None:
    2.29 +            self.length = unicode_len(self.__data__)
    2.30 +
    2.31 +        return self.length
    2.32 +
    2.33      def encode(self, encoding=None):
    2.34  
    2.35          """

     3.1 --- a/lib/native/__init__.py	Wed Dec 14 00:00:21 2016 +0100
     3.2 +++ b/lib/native/__init__.py	Wed Dec 14 00:03:08 2016 +0100
     3.3 @@ -47,4 +47,6 @@
     3.4  
     3.5  from native.system import exit, get_argv, get_path
     3.6  
     3.7 +from native.unicode import unicode_len
     3.8 +
     3.9  # vim: tabstop=4 expandtab shiftwidth=4

     4.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     4.2 +++ b/lib/native/unicode.py	Wed Dec 14 00:03:08 2016 +0100
     4.3 @@ -0,0 +1,31 @@
     4.4 +#!/usr/bin/env python
     4.5 +
     4.6 +"""
     4.7 +Native library functions for Unicode objects.
     4.8 +
     4.9 +None of these are actually defined here. Instead, native implementations are
    4.10 +substituted when each program is built. It is, however, important to declare
    4.11 +non-core exceptions used by the native functions because they need to be
    4.12 +identified as being needed by the program.
    4.13 +
    4.14 +Copyright (C) 2016 Paul Boddie <paul@boddie.org.uk>
    4.15 +
    4.16 +This program is free software; you can redistribute it and/or modify it under
    4.17 +the terms of the GNU General Public License as published by the Free Software
    4.18 +Foundation; either version 3 of the License, or (at your option) any later
    4.19 +version.
    4.20 +
    4.21 +This program is distributed in the hope that it will be useful, but WITHOUT
    4.22 +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
    4.23 +FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
    4.24 +details.
    4.25 +
    4.26 +You should have received a copy of the GNU General Public License along with
    4.27 +this program.  If not, see <http://www.gnu.org/licenses/>.
    4.28 +"""
    4.29 +
    4.30 +# Unicode string operations.
    4.31 +
    4.32 +def unicode_len(data): pass
    4.33 +
    4.34 +# vim: tabstop=4 expandtab shiftwidth=4

     5.1 --- a/lib/posix/iconv.py	Wed Dec 14 00:00:21 2016 +0100
     5.2 +++ b/lib/posix/iconv.py	Wed Dec 14 00:03:08 2016 +0100
     5.3 @@ -73,9 +73,9 @@
     5.4          _s, start, remaining = self.state
     5.5  
     5.6          if _s:
     5.7 -            self.state = [_s + s, start, remaining + len(s)]
     5.8 +            self.state = [_s + s, start, remaining + s.bytelength()]
     5.9          else:
    5.10 -            self.state = [s, 0, len(s)]
    5.11 +            self.state = [s, 0, s.bytelength()]
    5.12  
    5.13          while True:
    5.14  

     6.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     6.2 +++ b/templates/native/unicode.c	Wed Dec 14 00:03:08 2016 +0100
     6.3 @@ -0,0 +1,50 @@
     6.4 +/* Native functions for Unicode operations.
     6.5 +
     6.6 +Copyright (C) 2016 Paul Boddie <paul@boddie.org.uk>
     6.7 +
     6.8 +This program is free software; you can redistribute it and/or modify it under
     6.9 +the terms of the GNU General Public License as published by the Free Software
    6.10 +Foundation; either version 3 of the License, or (at your option) any later
    6.11 +version.
    6.12 +
    6.13 +This program is distributed in the hope that it will be useful, but WITHOUT
    6.14 +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
    6.15 +FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
    6.16 +details.
    6.17 +
    6.18 +You should have received a copy of the GNU General Public License along with
    6.19 +this program.  If not, see <http://www.gnu.org/licenses/>.
    6.20 +*/
    6.21 +
    6.22 +#include <string.h> /* strcmp, memcpy */
    6.23 +#include "native/common.h"
    6.24 +#include "types.h"
    6.25 +#include "exceptions.h"
    6.26 +#include "ops.h"
    6.27 +#include "progconsts.h"
    6.28 +#include "progops.h"
    6.29 +#include "progtypes.h"
    6.30 +#include "main.h"
    6.31 +
    6.32 +/* Unicode operations. */
    6.33 +
    6.34 +__attr __fn_native_unicode_unicode_len(__attr __args[])
    6.35 +{
    6.36 +    __attr * const _data = &__args[1];
    6.37 +    /* _data interpreted as string */
    6.38 +    char *s = _data->strvalue;
    6.39 +    int i, c = 0;
    6.40 +
    6.41 +    for (i = 0; i < _data->size; i++)
    6.42 +        if (((s[i] & 0xc0) == 0xc0) || !(s[i] & 0x80))
    6.43 +            c++;
    6.44 +
    6.45 +    /* Return the new integer. */
    6.46 +    return __new_int(c);
    6.47 +}
    6.48 +
    6.49 +/* Module initialisation. */
    6.50 +
    6.51 +void __main_native_unicode()
    6.52 +{
    6.53 +}

     7.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     7.2 +++ b/templates/native/unicode.h	Wed Dec 14 00:03:08 2016 +0100
     7.3 @@ -0,0 +1,30 @@
     7.4 +/* Native functions for Unicode operations.
     7.5 +
     7.6 +Copyright (C) 2016 Paul Boddie <paul@boddie.org.uk>
     7.7 +
     7.8 +This program is free software; you can redistribute it and/or modify it under
     7.9 +the terms of the GNU General Public License as published by the Free Software
    7.10 +Foundation; either version 3 of the License, or (at your option) any later
    7.11 +version.
    7.12 +
    7.13 +This program is distributed in the hope that it will be useful, but WITHOUT
    7.14 +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
    7.15 +FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
    7.16 +details.
    7.17 +
    7.18 +You should have received a copy of the GNU General Public License along with
    7.19 +this program.  If not, see <http://www.gnu.org/licenses/>.
    7.20 +*/
    7.21 +
    7.22 +#ifndef __NATIVE_UNICODE_H__
    7.23 +#define __NATIVE_UNICODE_H__
    7.24 +
    7.25 +/* Unicode operations. */
    7.26 +
    7.27 +__attr __fn_native_unicode_unicode_len(__attr __args[]);
    7.28 +
    7.29 +/* Module initialisation. */
    7.30 +
    7.31 +void __main_native_unicode();
    7.32 +
    7.33 +#endif /* __NATIVE_UNICODE_H__ */

     8.1 --- a/tests/unicode.py	Wed Dec 14 00:00:21 2016 +0100
     8.2 +++ b/tests/unicode.py	Wed Dec 14 00:03:08 2016 +0100
     8.3 @@ -6,6 +6,7 @@
     8.4  
     8.5  s = b"���"
     8.6  print s                             # ���
     8.7 +print len(s)                        # 3
     8.8  
     8.9  # Obtain text and print it.
    8.10  
    8.11 @@ -13,22 +14,28 @@
    8.12  
    8.13  u = unicode("���", "ISO-8859-1")
    8.14  print u                             # æøå
    8.15 +print u.__class__                   # __builtins__.unicode.utf8string
    8.16  print u.encode("ISO-8859-1")        # ���
    8.17  print u.encoding                    # ISO-8859-1
    8.18 +print len(u)                        # 3
    8.19  
    8.20  # Explicitly from Unicode literals.
    8.21  
    8.22  u2 = u"���"
    8.23  print u2                            # æøå
    8.24 +print u2.__class__                  # __builtins__.unicode.utf8string
    8.25  print u2.encode("ISO-8859-1")       # ���
    8.26  print u2.encoding                   # ISO-8859-1
    8.27 +print len(u2)                       # 3
    8.28  
    8.29  # Implicitly from string literals.
    8.30  
    8.31  #u3 = "���"
    8.32  #print u3                            # æøå
    8.33 +#print u3.__class__                  # __builtins__.unicode.utf8string
    8.34  #print u3.encode("ISO-8859-1")       # ���
    8.35  #print u3.encoding                   # ISO-8859-1
    8.36 +#print len(u3)                       # 3
    8.37  
    8.38  # Combine bytes and text.
    8.39  # The text should be decoded.
    8.40 @@ -36,6 +43,7 @@
    8.41  su = s + u
    8.42  print su                            # ������
    8.43  print su.__class__                  # __builtins__.str.string
    8.44 +print len(su)                       # 6
    8.45  
    8.46  # Combine text and bytes.
    8.47  # The text should be decoded.
    8.48 @@ -43,6 +51,7 @@
    8.49  us = u + s
    8.50  print us                            # ������
    8.51  print us.__class__                  # __builtins__.str.string
    8.52 +print len(us)                       # 6
    8.53  
    8.54  # Combine text and text.
    8.55  
    8.56 @@ -50,6 +59,7 @@
    8.57  print uu2                           # æøå
    8.58  print uu2.__class__                 # __builtins__.unicode.utf8string
    8.59  print uu2.encoding                  # ISO-8859-1
    8.60 +print len(uu2)                      # 6
    8.61  
    8.62  # Inspect and update the encoding of stdout.
    8.63  # Note that su and us are byte strings and are not recoded.
2016-12-14	Paul Boddie	raw files shortlog changelog graph	Added a UTF-8 character counting native function to support the __len__ method on Unicode objects, introducing a bytelength method on strings so that byte-level operations, such as conversion between encodings, can still work with Unicode objects (since __len__ returning characters would be inappropriate for such purposes).
			lib/__builtins__/str.py (file) lib/__builtins__/unicode.py (file) lib/native/__init__.py (file) lib/native/unicode.py (file) lib/posix/iconv.py (file) templates/native/unicode.c (file) templates/native/unicode.h (file) tests/unicode.py (file)