# HG changeset patch
# User Paul Boddie <paul@boddie.org.uk>
# Date 1481670188 -3600
# Node ID 97f87e030e28d04a0f4c7493ede55fb73a3e1c16
# Parent  10fffa3651d9c4d7eb842478c5d3477ce8423f5c
Added a UTF-8 character counting native function to support the __len__ method
on Unicode objects, introducing a bytelength method on strings so that
byte-level operations, such as conversion between encodings, can still work
with Unicode objects (since __len__ returning characters would be inappropriate
for such purposes).

diff -r 10fffa3651d9 -r 97f87e030e28 lib/__builtins__/str.py
--- a/lib/__builtins__/str.py	Wed Dec 14 00:00:21 2016 +0100
+++ b/lib/__builtins__/str.py	Wed Dec 14 00:03:08 2016 +0100
@@ -155,12 +155,14 @@
 
         return _negate(self.__eq__(other))
 
-    def __len__(self):
+    def bytelength(self):
 
-        "Return the length of this string."
+        "Return the number of bytes in this string."
 
         return str_len(self.__data__)
 
+    __len__ = bytelength
+
     def __str__(self):
 
         "Return a string representation."
diff -r 10fffa3651d9 -r 97f87e030e28 lib/__builtins__/unicode.py
--- a/lib/__builtins__/unicode.py	Wed Dec 14 00:00:21 2016 +0100
+++ b/lib/__builtins__/unicode.py	Wed Dec 14 00:03:08 2016 +0100
@@ -21,7 +21,7 @@
 
 from __builtins__.str import basestring
 from posix.iconv import Converter
-from native import str_add, isinstance as _isinstance
+from native import str_add, unicode_len, isinstance as _isinstance
 
 class utf8string(basestring):
 
@@ -36,6 +36,7 @@
 
         get_using(basestring.__init__, self)(other)
         self.encoding = encoding
+        self.length = None
 
     def _binary_op(self, op, other):
 
@@ -100,6 +101,15 @@
 
         return self._convert(self._binary_op_rev(str_add, other), other)
 
+    def __len__(self):
+
+        "Return the length of this string in characters."
+
+        if self.length is None:
+            self.length = unicode_len(self.__data__)
+
+        return self.length
+
     def encode(self, encoding=None):
 
         """
diff -r 10fffa3651d9 -r 97f87e030e28 lib/native/__init__.py
--- a/lib/native/__init__.py	Wed Dec 14 00:00:21 2016 +0100
+++ b/lib/native/__init__.py	Wed Dec 14 00:03:08 2016 +0100
@@ -47,4 +47,6 @@
 
 from native.system import exit, get_argv, get_path
 
+from native.unicode import unicode_len
+
 # vim: tabstop=4 expandtab shiftwidth=4
diff -r 10fffa3651d9 -r 97f87e030e28 lib/native/unicode.py
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/lib/native/unicode.py	Wed Dec 14 00:03:08 2016 +0100
@@ -0,0 +1,31 @@
+#!/usr/bin/env python
+
+"""
+Native library functions for Unicode objects.
+
+None of these are actually defined here. Instead, native implementations are
+substituted when each program is built. It is, however, important to declare
+non-core exceptions used by the native functions because they need to be
+identified as being needed by the program.
+
+Copyright (C) 2016 Paul Boddie <paul@boddie.org.uk>
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; either version 3 of the License, or (at your option) any later
+version.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
+details.
+
+You should have received a copy of the GNU General Public License along with
+this program.  If not, see <http://www.gnu.org/licenses/>.
+"""
+
+# Unicode string operations.
+
+def unicode_len(data): pass
+
+# vim: tabstop=4 expandtab shiftwidth=4
diff -r 10fffa3651d9 -r 97f87e030e28 lib/posix/iconv.py
--- a/lib/posix/iconv.py	Wed Dec 14 00:00:21 2016 +0100
+++ b/lib/posix/iconv.py	Wed Dec 14 00:03:08 2016 +0100
@@ -73,9 +73,9 @@
         _s, start, remaining = self.state
 
         if _s:
-            self.state = [_s + s, start, remaining + len(s)]
+            self.state = [_s + s, start, remaining + s.bytelength()]
         else:
-            self.state = [s, 0, len(s)]
+            self.state = [s, 0, s.bytelength()]
 
         while True:
 
diff -r 10fffa3651d9 -r 97f87e030e28 templates/native/unicode.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/templates/native/unicode.c	Wed Dec 14 00:03:08 2016 +0100
@@ -0,0 +1,50 @@
+/* Native functions for Unicode operations.
+
+Copyright (C) 2016 Paul Boddie <paul@boddie.org.uk>
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; either version 3 of the License, or (at your option) any later
+version.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
+details.
+
+You should have received a copy of the GNU General Public License along with
+this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include <string.h> /* strcmp, memcpy */
+#include "native/common.h"
+#include "types.h"
+#include "exceptions.h"
+#include "ops.h"
+#include "progconsts.h"
+#include "progops.h"
+#include "progtypes.h"
+#include "main.h"
+
+/* Unicode operations. */
+
+__attr __fn_native_unicode_unicode_len(__attr __args[])
+{
+    __attr * const _data = &__args[1];
+    /* _data interpreted as string */
+    char *s = _data->strvalue;
+    int i, c = 0;
+
+    for (i = 0; i < _data->size; i++)
+        if (((s[i] & 0xc0) == 0xc0) || !(s[i] & 0x80))
+            c++;
+
+    /* Return the new integer. */
+    return __new_int(c);
+}
+
+/* Module initialisation. */
+
+void __main_native_unicode()
+{
+}
diff -r 10fffa3651d9 -r 97f87e030e28 templates/native/unicode.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/templates/native/unicode.h	Wed Dec 14 00:03:08 2016 +0100
@@ -0,0 +1,30 @@
+/* Native functions for Unicode operations.
+
+Copyright (C) 2016 Paul Boddie <paul@boddie.org.uk>
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; either version 3 of the License, or (at your option) any later
+version.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
+details.
+
+You should have received a copy of the GNU General Public License along with
+this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef __NATIVE_UNICODE_H__
+#define __NATIVE_UNICODE_H__
+
+/* Unicode operations. */
+
+__attr __fn_native_unicode_unicode_len(__attr __args[]);
+
+/* Module initialisation. */
+
+void __main_native_unicode();
+
+#endif /* __NATIVE_UNICODE_H__ */
diff -r 10fffa3651d9 -r 97f87e030e28 tests/unicode.py
--- a/tests/unicode.py	Wed Dec 14 00:00:21 2016 +0100
+++ b/tests/unicode.py	Wed Dec 14 00:03:08 2016 +0100
@@ -6,6 +6,7 @@
 
 s = b"ЖШЕ"
 print s                             # ЖШЕ
+print len(s)                        # 3
 
 # Obtain text and print it.
 
@@ -13,22 +14,28 @@
 
 u = unicode("жше", "ISO-8859-1")
 print u                             # Г¦ГёГҐ
+print u.__class__                   # __builtins__.unicode.utf8string
 print u.encode("ISO-8859-1")        # жше
 print u.encoding                    # ISO-8859-1
+print len(u)                        # 3
 
 # Explicitly from Unicode literals.
 
 u2 = u"жше"
 print u2                            # Г¦ГёГҐ
+print u2.__class__                  # __builtins__.unicode.utf8string
 print u2.encode("ISO-8859-1")       # жше
 print u2.encoding                   # ISO-8859-1
+print len(u2)                       # 3
 
 # Implicitly from string literals.
 
 #u3 = "жше"
 #print u3                            # Г¦ГёГҐ
+#print u3.__class__                  # __builtins__.unicode.utf8string
 #print u3.encode("ISO-8859-1")       # жше
 #print u3.encoding                   # ISO-8859-1
+#print len(u3)                       # 3
 
 # Combine bytes and text.
 # The text should be decoded.
@@ -36,6 +43,7 @@
 su = s + u
 print su                            # ЖШЕжше
 print su.__class__                  # __builtins__.str.string
+print len(su)                       # 6
 
 # Combine text and bytes.
 # The text should be decoded.
@@ -43,6 +51,7 @@
 us = u + s
 print us                            # жшеЖШЕ
 print us.__class__                  # __builtins__.str.string
+print len(us)                       # 6
 
 # Combine text and text.
 
@@ -50,6 +59,7 @@
 print uu2                           # Г¦ГёГҐ
 print uu2.__class__                 # __builtins__.unicode.utf8string
 print uu2.encoding                  # ISO-8859-1
+print len(uu2)                      # 6
 
 # Inspect and update the encoding of stdout.
 # Note that su and us are byte strings and are not recoded.