# HG changeset patch
# User Paul Boddie <paul@boddie.org.uk>
# Date 1483737832 -3600
# Node ID e143933a630c10f5645d6108e29a09dfb3dc1ae6
# Parent  74fa20c041351fe41117cd8f550b3bcbd302772e
Added character access to Unicode objects, moving special item access methods
in basestring into the specific string class, providing separate versions in the
utf8string class.

diff -r 74fa20c04135 -r e143933a630c lib/__builtins__/str.py
--- a/lib/__builtins__/str.py	Mon Dec 19 00:26:49 2016 +0100
+++ b/lib/__builtins__/str.py	Fri Jan 06 22:23:52 2017 +0100
@@ -262,6 +262,10 @@
     def strip(self, chars=None): pass
     def upper(self): pass
 
+class string(basestring):
+
+    "A plain string of bytes."
+
     # Special implementation methods.
 
     def __get_single_item__(self, index):
@@ -290,12 +294,6 @@
 
         return str_substr(self.__data__, start, end, step)
 
-class string(basestring):
-
-    "A plain string of bytes."
-
-    pass
-
 def str(obj):
 
     "Return the string representation of 'obj'."
diff -r 74fa20c04135 -r e143933a630c lib/__builtins__/unicode.py
--- a/lib/__builtins__/unicode.py	Mon Dec 19 00:26:49 2016 +0100
+++ b/lib/__builtins__/unicode.py	Fri Jan 06 22:23:52 2017 +0100
@@ -20,8 +20,10 @@
 """
 
 from __builtins__.str import basestring
+from __builtins__.types import check_int
 from posix.iconv import Converter
-from native import str_add, unicode_len, isinstance as _isinstance
+from native import str_add, unicode_len, unicode_substr, \
+                   isinstance as _isinstance
 
 class utf8string(basestring):
 
@@ -161,6 +163,34 @@
             s.encoding = encoding
         return s
 
+    # Special implementation methods.
+
+    def __get_single_item__(self, index):
+    
+        "Return the item at the normalised (positive) 'index'."
+    
+        self._check_index(index)
+        return utf8string(unicode_substr(self.__data__, index, index + 1, 1), self.encoding)
+
+    def __get_multiple_items__(self, start, end, step):
+
+        """
+        Return items from 'start' until (but excluding) 'end', at 'step'
+        intervals.
+        """
+
+        self._check_index(start)
+        self._check_end_index(end)
+        check_int(step)
+
+        if step == 0:
+            raise ValueError(step)
+
+        if start == end:
+            return ""
+
+        return utf8string(unicode_substr(self.__data__, start, end, step), self.encoding)
+
 def unicode(s, encoding):
 
     "Convert 's' to a Unicode object, interpreting 's' as using 'encoding'."
diff -r 74fa20c04135 -r e143933a630c lib/native/__init__.py
--- a/lib/native/__init__.py	Mon Dec 19 00:26:49 2016 +0100
+++ b/lib/native/__init__.py	Fri Jan 06 22:23:52 2017 +0100
@@ -47,6 +47,6 @@
 
 from native.system import exit, get_argv, get_path
 
-from native.unicode import unicode_len
+from native.unicode import unicode_len, unicode_substr
 
 # vim: tabstop=4 expandtab shiftwidth=4
diff -r 74fa20c04135 -r e143933a630c lib/native/unicode.py
--- a/lib/native/unicode.py	Mon Dec 19 00:26:49 2016 +0100
+++ b/lib/native/unicode.py	Fri Jan 06 22:23:52 2017 +0100
@@ -27,5 +27,6 @@
 # Unicode string operations.
 
 def unicode_len(data): pass
+def unicode_substr(data, start, end, step): pass
 
 # vim: tabstop=4 expandtab shiftwidth=4
diff -r 74fa20c04135 -r e143933a630c templates/native/unicode.c
--- a/templates/native/unicode.c	Mon Dec 19 00:26:49 2016 +0100
+++ b/templates/native/unicode.c	Fri Jan 06 22:23:52 2017 +0100
@@ -16,7 +16,6 @@
 this program.  If not, see <http://www.gnu.org/licenses/>.
 */
 
-#include <string.h> /* strcmp, memcpy */
 #include "native/common.h"
 #include "types.h"
 #include "exceptions.h"
@@ -26,6 +25,39 @@
 #include "progtypes.h"
 #include "main.h"
 
+static inline int boundary(char c)
+{
+    return ((c & 0xc0) == 0xc0) || !(c & 0x80);
+}
+
+static unsigned int nextpos(char *s, unsigned int size, unsigned int bytestart)
+{
+    unsigned int i = bytestart;
+
+    while (i < size)
+    {
+        i++;
+        if (boundary(s[i]))
+            break;
+    }
+
+    return i;
+}
+
+static unsigned int prevpos(char *s, unsigned int bytestart)
+{
+    unsigned int i = bytestart;
+
+    while (i > 0)
+    {
+        i--;
+        if (boundary(s[i]))
+            break;
+    }
+
+    return i;
+}
+
 /* Unicode operations. */
 
 __attr __fn_native_unicode_unicode_len(__attr __args[])
@@ -33,16 +65,92 @@
     __attr * const _data = &__args[1];
     /* _data interpreted as string */
     char *s = _data->strvalue;
-    int i, c = 0;
+    unsigned int i, c = 0;
 
     for (i = 0; i < _data->size; i++)
-        if (((s[i] & 0xc0) == 0xc0) || !(s[i] & 0x80))
+        if (boundary(s[i]))
             c++;
 
     /* Return the new integer. */
     return __new_int(c);
 }
 
+__attr __fn_native_unicode_unicode_substr(__attr __args[])
+{
+    __attr * const _data = &__args[1];
+    __attr * const start = &__args[2];
+    __attr * const end = &__args[3];
+    __attr * const step = &__args[4];
+    /* _data interpreted as string */
+    char *s = _data->strvalue, *sub;
+    /* start.__data__ interpreted as int */
+    int istart = __load_via_object(start->value, __pos___data__).intvalue;
+    /* end.__data__ interpreted as int */
+    int iend = __load_via_object(end->value, __pos___data__).intvalue;
+    /* step.__data__ interpreted as int */
+    int istep = __load_via_object(step->value, __pos___data__).intvalue;
+
+    /* Calculate the number of characters. */
+    size_t nchar = ((iend - istart - (istep > 0 ? 1 : -1)) / istep) + 1;
+    unsigned int indexes[nchar];
+
+    unsigned int c, d, i, to, from, lastbyte = 0;
+    size_t resultsize = 0;
+
+    /* Find the indexes of the characters. */
+    if (istep > 0)
+    {
+        /* Get the first byte position. */
+        for (c = 0; c < istart; c++)
+            lastbyte = nextpos(s, _data->size, lastbyte);
+
+        /* Get each subsequent byte position. */
+        for (c = istart, i = 0; i < nchar; c += istep, i++)
+        {
+            indexes[i] = lastbyte;
+
+            /* Add the character size to the result size. */
+            resultsize += nextpos(s, _data->size, lastbyte) - lastbyte;
+
+            for (d = c; d < c + istep; d++)
+                lastbyte = nextpos(s, _data->size, lastbyte);
+        }
+    }
+    else
+    {
+        /* Get the first byte position. */
+        for (c = 0; c < istart; c++)
+            lastbyte = nextpos(s, _data->size, lastbyte);
+
+        /* Get each subsequent byte position. */
+        for (c = istart, i = 0; i < nchar; c += istep, i++)
+        {
+            indexes[i] = lastbyte;
+
+            /* Add the character size to the result size. */
+            resultsize += nextpos(s, _data->size, lastbyte) - lastbyte;
+
+            for (d = c; d > c + istep; d--)
+                lastbyte = prevpos(s, lastbyte);
+        }
+    }
+
+    /* Reserve space for a new string. */
+    sub = (char *) __ALLOCATE(resultsize + 1, sizeof(char));
+
+    /* Does not null terminate but final byte should be zero. */
+    for (i = 0, to = 0; i < nchar; i++)
+    {
+        from = indexes[i];
+        do
+        {
+            sub[to++] = s[from++];
+        } while (!boundary(s[from]));
+    }
+
+    return __new_str(sub, resultsize);
+}
+
 /* Module initialisation. */
 
 void __main_native_unicode()
diff -r 74fa20c04135 -r e143933a630c templates/native/unicode.h
--- a/templates/native/unicode.h	Mon Dec 19 00:26:49 2016 +0100
+++ b/templates/native/unicode.h	Fri Jan 06 22:23:52 2017 +0100
@@ -22,6 +22,7 @@
 /* Unicode operations. */
 
 __attr __fn_native_unicode_unicode_len(__attr __args[]);
+__attr __fn_native_unicode_unicode_substr(__attr __args[]);
 
 /* Module initialisation. */
 
diff -r 74fa20c04135 -r e143933a630c tests/unicode.py
--- a/tests/unicode.py	Mon Dec 19 00:26:49 2016 +0100
+++ b/tests/unicode.py	Fri Jan 06 22:23:52 2017 +0100
@@ -85,3 +85,21 @@
 print u                             # Ã¦Ã¸Ã¥
 print su                            # ÆØÅæøå
 print us                            # æøåÆØÅ
+
+# Reset the encoding.
+
+sys.stdout.encoding = "ISO-8859-1"
+
+# Test character access.
+
+u0 = u[0]
+print u0.__class__                  # __builtins__.unicode.utf8string
+print u0.encoding                   # ISO-8859-1
+print u0                            # æ
+print u[-1]                         # å
+print len(u[0])                     # 1
+print len(u[-1])                    # 1
+print u[:2]                         # æø
+print len(u[:2])                    # 2
+print u[-1::-1]                     # åøæ
+print len(u[-1::-1])                # 3