Added character access to Unicode objects, moving special item access methods in basestring into the specific string class, providing separate versions in the utf8string class.

     1.1 --- a/lib/__builtins__/str.py	Mon Dec 19 00:26:49 2016 +0100
     1.2 +++ b/lib/__builtins__/str.py	Fri Jan 06 22:23:52 2017 +0100
     1.3 @@ -262,6 +262,10 @@
     1.4      def strip(self, chars=None): pass
     1.5      def upper(self): pass
     1.6  
     1.7 +class string(basestring):
     1.8 +
     1.9 +    "A plain string of bytes."
    1.10 +
    1.11      # Special implementation methods.
    1.12  
    1.13      def __get_single_item__(self, index):
    1.14 @@ -290,12 +294,6 @@
    1.15  
    1.16          return str_substr(self.__data__, start, end, step)
    1.17  
    1.18 -class string(basestring):
    1.19 -
    1.20 -    "A plain string of bytes."
    1.21 -
    1.22 -    pass
    1.23 -
    1.24  def str(obj):
    1.25  
    1.26      "Return the string representation of 'obj'."

     2.1 --- a/lib/__builtins__/unicode.py	Mon Dec 19 00:26:49 2016 +0100
     2.2 +++ b/lib/__builtins__/unicode.py	Fri Jan 06 22:23:52 2017 +0100
     2.3 @@ -20,8 +20,10 @@
     2.4  """
     2.5  
     2.6  from __builtins__.str import basestring
     2.7 +from __builtins__.types import check_int
     2.8  from posix.iconv import Converter
     2.9 -from native import str_add, unicode_len, isinstance as _isinstance
    2.10 +from native import str_add, unicode_len, unicode_substr, \
    2.11 +                   isinstance as _isinstance
    2.12  
    2.13  class utf8string(basestring):
    2.14  
    2.15 @@ -161,6 +163,34 @@
    2.16              s.encoding = encoding
    2.17          return s
    2.18  
    2.19 +    # Special implementation methods.
    2.20 +
    2.21 +    def __get_single_item__(self, index):
    2.22 +    
    2.23 +        "Return the item at the normalised (positive) 'index'."
    2.24 +    
    2.25 +        self._check_index(index)
    2.26 +        return utf8string(unicode_substr(self.__data__, index, index + 1, 1), self.encoding)
    2.27 +
    2.28 +    def __get_multiple_items__(self, start, end, step):
    2.29 +
    2.30 +        """
    2.31 +        Return items from 'start' until (but excluding) 'end', at 'step'
    2.32 +        intervals.
    2.33 +        """
    2.34 +
    2.35 +        self._check_index(start)
    2.36 +        self._check_end_index(end)
    2.37 +        check_int(step)
    2.38 +
    2.39 +        if step == 0:
    2.40 +            raise ValueError(step)
    2.41 +
    2.42 +        if start == end:
    2.43 +            return ""
    2.44 +
    2.45 +        return utf8string(unicode_substr(self.__data__, start, end, step), self.encoding)
    2.46 +
    2.47  def unicode(s, encoding):
    2.48  
    2.49      "Convert 's' to a Unicode object, interpreting 's' as using 'encoding'."

     3.1 --- a/lib/native/__init__.py	Mon Dec 19 00:26:49 2016 +0100
     3.2 +++ b/lib/native/__init__.py	Fri Jan 06 22:23:52 2017 +0100
     3.3 @@ -47,6 +47,6 @@
     3.4  
     3.5  from native.system import exit, get_argv, get_path
     3.6  
     3.7 -from native.unicode import unicode_len
     3.8 +from native.unicode import unicode_len, unicode_substr
     3.9  
    3.10  # vim: tabstop=4 expandtab shiftwidth=4

     4.1 --- a/lib/native/unicode.py	Mon Dec 19 00:26:49 2016 +0100
     4.2 +++ b/lib/native/unicode.py	Fri Jan 06 22:23:52 2017 +0100
     4.3 @@ -27,5 +27,6 @@
     4.4  # Unicode string operations.
     4.5  
     4.6  def unicode_len(data): pass
     4.7 +def unicode_substr(data, start, end, step): pass
     4.8  
     4.9  # vim: tabstop=4 expandtab shiftwidth=4

     5.1 --- a/templates/native/unicode.c	Mon Dec 19 00:26:49 2016 +0100
     5.2 +++ b/templates/native/unicode.c	Fri Jan 06 22:23:52 2017 +0100
     5.3 @@ -16,7 +16,6 @@
     5.4  this program.  If not, see <http://www.gnu.org/licenses/>.
     5.5  */
     5.6  
     5.7 -#include <string.h> /* strcmp, memcpy */
     5.8  #include "native/common.h"
     5.9  #include "types.h"
    5.10  #include "exceptions.h"
    5.11 @@ -26,6 +25,39 @@
    5.12  #include "progtypes.h"
    5.13  #include "main.h"
    5.14  
    5.15 +static inline int boundary(char c)
    5.16 +{
    5.17 +    return ((c & 0xc0) == 0xc0) || !(c & 0x80);
    5.18 +}
    5.19 +
    5.20 +static unsigned int nextpos(char *s, unsigned int size, unsigned int bytestart)
    5.21 +{
    5.22 +    unsigned int i = bytestart;
    5.23 +
    5.24 +    while (i < size)
    5.25 +    {
    5.26 +        i++;
    5.27 +        if (boundary(s[i]))
    5.28 +            break;
    5.29 +    }
    5.30 +
    5.31 +    return i;
    5.32 +}
    5.33 +
    5.34 +static unsigned int prevpos(char *s, unsigned int bytestart)
    5.35 +{
    5.36 +    unsigned int i = bytestart;
    5.37 +
    5.38 +    while (i > 0)
    5.39 +    {
    5.40 +        i--;
    5.41 +        if (boundary(s[i]))
    5.42 +            break;
    5.43 +    }
    5.44 +
    5.45 +    return i;
    5.46 +}
    5.47 +
    5.48  /* Unicode operations. */
    5.49  
    5.50  __attr __fn_native_unicode_unicode_len(__attr __args[])
    5.51 @@ -33,16 +65,92 @@
    5.52      __attr * const _data = &__args[1];
    5.53      /* _data interpreted as string */
    5.54      char *s = _data->strvalue;
    5.55 -    int i, c = 0;
    5.56 +    unsigned int i, c = 0;
    5.57  
    5.58      for (i = 0; i < _data->size; i++)
    5.59 -        if (((s[i] & 0xc0) == 0xc0) || !(s[i] & 0x80))
    5.60 +        if (boundary(s[i]))
    5.61              c++;
    5.62  
    5.63      /* Return the new integer. */
    5.64      return __new_int(c);
    5.65  }
    5.66  
    5.67 +__attr __fn_native_unicode_unicode_substr(__attr __args[])
    5.68 +{
    5.69 +    __attr * const _data = &__args[1];
    5.70 +    __attr * const start = &__args[2];
    5.71 +    __attr * const end = &__args[3];
    5.72 +    __attr * const step = &__args[4];
    5.73 +    /* _data interpreted as string */
    5.74 +    char *s = _data->strvalue, *sub;
    5.75 +    /* start.__data__ interpreted as int */
    5.76 +    int istart = __load_via_object(start->value, __pos___data__).intvalue;
    5.77 +    /* end.__data__ interpreted as int */
    5.78 +    int iend = __load_via_object(end->value, __pos___data__).intvalue;
    5.79 +    /* step.__data__ interpreted as int */
    5.80 +    int istep = __load_via_object(step->value, __pos___data__).intvalue;
    5.81 +
    5.82 +    /* Calculate the number of characters. */
    5.83 +    size_t nchar = ((iend - istart - (istep > 0 ? 1 : -1)) / istep) + 1;
    5.84 +    unsigned int indexes[nchar];
    5.85 +
    5.86 +    unsigned int c, d, i, to, from, lastbyte = 0;
    5.87 +    size_t resultsize = 0;
    5.88 +
    5.89 +    /* Find the indexes of the characters. */
    5.90 +    if (istep > 0)
    5.91 +    {
    5.92 +        /* Get the first byte position. */
    5.93 +        for (c = 0; c < istart; c++)
    5.94 +            lastbyte = nextpos(s, _data->size, lastbyte);
    5.95 +
    5.96 +        /* Get each subsequent byte position. */
    5.97 +        for (c = istart, i = 0; i < nchar; c += istep, i++)
    5.98 +        {
    5.99 +            indexes[i] = lastbyte;
   5.100 +
   5.101 +            /* Add the character size to the result size. */
   5.102 +            resultsize += nextpos(s, _data->size, lastbyte) - lastbyte;
   5.103 +
   5.104 +            for (d = c; d < c + istep; d++)
   5.105 +                lastbyte = nextpos(s, _data->size, lastbyte);
   5.106 +        }
   5.107 +    }
   5.108 +    else
   5.109 +    {
   5.110 +        /* Get the first byte position. */
   5.111 +        for (c = 0; c < istart; c++)
   5.112 +            lastbyte = nextpos(s, _data->size, lastbyte);
   5.113 +
   5.114 +        /* Get each subsequent byte position. */
   5.115 +        for (c = istart, i = 0; i < nchar; c += istep, i++)
   5.116 +        {
   5.117 +            indexes[i] = lastbyte;
   5.118 +
   5.119 +            /* Add the character size to the result size. */
   5.120 +            resultsize += nextpos(s, _data->size, lastbyte) - lastbyte;
   5.121 +
   5.122 +            for (d = c; d > c + istep; d--)
   5.123 +                lastbyte = prevpos(s, lastbyte);
   5.124 +        }
   5.125 +    }
   5.126 +
   5.127 +    /* Reserve space for a new string. */
   5.128 +    sub = (char *) __ALLOCATE(resultsize + 1, sizeof(char));
   5.129 +
   5.130 +    /* Does not null terminate but final byte should be zero. */
   5.131 +    for (i = 0, to = 0; i < nchar; i++)
   5.132 +    {
   5.133 +        from = indexes[i];
   5.134 +        do
   5.135 +        {
   5.136 +            sub[to++] = s[from++];
   5.137 +        } while (!boundary(s[from]));
   5.138 +    }
   5.139 +
   5.140 +    return __new_str(sub, resultsize);
   5.141 +}
   5.142 +
   5.143  /* Module initialisation. */
   5.144  
   5.145  void __main_native_unicode()

     6.1 --- a/templates/native/unicode.h	Mon Dec 19 00:26:49 2016 +0100
     6.2 +++ b/templates/native/unicode.h	Fri Jan 06 22:23:52 2017 +0100
     6.3 @@ -22,6 +22,7 @@
     6.4  /* Unicode operations. */
     6.5  
     6.6  __attr __fn_native_unicode_unicode_len(__attr __args[]);
     6.7 +__attr __fn_native_unicode_unicode_substr(__attr __args[]);
     6.8  
     6.9  /* Module initialisation. */
    6.10  

     7.1 --- a/tests/unicode.py	Mon Dec 19 00:26:49 2016 +0100
     7.2 +++ b/tests/unicode.py	Fri Jan 06 22:23:52 2017 +0100
     7.3 @@ -85,3 +85,21 @@
     7.4  print u                             # æøå
     7.5  print su                            # ������
     7.6  print us                            # ������
     7.7 +
     7.8 +# Reset the encoding.
     7.9 +
    7.10 +sys.stdout.encoding = "ISO-8859-1"
    7.11 +
    7.12 +# Test character access.
    7.13 +
    7.14 +u0 = u[0]
    7.15 +print u0.__class__                  # __builtins__.unicode.utf8string
    7.16 +print u0.encoding                   # ISO-8859-1
    7.17 +print u0                            # �
    7.18 +print u[-1]                         # �
    7.19 +print len(u[0])                     # 1
    7.20 +print len(u[-1])                    # 1
    7.21 +print u[:2]                         # ��
    7.22 +print len(u[:2])                    # 2
    7.23 +print u[-1::-1]                     # ���
    7.24 +print len(u[-1::-1])                # 3
2017-01-06	Paul Boddie	raw files shortlog changelog graph	Added character access to Unicode objects, moving special item access methods in basestring into the specific string class, providing separate versions in the utf8string class.
			lib/__builtins__/str.py (file) lib/__builtins__/unicode.py (file) lib/native/__init__.py (file) lib/native/unicode.py (file) templates/native/unicode.c (file) templates/native/unicode.h (file) tests/unicode.py (file)