1.1 --- a/lib/__builtins__/str.py Mon Dec 19 00:26:49 2016 +0100
1.2 +++ b/lib/__builtins__/str.py Fri Jan 06 22:23:52 2017 +0100
1.3 @@ -262,6 +262,10 @@
1.4 def strip(self, chars=None): pass
1.5 def upper(self): pass
1.6
1.7 +class string(basestring):
1.8 +
1.9 + "A plain string of bytes."
1.10 +
1.11 # Special implementation methods.
1.12
1.13 def __get_single_item__(self, index):
1.14 @@ -290,12 +294,6 @@
1.15
1.16 return str_substr(self.__data__, start, end, step)
1.17
1.18 -class string(basestring):
1.19 -
1.20 - "A plain string of bytes."
1.21 -
1.22 - pass
1.23 -
1.24 def str(obj):
1.25
1.26 "Return the string representation of 'obj'."
2.1 --- a/lib/__builtins__/unicode.py Mon Dec 19 00:26:49 2016 +0100
2.2 +++ b/lib/__builtins__/unicode.py Fri Jan 06 22:23:52 2017 +0100
2.3 @@ -20,8 +20,10 @@
2.4 """
2.5
2.6 from __builtins__.str import basestring
2.7 +from __builtins__.types import check_int
2.8 from posix.iconv import Converter
2.9 -from native import str_add, unicode_len, isinstance as _isinstance
2.10 +from native import str_add, unicode_len, unicode_substr, \
2.11 + isinstance as _isinstance
2.12
2.13 class utf8string(basestring):
2.14
2.15 @@ -161,6 +163,34 @@
2.16 s.encoding = encoding
2.17 return s
2.18
2.19 + # Special implementation methods.
2.20 +
2.21 + def __get_single_item__(self, index):
2.22 +
2.23 + "Return the item at the normalised (positive) 'index'."
2.24 +
2.25 + self._check_index(index)
2.26 + return utf8string(unicode_substr(self.__data__, index, index + 1, 1), self.encoding)
2.27 +
2.28 + def __get_multiple_items__(self, start, end, step):
2.29 +
2.30 + """
2.31 + Return items from 'start' until (but excluding) 'end', at 'step'
2.32 + intervals.
2.33 + """
2.34 +
2.35 + self._check_index(start)
2.36 + self._check_end_index(end)
2.37 + check_int(step)
2.38 +
2.39 + if step == 0:
2.40 + raise ValueError(step)
2.41 +
2.42 + if start == end:
2.43 + return ""
2.44 +
2.45 + return utf8string(unicode_substr(self.__data__, start, end, step), self.encoding)
2.46 +
2.47 def unicode(s, encoding):
2.48
2.49 "Convert 's' to a Unicode object, interpreting 's' as using 'encoding'."
3.1 --- a/lib/native/__init__.py Mon Dec 19 00:26:49 2016 +0100
3.2 +++ b/lib/native/__init__.py Fri Jan 06 22:23:52 2017 +0100
3.3 @@ -47,6 +47,6 @@
3.4
3.5 from native.system import exit, get_argv, get_path
3.6
3.7 -from native.unicode import unicode_len
3.8 +from native.unicode import unicode_len, unicode_substr
3.9
3.10 # vim: tabstop=4 expandtab shiftwidth=4
4.1 --- a/lib/native/unicode.py Mon Dec 19 00:26:49 2016 +0100
4.2 +++ b/lib/native/unicode.py Fri Jan 06 22:23:52 2017 +0100
4.3 @@ -27,5 +27,6 @@
4.4 # Unicode string operations.
4.5
4.6 def unicode_len(data): pass
4.7 +def unicode_substr(data, start, end, step): pass
4.8
4.9 # vim: tabstop=4 expandtab shiftwidth=4
5.1 --- a/templates/native/unicode.c Mon Dec 19 00:26:49 2016 +0100
5.2 +++ b/templates/native/unicode.c Fri Jan 06 22:23:52 2017 +0100
5.3 @@ -16,7 +16,6 @@
5.4 this program. If not, see <http://www.gnu.org/licenses/>.
5.5 */
5.6
5.7 -#include <string.h> /* strcmp, memcpy */
5.8 #include "native/common.h"
5.9 #include "types.h"
5.10 #include "exceptions.h"
5.11 @@ -26,6 +25,39 @@
5.12 #include "progtypes.h"
5.13 #include "main.h"
5.14
5.15 +static inline int boundary(char c)
5.16 +{
5.17 + return ((c & 0xc0) == 0xc0) || !(c & 0x80);
5.18 +}
5.19 +
5.20 +static unsigned int nextpos(char *s, unsigned int size, unsigned int bytestart)
5.21 +{
5.22 + unsigned int i = bytestart;
5.23 +
5.24 + while (i < size)
5.25 + {
5.26 + i++;
5.27 + if (boundary(s[i]))
5.28 + break;
5.29 + }
5.30 +
5.31 + return i;
5.32 +}
5.33 +
5.34 +static unsigned int prevpos(char *s, unsigned int bytestart)
5.35 +{
5.36 + unsigned int i = bytestart;
5.37 +
5.38 + while (i > 0)
5.39 + {
5.40 + i--;
5.41 + if (boundary(s[i]))
5.42 + break;
5.43 + }
5.44 +
5.45 + return i;
5.46 +}
5.47 +
5.48 /* Unicode operations. */
5.49
5.50 __attr __fn_native_unicode_unicode_len(__attr __args[])
5.51 @@ -33,16 +65,92 @@
5.52 __attr * const _data = &__args[1];
5.53 /* _data interpreted as string */
5.54 char *s = _data->strvalue;
5.55 - int i, c = 0;
5.56 + unsigned int i, c = 0;
5.57
5.58 for (i = 0; i < _data->size; i++)
5.59 - if (((s[i] & 0xc0) == 0xc0) || !(s[i] & 0x80))
5.60 + if (boundary(s[i]))
5.61 c++;
5.62
5.63 /* Return the new integer. */
5.64 return __new_int(c);
5.65 }
5.66
5.67 +__attr __fn_native_unicode_unicode_substr(__attr __args[])
5.68 +{
5.69 + __attr * const _data = &__args[1];
5.70 + __attr * const start = &__args[2];
5.71 + __attr * const end = &__args[3];
5.72 + __attr * const step = &__args[4];
5.73 + /* _data interpreted as string */
5.74 + char *s = _data->strvalue, *sub;
5.75 + /* start.__data__ interpreted as int */
5.76 + int istart = __load_via_object(start->value, __pos___data__).intvalue;
5.77 + /* end.__data__ interpreted as int */
5.78 + int iend = __load_via_object(end->value, __pos___data__).intvalue;
5.79 + /* step.__data__ interpreted as int */
5.80 + int istep = __load_via_object(step->value, __pos___data__).intvalue;
5.81 +
5.82 + /* Calculate the number of characters. */
5.83 + size_t nchar = ((iend - istart - (istep > 0 ? 1 : -1)) / istep) + 1;
5.84 + unsigned int indexes[nchar];
5.85 +
5.86 + unsigned int c, d, i, to, from, lastbyte = 0;
5.87 + size_t resultsize = 0;
5.88 +
5.89 + /* Find the indexes of the characters. */
5.90 + if (istep > 0)
5.91 + {
5.92 + /* Get the first byte position. */
5.93 + for (c = 0; c < istart; c++)
5.94 + lastbyte = nextpos(s, _data->size, lastbyte);
5.95 +
5.96 + /* Get each subsequent byte position. */
5.97 + for (c = istart, i = 0; i < nchar; c += istep, i++)
5.98 + {
5.99 + indexes[i] = lastbyte;
5.100 +
5.101 + /* Add the character size to the result size. */
5.102 + resultsize += nextpos(s, _data->size, lastbyte) - lastbyte;
5.103 +
5.104 + for (d = c; d < c + istep; d++)
5.105 + lastbyte = nextpos(s, _data->size, lastbyte);
5.106 + }
5.107 + }
5.108 + else
5.109 + {
5.110 + /* Get the first byte position. */
5.111 + for (c = 0; c < istart; c++)
5.112 + lastbyte = nextpos(s, _data->size, lastbyte);
5.113 +
5.114 + /* Get each subsequent byte position. */
5.115 + for (c = istart, i = 0; i < nchar; c += istep, i++)
5.116 + {
5.117 + indexes[i] = lastbyte;
5.118 +
5.119 + /* Add the character size to the result size. */
5.120 + resultsize += nextpos(s, _data->size, lastbyte) - lastbyte;
5.121 +
5.122 + for (d = c; d > c + istep; d--)
5.123 + lastbyte = prevpos(s, lastbyte);
5.124 + }
5.125 + }
5.126 +
5.127 + /* Reserve space for a new string. */
5.128 + sub = (char *) __ALLOCATE(resultsize + 1, sizeof(char));
5.129 +
5.130 + /* Does not null terminate but final byte should be zero. */
5.131 + for (i = 0, to = 0; i < nchar; i++)
5.132 + {
5.133 + from = indexes[i];
5.134 + do
5.135 + {
5.136 + sub[to++] = s[from++];
5.137 + } while (!boundary(s[from]));
5.138 + }
5.139 +
5.140 + return __new_str(sub, resultsize);
5.141 +}
5.142 +
5.143 /* Module initialisation. */
5.144
5.145 void __main_native_unicode()
6.1 --- a/templates/native/unicode.h Mon Dec 19 00:26:49 2016 +0100
6.2 +++ b/templates/native/unicode.h Fri Jan 06 22:23:52 2017 +0100
6.3 @@ -22,6 +22,7 @@
6.4 /* Unicode operations. */
6.5
6.6 __attr __fn_native_unicode_unicode_len(__attr __args[]);
6.7 +__attr __fn_native_unicode_unicode_substr(__attr __args[]);
6.8
6.9 /* Module initialisation. */
6.10
7.1 --- a/tests/unicode.py Mon Dec 19 00:26:49 2016 +0100
7.2 +++ b/tests/unicode.py Fri Jan 06 22:23:52 2017 +0100
7.3 @@ -85,3 +85,21 @@
7.4 print u # æøå
7.5 print su # ÆØÅæøå
7.6 print us # æøåÆØÅ
7.7 +
7.8 +# Reset the encoding.
7.9 +
7.10 +sys.stdout.encoding = "ISO-8859-1"
7.11 +
7.12 +# Test character access.
7.13 +
7.14 +u0 = u[0]
7.15 +print u0.__class__ # __builtins__.unicode.utf8string
7.16 +print u0.encoding # ISO-8859-1
7.17 +print u0 # æ
7.18 +print u[-1] # å
7.19 +print len(u[0]) # 1
7.20 +print len(u[-1]) # 1
7.21 +print u[:2] # æø
7.22 +print len(u[:2]) # 2
7.23 +print u[-1::-1] # åøæ
7.24 +print len(u[-1::-1]) # 3