# HG changeset patch # User Paul Boddie # Date 1483737832 -3600 # Node ID e143933a630c10f5645d6108e29a09dfb3dc1ae6 # Parent 74fa20c041351fe41117cd8f550b3bcbd302772e Added character access to Unicode objects, moving special item access methods in basestring into the specific string class, providing separate versions in the utf8string class. diff -r 74fa20c04135 -r e143933a630c lib/__builtins__/str.py --- a/lib/__builtins__/str.py Mon Dec 19 00:26:49 2016 +0100 +++ b/lib/__builtins__/str.py Fri Jan 06 22:23:52 2017 +0100 @@ -262,6 +262,10 @@ def strip(self, chars=None): pass def upper(self): pass +class string(basestring): + + "A plain string of bytes." + # Special implementation methods. def __get_single_item__(self, index): @@ -290,12 +294,6 @@ return str_substr(self.__data__, start, end, step) -class string(basestring): - - "A plain string of bytes." - - pass - def str(obj): "Return the string representation of 'obj'." diff -r 74fa20c04135 -r e143933a630c lib/__builtins__/unicode.py --- a/lib/__builtins__/unicode.py Mon Dec 19 00:26:49 2016 +0100 +++ b/lib/__builtins__/unicode.py Fri Jan 06 22:23:52 2017 +0100 @@ -20,8 +20,10 @@ """ from __builtins__.str import basestring +from __builtins__.types import check_int from posix.iconv import Converter -from native import str_add, unicode_len, isinstance as _isinstance +from native import str_add, unicode_len, unicode_substr, \ + isinstance as _isinstance class utf8string(basestring): @@ -161,6 +163,34 @@ s.encoding = encoding return s + # Special implementation methods. + + def __get_single_item__(self, index): + + "Return the item at the normalised (positive) 'index'." + + self._check_index(index) + return utf8string(unicode_substr(self.__data__, index, index + 1, 1), self.encoding) + + def __get_multiple_items__(self, start, end, step): + + """ + Return items from 'start' until (but excluding) 'end', at 'step' + intervals. + """ + + self._check_index(start) + self._check_end_index(end) + check_int(step) + + if step == 0: + raise ValueError(step) + + if start == end: + return "" + + return utf8string(unicode_substr(self.__data__, start, end, step), self.encoding) + def unicode(s, encoding): "Convert 's' to a Unicode object, interpreting 's' as using 'encoding'." diff -r 74fa20c04135 -r e143933a630c lib/native/__init__.py --- a/lib/native/__init__.py Mon Dec 19 00:26:49 2016 +0100 +++ b/lib/native/__init__.py Fri Jan 06 22:23:52 2017 +0100 @@ -47,6 +47,6 @@ from native.system import exit, get_argv, get_path -from native.unicode import unicode_len +from native.unicode import unicode_len, unicode_substr # vim: tabstop=4 expandtab shiftwidth=4 diff -r 74fa20c04135 -r e143933a630c lib/native/unicode.py --- a/lib/native/unicode.py Mon Dec 19 00:26:49 2016 +0100 +++ b/lib/native/unicode.py Fri Jan 06 22:23:52 2017 +0100 @@ -27,5 +27,6 @@ # Unicode string operations. def unicode_len(data): pass +def unicode_substr(data, start, end, step): pass # vim: tabstop=4 expandtab shiftwidth=4 diff -r 74fa20c04135 -r e143933a630c templates/native/unicode.c --- a/templates/native/unicode.c Mon Dec 19 00:26:49 2016 +0100 +++ b/templates/native/unicode.c Fri Jan 06 22:23:52 2017 +0100 @@ -16,7 +16,6 @@ this program. If not, see . */ -#include /* strcmp, memcpy */ #include "native/common.h" #include "types.h" #include "exceptions.h" @@ -26,6 +25,39 @@ #include "progtypes.h" #include "main.h" +static inline int boundary(char c) +{ + return ((c & 0xc0) == 0xc0) || !(c & 0x80); +} + +static unsigned int nextpos(char *s, unsigned int size, unsigned int bytestart) +{ + unsigned int i = bytestart; + + while (i < size) + { + i++; + if (boundary(s[i])) + break; + } + + return i; +} + +static unsigned int prevpos(char *s, unsigned int bytestart) +{ + unsigned int i = bytestart; + + while (i > 0) + { + i--; + if (boundary(s[i])) + break; + } + + return i; +} + /* Unicode operations. */ __attr __fn_native_unicode_unicode_len(__attr __args[]) @@ -33,16 +65,92 @@ __attr * const _data = &__args[1]; /* _data interpreted as string */ char *s = _data->strvalue; - int i, c = 0; + unsigned int i, c = 0; for (i = 0; i < _data->size; i++) - if (((s[i] & 0xc0) == 0xc0) || !(s[i] & 0x80)) + if (boundary(s[i])) c++; /* Return the new integer. */ return __new_int(c); } +__attr __fn_native_unicode_unicode_substr(__attr __args[]) +{ + __attr * const _data = &__args[1]; + __attr * const start = &__args[2]; + __attr * const end = &__args[3]; + __attr * const step = &__args[4]; + /* _data interpreted as string */ + char *s = _data->strvalue, *sub; + /* start.__data__ interpreted as int */ + int istart = __load_via_object(start->value, __pos___data__).intvalue; + /* end.__data__ interpreted as int */ + int iend = __load_via_object(end->value, __pos___data__).intvalue; + /* step.__data__ interpreted as int */ + int istep = __load_via_object(step->value, __pos___data__).intvalue; + + /* Calculate the number of characters. */ + size_t nchar = ((iend - istart - (istep > 0 ? 1 : -1)) / istep) + 1; + unsigned int indexes[nchar]; + + unsigned int c, d, i, to, from, lastbyte = 0; + size_t resultsize = 0; + + /* Find the indexes of the characters. */ + if (istep > 0) + { + /* Get the first byte position. */ + for (c = 0; c < istart; c++) + lastbyte = nextpos(s, _data->size, lastbyte); + + /* Get each subsequent byte position. */ + for (c = istart, i = 0; i < nchar; c += istep, i++) + { + indexes[i] = lastbyte; + + /* Add the character size to the result size. */ + resultsize += nextpos(s, _data->size, lastbyte) - lastbyte; + + for (d = c; d < c + istep; d++) + lastbyte = nextpos(s, _data->size, lastbyte); + } + } + else + { + /* Get the first byte position. */ + for (c = 0; c < istart; c++) + lastbyte = nextpos(s, _data->size, lastbyte); + + /* Get each subsequent byte position. */ + for (c = istart, i = 0; i < nchar; c += istep, i++) + { + indexes[i] = lastbyte; + + /* Add the character size to the result size. */ + resultsize += nextpos(s, _data->size, lastbyte) - lastbyte; + + for (d = c; d > c + istep; d--) + lastbyte = prevpos(s, lastbyte); + } + } + + /* Reserve space for a new string. */ + sub = (char *) __ALLOCATE(resultsize + 1, sizeof(char)); + + /* Does not null terminate but final byte should be zero. */ + for (i = 0, to = 0; i < nchar; i++) + { + from = indexes[i]; + do + { + sub[to++] = s[from++]; + } while (!boundary(s[from])); + } + + return __new_str(sub, resultsize); +} + /* Module initialisation. */ void __main_native_unicode() diff -r 74fa20c04135 -r e143933a630c templates/native/unicode.h --- a/templates/native/unicode.h Mon Dec 19 00:26:49 2016 +0100 +++ b/templates/native/unicode.h Fri Jan 06 22:23:52 2017 +0100 @@ -22,6 +22,7 @@ /* Unicode operations. */ __attr __fn_native_unicode_unicode_len(__attr __args[]); +__attr __fn_native_unicode_unicode_substr(__attr __args[]); /* Module initialisation. */ diff -r 74fa20c04135 -r e143933a630c tests/unicode.py --- a/tests/unicode.py Mon Dec 19 00:26:49 2016 +0100 +++ b/tests/unicode.py Fri Jan 06 22:23:52 2017 +0100 @@ -85,3 +85,21 @@ print u # æøå print su # ÆØÅæøå print us # æøåÆØÅ + +# Reset the encoding. + +sys.stdout.encoding = "ISO-8859-1" + +# Test character access. + +u0 = u[0] +print u0.__class__ # __builtins__.unicode.utf8string +print u0.encoding # ISO-8859-1 +print u0 # æ +print u[-1] # å +print len(u[0]) # 1 +print len(u[-1]) # 1 +print u[:2] # æø +print len(u[:2]) # 2 +print u[-1::-1] # åøæ +print len(u[-1::-1]) # 3