1.1 --- a/lib/__builtins__/str.py Wed Dec 14 00:00:21 2016 +0100
1.2 +++ b/lib/__builtins__/str.py Wed Dec 14 00:03:08 2016 +0100
1.3 @@ -155,12 +155,14 @@
1.4
1.5 return _negate(self.__eq__(other))
1.6
1.7 - def __len__(self):
1.8 + def bytelength(self):
1.9
1.10 - "Return the length of this string."
1.11 + "Return the number of bytes in this string."
1.12
1.13 return str_len(self.__data__)
1.14
1.15 + __len__ = bytelength
1.16 +
1.17 def __str__(self):
1.18
1.19 "Return a string representation."
2.1 --- a/lib/__builtins__/unicode.py Wed Dec 14 00:00:21 2016 +0100
2.2 +++ b/lib/__builtins__/unicode.py Wed Dec 14 00:03:08 2016 +0100
2.3 @@ -21,7 +21,7 @@
2.4
2.5 from __builtins__.str import basestring
2.6 from posix.iconv import Converter
2.7 -from native import str_add, isinstance as _isinstance
2.8 +from native import str_add, unicode_len, isinstance as _isinstance
2.9
2.10 class utf8string(basestring):
2.11
2.12 @@ -36,6 +36,7 @@
2.13
2.14 get_using(basestring.__init__, self)(other)
2.15 self.encoding = encoding
2.16 + self.length = None
2.17
2.18 def _binary_op(self, op, other):
2.19
2.20 @@ -100,6 +101,15 @@
2.21
2.22 return self._convert(self._binary_op_rev(str_add, other), other)
2.23
2.24 + def __len__(self):
2.25 +
2.26 + "Return the length of this string in characters."
2.27 +
2.28 + if self.length is None:
2.29 + self.length = unicode_len(self.__data__)
2.30 +
2.31 + return self.length
2.32 +
2.33 def encode(self, encoding=None):
2.34
2.35 """
3.1 --- a/lib/native/__init__.py Wed Dec 14 00:00:21 2016 +0100
3.2 +++ b/lib/native/__init__.py Wed Dec 14 00:03:08 2016 +0100
3.3 @@ -47,4 +47,6 @@
3.4
3.5 from native.system import exit, get_argv, get_path
3.6
3.7 +from native.unicode import unicode_len
3.8 +
3.9 # vim: tabstop=4 expandtab shiftwidth=4
4.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
4.2 +++ b/lib/native/unicode.py Wed Dec 14 00:03:08 2016 +0100
4.3 @@ -0,0 +1,31 @@
4.4 +#!/usr/bin/env python
4.5 +
4.6 +"""
4.7 +Native library functions for Unicode objects.
4.8 +
4.9 +None of these are actually defined here. Instead, native implementations are
4.10 +substituted when each program is built. It is, however, important to declare
4.11 +non-core exceptions used by the native functions because they need to be
4.12 +identified as being needed by the program.
4.13 +
4.14 +Copyright (C) 2016 Paul Boddie <paul@boddie.org.uk>
4.15 +
4.16 +This program is free software; you can redistribute it and/or modify it under
4.17 +the terms of the GNU General Public License as published by the Free Software
4.18 +Foundation; either version 3 of the License, or (at your option) any later
4.19 +version.
4.20 +
4.21 +This program is distributed in the hope that it will be useful, but WITHOUT
4.22 +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
4.23 +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
4.24 +details.
4.25 +
4.26 +You should have received a copy of the GNU General Public License along with
4.27 +this program. If not, see <http://www.gnu.org/licenses/>.
4.28 +"""
4.29 +
4.30 +# Unicode string operations.
4.31 +
4.32 +def unicode_len(data): pass
4.33 +
4.34 +# vim: tabstop=4 expandtab shiftwidth=4
5.1 --- a/lib/posix/iconv.py Wed Dec 14 00:00:21 2016 +0100
5.2 +++ b/lib/posix/iconv.py Wed Dec 14 00:03:08 2016 +0100
5.3 @@ -73,9 +73,9 @@
5.4 _s, start, remaining = self.state
5.5
5.6 if _s:
5.7 - self.state = [_s + s, start, remaining + len(s)]
5.8 + self.state = [_s + s, start, remaining + s.bytelength()]
5.9 else:
5.10 - self.state = [s, 0, len(s)]
5.11 + self.state = [s, 0, s.bytelength()]
5.12
5.13 while True:
5.14
6.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
6.2 +++ b/templates/native/unicode.c Wed Dec 14 00:03:08 2016 +0100
6.3 @@ -0,0 +1,50 @@
6.4 +/* Native functions for Unicode operations.
6.5 +
6.6 +Copyright (C) 2016 Paul Boddie <paul@boddie.org.uk>
6.7 +
6.8 +This program is free software; you can redistribute it and/or modify it under
6.9 +the terms of the GNU General Public License as published by the Free Software
6.10 +Foundation; either version 3 of the License, or (at your option) any later
6.11 +version.
6.12 +
6.13 +This program is distributed in the hope that it will be useful, but WITHOUT
6.14 +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
6.15 +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
6.16 +details.
6.17 +
6.18 +You should have received a copy of the GNU General Public License along with
6.19 +this program. If not, see <http://www.gnu.org/licenses/>.
6.20 +*/
6.21 +
6.22 +#include <string.h> /* strcmp, memcpy */
6.23 +#include "native/common.h"
6.24 +#include "types.h"
6.25 +#include "exceptions.h"
6.26 +#include "ops.h"
6.27 +#include "progconsts.h"
6.28 +#include "progops.h"
6.29 +#include "progtypes.h"
6.30 +#include "main.h"
6.31 +
6.32 +/* Unicode operations. */
6.33 +
6.34 +__attr __fn_native_unicode_unicode_len(__attr __args[])
6.35 +{
6.36 + __attr * const _data = &__args[1];
6.37 + /* _data interpreted as string */
6.38 + char *s = _data->strvalue;
6.39 + int i, c = 0;
6.40 +
6.41 + for (i = 0; i < _data->size; i++)
6.42 + if (((s[i] & 0xc0) == 0xc0) || !(s[i] & 0x80))
6.43 + c++;
6.44 +
6.45 + /* Return the new integer. */
6.46 + return __new_int(c);
6.47 +}
6.48 +
6.49 +/* Module initialisation. */
6.50 +
6.51 +void __main_native_unicode()
6.52 +{
6.53 +}
7.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
7.2 +++ b/templates/native/unicode.h Wed Dec 14 00:03:08 2016 +0100
7.3 @@ -0,0 +1,30 @@
7.4 +/* Native functions for Unicode operations.
7.5 +
7.6 +Copyright (C) 2016 Paul Boddie <paul@boddie.org.uk>
7.7 +
7.8 +This program is free software; you can redistribute it and/or modify it under
7.9 +the terms of the GNU General Public License as published by the Free Software
7.10 +Foundation; either version 3 of the License, or (at your option) any later
7.11 +version.
7.12 +
7.13 +This program is distributed in the hope that it will be useful, but WITHOUT
7.14 +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
7.15 +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
7.16 +details.
7.17 +
7.18 +You should have received a copy of the GNU General Public License along with
7.19 +this program. If not, see <http://www.gnu.org/licenses/>.
7.20 +*/
7.21 +
7.22 +#ifndef __NATIVE_UNICODE_H__
7.23 +#define __NATIVE_UNICODE_H__
7.24 +
7.25 +/* Unicode operations. */
7.26 +
7.27 +__attr __fn_native_unicode_unicode_len(__attr __args[]);
7.28 +
7.29 +/* Module initialisation. */
7.30 +
7.31 +void __main_native_unicode();
7.32 +
7.33 +#endif /* __NATIVE_UNICODE_H__ */
8.1 --- a/tests/unicode.py Wed Dec 14 00:00:21 2016 +0100
8.2 +++ b/tests/unicode.py Wed Dec 14 00:03:08 2016 +0100
8.3 @@ -6,6 +6,7 @@
8.4
8.5 s = b"ÆØÅ"
8.6 print s # ÆØÅ
8.7 +print len(s) # 3
8.8
8.9 # Obtain text and print it.
8.10
8.11 @@ -13,22 +14,28 @@
8.12
8.13 u = unicode("æøå", "ISO-8859-1")
8.14 print u # æøå
8.15 +print u.__class__ # __builtins__.unicode.utf8string
8.16 print u.encode("ISO-8859-1") # æøå
8.17 print u.encoding # ISO-8859-1
8.18 +print len(u) # 3
8.19
8.20 # Explicitly from Unicode literals.
8.21
8.22 u2 = u"æøå"
8.23 print u2 # æøå
8.24 +print u2.__class__ # __builtins__.unicode.utf8string
8.25 print u2.encode("ISO-8859-1") # æøå
8.26 print u2.encoding # ISO-8859-1
8.27 +print len(u2) # 3
8.28
8.29 # Implicitly from string literals.
8.30
8.31 #u3 = "æøå"
8.32 #print u3 # æøå
8.33 +#print u3.__class__ # __builtins__.unicode.utf8string
8.34 #print u3.encode("ISO-8859-1") # æøå
8.35 #print u3.encoding # ISO-8859-1
8.36 +#print len(u3) # 3
8.37
8.38 # Combine bytes and text.
8.39 # The text should be decoded.
8.40 @@ -36,6 +43,7 @@
8.41 su = s + u
8.42 print su # ÆØÅæøå
8.43 print su.__class__ # __builtins__.str.string
8.44 +print len(su) # 6
8.45
8.46 # Combine text and bytes.
8.47 # The text should be decoded.
8.48 @@ -43,6 +51,7 @@
8.49 us = u + s
8.50 print us # æøåÆØÅ
8.51 print us.__class__ # __builtins__.str.string
8.52 +print len(us) # 6
8.53
8.54 # Combine text and text.
8.55
8.56 @@ -50,6 +59,7 @@
8.57 print uu2 # æøå
8.58 print uu2.__class__ # __builtins__.unicode.utf8string
8.59 print uu2.encoding # ISO-8859-1
8.60 +print len(uu2) # 6
8.61
8.62 # Inspect and update the encoding of stdout.
8.63 # Note that su and us are byte strings and are not recoded.