# HG changeset patch # User Paul Boddie # Date 1481670188 -3600 # Node ID 97f87e030e28d04a0f4c7493ede55fb73a3e1c16 # Parent 10fffa3651d9c4d7eb842478c5d3477ce8423f5c Added a UTF-8 character counting native function to support the __len__ method on Unicode objects, introducing a bytelength method on strings so that byte-level operations, such as conversion between encodings, can still work with Unicode objects (since __len__ returning characters would be inappropriate for such purposes). diff -r 10fffa3651d9 -r 97f87e030e28 lib/__builtins__/str.py --- a/lib/__builtins__/str.py Wed Dec 14 00:00:21 2016 +0100 +++ b/lib/__builtins__/str.py Wed Dec 14 00:03:08 2016 +0100 @@ -155,12 +155,14 @@ return _negate(self.__eq__(other)) - def __len__(self): + def bytelength(self): - "Return the length of this string." + "Return the number of bytes in this string." return str_len(self.__data__) + __len__ = bytelength + def __str__(self): "Return a string representation." diff -r 10fffa3651d9 -r 97f87e030e28 lib/__builtins__/unicode.py --- a/lib/__builtins__/unicode.py Wed Dec 14 00:00:21 2016 +0100 +++ b/lib/__builtins__/unicode.py Wed Dec 14 00:03:08 2016 +0100 @@ -21,7 +21,7 @@ from __builtins__.str import basestring from posix.iconv import Converter -from native import str_add, isinstance as _isinstance +from native import str_add, unicode_len, isinstance as _isinstance class utf8string(basestring): @@ -36,6 +36,7 @@ get_using(basestring.__init__, self)(other) self.encoding = encoding + self.length = None def _binary_op(self, op, other): @@ -100,6 +101,15 @@ return self._convert(self._binary_op_rev(str_add, other), other) + def __len__(self): + + "Return the length of this string in characters." + + if self.length is None: + self.length = unicode_len(self.__data__) + + return self.length + def encode(self, encoding=None): """ diff -r 10fffa3651d9 -r 97f87e030e28 lib/native/__init__.py --- a/lib/native/__init__.py Wed Dec 14 00:00:21 2016 +0100 +++ b/lib/native/__init__.py Wed Dec 14 00:03:08 2016 +0100 @@ -47,4 +47,6 @@ from native.system import exit, get_argv, get_path +from native.unicode import unicode_len + # vim: tabstop=4 expandtab shiftwidth=4 diff -r 10fffa3651d9 -r 97f87e030e28 lib/native/unicode.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/lib/native/unicode.py Wed Dec 14 00:03:08 2016 +0100 @@ -0,0 +1,31 @@ +#!/usr/bin/env python + +""" +Native library functions for Unicode objects. + +None of these are actually defined here. Instead, native implementations are +substituted when each program is built. It is, however, important to declare +non-core exceptions used by the native functions because they need to be +identified as being needed by the program. + +Copyright (C) 2016 Paul Boddie + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; either version 3 of the License, or (at your option) any later +version. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +details. + +You should have received a copy of the GNU General Public License along with +this program. If not, see . +""" + +# Unicode string operations. + +def unicode_len(data): pass + +# vim: tabstop=4 expandtab shiftwidth=4 diff -r 10fffa3651d9 -r 97f87e030e28 lib/posix/iconv.py --- a/lib/posix/iconv.py Wed Dec 14 00:00:21 2016 +0100 +++ b/lib/posix/iconv.py Wed Dec 14 00:03:08 2016 +0100 @@ -73,9 +73,9 @@ _s, start, remaining = self.state if _s: - self.state = [_s + s, start, remaining + len(s)] + self.state = [_s + s, start, remaining + s.bytelength()] else: - self.state = [s, 0, len(s)] + self.state = [s, 0, s.bytelength()] while True: diff -r 10fffa3651d9 -r 97f87e030e28 templates/native/unicode.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/templates/native/unicode.c Wed Dec 14 00:03:08 2016 +0100 @@ -0,0 +1,50 @@ +/* Native functions for Unicode operations. + +Copyright (C) 2016 Paul Boddie + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; either version 3 of the License, or (at your option) any later +version. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +details. + +You should have received a copy of the GNU General Public License along with +this program. If not, see . +*/ + +#include /* strcmp, memcpy */ +#include "native/common.h" +#include "types.h" +#include "exceptions.h" +#include "ops.h" +#include "progconsts.h" +#include "progops.h" +#include "progtypes.h" +#include "main.h" + +/* Unicode operations. */ + +__attr __fn_native_unicode_unicode_len(__attr __args[]) +{ + __attr * const _data = &__args[1]; + /* _data interpreted as string */ + char *s = _data->strvalue; + int i, c = 0; + + for (i = 0; i < _data->size; i++) + if (((s[i] & 0xc0) == 0xc0) || !(s[i] & 0x80)) + c++; + + /* Return the new integer. */ + return __new_int(c); +} + +/* Module initialisation. */ + +void __main_native_unicode() +{ +} diff -r 10fffa3651d9 -r 97f87e030e28 templates/native/unicode.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/templates/native/unicode.h Wed Dec 14 00:03:08 2016 +0100 @@ -0,0 +1,30 @@ +/* Native functions for Unicode operations. + +Copyright (C) 2016 Paul Boddie + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; either version 3 of the License, or (at your option) any later +version. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +details. + +You should have received a copy of the GNU General Public License along with +this program. If not, see . +*/ + +#ifndef __NATIVE_UNICODE_H__ +#define __NATIVE_UNICODE_H__ + +/* Unicode operations. */ + +__attr __fn_native_unicode_unicode_len(__attr __args[]); + +/* Module initialisation. */ + +void __main_native_unicode(); + +#endif /* __NATIVE_UNICODE_H__ */ diff -r 10fffa3651d9 -r 97f87e030e28 tests/unicode.py --- a/tests/unicode.py Wed Dec 14 00:00:21 2016 +0100 +++ b/tests/unicode.py Wed Dec 14 00:03:08 2016 +0100 @@ -6,6 +6,7 @@ s = b"ÆØÅ" print s # ÆØÅ +print len(s) # 3 # Obtain text and print it. @@ -13,22 +14,28 @@ u = unicode("æøå", "ISO-8859-1") print u # æøå +print u.__class__ # __builtins__.unicode.utf8string print u.encode("ISO-8859-1") # æøå print u.encoding # ISO-8859-1 +print len(u) # 3 # Explicitly from Unicode literals. u2 = u"æøå" print u2 # æøå +print u2.__class__ # __builtins__.unicode.utf8string print u2.encode("ISO-8859-1") # æøå print u2.encoding # ISO-8859-1 +print len(u2) # 3 # Implicitly from string literals. #u3 = "æøå" #print u3 # æøå +#print u3.__class__ # __builtins__.unicode.utf8string #print u3.encode("ISO-8859-1") # æøå #print u3.encoding # ISO-8859-1 +#print len(u3) # 3 # Combine bytes and text. # The text should be decoded. @@ -36,6 +43,7 @@ su = s + u print su # ÆØÅæøå print su.__class__ # __builtins__.str.string +print len(su) # 6 # Combine text and bytes. # The text should be decoded. @@ -43,6 +51,7 @@ us = u + s print us # æøåÆØÅ print us.__class__ # __builtins__.str.string +print len(us) # 6 # Combine text and text. @@ -50,6 +59,7 @@ print uu2 # æøå print uu2.__class__ # __builtins__.unicode.utf8string print uu2.encoding # ISO-8859-1 +print len(uu2) # 6 # Inspect and update the encoding of stdout. # Note that su and us are byte strings and are not recoded.