Lichen

Annotated templates/native/unicode.c

849:81587921b9b4
2018-07-11 Paul Boddie Use the __store_via_object operations to set attributes.
paul@403 1
/* Native functions for Unicode operations.
paul@403 2
paul@607 3
Copyright (C) 2016, 2017 Paul Boddie <paul@boddie.org.uk>
paul@403 4
paul@403 5
This program is free software; you can redistribute it and/or modify it under
paul@403 6
the terms of the GNU General Public License as published by the Free Software
paul@403 7
Foundation; either version 3 of the License, or (at your option) any later
paul@403 8
version.
paul@403 9
paul@403 10
This program is distributed in the hope that it will be useful, but WITHOUT
paul@403 11
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
paul@403 12
FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
paul@403 13
details.
paul@403 14
paul@403 15
You should have received a copy of the GNU General Public License along with
paul@403 16
this program.  If not, see <http://www.gnu.org/licenses/>.
paul@403 17
*/
paul@403 18
paul@403 19
#include "native/common.h"
paul@403 20
#include "types.h"
paul@403 21
#include "exceptions.h"
paul@403 22
#include "ops.h"
paul@403 23
#include "progconsts.h"
paul@403 24
#include "progops.h"
paul@403 25
#include "progtypes.h"
paul@403 26
#include "main.h"
paul@403 27
paul@431 28
static inline int boundary(char c)
paul@431 29
{
paul@431 30
    return ((c & 0xc0) == 0xc0) || !(c & 0x80);
paul@431 31
}
paul@431 32
paul@534 33
static inline int boundary_value(char c)
paul@534 34
{
paul@534 35
    if (!(c & 0x80)) return c;
paul@534 36
    else if ((c & 0xf8) == 0xf0) return c & 0x07;
paul@534 37
    else if ((c & 0xf0) == 0xe0) return c & 0x0f;
paul@534 38
    else if ((c & 0xe0) == 0xc0) return c & 0x1f;
paul@534 39
    else return 0;
paul@534 40
}
paul@534 41
paul@431 42
static unsigned int nextpos(char *s, unsigned int size, unsigned int bytestart)
paul@431 43
{
paul@431 44
    unsigned int i = bytestart;
paul@431 45
paul@431 46
    while (i < size)
paul@431 47
    {
paul@431 48
        i++;
paul@431 49
        if (boundary(s[i]))
paul@431 50
            break;
paul@431 51
    }
paul@431 52
paul@431 53
    return i;
paul@431 54
}
paul@431 55
paul@431 56
static unsigned int prevpos(char *s, unsigned int bytestart)
paul@431 57
{
paul@431 58
    unsigned int i = bytestart;
paul@431 59
paul@431 60
    while (i > 0)
paul@431 61
    {
paul@431 62
        i--;
paul@431 63
        if (boundary(s[i]))
paul@431 64
            break;
paul@431 65
    }
paul@431 66
paul@431 67
    return i;
paul@431 68
}
paul@431 69
paul@403 70
/* Unicode operations. */
paul@403 71
paul@664 72
__attr __fn_native_unicode_unicode_len(__attr __self, __attr _data, __attr _size)
paul@403 73
{
paul@664 74
    /* _data interpreted as string.__data__ */
paul@664 75
    char *s = _data.strvalue;
paul@583 76
    /* _size interpreted as int */
paul@763 77
    int size = __TOINT(_size);
paul@431 78
    unsigned int i, c = 0;
paul@403 79
paul@583 80
    for (i = 0; i < size; i++)
paul@431 81
        if (boundary(s[i]))
paul@403 82
            c++;
paul@403 83
paul@403 84
    /* Return the new integer. */
paul@403 85
    return __new_int(c);
paul@403 86
}
paul@403 87
paul@664 88
__attr __fn_native_unicode_unicode_ord(__attr __self, __attr _data, __attr _size)
paul@534 89
{
paul@664 90
    /* _data interpreted as string.__data__ */
paul@664 91
    char *s = _data.strvalue;
paul@583 92
    /* _size interpreted as int */
paul@763 93
    int size = __TOINT(_size);
paul@534 94
    unsigned int i, c = 0, v;
paul@534 95
paul@583 96
    for (i = 0; i < size; i++)
paul@534 97
    {
paul@534 98
        /* Evaluate the current character as a boundary. */
paul@534 99
paul@534 100
        v = boundary_value(s[i]);
paul@534 101
paul@534 102
        /* Boundary with characters read: stop reading. */
paul@534 103
paul@534 104
        if (v && i)
paul@534 105
            break;
paul@534 106
paul@534 107
        /* Boundary: initialise with the extracted value. */
paul@534 108
paul@534 109
        else if (v)
paul@534 110
            c = v;
paul@534 111
paul@534 112
        /* Not a boundary: shift and combine with the continuation value. */
paul@534 113
paul@534 114
        else
paul@534 115
            c = (c << 6) | (s[i] & 0x3f);
paul@534 116
    }
paul@534 117
paul@534 118
    /* Return the new integer. */
paul@534 119
    return __new_int(c);
paul@534 120
}
paul@534 121
paul@664 122
__attr __fn_native_unicode_unicode_substr(__attr __self, __attr _data, __attr _size, __attr start, __attr end, __attr step)
paul@431 123
{
paul@664 124
    /* _data interpreted as string.__data__ */
paul@664 125
    char *s = _data.strvalue, *sub;
paul@583 126
    /* _size interpreted as int */
paul@763 127
    int ss = __TOINT(_size);
paul@758 128
    /* start interpreted as int */
paul@763 129
    int istart = __TOINT(start);
paul@758 130
    /* end interpreted as int */
paul@763 131
    int iend = __TOINT(end);
paul@758 132
    /* step interpreted as int */
paul@763 133
    int istep = __TOINT(step);
paul@431 134
paul@431 135
    /* Calculate the number of characters. */
paul@431 136
    size_t nchar = ((iend - istart - (istep > 0 ? 1 : -1)) / istep) + 1;
paul@431 137
    unsigned int indexes[nchar];
paul@431 138
paul@431 139
    unsigned int c, d, i, to, from, lastbyte = 0;
paul@583 140
    int resultsize = 0;
paul@431 141
paul@431 142
    /* Find the indexes of the characters. */
paul@431 143
    if (istep > 0)
paul@431 144
    {
paul@431 145
        /* Get the first byte position. */
paul@431 146
        for (c = 0; c < istart; c++)
paul@569 147
            lastbyte = nextpos(s, ss, lastbyte);
paul@431 148
paul@431 149
        /* Get each subsequent byte position. */
paul@431 150
        for (c = istart, i = 0; i < nchar; c += istep, i++)
paul@431 151
        {
paul@431 152
            indexes[i] = lastbyte;
paul@431 153
paul@431 154
            /* Add the character size to the result size. */
paul@569 155
            resultsize += nextpos(s, ss, lastbyte) - lastbyte;
paul@431 156
paul@431 157
            for (d = c; d < c + istep; d++)
paul@569 158
                lastbyte = nextpos(s, ss, lastbyte);
paul@431 159
        }
paul@431 160
    }
paul@431 161
    else
paul@431 162
    {
paul@431 163
        /* Get the first byte position. */
paul@431 164
        for (c = 0; c < istart; c++)
paul@569 165
            lastbyte = nextpos(s, ss, lastbyte);
paul@431 166
paul@431 167
        /* Get each subsequent byte position. */
paul@431 168
        for (c = istart, i = 0; i < nchar; c += istep, i++)
paul@431 169
        {
paul@431 170
            indexes[i] = lastbyte;
paul@431 171
paul@431 172
            /* Add the character size to the result size. */
paul@569 173
            resultsize += nextpos(s, ss, lastbyte) - lastbyte;
paul@431 174
paul@431 175
            for (d = c; d > c + istep; d--)
paul@431 176
                lastbyte = prevpos(s, lastbyte);
paul@431 177
        }
paul@431 178
    }
paul@431 179
paul@431 180
    /* Reserve space for a new string. */
paul@431 181
    sub = (char *) __ALLOCATE(resultsize + 1, sizeof(char));
paul@431 182
paul@431 183
    /* Does not null terminate but final byte should be zero. */
paul@431 184
    for (i = 0, to = 0; i < nchar; i++)
paul@431 185
    {
paul@431 186
        from = indexes[i];
paul@431 187
        do
paul@431 188
        {
paul@431 189
            sub[to++] = s[from++];
paul@431 190
        } while (!boundary(s[from]));
paul@431 191
    }
paul@431 192
paul@431 193
    return __new_str(sub, resultsize);
paul@431 194
}
paul@431 195
paul@664 196
__attr __fn_native_unicode_unicode_unichr(__attr __self, __attr value)
paul@607 197
{
paul@607 198
    /* value interpreted as int */
paul@763 199
    int i = __TOINT(value);
paul@607 200
    unsigned int resultsize;
paul@607 201
    char *s;
paul@607 202
paul@607 203
    if (i < 128) resultsize = 1;
paul@607 204
    else if (i < 2048) resultsize = 2;
paul@607 205
    else if (i < 65536) resultsize = 3;
paul@607 206
    else resultsize = 4;
paul@607 207
paul@607 208
    /* Reserve space for a new string. */
paul@607 209
paul@607 210
    s = (char *) __ALLOCATE(resultsize + 1, sizeof(char));
paul@607 211
paul@607 212
    /* Populate the string. */
paul@607 213
paul@607 214
    if (i < 128) s[0] = (char) i;
paul@607 215
    else if (i < 2048)
paul@607 216
    {
paul@607 217
        s[0] = 0b11000000 | (i >> 6);
paul@607 218
        s[1] = 0b10000000 | (i & 0b00111111);
paul@607 219
    }
paul@607 220
    else if (i < 65536)
paul@607 221
    {
paul@607 222
        s[0] = 0b11100000 | (i >> 12);
paul@607 223
        s[1] = 0b10000000 | ((i >> 6) & 0b00111111);
paul@607 224
        s[2] = 0b10000000 | (i & 0b00111111);
paul@607 225
    }
paul@607 226
    else
paul@607 227
    {
paul@607 228
        s[0] = 0b11110000 | (i >> 18);
paul@607 229
        s[1] = 0b10000000 | ((i >> 12) & 0b00111111);
paul@607 230
        s[2] = 0b10000000 | ((i >> 6) & 0b00111111);
paul@607 231
        s[3] = 0b10000000 | (i & 0b00111111);
paul@607 232
    }
paul@607 233
paul@607 234
    return __new_str(s, resultsize);
paul@607 235
}
paul@607 236
paul@403 237
/* Module initialisation. */
paul@403 238
paul@403 239
void __main_native_unicode()
paul@403 240
{
paul@403 241
}