Lichen

templates/native/unicode.c

940:6ddce984649b
2021-10-30 Paul Boddie Fixed expected result in comment.
     1 /* Native functions for Unicode operations.     2      3 Copyright (C) 2016, 2017 Paul Boddie <paul@boddie.org.uk>     4      5 This program is free software; you can redistribute it and/or modify it under     6 the terms of the GNU General Public License as published by the Free Software     7 Foundation; either version 3 of the License, or (at your option) any later     8 version.     9     10 This program is distributed in the hope that it will be useful, but WITHOUT    11 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS    12 FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more    13 details.    14     15 You should have received a copy of the GNU General Public License along with    16 this program.  If not, see <http://www.gnu.org/licenses/>.    17 */    18     19 #include "native/common.h"    20 #include "types.h"    21 #include "exceptions.h"    22 #include "ops.h"    23 #include "progconsts.h"    24 #include "progops.h"    25 #include "progtypes.h"    26 #include "main.h"    27     28 static inline int boundary(char c)    29 {    30     return ((c & 0xc0) == 0xc0) || !(c & 0x80);    31 }    32     33 static inline int boundary_value(char c)    34 {    35     if (!(c & 0x80)) return c;    36     else if ((c & 0xf8) == 0xf0) return c & 0x07;    37     else if ((c & 0xf0) == 0xe0) return c & 0x0f;    38     else if ((c & 0xe0) == 0xc0) return c & 0x1f;    39     else return 0;    40 }    41     42 static unsigned int nextpos(char *s, unsigned int size, unsigned int bytestart)    43 {    44     unsigned int i = bytestart;    45     46     while (i < size)    47     {    48         i++;    49         if (boundary(s[i]))    50             break;    51     }    52     53     return i;    54 }    55     56 static unsigned int prevpos(char *s, unsigned int bytestart)    57 {    58     unsigned int i = bytestart;    59     60     while (i > 0)    61     {    62         i--;    63         if (boundary(s[i]))    64             break;    65     }    66     67     return i;    68 }    69     70 /* Unicode operations. */    71     72 __attr __fn_native_unicode_unicode_len(__attr __self, __attr _data, __attr _size)    73 {    74     /* _data interpreted as string.__data__ */    75     char *s = _data.strvalue;    76     /* _size interpreted as int */    77     int size = __TOINT(_size);    78     unsigned int i, c = 0;    79     80     for (i = 0; i < size; i++)    81         if (boundary(s[i]))    82             c++;    83     84     /* Return the new integer. */    85     return __new_int(c);    86 }    87     88 __attr __fn_native_unicode_unicode_ord(__attr __self, __attr _data, __attr _size)    89 {    90     /* _data interpreted as string.__data__ */    91     char *s = _data.strvalue;    92     /* _size interpreted as int */    93     int size = __TOINT(_size);    94     unsigned int i, c = 0, v;    95     96     for (i = 0; i < size; i++)    97     {    98         /* Evaluate the current character as a boundary. */    99    100         v = boundary_value(s[i]);   101    102         /* Boundary with characters read: stop reading. */   103    104         if (v && i)   105             break;   106    107         /* Boundary: initialise with the extracted value. */   108    109         else if (v)   110             c = v;   111    112         /* Not a boundary: shift and combine with the continuation value. */   113    114         else   115             c = (c << 6) | (s[i] & 0x3f);   116     }   117    118     /* Return the new integer. */   119     return __new_int(c);   120 }   121    122 __attr __fn_native_unicode_unicode_substr(__attr __self, __attr _data, __attr _size, __attr start, __attr end, __attr step)   123 {   124     /* _data interpreted as string.__data__ */   125     char *s = _data.strvalue, *sub;   126     /* _size interpreted as int */   127     int ss = __TOINT(_size);   128     /* start interpreted as int */   129     int istart = __TOINT(start);   130     /* end interpreted as int */   131     int iend = __TOINT(end);   132     /* step interpreted as int */   133     int istep = __TOINT(step);   134    135     /* Calculate the number of characters. */   136     size_t nchar = ((iend - istart - (istep > 0 ? 1 : -1)) / istep) + 1;   137     unsigned int indexes[nchar];   138    139     unsigned int c, d, i, to, from, lastbyte = 0;   140     int resultsize = 0;   141    142     /* Find the indexes of the characters. */   143     if (istep > 0)   144     {   145         /* Get the first byte position. */   146         for (c = 0; c < istart; c++)   147             lastbyte = nextpos(s, ss, lastbyte);   148    149         /* Get each subsequent byte position. */   150         for (c = istart, i = 0; i < nchar; c += istep, i++)   151         {   152             indexes[i] = lastbyte;   153    154             /* Add the character size to the result size. */   155             resultsize += nextpos(s, ss, lastbyte) - lastbyte;   156    157             for (d = c; d < c + istep; d++)   158                 lastbyte = nextpos(s, ss, lastbyte);   159         }   160     }   161     else   162     {   163         /* Get the first byte position. */   164         for (c = 0; c < istart; c++)   165             lastbyte = nextpos(s, ss, lastbyte);   166    167         /* Get each subsequent byte position. */   168         for (c = istart, i = 0; i < nchar; c += istep, i++)   169         {   170             indexes[i] = lastbyte;   171    172             /* Add the character size to the result size. */   173             resultsize += nextpos(s, ss, lastbyte) - lastbyte;   174    175             for (d = c; d > c + istep; d--)   176                 lastbyte = prevpos(s, lastbyte);   177         }   178     }   179    180     /* Reserve space for a new string. */   181     sub = (char *) __ALLOCATE(resultsize + 1, sizeof(char));   182    183     /* Does not null terminate but final byte should be zero. */   184     for (i = 0, to = 0; i < nchar; i++)   185     {   186         from = indexes[i];   187         do   188         {   189             sub[to++] = s[from++];   190         } while (!boundary(s[from]));   191     }   192    193     return __new_str(sub, resultsize);   194 }   195    196 __attr __fn_native_unicode_unicode_unichr(__attr __self, __attr value)   197 {   198     /* value interpreted as int */   199     int i = __TOINT(value);   200     unsigned int resultsize;   201     char *s;   202    203     if (i < 128) resultsize = 1;   204     else if (i < 2048) resultsize = 2;   205     else if (i < 65536) resultsize = 3;   206     else resultsize = 4;   207    208     /* Reserve space for a new string. */   209    210     s = (char *) __ALLOCATE(resultsize + 1, sizeof(char));   211    212     /* Populate the string. */   213    214     if (i < 128) s[0] = (char) i;   215     else if (i < 2048)   216     {   217         s[0] = 0b11000000 | (i >> 6);   218         s[1] = 0b10000000 | (i & 0b00111111);   219     }   220     else if (i < 65536)   221     {   222         s[0] = 0b11100000 | (i >> 12);   223         s[1] = 0b10000000 | ((i >> 6) & 0b00111111);   224         s[2] = 0b10000000 | (i & 0b00111111);   225     }   226     else   227     {   228         s[0] = 0b11110000 | (i >> 18);   229         s[1] = 0b10000000 | ((i >> 12) & 0b00111111);   230         s[2] = 0b10000000 | ((i >> 6) & 0b00111111);   231         s[3] = 0b10000000 | (i & 0b00111111);   232     }   233    234     return __new_str(s, resultsize);   235 }   236    237 /* Module initialisation. */   238    239 void __main_native_unicode()   240 {   241 }