Lichen

templates/native/unicode.c

934:2989aab1b4f7
10 months ago Paul Boddie Renamed the utf8string class to unicode, eliminating the unicode function. This means that the simple case of merely returning an object if it is already a Unicode object no longer occurs when using the unicode callable, but such behaviour might be better supported with more general customised instantiation functionality.
     1 /* Native functions for Unicode operations.     2      3 Copyright (C) 2016, 2017 Paul Boddie <paul@boddie.org.uk>     4      5 This program is free software; you can redistribute it and/or modify it under     6 the terms of the GNU General Public License as published by the Free Software     7 Foundation; either version 3 of the License, or (at your option) any later     8 version.     9     10 This program is distributed in the hope that it will be useful, but WITHOUT    11 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS    12 FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more    13 details.    14     15 You should have received a copy of the GNU General Public License along with    16 this program.  If not, see <http://www.gnu.org/licenses/>.    17 */    18     19 #include "native/common.h"    20 #include "types.h"    21 #include "exceptions.h"    22 #include "ops.h"    23 #include "progconsts.h"    24 #include "progops.h"    25 #include "progtypes.h"    26 #include "main.h"    27     28 static inline int boundary(char c)    29 {    30     return ((c & 0xc0) == 0xc0) || !(c & 0x80);    31 }    32     33 static inline int boundary_value(char c)    34 {    35     if (!(c & 0x80)) return c;    36     else if ((c & 0xf8) == 0xf0) return c & 0x07;    37     else if ((c & 0xf0) == 0xe0) return c & 0x0f;    38     else if ((c & 0xe0) == 0xc0) return c & 0x1f;    39     else return 0;    40 }    41     42 static unsigned int nextpos(char *s, unsigned int size, unsigned int bytestart)    43 {    44     unsigned int i = bytestart;    45     46     while (i < size)    47     {    48         i++;    49         if (boundary(s[i]))    50             break;    51     }    52     53     return i;    54 }    55     56 static unsigned int prevpos(char *s, unsigned int bytestart)    57 {    58     unsigned int i = bytestart;    59     60     while (i > 0)    61     {    62         i--;    63         if (boundary(s[i]))    64             break;    65     }    66     67     return i;    68 }    69     70 /* Unicode operations. */    71     72 __attr __fn_native_unicode_unicode_len(__attr __self, __attr _data, __attr _size)    73 {    74     /* _data interpreted as string.__data__ */    75     char *s = _data.strvalue;    76     /* _size interpreted as int */    77     int size = __TOINT(_size);    78     unsigned int i, c = 0;    79     80     for (i = 0; i < size; i++)    81         if (boundary(s[i]))    82             c++;    83     84     /* Return the new integer. */    85     return __new_int(c);    86 }    87     88 __attr __fn_native_unicode_unicode_ord(__attr __self, __attr _data, __attr _size)    89 {    90     /* _data interpreted as string.__data__ */    91     char *s = _data.strvalue;    92     /* _size interpreted as int */    93     int size = __TOINT(_size);    94     unsigned int i, c = 0, v;    95     96     for (i = 0; i < size; i++)    97     {    98         /* Evaluate the current character as a boundary. */    99    100         v = boundary_value(s[i]);   101    102         /* Boundary with characters read: stop reading. */   103    104         if (v && i)   105             break;   106    107         /* Boundary: initialise with the extracted value. */   108    109         else if (v)   110             c = v;   111    112         /* Not a boundary: shift and combine with the continuation value. */   113    114         else   115             c = (c << 6) | (s[i] & 0x3f);   116     }   117    118     /* Return the new integer. */   119     return __new_int(c);   120 }   121    122 __attr __fn_native_unicode_unicode_substr(__attr __self, __attr _data, __attr _size, __attr start, __attr end, __attr step)   123 {   124     /* _data interpreted as string.__data__ */   125     char *s = _data.strvalue, *sub;   126     /* _size interpreted as int */   127     int ss = __TOINT(_size);   128     /* start interpreted as int */   129     int istart = __TOINT(start);   130     /* end interpreted as int */   131     int iend = __TOINT(end);   132     /* step interpreted as int */   133     int istep = __TOINT(step);   134    135     /* Calculate the number of characters. */   136     size_t nchar = ((iend - istart - (istep > 0 ? 1 : -1)) / istep) + 1;   137     unsigned int indexes[nchar];   138    139     unsigned int c, d, i, to, from, lastbyte = 0;   140     int resultsize = 0;   141    142     /* Find the indexes of the characters. */   143     if (istep > 0)   144     {   145         /* Get the first byte position. */   146         for (c = 0; c < istart; c++)   147             lastbyte = nextpos(s, ss, lastbyte);   148    149         /* Get each subsequent byte position. */   150         for (c = istart, i = 0; i < nchar; c += istep, i++)   151         {   152             indexes[i] = lastbyte;   153    154             /* Add the character size to the result size. */   155             resultsize += nextpos(s, ss, lastbyte) - lastbyte;   156    157             for (d = c; d < c + istep; d++)   158                 lastbyte = nextpos(s, ss, lastbyte);   159         }   160     }   161     else   162     {   163         /* Get the first byte position. */   164         for (c = 0; c < istart; c++)   165             lastbyte = nextpos(s, ss, lastbyte);   166    167         /* Get each subsequent byte position. */   168         for (c = istart, i = 0; i < nchar; c += istep, i++)   169         {   170             indexes[i] = lastbyte;   171    172             /* Add the character size to the result size. */   173             resultsize += nextpos(s, ss, lastbyte) - lastbyte;   174    175             for (d = c; d > c + istep; d--)   176                 lastbyte = prevpos(s, lastbyte);   177         }   178     }   179    180     /* Reserve space for a new string. */   181     sub = (char *) __ALLOCATE(resultsize + 1, sizeof(char));   182    183     /* Does not null terminate but final byte should be zero. */   184     for (i = 0, to = 0; i < nchar; i++)   185     {   186         from = indexes[i];   187         do   188         {   189             sub[to++] = s[from++];   190         } while (!boundary(s[from]));   191     }   192    193     return __new_str(sub, resultsize);   194 }   195    196 __attr __fn_native_unicode_unicode_unichr(__attr __self, __attr value)   197 {   198     /* value interpreted as int */   199     int i = __TOINT(value);   200     unsigned int resultsize;   201     char *s;   202    203     if (i < 128) resultsize = 1;   204     else if (i < 2048) resultsize = 2;   205     else if (i < 65536) resultsize = 3;   206     else resultsize = 4;   207    208     /* Reserve space for a new string. */   209    210     s = (char *) __ALLOCATE(resultsize + 1, sizeof(char));   211    212     /* Populate the string. */   213    214     if (i < 128) s[0] = (char) i;   215     else if (i < 2048)   216     {   217         s[0] = 0b11000000 | (i >> 6);   218         s[1] = 0b10000000 | (i & 0b00111111);   219     }   220     else if (i < 65536)   221     {   222         s[0] = 0b11100000 | (i >> 12);   223         s[1] = 0b10000000 | ((i >> 6) & 0b00111111);   224         s[2] = 0b10000000 | (i & 0b00111111);   225     }   226     else   227     {   228         s[0] = 0b11110000 | (i >> 18);   229         s[1] = 0b10000000 | ((i >> 12) & 0b00111111);   230         s[2] = 0b10000000 | ((i >> 6) & 0b00111111);   231         s[3] = 0b10000000 | (i & 0b00111111);   232     }   233    234     return __new_str(s, resultsize);   235 }   236    237 /* Module initialisation. */   238    239 void __main_native_unicode()   240 {   241 }