Lichen

templates/native/unicode.c

627:05ad7964265c
2017-02-27 Paul Boddie Merged convenience macro changes.
     1 /* Native functions for Unicode operations.     2      3 Copyright (C) 2016, 2017 Paul Boddie <paul@boddie.org.uk>     4      5 This program is free software; you can redistribute it and/or modify it under     6 the terms of the GNU General Public License as published by the Free Software     7 Foundation; either version 3 of the License, or (at your option) any later     8 version.     9     10 This program is distributed in the hope that it will be useful, but WITHOUT    11 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS    12 FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more    13 details.    14     15 You should have received a copy of the GNU General Public License along with    16 this program.  If not, see <http://www.gnu.org/licenses/>.    17 */    18     19 #include "native/common.h"    20 #include "types.h"    21 #include "exceptions.h"    22 #include "ops.h"    23 #include "progconsts.h"    24 #include "progops.h"    25 #include "progtypes.h"    26 #include "main.h"    27     28 static inline int boundary(char c)    29 {    30     return ((c & 0xc0) == 0xc0) || !(c & 0x80);    31 }    32     33 static inline int boundary_value(char c)    34 {    35     if (!(c & 0x80)) return c;    36     else if ((c & 0xf8) == 0xf0) return c & 0x07;    37     else if ((c & 0xf0) == 0xe0) return c & 0x0f;    38     else if ((c & 0xe0) == 0xc0) return c & 0x1f;    39     else return 0;    40 }    41     42 static unsigned int nextpos(char *s, unsigned int size, unsigned int bytestart)    43 {    44     unsigned int i = bytestart;    45     46     while (i < size)    47     {    48         i++;    49         if (boundary(s[i]))    50             break;    51     }    52     53     return i;    54 }    55     56 static unsigned int prevpos(char *s, unsigned int bytestart)    57 {    58     unsigned int i = bytestart;    59     60     while (i > 0)    61     {    62         i--;    63         if (boundary(s[i]))    64             break;    65     }    66     67     return i;    68 }    69     70 /* Unicode operations. */    71     72 __attr __fn_native_unicode_unicode_len(__attr __args[])    73 {    74     __attr * const _data = &__args[1];    75     __attr * const _size = &__args[2];    76     /* _data interpreted as string */    77     char *s = _data->strvalue;    78     /* _size interpreted as int */    79     int size = _size->intvalue;    80     unsigned int i, c = 0;    81     82     for (i = 0; i < size; i++)    83         if (boundary(s[i]))    84             c++;    85     86     /* Return the new integer. */    87     return __new_int(c);    88 }    89     90 __attr __fn_native_unicode_unicode_ord(__attr __args[])    91 {    92     __attr * const _data = &__args[1];    93     __attr * const _size = &__args[2];    94     /* _data interpreted as string */    95     char *s = _data->strvalue;    96     /* _size interpreted as int */    97     int size = _size->intvalue;    98     unsigned int i, c = 0, v;    99    100     for (i = 0; i < size; i++)   101     {   102         /* Evaluate the current character as a boundary. */   103    104         v = boundary_value(s[i]);   105    106         /* Boundary with characters read: stop reading. */   107    108         if (v && i)   109             break;   110    111         /* Boundary: initialise with the extracted value. */   112    113         else if (v)   114             c = v;   115    116         /* Not a boundary: shift and combine with the continuation value. */   117    118         else   119             c = (c << 6) | (s[i] & 0x3f);   120     }   121    122     /* Return the new integer. */   123     return __new_int(c);   124 }   125    126 __attr __fn_native_unicode_unicode_substr(__attr __args[])   127 {   128     __attr * const _data = &__args[1];   129     __attr * const _size = &__args[2];   130     __attr * const start = &__args[3];   131     __attr * const end = &__args[4];   132     __attr * const step = &__args[5];   133     /* _data interpreted as string */   134     char *s = _data->strvalue, *sub;   135     /* _size interpreted as int */   136     int ss = _size->intvalue;   137     /* start.__data__ interpreted as int */   138     int istart = __load_via_object(start->value, __data__).intvalue;   139     /* end.__data__ interpreted as int */   140     int iend = __load_via_object(end->value, __data__).intvalue;   141     /* step.__data__ interpreted as int */   142     int istep = __load_via_object(step->value, __data__).intvalue;   143    144     /* Calculate the number of characters. */   145     size_t nchar = ((iend - istart - (istep > 0 ? 1 : -1)) / istep) + 1;   146     unsigned int indexes[nchar];   147    148     unsigned int c, d, i, to, from, lastbyte = 0;   149     int resultsize = 0;   150    151     /* Find the indexes of the characters. */   152     if (istep > 0)   153     {   154         /* Get the first byte position. */   155         for (c = 0; c < istart; c++)   156             lastbyte = nextpos(s, ss, lastbyte);   157    158         /* Get each subsequent byte position. */   159         for (c = istart, i = 0; i < nchar; c += istep, i++)   160         {   161             indexes[i] = lastbyte;   162    163             /* Add the character size to the result size. */   164             resultsize += nextpos(s, ss, lastbyte) - lastbyte;   165    166             for (d = c; d < c + istep; d++)   167                 lastbyte = nextpos(s, ss, lastbyte);   168         }   169     }   170     else   171     {   172         /* Get the first byte position. */   173         for (c = 0; c < istart; c++)   174             lastbyte = nextpos(s, ss, lastbyte);   175    176         /* Get each subsequent byte position. */   177         for (c = istart, i = 0; i < nchar; c += istep, i++)   178         {   179             indexes[i] = lastbyte;   180    181             /* Add the character size to the result size. */   182             resultsize += nextpos(s, ss, lastbyte) - lastbyte;   183    184             for (d = c; d > c + istep; d--)   185                 lastbyte = prevpos(s, lastbyte);   186         }   187     }   188    189     /* Reserve space for a new string. */   190     sub = (char *) __ALLOCATE(resultsize + 1, sizeof(char));   191    192     /* Does not null terminate but final byte should be zero. */   193     for (i = 0, to = 0; i < nchar; i++)   194     {   195         from = indexes[i];   196         do   197         {   198             sub[to++] = s[from++];   199         } while (!boundary(s[from]));   200     }   201    202     return __new_str(sub, resultsize);   203 }   204    205 __attr __fn_native_unicode_unicode_unichr(__attr __args[])   206 {   207     __attr * const value = &__args[1];   208     /* value interpreted as int */   209     int i = value->intvalue;   210     unsigned int resultsize;   211     char *s;   212    213     if (i < 128) resultsize = 1;   214     else if (i < 2048) resultsize = 2;   215     else if (i < 65536) resultsize = 3;   216     else resultsize = 4;   217    218     /* Reserve space for a new string. */   219    220     s = (char *) __ALLOCATE(resultsize + 1, sizeof(char));   221    222     /* Populate the string. */   223    224     if (i < 128) s[0] = (char) i;   225     else if (i < 2048)   226     {   227         s[0] = 0b11000000 | (i >> 6);   228         s[1] = 0b10000000 | (i & 0b00111111);   229     }   230     else if (i < 65536)   231     {   232         s[0] = 0b11100000 | (i >> 12);   233         s[1] = 0b10000000 | ((i >> 6) & 0b00111111);   234         s[2] = 0b10000000 | (i & 0b00111111);   235     }   236     else   237     {   238         s[0] = 0b11110000 | (i >> 18);   239         s[1] = 0b10000000 | ((i >> 12) & 0b00111111);   240         s[2] = 0b10000000 | ((i >> 6) & 0b00111111);   241         s[3] = 0b10000000 | (i & 0b00111111);   242     }   243    244     return __new_str(s, resultsize);   245 }   246    247 /* Module initialisation. */   248    249 void __main_native_unicode()   250 {   251 }