1 /* Native functions for Unicode operations. 2 3 Copyright (C) 2016 Paul Boddie <paul@boddie.org.uk> 4 5 This program is free software; you can redistribute it and/or modify it under 6 the terms of the GNU General Public License as published by the Free Software 7 Foundation; either version 3 of the License, or (at your option) any later 8 version. 9 10 This program is distributed in the hope that it will be useful, but WITHOUT 11 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS 12 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more 13 details. 14 15 You should have received a copy of the GNU General Public License along with 16 this program. If not, see <http://www.gnu.org/licenses/>. 17 */ 18 19 #include "native/common.h" 20 #include "types.h" 21 #include "exceptions.h" 22 #include "ops.h" 23 #include "progconsts.h" 24 #include "progops.h" 25 #include "progtypes.h" 26 #include "main.h" 27 28 static inline int boundary(char c) 29 { 30 return ((c & 0xc0) == 0xc0) || !(c & 0x80); 31 } 32 33 static inline int boundary_value(char c) 34 { 35 if (!(c & 0x80)) return c; 36 else if ((c & 0xf8) == 0xf0) return c & 0x07; 37 else if ((c & 0xf0) == 0xe0) return c & 0x0f; 38 else if ((c & 0xe0) == 0xc0) return c & 0x1f; 39 else return 0; 40 } 41 42 static unsigned int nextpos(char *s, unsigned int size, unsigned int bytestart) 43 { 44 unsigned int i = bytestart; 45 46 while (i < size) 47 { 48 i++; 49 if (boundary(s[i])) 50 break; 51 } 52 53 return i; 54 } 55 56 static unsigned int prevpos(char *s, unsigned int bytestart) 57 { 58 unsigned int i = bytestart; 59 60 while (i > 0) 61 { 62 i--; 63 if (boundary(s[i])) 64 break; 65 } 66 67 return i; 68 } 69 70 /* Unicode operations. */ 71 72 __attr __fn_native_unicode_unicode_len(__attr __args[]) 73 { 74 __attr * const _data = &__args[1]; 75 /* _data interpreted as string */ 76 char *s = _data->strvalue; 77 unsigned int i, c = 0; 78 79 for (i = 0; s[i] != 0; i++) 80 if (boundary(s[i])) 81 c++; 82 83 /* Return the new integer. */ 84 return __new_int(c); 85 } 86 87 __attr __fn_native_unicode_unicode_ord(__attr __args[]) 88 { 89 __attr * const _data = &__args[1]; 90 /* _data interpreted as string */ 91 char *s = _data->strvalue; 92 unsigned int i, c = 0, v; 93 94 for (i = 0; s[i] != 0; i++) 95 { 96 /* Evaluate the current character as a boundary. */ 97 98 v = boundary_value(s[i]); 99 100 /* Boundary with characters read: stop reading. */ 101 102 if (v && i) 103 break; 104 105 /* Boundary: initialise with the extracted value. */ 106 107 else if (v) 108 c = v; 109 110 /* Not a boundary: shift and combine with the continuation value. */ 111 112 else 113 c = (c << 6) | (s[i] & 0x3f); 114 } 115 116 /* Return the new integer. */ 117 return __new_int(c); 118 } 119 120 __attr __fn_native_unicode_unicode_substr(__attr __args[]) 121 { 122 __attr * const _data = &__args[1]; 123 __attr * const start = &__args[2]; 124 __attr * const end = &__args[3]; 125 __attr * const step = &__args[4]; 126 /* _data interpreted as string */ 127 char *s = _data->strvalue, *sub; 128 /* start.__data__ interpreted as int */ 129 int istart = __load_via_object(start->value, __pos___data__).intvalue; 130 /* end.__data__ interpreted as int */ 131 int iend = __load_via_object(end->value, __pos___data__).intvalue; 132 /* step.__data__ interpreted as int */ 133 int istep = __load_via_object(step->value, __pos___data__).intvalue; 134 135 /* Calculate the number of characters. */ 136 size_t nchar = ((iend - istart - (istep > 0 ? 1 : -1)) / istep) + 1; 137 unsigned int indexes[nchar]; 138 139 unsigned int c, d, i, to, from, lastbyte = 0; 140 size_t resultsize = 0, ss = strlen(_data->strvalue); 141 142 /* Find the indexes of the characters. */ 143 if (istep > 0) 144 { 145 /* Get the first byte position. */ 146 for (c = 0; c < istart; c++) 147 lastbyte = nextpos(s, ss, lastbyte); 148 149 /* Get each subsequent byte position. */ 150 for (c = istart, i = 0; i < nchar; c += istep, i++) 151 { 152 indexes[i] = lastbyte; 153 154 /* Add the character size to the result size. */ 155 resultsize += nextpos(s, ss, lastbyte) - lastbyte; 156 157 for (d = c; d < c + istep; d++) 158 lastbyte = nextpos(s, ss, lastbyte); 159 } 160 } 161 else 162 { 163 /* Get the first byte position. */ 164 for (c = 0; c < istart; c++) 165 lastbyte = nextpos(s, ss, lastbyte); 166 167 /* Get each subsequent byte position. */ 168 for (c = istart, i = 0; i < nchar; c += istep, i++) 169 { 170 indexes[i] = lastbyte; 171 172 /* Add the character size to the result size. */ 173 resultsize += nextpos(s, ss, lastbyte) - lastbyte; 174 175 for (d = c; d > c + istep; d--) 176 lastbyte = prevpos(s, lastbyte); 177 } 178 } 179 180 /* Reserve space for a new string. */ 181 sub = (char *) __ALLOCATE(resultsize + 1, sizeof(char)); 182 183 /* Does not null terminate but final byte should be zero. */ 184 for (i = 0, to = 0; i < nchar; i++) 185 { 186 from = indexes[i]; 187 do 188 { 189 sub[to++] = s[from++]; 190 } while (!boundary(s[from])); 191 } 192 193 return __new_str(sub); 194 } 195 196 /* Module initialisation. */ 197 198 void __main_native_unicode() 199 { 200 }