1 /* Native functions for Unicode operations. 2 3 Copyright (C) 2016, 2017 Paul Boddie <paul@boddie.org.uk> 4 5 This program is free software; you can redistribute it and/or modify it under 6 the terms of the GNU General Public License as published by the Free Software 7 Foundation; either version 3 of the License, or (at your option) any later 8 version. 9 10 This program is distributed in the hope that it will be useful, but WITHOUT 11 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS 12 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more 13 details. 14 15 You should have received a copy of the GNU General Public License along with 16 this program. If not, see <http://www.gnu.org/licenses/>. 17 */ 18 19 #include "native/common.h" 20 #include "types.h" 21 #include "exceptions.h" 22 #include "ops.h" 23 #include "progconsts.h" 24 #include "progops.h" 25 #include "progtypes.h" 26 #include "main.h" 27 28 static inline int boundary(char c) 29 { 30 return ((c & 0xc0) == 0xc0) || !(c & 0x80); 31 } 32 33 static inline int boundary_value(char c) 34 { 35 if (!(c & 0x80)) return c; 36 else if ((c & 0xf8) == 0xf0) return c & 0x07; 37 else if ((c & 0xf0) == 0xe0) return c & 0x0f; 38 else if ((c & 0xe0) == 0xc0) return c & 0x1f; 39 else return 0; 40 } 41 42 static unsigned int nextpos(char *s, unsigned int size, unsigned int bytestart) 43 { 44 unsigned int i = bytestart; 45 46 while (i < size) 47 { 48 i++; 49 if (boundary(s[i])) 50 break; 51 } 52 53 return i; 54 } 55 56 static unsigned int prevpos(char *s, unsigned int bytestart) 57 { 58 unsigned int i = bytestart; 59 60 while (i > 0) 61 { 62 i--; 63 if (boundary(s[i])) 64 break; 65 } 66 67 return i; 68 } 69 70 /* Unicode operations. */ 71 72 __attr __fn_native_unicode_unicode_len(__attr __args[]) 73 { 74 __attr * const _data = &__args[1]; 75 __attr * const _size = &__args[2]; 76 /* _data interpreted as string */ 77 char *s = _data->strvalue; 78 /* _size interpreted as int */ 79 int size = _size->intvalue; 80 unsigned int i, c = 0; 81 82 for (i = 0; i < size; i++) 83 if (boundary(s[i])) 84 c++; 85 86 /* Return the new integer. */ 87 return __new_int(c); 88 } 89 90 __attr __fn_native_unicode_unicode_ord(__attr __args[]) 91 { 92 __attr * const _data = &__args[1]; 93 __attr * const _size = &__args[2]; 94 /* _data interpreted as string */ 95 char *s = _data->strvalue; 96 /* _size interpreted as int */ 97 int size = _size->intvalue; 98 unsigned int i, c = 0, v; 99 100 for (i = 0; i < size; i++) 101 { 102 /* Evaluate the current character as a boundary. */ 103 104 v = boundary_value(s[i]); 105 106 /* Boundary with characters read: stop reading. */ 107 108 if (v && i) 109 break; 110 111 /* Boundary: initialise with the extracted value. */ 112 113 else if (v) 114 c = v; 115 116 /* Not a boundary: shift and combine with the continuation value. */ 117 118 else 119 c = (c << 6) | (s[i] & 0x3f); 120 } 121 122 /* Return the new integer. */ 123 return __new_int(c); 124 } 125 126 __attr __fn_native_unicode_unicode_substr(__attr __args[]) 127 { 128 __attr * const _data = &__args[1]; 129 __attr * const _size = &__args[2]; 130 __attr * const start = &__args[3]; 131 __attr * const end = &__args[4]; 132 __attr * const step = &__args[5]; 133 /* _data interpreted as string */ 134 char *s = _data->strvalue, *sub; 135 /* _size interpreted as int */ 136 int ss = _size->intvalue; 137 /* start.__data__ interpreted as int */ 138 int istart = __load_via_object(start->value, __data__).intvalue; 139 /* end.__data__ interpreted as int */ 140 int iend = __load_via_object(end->value, __data__).intvalue; 141 /* step.__data__ interpreted as int */ 142 int istep = __load_via_object(step->value, __data__).intvalue; 143 144 /* Calculate the number of characters. */ 145 size_t nchar = ((iend - istart - (istep > 0 ? 1 : -1)) / istep) + 1; 146 unsigned int indexes[nchar]; 147 148 unsigned int c, d, i, to, from, lastbyte = 0; 149 int resultsize = 0; 150 151 /* Find the indexes of the characters. */ 152 if (istep > 0) 153 { 154 /* Get the first byte position. */ 155 for (c = 0; c < istart; c++) 156 lastbyte = nextpos(s, ss, lastbyte); 157 158 /* Get each subsequent byte position. */ 159 for (c = istart, i = 0; i < nchar; c += istep, i++) 160 { 161 indexes[i] = lastbyte; 162 163 /* Add the character size to the result size. */ 164 resultsize += nextpos(s, ss, lastbyte) - lastbyte; 165 166 for (d = c; d < c + istep; d++) 167 lastbyte = nextpos(s, ss, lastbyte); 168 } 169 } 170 else 171 { 172 /* Get the first byte position. */ 173 for (c = 0; c < istart; c++) 174 lastbyte = nextpos(s, ss, lastbyte); 175 176 /* Get each subsequent byte position. */ 177 for (c = istart, i = 0; i < nchar; c += istep, i++) 178 { 179 indexes[i] = lastbyte; 180 181 /* Add the character size to the result size. */ 182 resultsize += nextpos(s, ss, lastbyte) - lastbyte; 183 184 for (d = c; d > c + istep; d--) 185 lastbyte = prevpos(s, lastbyte); 186 } 187 } 188 189 /* Reserve space for a new string. */ 190 sub = (char *) __ALLOCATE(resultsize + 1, sizeof(char)); 191 192 /* Does not null terminate but final byte should be zero. */ 193 for (i = 0, to = 0; i < nchar; i++) 194 { 195 from = indexes[i]; 196 do 197 { 198 sub[to++] = s[from++]; 199 } while (!boundary(s[from])); 200 } 201 202 return __new_str(sub, resultsize); 203 } 204 205 __attr __fn_native_unicode_unicode_unichr(__attr __args[]) 206 { 207 __attr * const value = &__args[1]; 208 /* value interpreted as int */ 209 int i = value->intvalue; 210 unsigned int resultsize; 211 char *s; 212 213 if (i < 128) resultsize = 1; 214 else if (i < 2048) resultsize = 2; 215 else if (i < 65536) resultsize = 3; 216 else resultsize = 4; 217 218 /* Reserve space for a new string. */ 219 220 s = (char *) __ALLOCATE(resultsize + 1, sizeof(char)); 221 222 /* Populate the string. */ 223 224 if (i < 128) s[0] = (char) i; 225 else if (i < 2048) 226 { 227 s[0] = 0b11000000 | (i >> 6); 228 s[1] = 0b10000000 | (i & 0b00111111); 229 } 230 else if (i < 65536) 231 { 232 s[0] = 0b11100000 | (i >> 12); 233 s[1] = 0b10000000 | ((i >> 6) & 0b00111111); 234 s[2] = 0b10000000 | (i & 0b00111111); 235 } 236 else 237 { 238 s[0] = 0b11110000 | (i >> 18); 239 s[1] = 0b10000000 | ((i >> 12) & 0b00111111); 240 s[2] = 0b10000000 | ((i >> 6) & 0b00111111); 241 s[3] = 0b10000000 | (i & 0b00111111); 242 } 243 244 return __new_str(s, resultsize); 245 } 246 247 /* Module initialisation. */ 248 249 void __main_native_unicode() 250 { 251 }