1 /* Native functions for Unicode operations. 2 3 Copyright (C) 2016, 2017 Paul Boddie <paul@boddie.org.uk> 4 5 This program is free software; you can redistribute it and/or modify it under 6 the terms of the GNU General Public License as published by the Free Software 7 Foundation; either version 3 of the License, or (at your option) any later 8 version. 9 10 This program is distributed in the hope that it will be useful, but WITHOUT 11 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS 12 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more 13 details. 14 15 You should have received a copy of the GNU General Public License along with 16 this program. If not, see <http://www.gnu.org/licenses/>. 17 */ 18 19 #include "native/common.h" 20 #include "types.h" 21 #include "exceptions.h" 22 #include "ops.h" 23 #include "progconsts.h" 24 #include "progops.h" 25 #include "progtypes.h" 26 #include "main.h" 27 28 static inline int boundary(char c) 29 { 30 return ((c & 0xc0) == 0xc0) || !(c & 0x80); 31 } 32 33 static inline int boundary_value(char c) 34 { 35 if (!(c & 0x80)) return c; 36 else if ((c & 0xf8) == 0xf0) return c & 0x07; 37 else if ((c & 0xf0) == 0xe0) return c & 0x0f; 38 else if ((c & 0xe0) == 0xc0) return c & 0x1f; 39 else return 0; 40 } 41 42 static unsigned int nextpos(char *s, unsigned int size, unsigned int bytestart) 43 { 44 unsigned int i = bytestart; 45 46 while (i < size) 47 { 48 i++; 49 if (boundary(s[i])) 50 break; 51 } 52 53 return i; 54 } 55 56 static unsigned int prevpos(char *s, unsigned int bytestart) 57 { 58 unsigned int i = bytestart; 59 60 while (i > 0) 61 { 62 i--; 63 if (boundary(s[i])) 64 break; 65 } 66 67 return i; 68 } 69 70 /* Unicode operations. */ 71 72 __attr __fn_native_unicode_unicode_len(__attr __self, __attr _data, __attr _size) 73 { 74 /* _data interpreted as string.__data__ */ 75 char *s = _data.strvalue; 76 /* _size interpreted as int */ 77 int size = __TOINT(_size); 78 unsigned int i, c = 0; 79 80 for (i = 0; i < size; i++) 81 if (boundary(s[i])) 82 c++; 83 84 /* Return the new integer. */ 85 return __new_int(c); 86 } 87 88 __attr __fn_native_unicode_unicode_ord(__attr __self, __attr _data, __attr _size) 89 { 90 /* _data interpreted as string.__data__ */ 91 char *s = _data.strvalue; 92 /* _size interpreted as int */ 93 int size = __TOINT(_size); 94 unsigned int i, c = 0, v; 95 96 for (i = 0; i < size; i++) 97 { 98 /* Evaluate the current character as a boundary. */ 99 100 v = boundary_value(s[i]); 101 102 /* Boundary with characters read: stop reading. */ 103 104 if (v && i) 105 break; 106 107 /* Boundary: initialise with the extracted value. */ 108 109 else if (v) 110 c = v; 111 112 /* Not a boundary: shift and combine with the continuation value. */ 113 114 else 115 c = (c << 6) | (s[i] & 0x3f); 116 } 117 118 /* Return the new integer. */ 119 return __new_int(c); 120 } 121 122 __attr __fn_native_unicode_unicode_substr(__attr __self, __attr _data, __attr _size, __attr start, __attr end, __attr step) 123 { 124 /* _data interpreted as string.__data__ */ 125 char *s = _data.strvalue, *sub; 126 /* _size interpreted as int */ 127 int ss = __TOINT(_size); 128 /* start interpreted as int */ 129 int istart = __TOINT(start); 130 /* end interpreted as int */ 131 int iend = __TOINT(end); 132 /* step interpreted as int */ 133 int istep = __TOINT(step); 134 135 /* Calculate the number of characters. */ 136 size_t nchar = ((iend - istart - (istep > 0 ? 1 : -1)) / istep) + 1; 137 unsigned int indexes[nchar]; 138 139 unsigned int c, d, i, to, from, lastbyte = 0; 140 int resultsize = 0; 141 142 /* Find the indexes of the characters. */ 143 if (istep > 0) 144 { 145 /* Get the first byte position. */ 146 for (c = 0; c < istart; c++) 147 lastbyte = nextpos(s, ss, lastbyte); 148 149 /* Get each subsequent byte position. */ 150 for (c = istart, i = 0; i < nchar; c += istep, i++) 151 { 152 indexes[i] = lastbyte; 153 154 /* Add the character size to the result size. */ 155 resultsize += nextpos(s, ss, lastbyte) - lastbyte; 156 157 for (d = c; d < c + istep; d++) 158 lastbyte = nextpos(s, ss, lastbyte); 159 } 160 } 161 else 162 { 163 /* Get the first byte position. */ 164 for (c = 0; c < istart; c++) 165 lastbyte = nextpos(s, ss, lastbyte); 166 167 /* Get each subsequent byte position. */ 168 for (c = istart, i = 0; i < nchar; c += istep, i++) 169 { 170 indexes[i] = lastbyte; 171 172 /* Add the character size to the result size. */ 173 resultsize += nextpos(s, ss, lastbyte) - lastbyte; 174 175 for (d = c; d > c + istep; d--) 176 lastbyte = prevpos(s, lastbyte); 177 } 178 } 179 180 /* Reserve space for a new string. */ 181 sub = (char *) __ALLOCATE(resultsize + 1, sizeof(char)); 182 183 /* Does not null terminate but final byte should be zero. */ 184 for (i = 0, to = 0; i < nchar; i++) 185 { 186 from = indexes[i]; 187 do 188 { 189 sub[to++] = s[from++]; 190 } while (!boundary(s[from])); 191 } 192 193 return __new_str(sub, resultsize); 194 } 195 196 __attr __fn_native_unicode_unicode_unichr(__attr __self, __attr value) 197 { 198 /* value interpreted as int */ 199 int i = __TOINT(value); 200 unsigned int resultsize; 201 char *s; 202 203 if (i < 128) resultsize = 1; 204 else if (i < 2048) resultsize = 2; 205 else if (i < 65536) resultsize = 3; 206 else resultsize = 4; 207 208 /* Reserve space for a new string. */ 209 210 s = (char *) __ALLOCATE(resultsize + 1, sizeof(char)); 211 212 /* Populate the string. */ 213 214 if (i < 128) s[0] = (char) i; 215 else if (i < 2048) 216 { 217 s[0] = 0b11000000 | (i >> 6); 218 s[1] = 0b10000000 | (i & 0b00111111); 219 } 220 else if (i < 65536) 221 { 222 s[0] = 0b11100000 | (i >> 12); 223 s[1] = 0b10000000 | ((i >> 6) & 0b00111111); 224 s[2] = 0b10000000 | (i & 0b00111111); 225 } 226 else 227 { 228 s[0] = 0b11110000 | (i >> 18); 229 s[1] = 0b10000000 | ((i >> 12) & 0b00111111); 230 s[2] = 0b10000000 | ((i >> 6) & 0b00111111); 231 s[3] = 0b10000000 | (i & 0b00111111); 232 } 233 234 return __new_str(s, resultsize); 235 } 236 237 /* Module initialisation. */ 238 239 void __main_native_unicode() 240 { 241 }