paul@403 | 1 | /* Native functions for Unicode operations. |
paul@403 | 2 | |
paul@607 | 3 | Copyright (C) 2016, 2017 Paul Boddie <paul@boddie.org.uk> |
paul@403 | 4 | |
paul@403 | 5 | This program is free software; you can redistribute it and/or modify it under |
paul@403 | 6 | the terms of the GNU General Public License as published by the Free Software |
paul@403 | 7 | Foundation; either version 3 of the License, or (at your option) any later |
paul@403 | 8 | version. |
paul@403 | 9 | |
paul@403 | 10 | This program is distributed in the hope that it will be useful, but WITHOUT |
paul@403 | 11 | ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS |
paul@403 | 12 | FOR A PARTICULAR PURPOSE. See the GNU General Public License for more |
paul@403 | 13 | details. |
paul@403 | 14 | |
paul@403 | 15 | You should have received a copy of the GNU General Public License along with |
paul@403 | 16 | this program. If not, see <http://www.gnu.org/licenses/>. |
paul@403 | 17 | */ |
paul@403 | 18 | |
paul@403 | 19 | #include "native/common.h" |
paul@403 | 20 | #include "types.h" |
paul@403 | 21 | #include "exceptions.h" |
paul@403 | 22 | #include "ops.h" |
paul@403 | 23 | #include "progconsts.h" |
paul@403 | 24 | #include "progops.h" |
paul@403 | 25 | #include "progtypes.h" |
paul@403 | 26 | #include "main.h" |
paul@403 | 27 | |
paul@431 | 28 | static inline int boundary(char c) |
paul@431 | 29 | { |
paul@431 | 30 | return ((c & 0xc0) == 0xc0) || !(c & 0x80); |
paul@431 | 31 | } |
paul@431 | 32 | |
paul@534 | 33 | static inline int boundary_value(char c) |
paul@534 | 34 | { |
paul@534 | 35 | if (!(c & 0x80)) return c; |
paul@534 | 36 | else if ((c & 0xf8) == 0xf0) return c & 0x07; |
paul@534 | 37 | else if ((c & 0xf0) == 0xe0) return c & 0x0f; |
paul@534 | 38 | else if ((c & 0xe0) == 0xc0) return c & 0x1f; |
paul@534 | 39 | else return 0; |
paul@534 | 40 | } |
paul@534 | 41 | |
paul@431 | 42 | static unsigned int nextpos(char *s, unsigned int size, unsigned int bytestart) |
paul@431 | 43 | { |
paul@431 | 44 | unsigned int i = bytestart; |
paul@431 | 45 | |
paul@431 | 46 | while (i < size) |
paul@431 | 47 | { |
paul@431 | 48 | i++; |
paul@431 | 49 | if (boundary(s[i])) |
paul@431 | 50 | break; |
paul@431 | 51 | } |
paul@431 | 52 | |
paul@431 | 53 | return i; |
paul@431 | 54 | } |
paul@431 | 55 | |
paul@431 | 56 | static unsigned int prevpos(char *s, unsigned int bytestart) |
paul@431 | 57 | { |
paul@431 | 58 | unsigned int i = bytestart; |
paul@431 | 59 | |
paul@431 | 60 | while (i > 0) |
paul@431 | 61 | { |
paul@431 | 62 | i--; |
paul@431 | 63 | if (boundary(s[i])) |
paul@431 | 64 | break; |
paul@431 | 65 | } |
paul@431 | 66 | |
paul@431 | 67 | return i; |
paul@431 | 68 | } |
paul@431 | 69 | |
paul@403 | 70 | /* Unicode operations. */ |
paul@403 | 71 | |
paul@664 | 72 | __attr __fn_native_unicode_unicode_len(__attr __self, __attr _data, __attr _size) |
paul@403 | 73 | { |
paul@664 | 74 | /* _data interpreted as string.__data__ */ |
paul@664 | 75 | char *s = _data.strvalue; |
paul@583 | 76 | /* _size interpreted as int */ |
paul@763 | 77 | int size = __TOINT(_size); |
paul@431 | 78 | unsigned int i, c = 0; |
paul@403 | 79 | |
paul@583 | 80 | for (i = 0; i < size; i++) |
paul@431 | 81 | if (boundary(s[i])) |
paul@403 | 82 | c++; |
paul@403 | 83 | |
paul@403 | 84 | /* Return the new integer. */ |
paul@403 | 85 | return __new_int(c); |
paul@403 | 86 | } |
paul@403 | 87 | |
paul@664 | 88 | __attr __fn_native_unicode_unicode_ord(__attr __self, __attr _data, __attr _size) |
paul@534 | 89 | { |
paul@664 | 90 | /* _data interpreted as string.__data__ */ |
paul@664 | 91 | char *s = _data.strvalue; |
paul@583 | 92 | /* _size interpreted as int */ |
paul@763 | 93 | int size = __TOINT(_size); |
paul@534 | 94 | unsigned int i, c = 0, v; |
paul@534 | 95 | |
paul@583 | 96 | for (i = 0; i < size; i++) |
paul@534 | 97 | { |
paul@534 | 98 | /* Evaluate the current character as a boundary. */ |
paul@534 | 99 | |
paul@534 | 100 | v = boundary_value(s[i]); |
paul@534 | 101 | |
paul@534 | 102 | /* Boundary with characters read: stop reading. */ |
paul@534 | 103 | |
paul@534 | 104 | if (v && i) |
paul@534 | 105 | break; |
paul@534 | 106 | |
paul@534 | 107 | /* Boundary: initialise with the extracted value. */ |
paul@534 | 108 | |
paul@534 | 109 | else if (v) |
paul@534 | 110 | c = v; |
paul@534 | 111 | |
paul@534 | 112 | /* Not a boundary: shift and combine with the continuation value. */ |
paul@534 | 113 | |
paul@534 | 114 | else |
paul@534 | 115 | c = (c << 6) | (s[i] & 0x3f); |
paul@534 | 116 | } |
paul@534 | 117 | |
paul@534 | 118 | /* Return the new integer. */ |
paul@534 | 119 | return __new_int(c); |
paul@534 | 120 | } |
paul@534 | 121 | |
paul@664 | 122 | __attr __fn_native_unicode_unicode_substr(__attr __self, __attr _data, __attr _size, __attr start, __attr end, __attr step) |
paul@431 | 123 | { |
paul@664 | 124 | /* _data interpreted as string.__data__ */ |
paul@664 | 125 | char *s = _data.strvalue, *sub; |
paul@583 | 126 | /* _size interpreted as int */ |
paul@763 | 127 | int ss = __TOINT(_size); |
paul@758 | 128 | /* start interpreted as int */ |
paul@763 | 129 | int istart = __TOINT(start); |
paul@758 | 130 | /* end interpreted as int */ |
paul@763 | 131 | int iend = __TOINT(end); |
paul@758 | 132 | /* step interpreted as int */ |
paul@763 | 133 | int istep = __TOINT(step); |
paul@431 | 134 | |
paul@431 | 135 | /* Calculate the number of characters. */ |
paul@431 | 136 | size_t nchar = ((iend - istart - (istep > 0 ? 1 : -1)) / istep) + 1; |
paul@431 | 137 | unsigned int indexes[nchar]; |
paul@431 | 138 | |
paul@431 | 139 | unsigned int c, d, i, to, from, lastbyte = 0; |
paul@583 | 140 | int resultsize = 0; |
paul@431 | 141 | |
paul@431 | 142 | /* Find the indexes of the characters. */ |
paul@431 | 143 | if (istep > 0) |
paul@431 | 144 | { |
paul@431 | 145 | /* Get the first byte position. */ |
paul@431 | 146 | for (c = 0; c < istart; c++) |
paul@569 | 147 | lastbyte = nextpos(s, ss, lastbyte); |
paul@431 | 148 | |
paul@431 | 149 | /* Get each subsequent byte position. */ |
paul@431 | 150 | for (c = istart, i = 0; i < nchar; c += istep, i++) |
paul@431 | 151 | { |
paul@431 | 152 | indexes[i] = lastbyte; |
paul@431 | 153 | |
paul@431 | 154 | /* Add the character size to the result size. */ |
paul@569 | 155 | resultsize += nextpos(s, ss, lastbyte) - lastbyte; |
paul@431 | 156 | |
paul@431 | 157 | for (d = c; d < c + istep; d++) |
paul@569 | 158 | lastbyte = nextpos(s, ss, lastbyte); |
paul@431 | 159 | } |
paul@431 | 160 | } |
paul@431 | 161 | else |
paul@431 | 162 | { |
paul@431 | 163 | /* Get the first byte position. */ |
paul@431 | 164 | for (c = 0; c < istart; c++) |
paul@569 | 165 | lastbyte = nextpos(s, ss, lastbyte); |
paul@431 | 166 | |
paul@431 | 167 | /* Get each subsequent byte position. */ |
paul@431 | 168 | for (c = istart, i = 0; i < nchar; c += istep, i++) |
paul@431 | 169 | { |
paul@431 | 170 | indexes[i] = lastbyte; |
paul@431 | 171 | |
paul@431 | 172 | /* Add the character size to the result size. */ |
paul@569 | 173 | resultsize += nextpos(s, ss, lastbyte) - lastbyte; |
paul@431 | 174 | |
paul@431 | 175 | for (d = c; d > c + istep; d--) |
paul@431 | 176 | lastbyte = prevpos(s, lastbyte); |
paul@431 | 177 | } |
paul@431 | 178 | } |
paul@431 | 179 | |
paul@431 | 180 | /* Reserve space for a new string. */ |
paul@431 | 181 | sub = (char *) __ALLOCATE(resultsize + 1, sizeof(char)); |
paul@431 | 182 | |
paul@431 | 183 | /* Does not null terminate but final byte should be zero. */ |
paul@431 | 184 | for (i = 0, to = 0; i < nchar; i++) |
paul@431 | 185 | { |
paul@431 | 186 | from = indexes[i]; |
paul@431 | 187 | do |
paul@431 | 188 | { |
paul@431 | 189 | sub[to++] = s[from++]; |
paul@431 | 190 | } while (!boundary(s[from])); |
paul@431 | 191 | } |
paul@431 | 192 | |
paul@431 | 193 | return __new_str(sub, resultsize); |
paul@431 | 194 | } |
paul@431 | 195 | |
paul@664 | 196 | __attr __fn_native_unicode_unicode_unichr(__attr __self, __attr value) |
paul@607 | 197 | { |
paul@607 | 198 | /* value interpreted as int */ |
paul@763 | 199 | int i = __TOINT(value); |
paul@607 | 200 | unsigned int resultsize; |
paul@607 | 201 | char *s; |
paul@607 | 202 | |
paul@607 | 203 | if (i < 128) resultsize = 1; |
paul@607 | 204 | else if (i < 2048) resultsize = 2; |
paul@607 | 205 | else if (i < 65536) resultsize = 3; |
paul@607 | 206 | else resultsize = 4; |
paul@607 | 207 | |
paul@607 | 208 | /* Reserve space for a new string. */ |
paul@607 | 209 | |
paul@607 | 210 | s = (char *) __ALLOCATE(resultsize + 1, sizeof(char)); |
paul@607 | 211 | |
paul@607 | 212 | /* Populate the string. */ |
paul@607 | 213 | |
paul@607 | 214 | if (i < 128) s[0] = (char) i; |
paul@607 | 215 | else if (i < 2048) |
paul@607 | 216 | { |
paul@607 | 217 | s[0] = 0b11000000 | (i >> 6); |
paul@607 | 218 | s[1] = 0b10000000 | (i & 0b00111111); |
paul@607 | 219 | } |
paul@607 | 220 | else if (i < 65536) |
paul@607 | 221 | { |
paul@607 | 222 | s[0] = 0b11100000 | (i >> 12); |
paul@607 | 223 | s[1] = 0b10000000 | ((i >> 6) & 0b00111111); |
paul@607 | 224 | s[2] = 0b10000000 | (i & 0b00111111); |
paul@607 | 225 | } |
paul@607 | 226 | else |
paul@607 | 227 | { |
paul@607 | 228 | s[0] = 0b11110000 | (i >> 18); |
paul@607 | 229 | s[1] = 0b10000000 | ((i >> 12) & 0b00111111); |
paul@607 | 230 | s[2] = 0b10000000 | ((i >> 6) & 0b00111111); |
paul@607 | 231 | s[3] = 0b10000000 | (i & 0b00111111); |
paul@607 | 232 | } |
paul@607 | 233 | |
paul@607 | 234 | return __new_str(s, resultsize); |
paul@607 | 235 | } |
paul@607 | 236 | |
paul@403 | 237 | /* Module initialisation. */ |
paul@403 | 238 | |
paul@403 | 239 | void __main_native_unicode() |
paul@403 | 240 | { |
paul@403 | 241 | } |