1 /* 2 * Copyright (c) 2014 SGI. 3 * Copyright (c) 2018 Collabora Ltd. 4 * All rights reserved. 5 * 6 * This program is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU General Public License as 8 * published by the Free Software Foundation. 9 * 10 * This program is distributed in the hope that it would be useful, 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 * GNU General Public License for more details. 14 * 15 */ 16 17 /* 18 * This code is adapted from the Linux Kernel. We have a 19 * userspace version here such that the hashes will match that 20 * implementation. 21 */ 22 23 #include "config.h" 24 #include <stdint.h> 25 #include <unistd.h> 26 #include <string.h> 27 #include <limits.h> 28 #include <errno.h> 29 30 #include "ext2_fs.h" 31 #include "ext2fs.h" 32 #include "ext2fsP.h" 33 34 /* Encoding a unicode version number as a single unsigned int. */ 35 #define UNICODE_MAJ_SHIFT (16) 36 #define UNICODE_MIN_SHIFT (8) 37 38 #define UNICODE_AGE(MAJ, MIN, REV) \ 39 (((unsigned int)(MAJ) << UNICODE_MAJ_SHIFT) | \ 40 ((unsigned int)(MIN) << UNICODE_MIN_SHIFT) | \ 41 ((unsigned int)(REV))) 42 43 /* Needed in struct utf8cursor below. */ 44 #define UTF8HANGULLEAF (12) 45 46 /* 47 * Cursor structure used by the normalizer. 48 */ 49 struct utf8cursor { 50 const struct utf8data *data; 51 const char *s; 52 const char *p; 53 const char *ss; 54 const char *sp; 55 unsigned int len; 56 unsigned int slen; 57 short int ccc; 58 short int nccc; 59 unsigned char hangul[UTF8HANGULLEAF]; 60 }; 61 62 /* 63 * Initialize a utf8cursor to normalize a string. 64 * Returns 0 on success. 65 * Returns -1 on failure. 66 */ 67 // extern int utf8cursor(struct utf8cursor *u8c, const struct utf8data *data, 68 // const char *s); 69 // extern int utf8ncursor(struct utf8cursor *u8c, const struct utf8data *data, 70 // const char *s, size_t len); 71 72 /* 73 * Get the next byte in the normalization. 74 * Returns a value > 0 && < 256 on success. 75 * Returns 0 when the end of the normalization is reached. 76 * Returns -1 if the string being normalized is not valid UTF-8. 77 */ 78 // extern int utf8byte(struct utf8cursor *u8c); 79 80 81 struct utf8data { 82 unsigned int maxage; 83 unsigned int offset; 84 }; 85 86 #define __INCLUDED_FROM_UTF8NORM_C__ 87 #include "utf8data.h" 88 #undef __INCLUDED_FROM_UTF8NORM_C__ 89 90 #define ARRAY_SIZE(array) \ 91 (sizeof(array) / sizeof(array[0])) 92 93 #if 0 94 /* Highest unicode version supported by the data tables. */ 95 static int utf8version_is_supported(uint8_t maj, uint8_t min, uint8_t rev) 96 { 97 int i = ARRAY_SIZE(utf8agetab) - 1; 98 unsigned int sb_utf8version = UNICODE_AGE(maj, min, rev); 99 100 while (i >= 0 && utf8agetab[i] != 0) { 101 if (sb_utf8version == utf8agetab[i]) 102 return 1; 103 i--; 104 } 105 return 0; 106 } 107 #endif 108 109 #if 0 110 static int utf8version_latest(void) 111 { 112 return utf8vers; 113 } 114 #endif 115 116 /* 117 * UTF-8 valid ranges. 118 * 119 * The UTF-8 encoding spreads the bits of a 32bit word over several 120 * bytes. This table gives the ranges that can be held and how they'd 121 * be represented. 122 * 123 * 0x00000000 0x0000007F: 0xxxxxxx 124 * 0x00000000 0x000007FF: 110xxxxx 10xxxxxx 125 * 0x00000000 0x0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx 126 * 0x00000000 0x001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 127 * 0x00000000 0x03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 128 * 0x00000000 0x7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 129 * 130 * There is an additional requirement on UTF-8, in that only the 131 * shortest representation of a 32bit value is to be used. A decoder 132 * must not decode sequences that do not satisfy this requirement. 133 * Thus the allowed ranges have a lower bound. 134 * 135 * 0x00000000 0x0000007F: 0xxxxxxx 136 * 0x00000080 0x000007FF: 110xxxxx 10xxxxxx 137 * 0x00000800 0x0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx 138 * 0x00010000 0x001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 139 * 0x00200000 0x03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 140 * 0x04000000 0x7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 141 * 142 * Actual unicode characters are limited to the range 0x0 - 0x10FFFF, 143 * 17 planes of 65536 values. This limits the sequences actually seen 144 * even more, to just the following. 145 * 146 * 0 - 0x7F: 0 - 0x7F 147 * 0x80 - 0x7FF: 0xC2 0x80 - 0xDF 0xBF 148 * 0x800 - 0xFFFF: 0xE0 0xA0 0x80 - 0xEF 0xBF 0xBF 149 * 0x10000 - 0x10FFFF: 0xF0 0x90 0x80 0x80 - 0xF4 0x8F 0xBF 0xBF 150 * 151 * Within those ranges the surrogates 0xD800 - 0xDFFF are not allowed. 152 * 153 * Note that the longest sequence seen with valid usage is 4 bytes, 154 * the same a single UTF-32 character. This makes the UTF-8 155 * representation of Unicode strictly smaller than UTF-32. 156 * 157 * The shortest sequence requirement was introduced by: 158 * Corrigendum #1: UTF-8 Shortest Form 159 * It can be found here: 160 * http://www.unicode.org/versions/corrigendum1.html 161 * 162 */ 163 164 /* 165 * Return the number of bytes used by the current UTF-8 sequence. 166 * Assumes the input points to the first byte of a valid UTF-8 167 * sequence. 168 */ 169 static inline int utf8clen(const char *s) 170 { 171 unsigned char c = *s; 172 173 return 1 + (c >= 0xC0) + (c >= 0xE0) + (c >= 0xF0); 174 } 175 176 /* 177 * Decode a 3-byte UTF-8 sequence. 178 */ 179 static unsigned int 180 utf8decode3(const char *str) 181 { 182 unsigned int uc; 183 184 uc = *str++ & 0x0F; 185 uc <<= 6; 186 uc |= *str++ & 0x3F; 187 uc <<= 6; 188 uc |= *str++ & 0x3F; 189 190 return uc; 191 } 192 193 /* 194 * Encode a 3-byte UTF-8 sequence. 195 */ 196 static int 197 utf8encode3(char *str, unsigned int val) 198 { 199 str[2] = (val & 0x3F) | 0x80; 200 val >>= 6; 201 str[1] = (val & 0x3F) | 0x80; 202 val >>= 6; 203 str[0] = val | 0xE0; 204 205 return 3; 206 } 207 208 /* 209 * utf8trie_t 210 * 211 * A compact binary tree, used to decode UTF-8 characters. 212 * 213 * Internal nodes are one byte for the node itself, and up to three 214 * bytes for an offset into the tree. The first byte contains the 215 * following information: 216 * NEXTBYTE - flag - advance to next byte if set 217 * BITNUM - 3 bit field - the bit number to tested 218 * OFFLEN - 2 bit field - number of bytes in the offset 219 * if offlen == 0 (non-branching node) 220 * RIGHTPATH - 1 bit field - set if the following node is for the 221 * right-hand path (tested bit is set) 222 * TRIENODE - 1 bit field - set if the following node is an internal 223 * node, otherwise it is a leaf node 224 * if offlen != 0 (branching node) 225 * LEFTNODE - 1 bit field - set if the left-hand node is internal 226 * RIGHTNODE - 1 bit field - set if the right-hand node is internal 227 * 228 * Due to the way utf8 works, there cannot be branching nodes with 229 * NEXTBYTE set, and moreover those nodes always have a righthand 230 * descendant. 231 */ 232 typedef const unsigned char utf8trie_t; 233 #define BITNUM 0x07 234 #define NEXTBYTE 0x08 235 #define OFFLEN 0x30 236 #define OFFLEN_SHIFT 4 237 #define RIGHTPATH 0x40 238 #define TRIENODE 0x80 239 #define RIGHTNODE 0x40 240 #define LEFTNODE 0x80 241 242 /* 243 * utf8leaf_t 244 * 245 * The leaves of the trie are embedded in the trie, and so the same 246 * underlying datatype: unsigned char. 247 * 248 * leaf[0]: The unicode version, stored as a generation number that is 249 * an index into utf8agetab[]. With this we can filter code 250 * points based on the unicode version in which they were 251 * defined. The CCC of a non-defined code point is 0. 252 * leaf[1]: Canonical Combining Class. During normalization, we need 253 * to do a stable sort into ascending order of all characters 254 * with a non-zero CCC that occur between two characters with 255 * a CCC of 0, or at the begin or end of a string. 256 * The unicode standard guarantees that all CCC values are 257 * between 0 and 254 inclusive, which leaves 255 available as 258 * a special value. 259 * Code points with CCC 0 are known as stoppers. 260 * leaf[2]: Decomposition. If leaf[1] == 255, then leaf[2] is the 261 * start of a NUL-terminated string that is the decomposition 262 * of the character. 263 * The CCC of a decomposable character is the same as the CCC 264 * of the first character of its decomposition. 265 * Some characters decompose as the empty string: these are 266 * characters with the Default_Ignorable_Code_Point property. 267 * These do affect normalization, as they all have CCC 0. 268 * 269 * The decompositions in the trie have been fully expanded, with the 270 * exception of Hangul syllables, which are decomposed algorithmically. 271 * 272 * Casefolding, if applicable, is also done using decompositions. 273 * 274 * The trie is constructed in such a way that leaves exist for all 275 * UTF-8 sequences that match the criteria from the "UTF-8 valid 276 * ranges" comment above, and only for those sequences. Therefore a 277 * lookup in the trie can be used to validate the UTF-8 input. 278 */ 279 typedef const unsigned char utf8leaf_t; 280 281 #define LEAF_GEN(LEAF) ((LEAF)[0]) 282 #define LEAF_CCC(LEAF) ((LEAF)[1]) 283 #define LEAF_STR(LEAF) ((const char *)((LEAF) + 2)) 284 285 #define MINCCC (0) 286 #define MAXCCC (254) 287 #define STOPPER (0) 288 #define DECOMPOSE (255) 289 290 /* Marker for hangul syllable decomposition. */ 291 #define HANGUL ((char)(255)) 292 /* Size of the synthesized leaf used for Hangul syllable decomposition. */ 293 #define UTF8HANGULLEAF (12) 294 295 /* 296 * Hangul decomposition (algorithm from Section 3.12 of Unicode 6.3.0) 297 * 298 * AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;; 299 * D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;; 300 * 301 * SBase = 0xAC00 302 * LBase = 0x1100 303 * VBase = 0x1161 304 * TBase = 0x11A7 305 * LCount = 19 306 * VCount = 21 307 * TCount = 28 308 * NCount = 588 (VCount * TCount) 309 * SCount = 11172 (LCount * NCount) 310 * 311 * Decomposition: 312 * SIndex = s - SBase 313 * 314 * LV (Canonical/Full) 315 * LIndex = SIndex / NCount 316 * VIndex = (Sindex % NCount) / TCount 317 * LPart = LBase + LIndex 318 * VPart = VBase + VIndex 319 * 320 * LVT (Canonical) 321 * LVIndex = (SIndex / TCount) * TCount 322 * TIndex = (Sindex % TCount) 323 * LVPart = SBase + LVIndex 324 * TPart = TBase + TIndex 325 * 326 * LVT (Full) 327 * LIndex = SIndex / NCount 328 * VIndex = (Sindex % NCount) / TCount 329 * TIndex = (Sindex % TCount) 330 * LPart = LBase + LIndex 331 * VPart = VBase + VIndex 332 * if (TIndex == 0) { 333 * d = <LPart, VPart> 334 * } else { 335 * TPart = TBase + TIndex 336 * d = <LPart, TPart, VPart> 337 * } 338 */ 339 340 /* Constants */ 341 #define SB (0xAC00) 342 #define LB (0x1100) 343 #define VB (0x1161) 344 #define TB (0x11A7) 345 #define LC (19) 346 #define VC (21) 347 #define TC (28) 348 #define NC (VC * TC) 349 #define SC (LC * NC) 350 351 /* Algorithmic decomposition of hangul syllable. */ 352 static utf8leaf_t * 353 utf8hangul(const char *str, unsigned char *hangul) 354 { 355 unsigned int si; 356 unsigned int li; 357 unsigned int vi; 358 unsigned int ti; 359 unsigned char *h; 360 361 /* Calculate the SI, LI, VI, and TI values. */ 362 si = utf8decode3(str) - SB; 363 li = si / NC; 364 vi = (si % NC) / TC; 365 ti = si % TC; 366 367 /* Fill in base of leaf. */ 368 h = hangul; 369 LEAF_GEN(h) = 2; 370 LEAF_CCC(h) = DECOMPOSE; 371 h += 2; 372 373 /* Add LPart, a 3-byte UTF-8 sequence. */ 374 h += utf8encode3((char *)h, li + LB); 375 376 /* Add VPart, a 3-byte UTF-8 sequence. */ 377 h += utf8encode3((char *)h, vi + VB); 378 379 /* Add TPart if required, also a 3-byte UTF-8 sequence. */ 380 if (ti) 381 h += utf8encode3((char *)h, ti + TB); 382 383 /* Terminate string. */ 384 h[0] = '\0'; 385 386 return hangul; 387 } 388 389 /* 390 * Use trie to scan s, touching at most len bytes. 391 * Returns the leaf if one exists, NULL otherwise. 392 * 393 * A non-NULL return guarantees that the UTF-8 sequence starting at s 394 * is well-formed and corresponds to a known unicode code point. The 395 * shorthand for this will be "is valid UTF-8 unicode". 396 */ 397 static utf8leaf_t *utf8nlookup(const struct utf8data *data, 398 unsigned char *hangul, const char *s, size_t len) 399 { 400 utf8trie_t *trie; 401 int offlen; 402 int offset; 403 int mask; 404 int node; 405 406 if (!data) 407 return NULL; 408 if (len == 0) 409 return NULL; 410 411 trie = utf8data + data->offset; 412 node = 1; 413 while (node) { 414 offlen = (*trie & OFFLEN) >> OFFLEN_SHIFT; 415 if (*trie & NEXTBYTE) { 416 if (--len == 0) 417 return NULL; 418 s++; 419 } 420 mask = 1 << (*trie & BITNUM); 421 if (*s & mask) { 422 /* Right leg */ 423 if (offlen) { 424 /* Right node at offset of trie */ 425 node = (*trie & RIGHTNODE); 426 offset = trie[offlen]; 427 while (--offlen) { 428 offset <<= 8; 429 offset |= trie[offlen]; 430 } 431 trie += offset; 432 } else if (*trie & RIGHTPATH) { 433 /* Right node after this node */ 434 node = (*trie & TRIENODE); 435 trie++; 436 } else { 437 /* No right node. */ 438 return NULL; 439 } 440 } else { 441 /* Left leg */ 442 if (offlen) { 443 /* Left node after this node. */ 444 node = (*trie & LEFTNODE); 445 trie += offlen + 1; 446 } else if (*trie & RIGHTPATH) { 447 /* No left node. */ 448 return NULL; 449 } else { 450 /* Left node after this node */ 451 node = (*trie & TRIENODE); 452 trie++; 453 } 454 } 455 } 456 /* 457 * Hangul decomposition is done algorithmically. These are the 458 * codepoints >= 0xAC00 and <= 0xD7A3. Their UTF-8 encoding is 459 * always 3 bytes long, so s has been advanced twice, and the 460 * start of the sequence is at s-2. 461 */ 462 if (LEAF_CCC(trie) == DECOMPOSE && LEAF_STR(trie)[0] == HANGUL) 463 trie = utf8hangul(s - 2, hangul); 464 return trie; 465 } 466 467 /* 468 * Use trie to scan s. 469 * Returns the leaf if one exists, NULL otherwise. 470 * 471 * Forwards to utf8nlookup(). 472 */ 473 static utf8leaf_t *utf8lookup(const struct utf8data *data, 474 unsigned char *hangul, const char *s) 475 { 476 return utf8nlookup(data, hangul, s, (size_t)-1); 477 } 478 479 #if 0 480 /* 481 * Maximum age of any character in s. 482 * Return -1 if s is not valid UTF-8 unicode. 483 * Return 0 if only non-assigned code points are used. 484 */ 485 static int utf8agemax(const struct utf8data *data, const char *s) 486 { 487 utf8leaf_t *leaf; 488 int age = 0; 489 int leaf_age; 490 unsigned char hangul[UTF8HANGULLEAF]; 491 492 if (!data) 493 return -1; 494 495 while (*s) { 496 leaf = utf8lookup(data, hangul, s); 497 if (!leaf) 498 return -1; 499 500 leaf_age = utf8agetab[LEAF_GEN(leaf)]; 501 if (leaf_age <= data->maxage && leaf_age > age) 502 age = leaf_age; 503 s += utf8clen(s); 504 } 505 return age; 506 } 507 #endif 508 509 #if 0 510 /* 511 * Minimum age of any character in s. 512 * Return -1 if s is not valid UTF-8 unicode. 513 * Return 0 if non-assigned code points are used. 514 */ 515 static int utf8agemin(const struct utf8data *data, const char *s) 516 { 517 utf8leaf_t *leaf; 518 int age; 519 int leaf_age; 520 unsigned char hangul[UTF8HANGULLEAF]; 521 522 if (!data) 523 return -1; 524 age = data->maxage; 525 while (*s) { 526 leaf = utf8lookup(data, hangul, s); 527 if (!leaf) 528 return -1; 529 leaf_age = utf8agetab[LEAF_GEN(leaf)]; 530 if (leaf_age <= data->maxage && leaf_age < age) 531 age = leaf_age; 532 s += utf8clen(s); 533 } 534 return age; 535 } 536 #endif 537 538 #if 0 539 /* 540 * Maximum age of any character in s, touch at most len bytes. 541 * Return -1 if s is not valid UTF-8 unicode. 542 */ 543 static int utf8nagemax(const struct utf8data *data, const char *s, size_t len) 544 { 545 utf8leaf_t *leaf; 546 int age = 0; 547 int leaf_age; 548 unsigned char hangul[UTF8HANGULLEAF]; 549 550 if (!data) 551 return -1; 552 553 while (len && *s) { 554 leaf = utf8nlookup(data, hangul, s, len); 555 if (!leaf) 556 return -1; 557 leaf_age = utf8agetab[LEAF_GEN(leaf)]; 558 if (leaf_age <= data->maxage && leaf_age > age) 559 age = leaf_age; 560 len -= utf8clen(s); 561 s += utf8clen(s); 562 } 563 return age; 564 } 565 #endif 566 567 #if 0 568 /* 569 * Maximum age of any character in s, touch at most len bytes. 570 * Return -1 if s is not valid UTF-8 unicode. 571 */ 572 static int utf8nagemin(const struct utf8data *data, const char *s, size_t len) 573 { 574 utf8leaf_t *leaf; 575 int leaf_age; 576 int age; 577 unsigned char hangul[UTF8HANGULLEAF]; 578 579 if (!data) 580 return -1; 581 age = data->maxage; 582 while (len && *s) { 583 leaf = utf8nlookup(data, hangul, s, len); 584 if (!leaf) 585 return -1; 586 leaf_age = utf8agetab[LEAF_GEN(leaf)]; 587 if (leaf_age <= data->maxage && leaf_age < age) 588 age = leaf_age; 589 len -= utf8clen(s); 590 s += utf8clen(s); 591 } 592 return age; 593 } 594 #endif 595 596 #if 0 597 /* 598 * Length of the normalization of s. 599 * Return -1 if s is not valid UTF-8 unicode. 600 * 601 * A string of Default_Ignorable_Code_Point has length 0. 602 */ 603 static ssize_t utf8len(const struct utf8data *data, const char *s) 604 { 605 utf8leaf_t *leaf; 606 size_t ret = 0; 607 unsigned char hangul[UTF8HANGULLEAF]; 608 609 if (!data) 610 return -1; 611 while (*s) { 612 leaf = utf8lookup(data, hangul, s); 613 if (!leaf) 614 return -1; 615 if (utf8agetab[LEAF_GEN(leaf)] > data->maxage) 616 ret += utf8clen(s); 617 else if (LEAF_CCC(leaf) == DECOMPOSE) 618 ret += strlen(LEAF_STR(leaf)); 619 else 620 ret += utf8clen(s); 621 s += utf8clen(s); 622 } 623 return ret; 624 } 625 #endif 626 627 #if 0 628 /* 629 * Length of the normalization of s, touch at most len bytes. 630 * Return -1 if s is not valid UTF-8 unicode. 631 */ 632 static ssize_t utf8nlen(const struct utf8data *data, const char *s, size_t len) 633 { 634 utf8leaf_t *leaf; 635 size_t ret = 0; 636 unsigned char hangul[UTF8HANGULLEAF]; 637 638 if (!data) 639 return -1; 640 while (len && *s) { 641 leaf = utf8nlookup(data, hangul, s, len); 642 if (!leaf) 643 return -1; 644 if (utf8agetab[LEAF_GEN(leaf)] > data->maxage) 645 ret += utf8clen(s); 646 else if (LEAF_CCC(leaf) == DECOMPOSE) 647 ret += strlen(LEAF_STR(leaf)); 648 else 649 ret += utf8clen(s); 650 len -= utf8clen(s); 651 s += utf8clen(s); 652 } 653 return ret; 654 } 655 #endif 656 657 /* 658 * Set up an utf8cursor for use by utf8byte(). 659 * 660 * u8c : pointer to cursor. 661 * data : const struct utf8data to use for normalization. 662 * s : string. 663 * len : length of s. 664 * 665 * Returns -1 on error, 0 on success. 666 */ 667 static int utf8ncursor(struct utf8cursor *u8c, const struct utf8data *data, 668 const char *s, size_t len) 669 { 670 if (!data) 671 return -1; 672 if (!s) 673 return -1; 674 u8c->data = data; 675 u8c->s = s; 676 u8c->p = NULL; 677 u8c->ss = NULL; 678 u8c->sp = NULL; 679 u8c->len = len; 680 u8c->slen = 0; 681 u8c->ccc = STOPPER; 682 u8c->nccc = STOPPER; 683 /* Check we didn't clobber the maximum length. */ 684 if (u8c->len != len) 685 return -1; 686 /* The first byte of s may not be an utf8 continuation. */ 687 if (len > 0 && (*s & 0xC0) == 0x80) 688 return -1; 689 return 0; 690 } 691 692 #if 0 693 /* 694 * Set up an utf8cursor for use by utf8byte(). 695 * 696 * u8c : pointer to cursor. 697 * data : const struct utf8data to use for normalization. 698 * s : NUL-terminated string. 699 * 700 * Returns -1 on error, 0 on success. 701 */ 702 static int utf8cursor(struct utf8cursor *u8c, const struct utf8data *data, 703 const char *s) 704 { 705 return utf8ncursor(u8c, data, s, (unsigned int)-1); 706 } 707 #endif 708 709 /* 710 * Get one byte from the normalized form of the string described by u8c. 711 * 712 * Returns the byte cast to an unsigned char on succes, and -1 on failure. 713 * 714 * The cursor keeps track of the location in the string in u8c->s. 715 * When a character is decomposed, the current location is stored in 716 * u8c->p, and u8c->s is set to the start of the decomposition. Note 717 * that bytes from a decomposition do not count against u8c->len. 718 * 719 * Characters are emitted if they match the current CCC in u8c->ccc. 720 * Hitting end-of-string while u8c->ccc == STOPPER means we're done, 721 * and the function returns 0 in that case. 722 * 723 * Sorting by CCC is done by repeatedly scanning the string. The 724 * values of u8c->s and u8c->p are stored in u8c->ss and u8c->sp at 725 * the start of the scan. The first pass finds the lowest CCC to be 726 * emitted and stores it in u8c->nccc, the second pass emits the 727 * characters with this CCC and finds the next lowest CCC. This limits 728 * the number of passes to 1 + the number of different CCCs in the 729 * sequence being scanned. 730 * 731 * Therefore: 732 * u8c->p != NULL -> a decomposition is being scanned. 733 * u8c->ss != NULL -> this is a repeating scan. 734 * u8c->ccc == -1 -> this is the first scan of a repeating scan. 735 */ 736 static int utf8byte(struct utf8cursor *u8c) 737 { 738 utf8leaf_t *leaf; 739 int ccc; 740 741 for (;;) { 742 /* Check for the end of a decomposed character. */ 743 if (u8c->p && *u8c->s == '\0') { 744 u8c->s = u8c->p; 745 u8c->p = NULL; 746 } 747 748 /* Check for end-of-string. */ 749 if (!u8c->p && (u8c->len == 0 || *u8c->s == '\0')) { 750 /* There is no next byte. */ 751 if (u8c->ccc == STOPPER) 752 return 0; 753 /* End-of-string during a scan counts as a stopper. */ 754 ccc = STOPPER; 755 goto ccc_mismatch; 756 } else if ((*u8c->s & 0xC0) == 0x80) { 757 /* This is a continuation of the current character. */ 758 if (!u8c->p) 759 u8c->len--; 760 return (unsigned char)*u8c->s++; 761 } 762 763 /* Look up the data for the current character. */ 764 if (u8c->p) { 765 leaf = utf8lookup(u8c->data, u8c->hangul, u8c->s); 766 } else { 767 leaf = utf8nlookup(u8c->data, u8c->hangul, 768 u8c->s, u8c->len); 769 } 770 771 /* No leaf found implies that the input is a binary blob. */ 772 if (!leaf) 773 return -1; 774 775 ccc = LEAF_CCC(leaf); 776 /* Characters that are too new have CCC 0. */ 777 if (utf8agetab[LEAF_GEN(leaf)] > u8c->data->maxage) { 778 ccc = STOPPER; 779 } else if (ccc == DECOMPOSE) { 780 u8c->len -= utf8clen(u8c->s); 781 u8c->p = u8c->s + utf8clen(u8c->s); 782 u8c->s = LEAF_STR(leaf); 783 /* Empty decomposition implies CCC 0. */ 784 if (*u8c->s == '\0') { 785 if (u8c->ccc == STOPPER) 786 continue; 787 ccc = STOPPER; 788 goto ccc_mismatch; 789 } 790 791 leaf = utf8lookup(u8c->data, u8c->hangul, u8c->s); 792 if (!leaf) 793 return -1; 794 ccc = LEAF_CCC(leaf); 795 } 796 797 /* 798 * If this is not a stopper, then see if it updates 799 * the next canonical class to be emitted. 800 */ 801 if (ccc != STOPPER && u8c->ccc < ccc && ccc < u8c->nccc) 802 u8c->nccc = ccc; 803 804 /* 805 * Return the current byte if this is the current 806 * combining class. 807 */ 808 if (ccc == u8c->ccc) { 809 if (!u8c->p) 810 u8c->len--; 811 return (unsigned char)*u8c->s++; 812 } 813 814 /* Current combining class mismatch. */ 815 ccc_mismatch: 816 if (u8c->nccc == STOPPER) { 817 /* 818 * Scan forward for the first canonical class 819 * to be emitted. Save the position from 820 * which to restart. 821 */ 822 u8c->ccc = MINCCC - 1; 823 u8c->nccc = ccc; 824 u8c->sp = u8c->p; 825 u8c->ss = u8c->s; 826 u8c->slen = u8c->len; 827 if (!u8c->p) 828 u8c->len -= utf8clen(u8c->s); 829 u8c->s += utf8clen(u8c->s); 830 } else if (ccc != STOPPER) { 831 /* Not a stopper, and not the ccc we're emitting. */ 832 if (!u8c->p) 833 u8c->len -= utf8clen(u8c->s); 834 u8c->s += utf8clen(u8c->s); 835 } else if (u8c->nccc != MAXCCC + 1) { 836 /* At a stopper, restart for next ccc. */ 837 u8c->ccc = u8c->nccc; 838 u8c->nccc = MAXCCC + 1; 839 u8c->s = u8c->ss; 840 u8c->p = u8c->sp; 841 u8c->len = u8c->slen; 842 } else { 843 /* All done, proceed from here. */ 844 u8c->ccc = STOPPER; 845 u8c->nccc = STOPPER; 846 u8c->sp = NULL; 847 u8c->ss = NULL; 848 u8c->slen = 0; 849 } 850 } 851 } 852 853 #if 0 854 /* 855 * Look for the correct const struct utf8data for a unicode version. 856 * Returns NULL if the version requested is too new. 857 * 858 * Two normalization forms are supported: nfdi and nfdicf. 859 * 860 * nfdi: 861 * - Apply unicode normalization form NFD. 862 * - Remove any Default_Ignorable_Code_Point. 863 * 864 * nfdicf: 865 * - Apply unicode normalization form NFD. 866 * - Remove any Default_Ignorable_Code_Point. 867 * - Apply a full casefold (C + F). 868 */ 869 static const struct utf8data *utf8nfdi(unsigned int maxage) 870 { 871 int i = ARRAY_SIZE(utf8nfdidata) - 1; 872 873 while (maxage < utf8nfdidata[i].maxage) 874 i--; 875 if (maxage > utf8nfdidata[i].maxage) 876 return NULL; 877 return &utf8nfdidata[i]; 878 } 879 #endif 880 881 static const struct utf8data *utf8nfdicf(unsigned int maxage) 882 { 883 int i = ARRAY_SIZE(utf8nfdicfdata) - 1; 884 885 while (maxage < utf8nfdicfdata[i].maxage) 886 i--; 887 if (maxage > utf8nfdicfdata[i].maxage) 888 return NULL; 889 return &utf8nfdicfdata[i]; 890 } 891 892 static int utf8_casefold(const struct ext2fs_nls_table *table, 893 const unsigned char *str, size_t len, 894 unsigned char *dest, size_t dlen) 895 { 896 const struct utf8data *data = utf8nfdicf(table->version); 897 struct utf8cursor cur; 898 size_t nlen = 0; 899 900 if (utf8ncursor(&cur, data, (const char *) str, len) < 0) 901 goto invalid_seq; 902 903 for (nlen = 0; nlen < dlen; nlen++) { 904 int c = utf8byte(&cur); 905 906 dest[nlen] = c; 907 if (!c) 908 return nlen; 909 if (c == -1) 910 break; 911 } 912 913 return -ENAMETOOLONG; 914 915 invalid_seq: 916 if (dlen < len) 917 return -ENAMETOOLONG; 918 919 /* Signal invalid sequence */ 920 return -EINVAL; 921 } 922 923 static int utf8_validate(const struct ext2fs_nls_table *table, 924 char *s, size_t len, char **pos) 925 { 926 const struct utf8data *data = utf8nfdicf(table->version); 927 utf8leaf_t *leaf; 928 unsigned char hangul[UTF8HANGULLEAF]; 929 930 if (!data) 931 return -1; 932 while (len && *s) { 933 leaf = utf8nlookup(data, hangul, s, len); 934 if (!leaf) { 935 *pos = s; 936 return 1; 937 } 938 len -= utf8clen(s); 939 s += utf8clen(s); 940 } 941 return 0; 942 } 943 944 static int utf8_casefold_cmp(const struct ext2fs_nls_table *table, 945 const unsigned char *str1, size_t len1, 946 const unsigned char *str2, size_t len2) 947 { 948 const struct utf8data *data = utf8nfdicf(table->version); 949 int c1, c2; 950 struct utf8cursor cur1, cur2; 951 952 if (utf8ncursor(&cur1, data, (const char *) str1, len1) < 0) 953 return -1; 954 if (utf8ncursor(&cur2, data, (const char *) str2, len2) < 0) 955 return -1; 956 957 do { 958 c1 = utf8byte(&cur1); 959 c2 = utf8byte(&cur2); 960 961 if (c1 < 0 || c2 < 0) 962 return -1; 963 if (c1 != c2) 964 return c1 - c2; 965 } while (c1); 966 967 return 0; 968 } 969 970 static const struct ext2fs_nls_ops utf8_ops = { 971 .casefold = utf8_casefold, 972 .validate = utf8_validate, 973 .casefold_cmp = utf8_casefold_cmp, 974 }; 975 976 static const struct ext2fs_nls_table nls_utf8 = { 977 .ops = &utf8_ops, 978 .version = UNICODE_AGE(12, 1, 0), 979 }; 980 981 const struct ext2fs_nls_table *ext2fs_load_nls_table(int encoding) 982 { 983 if (encoding == EXT4_ENC_UTF8_12_1) 984 return &nls_utf8; 985 986 return NULL; 987 } 988 989 int ext2fs_check_encoded_name(const struct ext2fs_nls_table *table, 990 char *name, size_t len, char **pos) 991 { 992 return table->ops->validate(table, name, len, pos); 993 } 994 995 int ext2fs_casefold_cmp(const struct ext2fs_nls_table *table, 996 const unsigned char *str1, size_t len1, 997 const unsigned char *str2, size_t len2) 998 { 999 return table->ops->casefold_cmp(table, str1, len1, str2, len2); 1000 }