L4Re/departure (file libext2fs/lib/libext2fs/nls

     1 /*     2  * Copyright (c) 2014 SGI.     3  * Copyright (c) 2018 Collabora Ltd.     4  * All rights reserved.     5  *     6  * This program is free software; you can redistribute it and/or     7  * modify it under the terms of the GNU General Public License as     8  * published by the Free Software Foundation.     9  *    10  * This program is distributed in the hope that it would be useful,    11  * but WITHOUT ANY WARRANTY; without even the implied warranty of    12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the    13  * GNU General Public License for more details.    14  *    15  */    16     17 /*    18  * This code is adapted from the Linux Kernel.  We have a    19  * userspace version here such that the hashes will match that    20  * implementation.    21  */    22     23 #include "config.h"    24 #include <stdint.h>    25 #include <unistd.h>    26 #include <string.h>    27 #include <limits.h>    28 #include <errno.h>    29     30 #include "ext2_fs.h"    31 #include "ext2fs.h"    32 #include "ext2fsP.h"    33     34 /* Encoding a unicode version number as a single unsigned int. */    35 #define UNICODE_MAJ_SHIFT		(16)    36 #define UNICODE_MIN_SHIFT		(8)    37     38 #define UNICODE_AGE(MAJ, MIN, REV)			\    39 	(((unsigned int)(MAJ) << UNICODE_MAJ_SHIFT) |	\    40 	 ((unsigned int)(MIN) << UNICODE_MIN_SHIFT) |	\    41 	 ((unsigned int)(REV)))    42     43 /* Needed in struct utf8cursor below. */    44 #define UTF8HANGULLEAF	(12)    45     46 /*    47  * Cursor structure used by the normalizer.    48  */    49 struct utf8cursor {    50 	const struct utf8data	*data;    51 	const char	*s;    52 	const char	*p;    53 	const char	*ss;    54 	const char	*sp;    55 	unsigned int	len;    56 	unsigned int	slen;    57 	short int	ccc;    58 	short int	nccc;    59 	unsigned char	hangul[UTF8HANGULLEAF];    60 };    61     62 /*    63  * Initialize a utf8cursor to normalize a string.    64  * Returns 0 on success.    65  * Returns -1 on failure.    66  */    67 // extern int utf8cursor(struct utf8cursor *u8c, const struct utf8data *data,    68 //		      const char *s);    69 // extern int utf8ncursor(struct utf8cursor *u8c, const struct utf8data *data,    70 //		       const char *s, size_t len);    71     72 /*    73  * Get the next byte in the normalization.    74  * Returns a value > 0 && < 256 on success.    75  * Returns 0 when the end of the normalization is reached.    76  * Returns -1 if the string being normalized is not valid UTF-8.    77  */    78 // extern int utf8byte(struct utf8cursor *u8c);    79     80     81 struct utf8data {    82 	unsigned int maxage;    83 	unsigned int offset;    84 };    85     86 #define __INCLUDED_FROM_UTF8NORM_C__    87 #include "utf8data.h"    88 #undef __INCLUDED_FROM_UTF8NORM_C__    89     90 #define ARRAY_SIZE(array)			\    91         (sizeof(array) / sizeof(array[0]))    92     93 #if 0    94 /* Highest unicode version supported by the data tables. */    95 static int utf8version_is_supported(uint8_t maj, uint8_t min, uint8_t rev)    96 {    97 	int i = ARRAY_SIZE(utf8agetab) - 1;    98 	unsigned int sb_utf8version = UNICODE_AGE(maj, min, rev);    99    100 	while (i >= 0 && utf8agetab[i] != 0) {   101 		if (sb_utf8version == utf8agetab[i])   102 			return 1;   103 		i--;   104 	}   105 	return 0;   106 }   107 #endif   108    109 #if 0   110 static int utf8version_latest(void)   111 {   112 	return utf8vers;   113 }   114 #endif   115    116 /*   117  * UTF-8 valid ranges.   118  *   119  * The UTF-8 encoding spreads the bits of a 32bit word over several   120  * bytes. This table gives the ranges that can be held and how they'd   121  * be represented.   122  *   123  * 0x00000000 0x0000007F: 0xxxxxxx   124  * 0x00000000 0x000007FF: 110xxxxx 10xxxxxx   125  * 0x00000000 0x0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx   126  * 0x00000000 0x001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx   127  * 0x00000000 0x03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx   128  * 0x00000000 0x7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx   129  *   130  * There is an additional requirement on UTF-8, in that only the   131  * shortest representation of a 32bit value is to be used.  A decoder   132  * must not decode sequences that do not satisfy this requirement.   133  * Thus the allowed ranges have a lower bound.   134  *   135  * 0x00000000 0x0000007F: 0xxxxxxx   136  * 0x00000080 0x000007FF: 110xxxxx 10xxxxxx   137  * 0x00000800 0x0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx   138  * 0x00010000 0x001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx   139  * 0x00200000 0x03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx   140  * 0x04000000 0x7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx   141  *   142  * Actual unicode characters are limited to the range 0x0 - 0x10FFFF,   143  * 17 planes of 65536 values.  This limits the sequences actually seen   144  * even more, to just the following.   145  *   146  *          0 -     0x7F: 0                   - 0x7F   147  *       0x80 -    0x7FF: 0xC2 0x80           - 0xDF 0xBF   148  *      0x800 -   0xFFFF: 0xE0 0xA0 0x80      - 0xEF 0xBF 0xBF   149  *    0x10000 - 0x10FFFF: 0xF0 0x90 0x80 0x80 - 0xF4 0x8F 0xBF 0xBF   150  *   151  * Within those ranges the surrogates 0xD800 - 0xDFFF are not allowed.   152  *   153  * Note that the longest sequence seen with valid usage is 4 bytes,   154  * the same a single UTF-32 character.  This makes the UTF-8   155  * representation of Unicode strictly smaller than UTF-32.   156  *   157  * The shortest sequence requirement was introduced by:   158  *    Corrigendum #1: UTF-8 Shortest Form   159  * It can be found here:   160  *    http://www.unicode.org/versions/corrigendum1.html   161  *   162  */   163    164 /*   165  * Return the number of bytes used by the current UTF-8 sequence.   166  * Assumes the input points to the first byte of a valid UTF-8   167  * sequence.   168  */   169 static inline int utf8clen(const char *s)   170 {   171 	unsigned char c = *s;   172    173 	return 1 + (c >= 0xC0) + (c >= 0xE0) + (c >= 0xF0);   174 }   175    176 /*   177  * Decode a 3-byte UTF-8 sequence.   178  */   179 static unsigned int   180 utf8decode3(const char *str)   181 {   182 	unsigned int		uc;   183    184 	uc = *str++ & 0x0F;   185 	uc <<= 6;   186 	uc |= *str++ & 0x3F;   187 	uc <<= 6;   188 	uc |= *str++ & 0x3F;   189    190 	return uc;   191 }   192    193 /*   194  * Encode a 3-byte UTF-8 sequence.   195  */   196 static int   197 utf8encode3(char *str, unsigned int val)   198 {   199 	str[2] = (val & 0x3F) | 0x80;   200 	val >>= 6;   201 	str[1] = (val & 0x3F) | 0x80;   202 	val >>= 6;   203 	str[0] = val | 0xE0;   204    205 	return 3;   206 }   207    208 /*   209  * utf8trie_t   210  *   211  * A compact binary tree, used to decode UTF-8 characters.   212  *   213  * Internal nodes are one byte for the node itself, and up to three   214  * bytes for an offset into the tree.  The first byte contains the   215  * following information:   216  *  NEXTBYTE  - flag        - advance to next byte if set   217  *  BITNUM    - 3 bit field - the bit number to tested   218  *  OFFLEN    - 2 bit field - number of bytes in the offset   219  * if offlen == 0 (non-branching node)   220  *  RIGHTPATH - 1 bit field - set if the following node is for the   221  *                            right-hand path (tested bit is set)   222  *  TRIENODE  - 1 bit field - set if the following node is an internal   223  *                            node, otherwise it is a leaf node   224  * if offlen != 0 (branching node)   225  *  LEFTNODE  - 1 bit field - set if the left-hand node is internal   226  *  RIGHTNODE - 1 bit field - set if the right-hand node is internal   227  *   228  * Due to the way utf8 works, there cannot be branching nodes with   229  * NEXTBYTE set, and moreover those nodes always have a righthand   230  * descendant.   231  */   232 typedef const unsigned char utf8trie_t;   233 #define BITNUM		0x07   234 #define NEXTBYTE	0x08   235 #define OFFLEN		0x30   236 #define OFFLEN_SHIFT	4   237 #define RIGHTPATH	0x40   238 #define TRIENODE	0x80   239 #define RIGHTNODE	0x40   240 #define LEFTNODE	0x80   241    242 /*   243  * utf8leaf_t   244  *   245  * The leaves of the trie are embedded in the trie, and so the same   246  * underlying datatype: unsigned char.   247  *   248  * leaf[0]: The unicode version, stored as a generation number that is   249  *          an index into utf8agetab[].  With this we can filter code   250  *          points based on the unicode version in which they were   251  *          defined.  The CCC of a non-defined code point is 0.   252  * leaf[1]: Canonical Combining Class. During normalization, we need   253  *          to do a stable sort into ascending order of all characters   254  *          with a non-zero CCC that occur between two characters with   255  *          a CCC of 0, or at the begin or end of a string.   256  *          The unicode standard guarantees that all CCC values are   257  *          between 0 and 254 inclusive, which leaves 255 available as   258  *          a special value.   259  *          Code points with CCC 0 are known as stoppers.   260  * leaf[2]: Decomposition. If leaf[1] == 255, then leaf[2] is the   261  *          start of a NUL-terminated string that is the decomposition   262  *          of the character.   263  *          The CCC of a decomposable character is the same as the CCC   264  *          of the first character of its decomposition.   265  *          Some characters decompose as the empty string: these are   266  *          characters with the Default_Ignorable_Code_Point property.   267  *          These do affect normalization, as they all have CCC 0.   268  *   269  * The decompositions in the trie have been fully expanded, with the   270  * exception of Hangul syllables, which are decomposed algorithmically.   271  *   272  * Casefolding, if applicable, is also done using decompositions.   273  *   274  * The trie is constructed in such a way that leaves exist for all   275  * UTF-8 sequences that match the criteria from the "UTF-8 valid   276  * ranges" comment above, and only for those sequences.  Therefore a   277  * lookup in the trie can be used to validate the UTF-8 input.   278  */   279 typedef const unsigned char utf8leaf_t;   280    281 #define LEAF_GEN(LEAF)	((LEAF)[0])   282 #define LEAF_CCC(LEAF)	((LEAF)[1])   283 #define LEAF_STR(LEAF)	((const char *)((LEAF) + 2))   284    285 #define MINCCC		(0)   286 #define MAXCCC		(254)   287 #define STOPPER		(0)   288 #define	DECOMPOSE	(255)   289    290 /* Marker for hangul syllable decomposition. */   291 #define HANGUL		((char)(255))   292 /* Size of the synthesized leaf used for Hangul syllable decomposition. */   293 #define UTF8HANGULLEAF	(12)   294    295 /*   296  * Hangul decomposition (algorithm from Section 3.12 of Unicode 6.3.0)   297  *   298  * AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;;   299  * D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;;   300  *   301  * SBase = 0xAC00   302  * LBase = 0x1100   303  * VBase = 0x1161   304  * TBase = 0x11A7   305  * LCount = 19   306  * VCount = 21   307  * TCount = 28   308  * NCount = 588 (VCount * TCount)   309  * SCount = 11172 (LCount * NCount)   310  *   311  * Decomposition:   312  *   SIndex = s - SBase   313  *   314  * LV (Canonical/Full)   315  *   LIndex = SIndex / NCount   316  *   VIndex = (Sindex % NCount) / TCount   317  *   LPart = LBase + LIndex   318  *   VPart = VBase + VIndex   319  *   320  * LVT (Canonical)   321  *   LVIndex = (SIndex / TCount) * TCount   322  *   TIndex = (Sindex % TCount)   323  *   LVPart = SBase + LVIndex   324  *   TPart = TBase + TIndex   325  *   326  * LVT (Full)   327  *   LIndex = SIndex / NCount   328  *   VIndex = (Sindex % NCount) / TCount   329  *   TIndex = (Sindex % TCount)   330  *   LPart = LBase + LIndex   331  *   VPart = VBase + VIndex   332  *   if (TIndex == 0) {   333  *          d = <LPart, VPart>   334  *   } else {   335  *          TPart = TBase + TIndex   336  *          d = <LPart, TPart, VPart>   337  *   }   338  */   339    340 /* Constants */   341 #define SB	(0xAC00)   342 #define LB	(0x1100)   343 #define VB	(0x1161)   344 #define TB	(0x11A7)   345 #define LC	(19)   346 #define VC	(21)   347 #define TC	(28)   348 #define NC	(VC * TC)   349 #define SC	(LC * NC)   350    351 /* Algorithmic decomposition of hangul syllable. */   352 static utf8leaf_t *   353 utf8hangul(const char *str, unsigned char *hangul)   354 {   355 	unsigned int	si;   356 	unsigned int	li;   357 	unsigned int	vi;   358 	unsigned int	ti;   359 	unsigned char	*h;   360    361 	/* Calculate the SI, LI, VI, and TI values. */   362 	si = utf8decode3(str) - SB;   363 	li = si / NC;   364 	vi = (si % NC) / TC;   365 	ti = si % TC;   366    367 	/* Fill in base of leaf. */   368 	h = hangul;   369 	LEAF_GEN(h) = 2;   370 	LEAF_CCC(h) = DECOMPOSE;   371 	h += 2;   372    373 	/* Add LPart, a 3-byte UTF-8 sequence. */   374 	h += utf8encode3((char *)h, li + LB);   375    376 	/* Add VPart, a 3-byte UTF-8 sequence. */   377 	h += utf8encode3((char *)h, vi + VB);   378    379 	/* Add TPart if required, also a 3-byte UTF-8 sequence. */   380 	if (ti)   381 		h += utf8encode3((char *)h, ti + TB);   382    383 	/* Terminate string. */   384 	h[0] = '\0';   385    386 	return hangul;   387 }   388    389 /*   390  * Use trie to scan s, touching at most len bytes.   391  * Returns the leaf if one exists, NULL otherwise.   392  *   393  * A non-NULL return guarantees that the UTF-8 sequence starting at s   394  * is well-formed and corresponds to a known unicode code point.  The   395  * shorthand for this will be "is valid UTF-8 unicode".   396  */   397 static utf8leaf_t *utf8nlookup(const struct utf8data *data,   398 			       unsigned char *hangul, const char *s, size_t len)   399 {   400 	utf8trie_t	*trie;   401 	int		offlen;   402 	int		offset;   403 	int		mask;   404 	int		node;   405    406 	if (!data)   407 		return NULL;   408 	if (len == 0)   409 		return NULL;   410    411 	trie = utf8data + data->offset;   412 	node = 1;   413 	while (node) {   414 		offlen = (*trie & OFFLEN) >> OFFLEN_SHIFT;   415 		if (*trie & NEXTBYTE) {   416 			if (--len == 0)   417 				return NULL;   418 			s++;   419 		}   420 		mask = 1 << (*trie & BITNUM);   421 		if (*s & mask) {   422 			/* Right leg */   423 			if (offlen) {   424 				/* Right node at offset of trie */   425 				node = (*trie & RIGHTNODE);   426 				offset = trie[offlen];   427 				while (--offlen) {   428 					offset <<= 8;   429 					offset |= trie[offlen];   430 				}   431 				trie += offset;   432 			} else if (*trie & RIGHTPATH) {   433 				/* Right node after this node */   434 				node = (*trie & TRIENODE);   435 				trie++;   436 			} else {   437 				/* No right node. */   438 				return NULL;   439 			}   440 		} else {   441 			/* Left leg */   442 			if (offlen) {   443 				/* Left node after this node. */   444 				node = (*trie & LEFTNODE);   445 				trie += offlen + 1;   446 			} else if (*trie & RIGHTPATH) {   447 				/* No left node. */   448 				return NULL;   449 			} else {   450 				/* Left node after this node */   451 				node = (*trie & TRIENODE);   452 				trie++;   453 			}   454 		}   455 	}   456 	/*   457 	 * Hangul decomposition is done algorithmically. These are the   458 	 * codepoints >= 0xAC00 and <= 0xD7A3. Their UTF-8 encoding is   459 	 * always 3 bytes long, so s has been advanced twice, and the   460 	 * start of the sequence is at s-2.   461 	 */   462 	if (LEAF_CCC(trie) == DECOMPOSE && LEAF_STR(trie)[0] == HANGUL)   463 		trie = utf8hangul(s - 2, hangul);   464 	return trie;   465 }   466    467 /*   468  * Use trie to scan s.   469  * Returns the leaf if one exists, NULL otherwise.   470  *   471  * Forwards to utf8nlookup().   472  */   473 static utf8leaf_t *utf8lookup(const struct utf8data *data,   474 			      unsigned char *hangul, const char *s)   475 {   476 	return utf8nlookup(data, hangul, s, (size_t)-1);   477 }   478    479 #if 0   480 /*   481  * Maximum age of any character in s.   482  * Return -1 if s is not valid UTF-8 unicode.   483  * Return 0 if only non-assigned code points are used.   484  */   485 static int utf8agemax(const struct utf8data *data, const char *s)   486 {   487 	utf8leaf_t	*leaf;   488 	int		age = 0;   489 	int		leaf_age;   490 	unsigned char	hangul[UTF8HANGULLEAF];   491    492 	if (!data)   493 		return -1;   494    495 	while (*s) {   496 		leaf = utf8lookup(data, hangul, s);   497 		if (!leaf)   498 			return -1;   499    500 		leaf_age = utf8agetab[LEAF_GEN(leaf)];   501 		if (leaf_age <= data->maxage && leaf_age > age)   502 			age = leaf_age;   503 		s += utf8clen(s);   504 	}   505 	return age;   506 }   507 #endif   508    509 #if 0   510 /*   511  * Minimum age of any character in s.   512  * Return -1 if s is not valid UTF-8 unicode.   513  * Return 0 if non-assigned code points are used.   514  */   515 static int utf8agemin(const struct utf8data *data, const char *s)   516 {   517 	utf8leaf_t	*leaf;   518 	int		age;   519 	int		leaf_age;   520 	unsigned char	hangul[UTF8HANGULLEAF];   521    522 	if (!data)   523 		return -1;   524 	age = data->maxage;   525 	while (*s) {   526 		leaf = utf8lookup(data, hangul, s);   527 		if (!leaf)   528 			return -1;   529 		leaf_age = utf8agetab[LEAF_GEN(leaf)];   530 		if (leaf_age <= data->maxage && leaf_age < age)   531 			age = leaf_age;   532 		s += utf8clen(s);   533 	}   534 	return age;   535 }   536 #endif   537    538 #if 0   539 /*   540  * Maximum age of any character in s, touch at most len bytes.   541  * Return -1 if s is not valid UTF-8 unicode.   542  */   543 static int utf8nagemax(const struct utf8data *data, const char *s, size_t len)   544 {   545 	utf8leaf_t	*leaf;   546 	int		age = 0;   547 	int		leaf_age;   548 	unsigned char	hangul[UTF8HANGULLEAF];   549    550 	if (!data)   551 		return -1;   552    553 	while (len && *s) {   554 		leaf = utf8nlookup(data, hangul, s, len);   555 		if (!leaf)   556 			return -1;   557 		leaf_age = utf8agetab[LEAF_GEN(leaf)];   558 		if (leaf_age <= data->maxage && leaf_age > age)   559 			age = leaf_age;   560 		len -= utf8clen(s);   561 		s += utf8clen(s);   562 	}   563 	return age;   564 }   565 #endif   566    567 #if 0   568 /*   569  * Maximum age of any character in s, touch at most len bytes.   570  * Return -1 if s is not valid UTF-8 unicode.   571  */   572 static int utf8nagemin(const struct utf8data *data, const char *s, size_t len)   573 {   574 	utf8leaf_t	*leaf;   575 	int		leaf_age;   576 	int		age;   577 	unsigned char	hangul[UTF8HANGULLEAF];   578    579 	if (!data)   580 		return -1;   581 	age = data->maxage;   582 	while (len && *s) {   583 		leaf = utf8nlookup(data, hangul, s, len);   584 		if (!leaf)   585 			return -1;   586 		leaf_age = utf8agetab[LEAF_GEN(leaf)];   587 		if (leaf_age <= data->maxage && leaf_age < age)   588 			age = leaf_age;   589 		len -= utf8clen(s);   590 		s += utf8clen(s);   591 	}   592 	return age;   593 }   594 #endif   595    596 #if 0   597 /*   598  * Length of the normalization of s.   599  * Return -1 if s is not valid UTF-8 unicode.   600  *   601  * A string of Default_Ignorable_Code_Point has length 0.   602  */   603 static ssize_t utf8len(const struct utf8data *data, const char *s)   604 {   605 	utf8leaf_t	*leaf;   606 	size_t		ret = 0;   607 	unsigned char	hangul[UTF8HANGULLEAF];   608    609 	if (!data)   610 		return -1;   611 	while (*s) {   612 		leaf = utf8lookup(data, hangul, s);   613 		if (!leaf)   614 			return -1;   615 		if (utf8agetab[LEAF_GEN(leaf)] > data->maxage)   616 			ret += utf8clen(s);   617 		else if (LEAF_CCC(leaf) == DECOMPOSE)   618 			ret += strlen(LEAF_STR(leaf));   619 		else   620 			ret += utf8clen(s);   621 		s += utf8clen(s);   622 	}   623 	return ret;   624 }   625 #endif   626    627 #if 0   628 /*   629  * Length of the normalization of s, touch at most len bytes.   630  * Return -1 if s is not valid UTF-8 unicode.   631  */   632 static ssize_t utf8nlen(const struct utf8data *data, const char *s, size_t len)   633 {   634 	utf8leaf_t	*leaf;   635 	size_t		ret = 0;   636 	unsigned char	hangul[UTF8HANGULLEAF];   637    638 	if (!data)   639 		return -1;   640 	while (len && *s) {   641 		leaf = utf8nlookup(data, hangul, s, len);   642 		if (!leaf)   643 			return -1;   644 		if (utf8agetab[LEAF_GEN(leaf)] > data->maxage)   645 			ret += utf8clen(s);   646 		else if (LEAF_CCC(leaf) == DECOMPOSE)   647 			ret += strlen(LEAF_STR(leaf));   648 		else   649 			ret += utf8clen(s);   650 		len -= utf8clen(s);   651 		s += utf8clen(s);   652 	}   653 	return ret;   654 }   655 #endif   656    657 /*   658  * Set up an utf8cursor for use by utf8byte().   659  *   660  *   u8c    : pointer to cursor.   661  *   data   : const struct utf8data to use for normalization.   662  *   s      : string.   663  *   len    : length of s.   664  *   665  * Returns -1 on error, 0 on success.   666  */   667 static int utf8ncursor(struct utf8cursor *u8c, const struct utf8data *data,   668 		const char *s, size_t len)   669 {   670 	if (!data)   671 		return -1;   672 	if (!s)   673 		return -1;   674 	u8c->data = data;   675 	u8c->s = s;   676 	u8c->p = NULL;   677 	u8c->ss = NULL;   678 	u8c->sp = NULL;   679 	u8c->len = len;   680 	u8c->slen = 0;   681 	u8c->ccc = STOPPER;   682 	u8c->nccc = STOPPER;   683 	/* Check we didn't clobber the maximum length. */   684 	if (u8c->len != len)   685 		return -1;   686 	/* The first byte of s may not be an utf8 continuation. */   687 	if (len > 0 && (*s & 0xC0) == 0x80)   688 		return -1;   689 	return 0;   690 }   691    692 #if 0   693 /*   694  * Set up an utf8cursor for use by utf8byte().   695  *   696  *   u8c    : pointer to cursor.   697  *   data   : const struct utf8data to use for normalization.   698  *   s      : NUL-terminated string.   699  *   700  * Returns -1 on error, 0 on success.   701  */   702 static int utf8cursor(struct utf8cursor *u8c, const struct utf8data *data,   703 	       const char *s)   704 {   705 	return utf8ncursor(u8c, data, s, (unsigned int)-1);   706 }   707 #endif   708    709 /*   710  * Get one byte from the normalized form of the string described by u8c.   711  *   712  * Returns the byte cast to an unsigned char on succes, and -1 on failure.   713  *   714  * The cursor keeps track of the location in the string in u8c->s.   715  * When a character is decomposed, the current location is stored in   716  * u8c->p, and u8c->s is set to the start of the decomposition. Note   717  * that bytes from a decomposition do not count against u8c->len.   718  *   719  * Characters are emitted if they match the current CCC in u8c->ccc.   720  * Hitting end-of-string while u8c->ccc == STOPPER means we're done,   721  * and the function returns 0 in that case.   722  *   723  * Sorting by CCC is done by repeatedly scanning the string.  The   724  * values of u8c->s and u8c->p are stored in u8c->ss and u8c->sp at   725  * the start of the scan.  The first pass finds the lowest CCC to be   726  * emitted and stores it in u8c->nccc, the second pass emits the   727  * characters with this CCC and finds the next lowest CCC. This limits   728  * the number of passes to 1 + the number of different CCCs in the   729  * sequence being scanned.   730  *   731  * Therefore:   732  *  u8c->p  != NULL -> a decomposition is being scanned.   733  *  u8c->ss != NULL -> this is a repeating scan.   734  *  u8c->ccc == -1   -> this is the first scan of a repeating scan.   735  */   736 static int utf8byte(struct utf8cursor *u8c)   737 {   738 	utf8leaf_t *leaf;   739 	int ccc;   740    741 	for (;;) {   742 		/* Check for the end of a decomposed character. */   743 		if (u8c->p && *u8c->s == '\0') {   744 			u8c->s = u8c->p;   745 			u8c->p = NULL;   746 		}   747    748 		/* Check for end-of-string. */   749 		if (!u8c->p && (u8c->len == 0 || *u8c->s == '\0')) {   750 			/* There is no next byte. */   751 			if (u8c->ccc == STOPPER)   752 				return 0;   753 			/* End-of-string during a scan counts as a stopper. */   754 			ccc = STOPPER;   755 			goto ccc_mismatch;   756 		} else if ((*u8c->s & 0xC0) == 0x80) {   757 			/* This is a continuation of the current character. */   758 			if (!u8c->p)   759 				u8c->len--;   760 			return (unsigned char)*u8c->s++;   761 		}   762    763 		/* Look up the data for the current character. */   764 		if (u8c->p) {   765 			leaf = utf8lookup(u8c->data, u8c->hangul, u8c->s);   766 		} else {   767 			leaf = utf8nlookup(u8c->data, u8c->hangul,   768 					   u8c->s, u8c->len);   769 		}   770    771 		/* No leaf found implies that the input is a binary blob. */   772 		if (!leaf)   773 			return -1;   774    775 		ccc = LEAF_CCC(leaf);   776 		/* Characters that are too new have CCC 0. */   777 		if (utf8agetab[LEAF_GEN(leaf)] > u8c->data->maxage) {   778 			ccc = STOPPER;   779 		} else if (ccc == DECOMPOSE) {   780 			u8c->len -= utf8clen(u8c->s);   781 			u8c->p = u8c->s + utf8clen(u8c->s);   782 			u8c->s = LEAF_STR(leaf);   783 			/* Empty decomposition implies CCC 0. */   784 			if (*u8c->s == '\0') {   785 				if (u8c->ccc == STOPPER)   786 					continue;   787 				ccc = STOPPER;   788 				goto ccc_mismatch;   789 			}   790    791 			leaf = utf8lookup(u8c->data, u8c->hangul, u8c->s);   792 			if (!leaf)   793 				return -1;   794 			ccc = LEAF_CCC(leaf);   795 		}   796    797 		/*   798 		 * If this is not a stopper, then see if it updates   799 		 * the next canonical class to be emitted.   800 		 */   801 		if (ccc != STOPPER && u8c->ccc < ccc && ccc < u8c->nccc)   802 			u8c->nccc = ccc;   803    804 		/*   805 		 * Return the current byte if this is the current   806 		 * combining class.   807 		 */   808 		if (ccc == u8c->ccc) {   809 			if (!u8c->p)   810 				u8c->len--;   811 			return (unsigned char)*u8c->s++;   812 		}   813    814 		/* Current combining class mismatch. */   815 ccc_mismatch:   816 		if (u8c->nccc == STOPPER) {   817 			/*   818 			 * Scan forward for the first canonical class   819 			 * to be emitted.  Save the position from   820 			 * which to restart.   821 			 */   822 			u8c->ccc = MINCCC - 1;   823 			u8c->nccc = ccc;   824 			u8c->sp = u8c->p;   825 			u8c->ss = u8c->s;   826 			u8c->slen = u8c->len;   827 			if (!u8c->p)   828 				u8c->len -= utf8clen(u8c->s);   829 			u8c->s += utf8clen(u8c->s);   830 		} else if (ccc != STOPPER) {   831 			/* Not a stopper, and not the ccc we're emitting. */   832 			if (!u8c->p)   833 				u8c->len -= utf8clen(u8c->s);   834 			u8c->s += utf8clen(u8c->s);   835 		} else if (u8c->nccc != MAXCCC + 1) {   836 			/* At a stopper, restart for next ccc. */   837 			u8c->ccc = u8c->nccc;   838 			u8c->nccc = MAXCCC + 1;   839 			u8c->s = u8c->ss;   840 			u8c->p = u8c->sp;   841 			u8c->len = u8c->slen;   842 		} else {   843 			/* All done, proceed from here. */   844 			u8c->ccc = STOPPER;   845 			u8c->nccc = STOPPER;   846 			u8c->sp = NULL;   847 			u8c->ss = NULL;   848 			u8c->slen = 0;   849 		}   850 	}   851 }   852    853 #if 0   854 /*   855  * Look for the correct const struct utf8data for a unicode version.   856  * Returns NULL if the version requested is too new.   857  *   858  * Two normalization forms are supported: nfdi and nfdicf.   859  *   860  * nfdi:   861  *  - Apply unicode normalization form NFD.   862  *  - Remove any Default_Ignorable_Code_Point.   863  *   864  * nfdicf:   865  *  - Apply unicode normalization form NFD.   866  *  - Remove any Default_Ignorable_Code_Point.   867  *  - Apply a full casefold (C + F).   868  */   869 static const struct utf8data *utf8nfdi(unsigned int maxage)   870 {   871 	int i = ARRAY_SIZE(utf8nfdidata) - 1;   872    873 	while (maxage < utf8nfdidata[i].maxage)   874 		i--;   875 	if (maxage > utf8nfdidata[i].maxage)   876 		return NULL;   877 	return &utf8nfdidata[i];   878 }   879 #endif   880    881 static const struct utf8data *utf8nfdicf(unsigned int maxage)   882 {   883 	int i = ARRAY_SIZE(utf8nfdicfdata) - 1;   884    885 	while (maxage < utf8nfdicfdata[i].maxage)   886 		i--;   887 	if (maxage > utf8nfdicfdata[i].maxage)   888 		return NULL;   889 	return &utf8nfdicfdata[i];   890 }   891    892 static int utf8_casefold(const struct ext2fs_nls_table *table,   893 			  const unsigned char *str, size_t len,   894 			  unsigned char *dest, size_t dlen)   895 {   896 	const struct utf8data *data = utf8nfdicf(table->version);   897 	struct utf8cursor cur;   898 	size_t nlen = 0;   899    900 	if (utf8ncursor(&cur, data, (const char *) str, len) < 0)   901 		goto invalid_seq;   902    903 	for (nlen = 0; nlen < dlen; nlen++) {   904 		int c = utf8byte(&cur);   905    906 		dest[nlen] = c;   907 		if (!c)   908 			return nlen;   909 		if (c == -1)   910 			break;   911 	}   912    913 	return -ENAMETOOLONG;   914    915 invalid_seq:   916 	if (dlen < len)   917 		return -ENAMETOOLONG;   918    919 	/* Signal invalid sequence */   920 	return -EINVAL;   921 }   922    923 static int utf8_validate(const struct ext2fs_nls_table *table,   924 			 char *s, size_t len, char **pos)   925 {   926 	const struct utf8data *data = utf8nfdicf(table->version);   927 	utf8leaf_t	*leaf;   928 	unsigned char	hangul[UTF8HANGULLEAF];   929    930 	if (!data)   931 		return -1;   932 	while (len && *s) {   933 		leaf = utf8nlookup(data, hangul, s, len);   934 		if (!leaf) {   935 			*pos = s;   936 			return 1;   937 		}   938 		len -= utf8clen(s);   939 		s += utf8clen(s);   940 	}   941 	return 0;   942 }   943    944 static int utf8_casefold_cmp(const struct ext2fs_nls_table *table,   945 			     const unsigned char *str1, size_t len1,   946 			     const unsigned char *str2, size_t len2)   947 {   948 	const struct utf8data *data = utf8nfdicf(table->version);   949 	int c1, c2;   950 	struct utf8cursor cur1, cur2;   951    952 	if (utf8ncursor(&cur1, data, (const char *) str1, len1) < 0)   953 		return -1;   954 	if (utf8ncursor(&cur2, data, (const char *) str2, len2) < 0)   955 		return -1;   956    957 	do {   958 		c1 = utf8byte(&cur1);   959 		c2 = utf8byte(&cur2);   960    961 		if (c1 < 0 || c2 < 0)   962 			return -1;   963 		if (c1 != c2)   964 			return c1 - c2;   965 	} while (c1);   966    967 	return 0;   968 }   969    970 static const struct ext2fs_nls_ops utf8_ops = {   971 	.casefold = utf8_casefold,   972 	.validate = utf8_validate,   973 	.casefold_cmp = utf8_casefold_cmp,   974 };   975    976 static const struct ext2fs_nls_table nls_utf8 = {   977 	.ops = &utf8_ops,   978 	.version = UNICODE_AGE(12, 1, 0),   979 };   980    981 const struct ext2fs_nls_table *ext2fs_load_nls_table(int encoding)   982 {   983 	if (encoding == EXT4_ENC_UTF8_12_1)   984 		return &nls_utf8;   985    986 	return NULL;   987 }   988    989 int ext2fs_check_encoded_name(const struct ext2fs_nls_table *table,   990 			      char *name, size_t len, char **pos)   991 {   992 	return table->ops->validate(table, name, len, pos);   993 }   994    995 int ext2fs_casefold_cmp(const struct ext2fs_nls_table *table,   996 			const unsigned char *str1, size_t len1,   997 			const unsigned char *str2, size_t len2)   998 {   999 	return table->ops->casefold_cmp(table, str1, len1, str2, len2);  1000 }
L4Re/departure

libext2fs/lib/libext2fs/nls_utf8.c