diff options
Diffstat (limited to '')
-rw-r--r-- | lib/uninorm.in.h | 249 | ||||
-rw-r--r-- | lib/uninorm/decompose-internal.c | 29 | ||||
-rw-r--r-- | lib/uninorm/decompose-internal.h | 36 | ||||
-rw-r--r-- | lib/uninorm/normalize-internal.h | 35 | ||||
-rw-r--r-- | lib/uninorm/u-normalize-internal.h | 380 | ||||
-rw-r--r-- | lib/uninorm/u8-normalize.c | 46 |
6 files changed, 775 insertions, 0 deletions
diff --git a/lib/uninorm.in.h b/lib/uninorm.in.h new file mode 100644 index 0000000..fd6affb --- /dev/null +++ b/lib/uninorm.in.h @@ -0,0 +1,249 @@ +/* Normalization forms (composition and decomposition) of Unicode strings. + Copyright (C) 2001-2002, 2009-2022 Free Software Foundation, Inc. + Written by Bruno Haible <bruno@clisp.org>, 2009. + + This file is free software: you can redistribute it and/or modify + it under the terms of the GNU Lesser General Public License as + published by the Free Software Foundation; either version 2.1 of the + License, or (at your option) any later version. + + This file is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see <https://www.gnu.org/licenses/>. */ + +#ifndef _UNINORM_H +#define _UNINORM_H + +/* Get size_t. */ +#include <stddef.h> + +#include "unitypes.h" + + +#ifdef __cplusplus +extern "C" { +#endif + + +/* Conventions: + + All functions prefixed with u8_ operate on UTF-8 encoded strings. + Their unit is an uint8_t (1 byte). + + All functions prefixed with u16_ operate on UTF-16 encoded strings. + Their unit is an uint16_t (a 2-byte word). + + All functions prefixed with u32_ operate on UCS-4 encoded strings. + Their unit is an uint32_t (a 4-byte word). + + All argument pairs (s, n) denote a Unicode string s[0..n-1] with exactly + n units. + + Functions returning a string result take a (resultbuf, lengthp) argument + pair. If resultbuf is not NULL and the result fits into *lengthp units, + it is put in resultbuf, and resultbuf is returned. Otherwise, a freshly + allocated string is returned. In both cases, *lengthp is set to the + length (number of units) of the returned string. In case of error, + NULL is returned and errno is set. */ + + +enum +{ + UC_DECOMP_CANONICAL,/* Canonical decomposition. */ + UC_DECOMP_FONT, /* <font> A font variant (e.g. a blackletter form). */ + UC_DECOMP_NOBREAK, /* <noBreak> A no-break version of a space or hyphen. */ + UC_DECOMP_INITIAL, /* <initial> An initial presentation form (Arabic). */ + UC_DECOMP_MEDIAL, /* <medial> A medial presentation form (Arabic). */ + UC_DECOMP_FINAL, /* <final> A final presentation form (Arabic). */ + UC_DECOMP_ISOLATED,/* <isolated> An isolated presentation form (Arabic). */ + UC_DECOMP_CIRCLE, /* <circle> An encircled form. */ + UC_DECOMP_SUPER, /* <super> A superscript form. */ + UC_DECOMP_SUB, /* <sub> A subscript form. */ + UC_DECOMP_VERTICAL,/* <vertical> A vertical layout presentation form. */ + UC_DECOMP_WIDE, /* <wide> A wide (or zenkaku) compatibility character. */ + UC_DECOMP_NARROW, /* <narrow> A narrow (or hankaku) compatibility character. */ + UC_DECOMP_SMALL, /* <small> A small variant form (CNS compatibility). */ + UC_DECOMP_SQUARE, /* <square> A CJK squared font variant. */ + UC_DECOMP_FRACTION,/* <fraction> A vulgar fraction form. */ + UC_DECOMP_COMPAT /* <compat> Otherwise unspecified compatibility character. */ +}; + +/* Maximum size of decomposition of a single Unicode character. */ +#define UC_DECOMPOSITION_MAX_LENGTH 32 + +/* Return the character decomposition mapping of a Unicode character. + DECOMPOSITION must point to an array of at least UC_DECOMPOSITION_MAX_LENGTH + ucs_t elements. + When a decomposition exists, DECOMPOSITION[0..N-1] and *DECOMP_TAG are + filled and N is returned. Otherwise -1 is returned. */ +extern int + uc_decomposition (ucs4_t uc, int *decomp_tag, ucs4_t *decomposition); + +/* Return the canonical character decomposition mapping of a Unicode character. + DECOMPOSITION must point to an array of at least UC_DECOMPOSITION_MAX_LENGTH + ucs_t elements. + When a decomposition exists, DECOMPOSITION[0..N-1] is filled and N is + returned. Otherwise -1 is returned. */ +extern int + uc_canonical_decomposition (ucs4_t uc, ucs4_t *decomposition); + + +/* Attempt to combine the Unicode characters uc1, uc2. + uc1 is known to have canonical combining class 0. + Return the combination of uc1 and uc2, if it exists. + Return 0 otherwise. + Not all decompositions can be recombined using this function. See the + Unicode file CompositionExclusions.txt for details. */ +extern ucs4_t + uc_composition (ucs4_t uc1, ucs4_t uc2) + _UC_ATTRIBUTE_CONST; + + +/* An object of type uninorm_t denotes a Unicode normalization form. */ +struct unicode_normalization_form; +typedef const struct unicode_normalization_form *uninorm_t; + +/* UNINORM_NFD: Normalization form D: canonical decomposition. */ +extern const struct unicode_normalization_form uninorm_nfd; +#define UNINORM_NFD (&uninorm_nfd) + +/* UNINORM_NFC: Normalization form C: canonical decomposition, then + canonical composition. */ +extern const struct unicode_normalization_form uninorm_nfc; +#define UNINORM_NFC (&uninorm_nfc) + +/* UNINORM_NFKD: Normalization form KD: compatibility decomposition. */ +extern const struct unicode_normalization_form uninorm_nfkd; +#define UNINORM_NFKD (&uninorm_nfkd) + +/* UNINORM_NFKC: Normalization form KC: compatibility decomposition, then + canonical composition. */ +extern const struct unicode_normalization_form uninorm_nfkc; +#define UNINORM_NFKC (&uninorm_nfkc) + +/* Test whether a normalization form does compatibility decomposition. */ +#define uninorm_is_compat_decomposing(nf) \ + ((* (const unsigned int *) (nf) >> 0) & 1) + +/* Test whether a normalization form includes canonical composition. */ +#define uninorm_is_composing(nf) \ + ((* (const unsigned int *) (nf) >> 1) & 1) + +/* Return the decomposing variant of a normalization form. + This maps NFC,NFD -> NFD and NFKC,NFKD -> NFKD. */ +extern uninorm_t + uninorm_decomposing_form (uninorm_t nf) + _UC_ATTRIBUTE_PURE; + + +/* Return the specified normalization form of a string. */ +extern uint8_t * + u8_normalize (uninorm_t nf, const uint8_t *s, size_t n, + uint8_t *_UC_RESTRICT resultbuf, size_t *lengthp); +extern uint16_t * + u16_normalize (uninorm_t nf, const uint16_t *s, size_t n, + uint16_t *_UC_RESTRICT resultbuf, size_t *lengthp); +extern uint32_t * + u32_normalize (uninorm_t nf, const uint32_t *s, size_t n, + uint32_t *_UC_RESTRICT resultbuf, size_t *lengthp); + + +/* Compare S1 and S2, ignoring differences in normalization. + NF must be either UNINORM_NFD or UNINORM_NFKD. + If successful, set *RESULTP to -1 if S1 < S2, 0 if S1 = S2, 1 if S1 > S2, and + return 0. Upon failure, return -1 with errno set. */ +extern int + u8_normcmp (const uint8_t *s1, size_t n1, const uint8_t *s2, size_t n2, + uninorm_t nf, int *resultp); +extern int + u16_normcmp (const uint16_t *s1, size_t n1, const uint16_t *s2, size_t n2, + uninorm_t nf, int *resultp); +extern int + u32_normcmp (const uint32_t *s1, size_t n1, const uint32_t *s2, size_t n2, + uninorm_t nf, int *resultp); + + +/* Converts the string S of length N to a NUL-terminated byte sequence, in such + a way that comparing uN_normxfrm (S1) and uN_normxfrm (S2) with uN_cmp2() is + equivalent to comparing S1 and S2 with uN_normcoll(). + NF must be either UNINORM_NFC or UNINORM_NFKC. */ +extern char * + u8_normxfrm (const uint8_t *s, size_t n, uninorm_t nf, + char *resultbuf, size_t *lengthp); +extern char * + u16_normxfrm (const uint16_t *s, size_t n, uninorm_t nf, + char *resultbuf, size_t *lengthp); +extern char * + u32_normxfrm (const uint32_t *s, size_t n, uninorm_t nf, + char *resultbuf, size_t *lengthp); + + +/* Compare S1 and S2, ignoring differences in normalization, using the + collation rules of the current locale. + NF must be either UNINORM_NFC or UNINORM_NFKC. + If successful, set *RESULTP to -1 if S1 < S2, 0 if S1 = S2, 1 if S1 > S2, and + return 0. Upon failure, return -1 with errno set. */ +extern int + u8_normcoll (const uint8_t *s1, size_t n1, const uint8_t *s2, size_t n2, + uninorm_t nf, int *resultp); +extern int + u16_normcoll (const uint16_t *s1, size_t n1, const uint16_t *s2, size_t n2, + uninorm_t nf, int *resultp); +extern int + u32_normcoll (const uint32_t *s1, size_t n1, const uint32_t *s2, size_t n2, + uninorm_t nf, int *resultp); + + +/* Normalization of a stream of Unicode characters. + + A "stream of Unicode characters" is essentially a function that accepts an + ucs4_t argument repeatedly, optionally combined with a function that + "flushes" the stream. */ + +/* Data type of a stream of Unicode characters that normalizes its input + according to a given normalization form and passes the normalized character + sequence to the encapsulated stream of Unicode characters. */ +struct uninorm_filter; + +/* Bring data buffered in the filter to its destination, the encapsulated + stream, then close and free the filter. + Return 0 if successful, or -1 with errno set upon failure. */ +extern int + uninorm_filter_free (struct uninorm_filter *filter); + +/* Create and return a normalization filter for Unicode characters. + The pair (stream_func, stream_data) is the encapsulated stream. + stream_func (stream_data, uc) receives the Unicode character uc + and returns 0 if successful, or -1 with errno set upon failure. + Return the new filter, or NULL with errno set upon failure. */ +extern struct uninorm_filter * + uninorm_filter_create (uninorm_t nf, + int (*stream_func) (void *stream_data, ucs4_t uc), + void *stream_data) + _GL_ATTRIBUTE_DEALLOC (uninorm_filter_free, 1); + +/* Stuff a Unicode character into a normalizing filter. + Return 0 if successful, or -1 with errno set upon failure. */ +extern int + uninorm_filter_write (struct uninorm_filter *filter, ucs4_t uc); + +/* Bring data buffered in the filter to its destination, the encapsulated + stream. + Return 0 if successful, or -1 with errno set upon failure. + Note! If after calling this function, additional characters are written + into the filter, the resulting character sequence in the encapsulated stream + will not necessarily be normalized. */ +extern int + uninorm_filter_flush (struct uninorm_filter *filter); + + +#ifdef __cplusplus +} +#endif + + +#endif /* _UNINORM_H */ diff --git a/lib/uninorm/decompose-internal.c b/lib/uninorm/decompose-internal.c new file mode 100644 index 0000000..1ed8235 --- /dev/null +++ b/lib/uninorm/decompose-internal.c @@ -0,0 +1,29 @@ +/* Decomposition of Unicode strings. + Copyright (C) 2009-2022 Free Software Foundation, Inc. + Written by Bruno Haible <bruno@clisp.org>, 2009. + + This file is free software: you can redistribute it and/or modify + it under the terms of the GNU Lesser General Public License as + published by the Free Software Foundation; either version 2.1 of the + License, or (at your option) any later version. + + This file is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see <https://www.gnu.org/licenses/>. */ + +#include <config.h> + +/* Specification. */ +#include "decompose-internal.h" + +#define ELEMENT struct ucs4_with_ccc +#define COMPARE(a,b) ((a)->ccc - (b)->ccc) +#define STATIC +#define STATIC_FROMTO static +#define merge_sort_fromto gl_uninorm_decompose_merge_sort_fromto +#define merge_sort_inplace gl_uninorm_decompose_merge_sort_inplace +#include "array-mergesort.h" diff --git a/lib/uninorm/decompose-internal.h b/lib/uninorm/decompose-internal.h new file mode 100644 index 0000000..15d8250 --- /dev/null +++ b/lib/uninorm/decompose-internal.h @@ -0,0 +1,36 @@ +/* Decomposition of Unicode strings. + Copyright (C) 2009-2022 Free Software Foundation, Inc. + Written by Bruno Haible <bruno@clisp.org>, 2009. + + This file is free software: you can redistribute it and/or modify + it under the terms of the GNU Lesser General Public License as + published by the Free Software Foundation; either version 2.1 of the + License, or (at your option) any later version. + + This file is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see <https://www.gnu.org/licenses/>. */ + +#include <stddef.h> + +#include "unitypes.h" + +/* Variant of uc_decomposition that does not produce the 'tag'. */ +extern int + uc_compat_decomposition (ucs4_t uc, ucs4_t *decomposition); + +/* A Unicode character together with its canonical combining class. */ +struct ucs4_with_ccc +{ + ucs4_t code; + int ccc; /* range 0..255 */ +}; + +/* Stable-sort an array of 'struct ucs4_with_ccc'. */ +extern void + gl_uninorm_decompose_merge_sort_inplace (struct ucs4_with_ccc *src, size_t n, + struct ucs4_with_ccc *tmp); diff --git a/lib/uninorm/normalize-internal.h b/lib/uninorm/normalize-internal.h new file mode 100644 index 0000000..aae6dbd --- /dev/null +++ b/lib/uninorm/normalize-internal.h @@ -0,0 +1,35 @@ +/* Normalization of Unicode strings. + Copyright (C) 2009-2022 Free Software Foundation, Inc. + Written by Bruno Haible <bruno@clisp.org>, 2009. + + This file is free software: you can redistribute it and/or modify + it under the terms of the GNU Lesser General Public License as + published by the Free Software Foundation; either version 2.1 of the + License, or (at your option) any later version. + + This file is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see <https://www.gnu.org/licenses/>. */ + +#include "unitypes.h" + +/* Complete definition of normalization form descriptor. */ +struct unicode_normalization_form +{ + /* Bit mask containing meta-information. + This must be the first field. */ + unsigned int description; + #define NF_IS_COMPAT_DECOMPOSING (1 << 0) + #define NF_IS_COMPOSING (1 << 1) + /* Function that decomposes a Unicode character. */ + int (*decomposer) (ucs4_t uc, ucs4_t *decomposition); + /* Function that combines two Unicode characters, a starter and another + character. */ + ucs4_t (*composer) (ucs4_t uc1, ucs4_t uc2); + /* Decomposing variant. */ + const struct unicode_normalization_form *decomposing_variant; +}; diff --git a/lib/uninorm/u-normalize-internal.h b/lib/uninorm/u-normalize-internal.h new file mode 100644 index 0000000..f0d9c6e --- /dev/null +++ b/lib/uninorm/u-normalize-internal.h @@ -0,0 +1,380 @@ +/* Decomposition and composition of Unicode strings. + Copyright (C) 2009-2022 Free Software Foundation, Inc. + Written by Bruno Haible <bruno@clisp.org>, 2009. + + This file is free software: you can redistribute it and/or modify + it under the terms of the GNU Lesser General Public License as + published by the Free Software Foundation; either version 2.1 of the + License, or (at your option) any later version. + + This file is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see <https://www.gnu.org/licenses/>. */ + +UNIT * +FUNC (uninorm_t nf, const UNIT *s, size_t n, + UNIT *resultbuf, size_t *lengthp) +{ + int (*decomposer) (ucs4_t uc, ucs4_t *decomposition) = nf->decomposer; + ucs4_t (*composer) (ucs4_t uc1, ucs4_t uc2) = nf->composer; + + /* The result being accumulated. */ + UNIT *result; + size_t length; + size_t allocated; + /* The buffer for sorting. */ + #define SORTBUF_PREALLOCATED 64 + struct ucs4_with_ccc sortbuf_preallocated[2 * SORTBUF_PREALLOCATED]; + struct ucs4_with_ccc *sortbuf; /* array of size 2 * sortbuf_allocated */ + size_t sortbuf_allocated; + size_t sortbuf_count; + + /* Initialize the accumulator. */ + if (resultbuf == NULL) + { + result = NULL; + allocated = 0; + } + else + { + result = resultbuf; + allocated = *lengthp; + } + length = 0; + + /* Initialize the buffer for sorting. */ + sortbuf = sortbuf_preallocated; + sortbuf_allocated = SORTBUF_PREALLOCATED; + sortbuf_count = 0; + + { + const UNIT *s_end = s + n; + + for (;;) + { + int count; + ucs4_t decomposed[UC_DECOMPOSITION_MAX_LENGTH]; + int decomposed_count; + int i; + + if (s < s_end) + { + /* Fetch the next character. */ + count = U_MBTOUC_UNSAFE (&decomposed[0], s, s_end - s); + decomposed_count = 1; + + /* Decompose it, recursively. + It would be possible to precompute the recursive decomposition + and store it in a table. But this would significantly increase + the size of the decomposition tables, because for example for + U+1FC1 the recursive canonical decomposition and the recursive + compatibility decomposition are different. */ + { + int curr; + + for (curr = 0; curr < decomposed_count; ) + { + /* Invariant: decomposed[0..curr-1] is fully decomposed, i.e. + all elements are atomic. */ + ucs4_t curr_decomposed[UC_DECOMPOSITION_MAX_LENGTH]; + int curr_decomposed_count; + + curr_decomposed_count = decomposer (decomposed[curr], curr_decomposed); + if (curr_decomposed_count >= 0) + { + /* Move curr_decomposed[0..curr_decomposed_count-1] over + decomposed[curr], making room. It's not worth using + memcpy() here, since the counts are so small. */ + int shift = curr_decomposed_count - 1; + + if (shift < 0) + abort (); + if (shift > 0) + { + int j; + + decomposed_count += shift; + if (decomposed_count > UC_DECOMPOSITION_MAX_LENGTH) + abort (); + for (j = decomposed_count - 1 - shift; j > curr; j--) + decomposed[j + shift] = decomposed[j]; + } + for (; shift >= 0; shift--) + decomposed[curr + shift] = curr_decomposed[shift]; + } + else + { + /* decomposed[curr] is atomic. */ + curr++; + } + } + } + } + else + { + count = 0; + decomposed_count = 0; + } + + i = 0; + for (;;) + { + ucs4_t uc; + int ccc; + + if (s < s_end) + { + /* Fetch the next character from the decomposition. */ + if (i == decomposed_count) + break; + uc = decomposed[i]; + ccc = uc_combining_class (uc); + } + else + { + /* End of string reached. */ + uc = 0; + ccc = 0; + } + + if (ccc == 0) + { + size_t j; + + /* Apply the canonical ordering algorithm to the accumulated + sequence of characters. */ + if (sortbuf_count > 1) + gl_uninorm_decompose_merge_sort_inplace (sortbuf, sortbuf_count, + sortbuf + sortbuf_count); + + if (composer != NULL) + { + /* Attempt to combine decomposed characters, as specified + in the Unicode Standard Annex #15 "Unicode Normalization + Forms". We need to check + 1. whether the first accumulated character is a + "starter" (i.e. has ccc = 0). This is usually the + case. But when the string starts with a + non-starter, the sortbuf also starts with a + non-starter. Btw, this check could also be + omitted, because the composition table has only + entries (code1, code2) for which code1 is a + starter; if the first accumulated character is not + a starter, no lookup will succeed. + 2. If the sortbuf has more than one character, check + for each of these characters that are not "blocked" + from the starter (i.e. have a ccc that is higher + than the ccc of the previous character) whether it + can be combined with the first character. + 3. If only one character is left in sortbuf, check + whether it can be combined with the next character + (also a starter). */ + if (sortbuf_count > 0 && sortbuf[0].ccc == 0) + { + for (j = 1; j < sortbuf_count; ) + { + if (sortbuf[j].ccc > sortbuf[j - 1].ccc) + { + ucs4_t combined = + composer (sortbuf[0].code, sortbuf[j].code); + if (combined) + { + size_t k; + + sortbuf[0].code = combined; + /* sortbuf[0].ccc = 0, still valid. */ + for (k = j + 1; k < sortbuf_count; k++) + sortbuf[k - 1] = sortbuf[k]; + sortbuf_count--; + continue; + } + } + j++; + } + if (s < s_end && sortbuf_count == 1) + { + ucs4_t combined = + composer (sortbuf[0].code, uc); + if (combined) + { + uc = combined; + ccc = 0; + /* uc could be further combined with subsequent + characters. So don't put it into sortbuf[0] in + this round, only in the next round. */ + sortbuf_count = 0; + } + } + } + } + + for (j = 0; j < sortbuf_count; j++) + { + ucs4_t muc = sortbuf[j].code; + + /* Append muc to the result accumulator. */ + if (length < allocated) + { + int ret = + U_UCTOMB (result + length, muc, allocated - length); + if (ret == -1) + { + errno = EINVAL; + goto fail; + } + if (ret >= 0) + { + length += ret; + goto done_appending; + } + } + { + size_t old_allocated = allocated; + size_t new_allocated = 2 * old_allocated; + if (new_allocated < 64) + new_allocated = 64; + if (new_allocated < old_allocated) /* integer overflow? */ + abort (); + { + UNIT *larger_result; + if (result == NULL) + { + larger_result = + (UNIT *) malloc (new_allocated * sizeof (UNIT)); + if (larger_result == NULL) + { + errno = ENOMEM; + goto fail; + } + } + else if (result == resultbuf) + { + larger_result = + (UNIT *) malloc (new_allocated * sizeof (UNIT)); + if (larger_result == NULL) + { + errno = ENOMEM; + goto fail; + } + U_CPY (larger_result, resultbuf, length); + } + else + { + larger_result = + (UNIT *) realloc (result, new_allocated * sizeof (UNIT)); + if (larger_result == NULL) + { + errno = ENOMEM; + goto fail; + } + } + result = larger_result; + allocated = new_allocated; + { + int ret = + U_UCTOMB (result + length, muc, allocated - length); + if (ret == -1) + { + errno = EINVAL; + goto fail; + } + if (ret < 0) + abort (); + length += ret; + goto done_appending; + } + } + } + done_appending: ; + } + + /* sortbuf is now empty. */ + sortbuf_count = 0; + } + + if (!(s < s_end)) + /* End of string reached. */ + break; + + /* Append (uc, ccc) to sortbuf. */ + if (sortbuf_count == sortbuf_allocated) + { + struct ucs4_with_ccc *new_sortbuf; + + sortbuf_allocated = 2 * sortbuf_allocated; + if (sortbuf_allocated < sortbuf_count) /* integer overflow? */ + abort (); + new_sortbuf = + (struct ucs4_with_ccc *) malloc (2 * sortbuf_allocated * sizeof (struct ucs4_with_ccc)); + if (new_sortbuf == NULL) + { + errno = ENOMEM; + goto fail; + } + memcpy (new_sortbuf, sortbuf, + sortbuf_count * sizeof (struct ucs4_with_ccc)); + if (sortbuf != sortbuf_preallocated) + free (sortbuf); + sortbuf = new_sortbuf; + } + sortbuf[sortbuf_count].code = uc; + sortbuf[sortbuf_count].ccc = ccc; + sortbuf_count++; + + i++; + } + + if (!(s < s_end)) + /* End of string reached. */ + break; + + s += count; + } + } + + if (length == 0) + { + if (result == NULL) + { + /* Return a non-NULL value. NULL means error. */ + result = (UNIT *) malloc (1); + if (result == NULL) + { + errno = ENOMEM; + goto fail; + } + } + } + else if (result != resultbuf && length < allocated) + { + /* Shrink the allocated memory if possible. */ + UNIT *memory; + + memory = (UNIT *) realloc (result, length * sizeof (UNIT)); + if (memory != NULL) + result = memory; + } + + if (sortbuf_count > 0) + abort (); + if (sortbuf != sortbuf_preallocated) + free (sortbuf); + + *lengthp = length; + return result; + + fail: + { + int saved_errno = errno; + if (sortbuf != sortbuf_preallocated) + free (sortbuf); + if (result != resultbuf) + free (result); + errno = saved_errno; + } + return NULL; +} diff --git a/lib/uninorm/u8-normalize.c b/lib/uninorm/u8-normalize.c new file mode 100644 index 0000000..fe40d11 --- /dev/null +++ b/lib/uninorm/u8-normalize.c @@ -0,0 +1,46 @@ +/* Normalization of UTF-8 strings. + Copyright (C) 2009-2022 Free Software Foundation, Inc. + Written by Bruno Haible <bruno@clisp.org>, 2009. + + This file is free software. + It is dual-licensed under "the GNU LGPLv3+ or the GNU GPLv2+". + You can redistribute it and/or modify it under either + - the terms of the GNU Lesser General Public License as published + by the Free Software Foundation, either version 3, or (at your + option) any later version, or + - the terms of the GNU General Public License as published by the + Free Software Foundation; either version 2, or (at your option) + any later version, or + - the same dual license "the GNU LGPLv3+ or the GNU GPLv2+". + + This file is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License and the GNU General Public License + for more details. + + You should have received a copy of the GNU Lesser General Public + License and of the GNU General Public License along with this + program. If not, see <https://www.gnu.org/licenses/>. */ + +#include <config.h> + +/* Specification. */ +#include "uninorm.h" + +#include <errno.h> +#include <stddef.h> +#include <stdlib.h> +#include <string.h> + +#include "unistr.h" +#include "unictype.h" +#include "normalize-internal.h" +#include "uninorm/decompose-internal.h" + +#define FUNC u8_normalize +#define UNIT uint8_t +#define U_MBTOUC_UNSAFE u8_mbtouc_unsafe +#define U_UCTOMB u8_uctomb +#define U_CPY u8_cpy +#include "u-normalize-internal.h" |