6 files changed, 775 insertions, 0 deletions
diff --git a/lib/uninorm.in.h b/lib/uninorm.in.h
new file mode 100644
index 0000000..fd6affb
--- /dev/null
+++ b/lib/uninorm.in.h
@@ -0,0 +1,249 @@
+/* Normalization forms (composition and decomposition) of Unicode strings.
+   Copyright (C) 2001-2002, 2009-2022 Free Software Foundation, Inc.
+   Written by Bruno Haible <bruno@clisp.org>, 2009.
+
+   This file is free software: you can redistribute it and/or modify
+   it under the terms of the GNU Lesser General Public License as
+   published by the Free Software Foundation; either version 2.1 of the
+   License, or (at your option) any later version.
+
+   This file is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public License
+   along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
+
+#ifndef _UNINORM_H
+#define _UNINORM_H
+
+/* Get size_t.  */
+#include <stddef.h>
+
+#include "unitypes.h"
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/* Conventions:
+
+   All functions prefixed with u8_ operate on UTF-8 encoded strings.
+   Their unit is an uint8_t (1 byte).
+
+   All functions prefixed with u16_ operate on UTF-16 encoded strings.
+   Their unit is an uint16_t (a 2-byte word).
+
+   All functions prefixed with u32_ operate on UCS-4 encoded strings.
+   Their unit is an uint32_t (a 4-byte word).
+
+   All argument pairs (s, n) denote a Unicode string s[0..n-1] with exactly
+   n units.
+
+   Functions returning a string result take a (resultbuf, lengthp) argument
+   pair.  If resultbuf is not NULL and the result fits into *lengthp units,
+   it is put in resultbuf, and resultbuf is returned.  Otherwise, a freshly
+   allocated string is returned.  In both cases, *lengthp is set to the
+   length (number of units) of the returned string.  In case of error,
+   NULL is returned and errno is set.  */
+
+
+enum
+{
+  UC_DECOMP_CANONICAL,/*            Canonical decomposition.                  */
+  UC_DECOMP_FONT,    /*   <font>    A font variant (e.g. a blackletter form). */
+  UC_DECOMP_NOBREAK, /* <noBreak>   A no-break version of a space or hyphen.  */
+  UC_DECOMP_INITIAL, /* <initial>   An initial presentation form (Arabic).    */
+  UC_DECOMP_MEDIAL,  /*  <medial>   A medial presentation form (Arabic).      */
+  UC_DECOMP_FINAL,   /*  <final>    A final presentation form (Arabic).       */
+  UC_DECOMP_ISOLATED,/* <isolated>  An isolated presentation form (Arabic).   */
+  UC_DECOMP_CIRCLE,  /*  <circle>   An encircled form.                        */
+  UC_DECOMP_SUPER,   /*  <super>    A superscript form.                       */
+  UC_DECOMP_SUB,     /*   <sub>     A subscript form.                         */
+  UC_DECOMP_VERTICAL,/* <vertical>  A vertical layout presentation form.      */
+  UC_DECOMP_WIDE,    /*   <wide>    A wide (or zenkaku) compatibility character. */
+  UC_DECOMP_NARROW,  /*  <narrow>   A narrow (or hankaku) compatibility character. */
+  UC_DECOMP_SMALL,   /*  <small>    A small variant form (CNS compatibility). */
+  UC_DECOMP_SQUARE,  /*  <square>   A CJK squared font variant.               */
+  UC_DECOMP_FRACTION,/* <fraction>  A vulgar fraction form.                   */
+  UC_DECOMP_COMPAT   /*  <compat>   Otherwise unspecified compatibility character. */
+};
+
+/* Maximum size of decomposition of a single Unicode character.  */
+#define UC_DECOMPOSITION_MAX_LENGTH 32
+
+/* Return the character decomposition mapping of a Unicode character.
+   DECOMPOSITION must point to an array of at least UC_DECOMPOSITION_MAX_LENGTH
+   ucs_t elements.
+   When a decomposition exists, DECOMPOSITION[0..N-1] and *DECOMP_TAG are
+   filled and N is returned.  Otherwise -1 is returned.  */
+extern int
+       uc_decomposition (ucs4_t uc, int *decomp_tag, ucs4_t *decomposition);
+
+/* Return the canonical character decomposition mapping of a Unicode character.
+   DECOMPOSITION must point to an array of at least UC_DECOMPOSITION_MAX_LENGTH
+   ucs_t elements.
+   When a decomposition exists, DECOMPOSITION[0..N-1] is filled and N is
+   returned.  Otherwise -1 is returned.  */
+extern int
+       uc_canonical_decomposition (ucs4_t uc, ucs4_t *decomposition);
+
+
+/* Attempt to combine the Unicode characters uc1, uc2.
+   uc1 is known to have canonical combining class 0.
+   Return the combination of uc1 and uc2, if it exists.
+   Return 0 otherwise.
+   Not all decompositions can be recombined using this function.  See the
+   Unicode file CompositionExclusions.txt for details.  */
+extern ucs4_t
+       uc_composition (ucs4_t uc1, ucs4_t uc2)
+       _UC_ATTRIBUTE_CONST;
+
+
+/* An object of type uninorm_t denotes a Unicode normalization form.  */
+struct unicode_normalization_form;
+typedef const struct unicode_normalization_form *uninorm_t;
+
+/* UNINORM_NFD: Normalization form D: canonical decomposition.  */
+extern const struct unicode_normalization_form uninorm_nfd;
+#define UNINORM_NFD (&uninorm_nfd)
+
+/* UNINORM_NFC: Normalization form C: canonical decomposition, then
+   canonical composition.  */
+extern const struct unicode_normalization_form uninorm_nfc;
+#define UNINORM_NFC (&uninorm_nfc)
+
+/* UNINORM_NFKD: Normalization form KD: compatibility decomposition.  */
+extern const struct unicode_normalization_form uninorm_nfkd;
+#define UNINORM_NFKD (&uninorm_nfkd)
+
+/* UNINORM_NFKC: Normalization form KC: compatibility decomposition, then
+   canonical composition.  */
+extern const struct unicode_normalization_form uninorm_nfkc;
+#define UNINORM_NFKC (&uninorm_nfkc)
+
+/* Test whether a normalization form does compatibility decomposition.  */
+#define uninorm_is_compat_decomposing(nf) \
+  ((* (const unsigned int *) (nf) >> 0) & 1)
+
+/* Test whether a normalization form includes canonical composition.  */
+#define uninorm_is_composing(nf) \
+  ((* (const unsigned int *) (nf) >> 1) & 1)
+
+/* Return the decomposing variant of a normalization form.
+   This maps NFC,NFD -> NFD and NFKC,NFKD -> NFKD.  */
+extern uninorm_t
+       uninorm_decomposing_form (uninorm_t nf)
+       _UC_ATTRIBUTE_PURE;
+
+
+/* Return the specified normalization form of a string.  */
+extern uint8_t *
+       u8_normalize (uninorm_t nf, const uint8_t *s, size_t n,
+                     uint8_t *_UC_RESTRICT resultbuf, size_t *lengthp);
+extern uint16_t *
+       u16_normalize (uninorm_t nf, const uint16_t *s, size_t n,
+                      uint16_t *_UC_RESTRICT resultbuf, size_t *lengthp);
+extern uint32_t *
+       u32_normalize (uninorm_t nf, const uint32_t *s, size_t n,
+                      uint32_t *_UC_RESTRICT resultbuf, size_t *lengthp);
+
+
+/* Compare S1 and S2, ignoring differences in normalization.
+   NF must be either UNINORM_NFD or UNINORM_NFKD.
+   If successful, set *RESULTP to -1 if S1 < S2, 0 if S1 = S2, 1 if S1 > S2, and
+   return 0.  Upon failure, return -1 with errno set.  */
+extern int
+       u8_normcmp (const uint8_t *s1, size_t n1, const uint8_t *s2, size_t n2,
+                   uninorm_t nf, int *resultp);
+extern int
+       u16_normcmp (const uint16_t *s1, size_t n1, const uint16_t *s2, size_t n2,
+                    uninorm_t nf, int *resultp);
+extern int
+       u32_normcmp (const uint32_t *s1, size_t n1, const uint32_t *s2, size_t n2,
+                    uninorm_t nf, int *resultp);
+
+
+/* Converts the string S of length N to a NUL-terminated byte sequence, in such
+   a way that comparing uN_normxfrm (S1) and uN_normxfrm (S2) with uN_cmp2() is
+   equivalent to comparing S1 and S2 with uN_normcoll().
+   NF must be either UNINORM_NFC or UNINORM_NFKC.  */
+extern char *
+       u8_normxfrm (const uint8_t *s, size_t n, uninorm_t nf,
+                    char *resultbuf, size_t *lengthp);
+extern char *
+       u16_normxfrm (const uint16_t *s, size_t n, uninorm_t nf,
+                     char *resultbuf, size_t *lengthp);
+extern char *
+       u32_normxfrm (const uint32_t *s, size_t n, uninorm_t nf,
+                     char *resultbuf, size_t *lengthp);
+
+
+/* Compare S1 and S2, ignoring differences in normalization, using the
+   collation rules of the current locale.
+   NF must be either UNINORM_NFC or UNINORM_NFKC.
+   If successful, set *RESULTP to -1 if S1 < S2, 0 if S1 = S2, 1 if S1 > S2, and
+   return 0.  Upon failure, return -1 with errno set.  */
+extern int
+       u8_normcoll (const uint8_t *s1, size_t n1, const uint8_t *s2, size_t n2,
+                    uninorm_t nf, int *resultp);
+extern int
+       u16_normcoll (const uint16_t *s1, size_t n1, const uint16_t *s2, size_t n2,
+                     uninorm_t nf, int *resultp);
+extern int
+       u32_normcoll (const uint32_t *s1, size_t n1, const uint32_t *s2, size_t n2,
+                     uninorm_t nf, int *resultp);
+
+
+/* Normalization of a stream of Unicode characters.
+
+   A "stream of Unicode characters" is essentially a function that accepts an
+   ucs4_t argument repeatedly, optionally combined with a function that
+   "flushes" the stream.  */
+
+/* Data type of a stream of Unicode characters that normalizes its input
+   according to a given normalization form and passes the normalized character
+   sequence to the encapsulated stream of Unicode characters.  */
+struct uninorm_filter;
+
+/* Bring data buffered in the filter to its destination, the encapsulated
+   stream, then close and free the filter.
+   Return 0 if successful, or -1 with errno set upon failure.  */
+extern int
+       uninorm_filter_free (struct uninorm_filter *filter);
+
+/* Create and return a normalization filter for Unicode characters.
+   The pair (stream_func, stream_data) is the encapsulated stream.
+   stream_func (stream_data, uc) receives the Unicode character uc
+   and returns 0 if successful, or -1 with errno set upon failure.
+   Return the new filter, or NULL with errno set upon failure.  */
+extern struct uninorm_filter *
+       uninorm_filter_create (uninorm_t nf,
+                              int (*stream_func) (void *stream_data, ucs4_t uc),
+                              void *stream_data)
+       _GL_ATTRIBUTE_DEALLOC (uninorm_filter_free, 1);
+
+/* Stuff a Unicode character into a normalizing filter.
+   Return 0 if successful, or -1 with errno set upon failure.  */
+extern int
+       uninorm_filter_write (struct uninorm_filter *filter, ucs4_t uc);
+
+/* Bring data buffered in the filter to its destination, the encapsulated
+   stream.
+   Return 0 if successful, or -1 with errno set upon failure.
+   Note! If after calling this function, additional characters are written
+   into the filter, the resulting character sequence in the encapsulated stream
+   will not necessarily be normalized.  */
+extern int
+       uninorm_filter_flush (struct uninorm_filter *filter);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+
+#endif /* _UNINORM_H */
diff --git a/lib/uninorm/decompose-internal.c b/lib/uninorm/decompose-internal.c
new file mode 100644
index 0000000..1ed8235
--- /dev/null
+++ b/lib/uninorm/decompose-internal.c
@@ -0,0 +1,29 @@
+/* Decomposition of Unicode strings.
+   Copyright (C) 2009-2022 Free Software Foundation, Inc.
+   Written by Bruno Haible <bruno@clisp.org>, 2009.
+
+   This file is free software: you can redistribute it and/or modify
+   it under the terms of the GNU Lesser General Public License as
+   published by the Free Software Foundation; either version 2.1 of the
+   License, or (at your option) any later version.
+
+   This file is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public License
+   along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
+
+#include <config.h>
+
+/* Specification.  */
+#include "decompose-internal.h"
+
+#define ELEMENT struct ucs4_with_ccc
+#define COMPARE(a,b) ((a)->ccc - (b)->ccc)
+#define STATIC
+#define STATIC_FROMTO static
+#define merge_sort_fromto gl_uninorm_decompose_merge_sort_fromto
+#define merge_sort_inplace gl_uninorm_decompose_merge_sort_inplace
+#include "array-mergesort.h"
diff --git a/lib/uninorm/decompose-internal.h b/lib/uninorm/decompose-internal.h
new file mode 100644
index 0000000..15d8250
--- /dev/null
+++ b/lib/uninorm/decompose-internal.h
@@ -0,0 +1,36 @@
+/* Decomposition of Unicode strings.
+   Copyright (C) 2009-2022 Free Software Foundation, Inc.
+   Written by Bruno Haible <bruno@clisp.org>, 2009.
+
+   This file is free software: you can redistribute it and/or modify
+   it under the terms of the GNU Lesser General Public License as
+   published by the Free Software Foundation; either version 2.1 of the
+   License, or (at your option) any later version.
+
+   This file is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public License
+   along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
+
+#include <stddef.h>
+
+#include "unitypes.h"
+
+/* Variant of uc_decomposition that does not produce the 'tag'.  */
+extern int
+       uc_compat_decomposition (ucs4_t uc, ucs4_t *decomposition);
+
+/* A Unicode character together with its canonical combining class.  */
+struct ucs4_with_ccc
+{
+  ucs4_t code;
+  int ccc;      /* range 0..255 */
+};
+
+/* Stable-sort an array of 'struct ucs4_with_ccc'.  */
+extern void
+       gl_uninorm_decompose_merge_sort_inplace (struct ucs4_with_ccc *src, size_t n,
+                                                struct ucs4_with_ccc *tmp);
diff --git a/lib/uninorm/normalize-internal.h b/lib/uninorm/normalize-internal.h
new file mode 100644
index 0000000..aae6dbd
--- /dev/null
+++ b/lib/uninorm/normalize-internal.h
@@ -0,0 +1,35 @@
+/* Normalization of Unicode strings.
+   Copyright (C) 2009-2022 Free Software Foundation, Inc.
+   Written by Bruno Haible <bruno@clisp.org>, 2009.
+
+   This file is free software: you can redistribute it and/or modify
+   it under the terms of the GNU Lesser General Public License as
+   published by the Free Software Foundation; either version 2.1 of the
+   License, or (at your option) any later version.
+
+   This file is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public License
+   along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
+
+#include "unitypes.h"
+
+/* Complete definition of normalization form descriptor.  */
+struct unicode_normalization_form
+{
+  /* Bit mask containing meta-information.
+     This must be the first field.  */
+  unsigned int description;
+  #define NF_IS_COMPAT_DECOMPOSING  (1 << 0)
+  #define NF_IS_COMPOSING           (1 << 1)
+  /* Function that decomposes a Unicode character.  */
+  int (*decomposer) (ucs4_t uc, ucs4_t *decomposition);
+  /* Function that combines two Unicode characters, a starter and another
+     character.  */
+  ucs4_t (*composer) (ucs4_t uc1, ucs4_t uc2);
+  /* Decomposing variant.  */
+  const struct unicode_normalization_form *decomposing_variant;
+};
diff --git a/lib/uninorm/u-normalize-internal.h b/lib/uninorm/u-normalize-internal.h
new file mode 100644
index 0000000..f0d9c6e
--- /dev/null
+++ b/lib/uninorm/u-normalize-internal.h
@@ -0,0 +1,380 @@
+/* Decomposition and composition of Unicode strings.
+   Copyright (C) 2009-2022 Free Software Foundation, Inc.
+   Written by Bruno Haible <bruno@clisp.org>, 2009.
+
+   This file is free software: you can redistribute it and/or modify
+   it under the terms of the GNU Lesser General Public License as
+   published by the Free Software Foundation; either version 2.1 of the
+   License, or (at your option) any later version.
+
+   This file is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public License
+   along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
+
+UNIT *
+FUNC (uninorm_t nf, const UNIT *s, size_t n,
+      UNIT *resultbuf, size_t *lengthp)
+{
+  int (*decomposer) (ucs4_t uc, ucs4_t *decomposition) = nf->decomposer;
+  ucs4_t (*composer) (ucs4_t uc1, ucs4_t uc2) = nf->composer;
+
+  /* The result being accumulated.  */
+  UNIT *result;
+  size_t length;
+  size_t allocated;
+  /* The buffer for sorting.  */
+  #define SORTBUF_PREALLOCATED 64
+  struct ucs4_with_ccc sortbuf_preallocated[2 * SORTBUF_PREALLOCATED];
+  struct ucs4_with_ccc *sortbuf; /* array of size 2 * sortbuf_allocated */
+  size_t sortbuf_allocated;
+  size_t sortbuf_count;
+
+  /* Initialize the accumulator.  */
+  if (resultbuf == NULL)
+    {
+      result = NULL;
+      allocated = 0;
+    }
+  else
+    {
+      result = resultbuf;
+      allocated = *lengthp;
+    }
+  length = 0;
+
+  /* Initialize the buffer for sorting.  */
+  sortbuf = sortbuf_preallocated;
+  sortbuf_allocated = SORTBUF_PREALLOCATED;
+  sortbuf_count = 0;
+
+  {
+    const UNIT *s_end = s + n;
+
+    for (;;)
+      {
+        int count;
+        ucs4_t decomposed[UC_DECOMPOSITION_MAX_LENGTH];
+        int decomposed_count;
+        int i;
+
+        if (s < s_end)
+          {
+            /* Fetch the next character.  */
+            count = U_MBTOUC_UNSAFE (&decomposed[0], s, s_end - s);
+            decomposed_count = 1;
+
+            /* Decompose it, recursively.
+               It would be possible to precompute the recursive decomposition
+               and store it in a table.  But this would significantly increase
+               the size of the decomposition tables, because for example for
+               U+1FC1 the recursive canonical decomposition and the recursive
+               compatibility decomposition are different.  */
+            {
+              int curr;
+
+              for (curr = 0; curr < decomposed_count; )
+                {
+                  /* Invariant: decomposed[0..curr-1] is fully decomposed, i.e.
+                     all elements are atomic.  */
+                  ucs4_t curr_decomposed[UC_DECOMPOSITION_MAX_LENGTH];
+                  int curr_decomposed_count;
+
+                  curr_decomposed_count = decomposer (decomposed[curr], curr_decomposed);
+                  if (curr_decomposed_count >= 0)
+                    {
+                      /* Move curr_decomposed[0..curr_decomposed_count-1] over
+                         decomposed[curr], making room.  It's not worth using
+                         memcpy() here, since the counts are so small.  */
+                      int shift = curr_decomposed_count - 1;
+
+                      if (shift < 0)
+                        abort ();
+                      if (shift > 0)
+                        {
+                          int j;
+
+                          decomposed_count += shift;
+                          if (decomposed_count > UC_DECOMPOSITION_MAX_LENGTH)
+                            abort ();
+                          for (j = decomposed_count - 1 - shift; j > curr; j--)
+                            decomposed[j + shift] = decomposed[j];
+                        }
+                      for (; shift >= 0; shift--)
+                        decomposed[curr + shift] = curr_decomposed[shift];
+                    }
+                  else
+                    {
+                      /* decomposed[curr] is atomic.  */
+                      curr++;
+                    }
+                }
+            }
+          }
+        else
+          {
+            count = 0;
+            decomposed_count = 0;
+          }
+
+        i = 0;
+        for (;;)
+          {
+            ucs4_t uc;
+            int ccc;
+
+            if (s < s_end)
+              {
+                /* Fetch the next character from the decomposition.  */
+                if (i == decomposed_count)
+                  break;
+                uc = decomposed[i];
+                ccc = uc_combining_class (uc);
+              }
+            else
+              {
+                /* End of string reached.  */
+                uc = 0;
+                ccc = 0;
+              }
+
+            if (ccc == 0)
+              {
+                size_t j;
+
+                /* Apply the canonical ordering algorithm to the accumulated
+                   sequence of characters.  */
+                if (sortbuf_count > 1)
+                  gl_uninorm_decompose_merge_sort_inplace (sortbuf, sortbuf_count,
+                                                           sortbuf + sortbuf_count);
+
+                if (composer != NULL)
+                  {
+                    /* Attempt to combine decomposed characters, as specified
+                       in the Unicode Standard Annex #15 "Unicode Normalization
+                       Forms".  We need to check
+                         1. whether the first accumulated character is a
+                            "starter" (i.e. has ccc = 0).  This is usually the
+                            case.  But when the string starts with a
+                            non-starter, the sortbuf also starts with a
+                            non-starter.  Btw, this check could also be
+                            omitted, because the composition table has only
+                            entries (code1, code2) for which code1 is a
+                            starter; if the first accumulated character is not
+                            a starter, no lookup will succeed.
+                         2. If the sortbuf has more than one character, check
+                            for each of these characters that are not "blocked"
+                            from the starter (i.e. have a ccc that is higher
+                            than the ccc of the previous character) whether it
+                            can be combined with the first character.
+                         3. If only one character is left in sortbuf, check
+                            whether it can be combined with the next character
+                            (also a starter).  */
+                    if (sortbuf_count > 0 && sortbuf[0].ccc == 0)
+                      {
+                        for (j = 1; j < sortbuf_count; )
+                          {
+                            if (sortbuf[j].ccc > sortbuf[j - 1].ccc)
+                              {
+                                ucs4_t combined =
+                                  composer (sortbuf[0].code, sortbuf[j].code);
+                                if (combined)
+                                  {
+                                    size_t k;
+
+                                    sortbuf[0].code = combined;
+                                    /* sortbuf[0].ccc = 0, still valid.  */
+                                    for (k = j + 1; k < sortbuf_count; k++)
+                                      sortbuf[k - 1] = sortbuf[k];
+                                    sortbuf_count--;
+                                    continue;
+                                  }
+                              }
+                            j++;
+                          }
+                        if (s < s_end && sortbuf_count == 1)
+                          {
+                            ucs4_t combined =
+                              composer (sortbuf[0].code, uc);
+                            if (combined)
+                              {
+                                uc = combined;
+                                ccc = 0;
+                                /* uc could be further combined with subsequent
+                                   characters.  So don't put it into sortbuf[0] in
+                                   this round, only in the next round.  */
+                                sortbuf_count = 0;
+                              }
+                          }
+                      }
+                  }
+
+                for (j = 0; j < sortbuf_count; j++)
+                  {
+                    ucs4_t muc = sortbuf[j].code;
+
+                    /* Append muc to the result accumulator.  */
+                    if (length < allocated)
+                      {
+                        int ret =
+                          U_UCTOMB (result + length, muc, allocated - length);
+                        if (ret == -1)
+                          {
+                            errno = EINVAL;
+                            goto fail;
+                          }
+                        if (ret >= 0)
+                          {
+                            length += ret;
+                            goto done_appending;
+                          }
+                      }
+                    {
+                      size_t old_allocated = allocated;
+                      size_t new_allocated = 2 * old_allocated;
+                      if (new_allocated < 64)
+                        new_allocated = 64;
+                      if (new_allocated < old_allocated) /* integer overflow? */
+                        abort ();
+                      {
+                        UNIT *larger_result;
+                        if (result == NULL)
+                          {
+                            larger_result =
+                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
+                            if (larger_result == NULL)
+                              {
+                                errno = ENOMEM;
+                                goto fail;
+                              }
+                          }
+                        else if (result == resultbuf)
+                          {
+                            larger_result =
+                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
+                            if (larger_result == NULL)
+                              {
+                                errno = ENOMEM;
+                                goto fail;
+                              }
+                            U_CPY (larger_result, resultbuf, length);
+                          }
+                        else
+                          {
+                            larger_result =
+                              (UNIT *) realloc (result, new_allocated * sizeof (UNIT));
+                            if (larger_result == NULL)
+                              {
+                                errno = ENOMEM;
+                                goto fail;
+                              }
+                          }
+                        result = larger_result;
+                        allocated = new_allocated;
+                        {
+                          int ret =
+                            U_UCTOMB (result + length, muc, allocated - length);
+                          if (ret == -1)
+                            {
+                              errno = EINVAL;
+                              goto fail;
+                            }
+                          if (ret < 0)
+                            abort ();
+                          length += ret;
+                          goto done_appending;
+                        }
+                      }
+                    }
+                   done_appending: ;
+                  }
+
+                /* sortbuf is now empty.  */
+                sortbuf_count = 0;
+              }
+
+            if (!(s < s_end))
+              /* End of string reached.  */
+              break;
+
+            /* Append (uc, ccc) to sortbuf.  */
+            if (sortbuf_count == sortbuf_allocated)
+              {
+                struct ucs4_with_ccc *new_sortbuf;
+
+                sortbuf_allocated = 2 * sortbuf_allocated;
+                if (sortbuf_allocated < sortbuf_count) /* integer overflow? */
+                  abort ();
+                new_sortbuf =
+                  (struct ucs4_with_ccc *) malloc (2 * sortbuf_allocated * sizeof (struct ucs4_with_ccc));
+                if (new_sortbuf == NULL)
+                  {
+                    errno = ENOMEM;
+                    goto fail;
+                  }
+                memcpy (new_sortbuf, sortbuf,
+                        sortbuf_count * sizeof (struct ucs4_with_ccc));
+                if (sortbuf != sortbuf_preallocated)
+                  free (sortbuf);
+                sortbuf = new_sortbuf;
+              }
+            sortbuf[sortbuf_count].code = uc;
+            sortbuf[sortbuf_count].ccc = ccc;
+            sortbuf_count++;
+
+            i++;
+          }
+
+        if (!(s < s_end))
+          /* End of string reached.  */
+          break;
+
+        s += count;
+      }
+  }
+
+  if (length == 0)
+    {
+      if (result == NULL)
+        {
+          /* Return a non-NULL value.  NULL means error.  */
+          result = (UNIT *) malloc (1);
+          if (result == NULL)
+            {
+              errno = ENOMEM;
+              goto fail;
+            }
+        }
+    }
+  else if (result != resultbuf && length < allocated)
+    {
+      /* Shrink the allocated memory if possible.  */
+      UNIT *memory;
+
+      memory = (UNIT *) realloc (result, length * sizeof (UNIT));
+      if (memory != NULL)
+        result = memory;
+    }
+
+  if (sortbuf_count > 0)
+    abort ();
+  if (sortbuf != sortbuf_preallocated)
+    free (sortbuf);
+
+  *lengthp = length;
+  return result;
+
+ fail:
+  {
+    int saved_errno = errno;
+    if (sortbuf != sortbuf_preallocated)
+      free (sortbuf);
+    if (result != resultbuf)
+      free (result);
+    errno = saved_errno;
+  }
+  return NULL;
+}
diff --git a/lib/uninorm/u8-normalize.c b/lib/uninorm/u8-normalize.c
new file mode 100644
index 0000000..fe40d11
--- /dev/null
+++ b/lib/uninorm/u8-normalize.c
@@ -0,0 +1,46 @@
+/* Normalization of UTF-8 strings.
+   Copyright (C) 2009-2022 Free Software Foundation, Inc.
+   Written by Bruno Haible <bruno@clisp.org>, 2009.
+
+   This file is free software.
+   It is dual-licensed under "the GNU LGPLv3+ or the GNU GPLv2+".
+   You can redistribute it and/or modify it under either
+     - the terms of the GNU Lesser General Public License as published
+       by the Free Software Foundation, either version 3, or (at your
+       option) any later version, or
+     - the terms of the GNU General Public License as published by the
+       Free Software Foundation; either version 2, or (at your option)
+       any later version, or
+     - the same dual license "the GNU LGPLv3+ or the GNU GPLv2+".
+
+   This file is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License and the GNU General Public License
+   for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License and of the GNU General Public License along with this
+   program.  If not, see <https://www.gnu.org/licenses/>.  */
+
+#include <config.h>
+
+/* Specification.  */
+#include "uninorm.h"
+
+#include <errno.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "unistr.h"
+#include "unictype.h"
+#include "normalize-internal.h"
+#include "uninorm/decompose-internal.h"
+
+#define FUNC u8_normalize
+#define UNIT uint8_t
+#define U_MBTOUC_UNSAFE u8_mbtouc_unsafe
+#define U_UCTOMB u8_uctomb
+#define U_CPY u8_cpy
+#include "u-normalize-internal.h"