summaryrefslogtreecommitdiffstats
path: root/lib/uninorm
diff options
context:
space:
mode:
Diffstat (limited to 'lib/uninorm')
-rw-r--r--lib/uninorm/decompose-internal.c28
-rw-r--r--lib/uninorm/decompose-internal.h36
-rw-r--r--lib/uninorm/normalize-internal.h35
-rw-r--r--lib/uninorm/u-normalize-internal.h380
-rw-r--r--lib/uninorm/u8-normalize.c38
5 files changed, 517 insertions, 0 deletions
diff --git a/lib/uninorm/decompose-internal.c b/lib/uninorm/decompose-internal.c
new file mode 100644
index 0000000..e8448d3
--- /dev/null
+++ b/lib/uninorm/decompose-internal.c
@@ -0,0 +1,28 @@
+/* Decomposition of Unicode strings.
+ Copyright (C) 2009-2018 Free Software Foundation, Inc.
+ Written by Bruno Haible <bruno@clisp.org>, 2009.
+
+ This program is free software: you can redistribute it and/or modify it
+ under the terms of the GNU General Public License as published
+ by the Free Software Foundation; either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <https://www.gnu.org/licenses/>. */
+
+#include <config.h>
+
+/* Specification. */
+#include "decompose-internal.h"
+
+#define ELEMENT struct ucs4_with_ccc
+#define COMPARE(a,b) ((a)->ccc - (b)->ccc)
+#define STATIC
+#define merge_sort_fromto gl_uninorm_decompose_merge_sort_fromto
+#define merge_sort_inplace gl_uninorm_decompose_merge_sort_inplace
+#include "array-mergesort.h"
diff --git a/lib/uninorm/decompose-internal.h b/lib/uninorm/decompose-internal.h
new file mode 100644
index 0000000..039bf60
--- /dev/null
+++ b/lib/uninorm/decompose-internal.h
@@ -0,0 +1,36 @@
+/* Decomposition of Unicode strings.
+ Copyright (C) 2009-2018 Free Software Foundation, Inc.
+ Written by Bruno Haible <bruno@clisp.org>, 2009.
+
+ This program is free software: you can redistribute it and/or modify it
+ under the terms of the GNU General Public License as published
+ by the Free Software Foundation; either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <https://www.gnu.org/licenses/>. */
+
+#include <stddef.h>
+
+#include "unitypes.h"
+
+/* Variant of uc_decomposition that does not produce the 'tag'. */
+extern int
+ uc_compat_decomposition (ucs4_t uc, ucs4_t *decomposition);
+
+/* A Unicode character together with its canonical combining class. */
+struct ucs4_with_ccc
+{
+ ucs4_t code;
+ int ccc; /* range 0..255 */
+};
+
+/* Stable-sort an array of 'struct ucs4_with_ccc'. */
+extern void
+ gl_uninorm_decompose_merge_sort_inplace (struct ucs4_with_ccc *src, size_t n,
+ struct ucs4_with_ccc *tmp);
diff --git a/lib/uninorm/normalize-internal.h b/lib/uninorm/normalize-internal.h
new file mode 100644
index 0000000..ed41695
--- /dev/null
+++ b/lib/uninorm/normalize-internal.h
@@ -0,0 +1,35 @@
+/* Normalization of Unicode strings.
+ Copyright (C) 2009-2018 Free Software Foundation, Inc.
+ Written by Bruno Haible <bruno@clisp.org>, 2009.
+
+ This program is free software: you can redistribute it and/or modify it
+ under the terms of the GNU General Public License as published
+ by the Free Software Foundation; either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <https://www.gnu.org/licenses/>. */
+
+#include "unitypes.h"
+
+/* Complete definition of normalization form descriptor. */
+struct unicode_normalization_form
+{
+ /* Bit mask containing meta-information.
+ This must be the first field. */
+ unsigned int description;
+ #define NF_IS_COMPAT_DECOMPOSING (1 << 0)
+ #define NF_IS_COMPOSING (1 << 1)
+ /* Function that decomposes a Unicode character. */
+ int (*decomposer) (ucs4_t uc, ucs4_t *decomposition);
+ /* Function that combines two Unicode characters, a starter and another
+ character. */
+ ucs4_t (*composer) (ucs4_t uc1, ucs4_t uc2);
+ /* Decomposing variant. */
+ const struct unicode_normalization_form *decomposing_variant;
+};
diff --git a/lib/uninorm/u-normalize-internal.h b/lib/uninorm/u-normalize-internal.h
new file mode 100644
index 0000000..86d5e26
--- /dev/null
+++ b/lib/uninorm/u-normalize-internal.h
@@ -0,0 +1,380 @@
+/* Decomposition and composition of Unicode strings.
+ Copyright (C) 2009-2018 Free Software Foundation, Inc.
+ Written by Bruno Haible <bruno@clisp.org>, 2009.
+
+ This program is free software: you can redistribute it and/or modify it
+ under the terms of the GNU General Public License as published
+ by the Free Software Foundation; either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <https://www.gnu.org/licenses/>. */
+
+UNIT *
+FUNC (uninorm_t nf, const UNIT *s, size_t n,
+ UNIT *resultbuf, size_t *lengthp)
+{
+ int (*decomposer) (ucs4_t uc, ucs4_t *decomposition) = nf->decomposer;
+ ucs4_t (*composer) (ucs4_t uc1, ucs4_t uc2) = nf->composer;
+
+ /* The result being accumulated. */
+ UNIT *result;
+ size_t length;
+ size_t allocated;
+ /* The buffer for sorting. */
+ #define SORTBUF_PREALLOCATED 64
+ struct ucs4_with_ccc sortbuf_preallocated[2 * SORTBUF_PREALLOCATED];
+ struct ucs4_with_ccc *sortbuf; /* array of size 2 * sortbuf_allocated */
+ size_t sortbuf_allocated;
+ size_t sortbuf_count;
+
+ /* Initialize the accumulator. */
+ if (resultbuf == NULL)
+ {
+ result = NULL;
+ allocated = 0;
+ }
+ else
+ {
+ result = resultbuf;
+ allocated = *lengthp;
+ }
+ length = 0;
+
+ /* Initialize the buffer for sorting. */
+ sortbuf = sortbuf_preallocated;
+ sortbuf_allocated = SORTBUF_PREALLOCATED;
+ sortbuf_count = 0;
+
+ {
+ const UNIT *s_end = s + n;
+
+ for (;;)
+ {
+ int count;
+ ucs4_t decomposed[UC_DECOMPOSITION_MAX_LENGTH];
+ int decomposed_count;
+ int i;
+
+ if (s < s_end)
+ {
+ /* Fetch the next character. */
+ count = U_MBTOUC_UNSAFE (&decomposed[0], s, s_end - s);
+ decomposed_count = 1;
+
+ /* Decompose it, recursively.
+ It would be possible to precompute the recursive decomposition
+ and store it in a table. But this would significantly increase
+ the size of the decomposition tables, because for example for
+ U+1FC1 the recursive canonical decomposition and the recursive
+ compatibility decomposition are different. */
+ {
+ int curr;
+
+ for (curr = 0; curr < decomposed_count; )
+ {
+ /* Invariant: decomposed[0..curr-1] is fully decomposed, i.e.
+ all elements are atomic. */
+ ucs4_t curr_decomposed[UC_DECOMPOSITION_MAX_LENGTH];
+ int curr_decomposed_count;
+
+ curr_decomposed_count = decomposer (decomposed[curr], curr_decomposed);
+ if (curr_decomposed_count >= 0)
+ {
+ /* Move curr_decomposed[0..curr_decomposed_count-1] over
+ decomposed[curr], making room. It's not worth using
+ memcpy() here, since the counts are so small. */
+ int shift = curr_decomposed_count - 1;
+
+ if (shift < 0)
+ abort ();
+ if (shift > 0)
+ {
+ int j;
+
+ decomposed_count += shift;
+ if (decomposed_count > UC_DECOMPOSITION_MAX_LENGTH)
+ abort ();
+ for (j = decomposed_count - 1 - shift; j > curr; j--)
+ decomposed[j + shift] = decomposed[j];
+ }
+ for (; shift >= 0; shift--)
+ decomposed[curr + shift] = curr_decomposed[shift];
+ }
+ else
+ {
+ /* decomposed[curr] is atomic. */
+ curr++;
+ }
+ }
+ }
+ }
+ else
+ {
+ count = 0;
+ decomposed_count = 0;
+ }
+
+ i = 0;
+ for (;;)
+ {
+ ucs4_t uc;
+ int ccc;
+
+ if (s < s_end)
+ {
+ /* Fetch the next character from the decomposition. */
+ if (i == decomposed_count)
+ break;
+ uc = decomposed[i];
+ ccc = uc_combining_class (uc);
+ }
+ else
+ {
+ /* End of string reached. */
+ uc = 0;
+ ccc = 0;
+ }
+
+ if (ccc == 0)
+ {
+ size_t j;
+
+ /* Apply the canonical ordering algorithm to the accumulated
+ sequence of characters. */
+ if (sortbuf_count > 1)
+ gl_uninorm_decompose_merge_sort_inplace (sortbuf, sortbuf_count,
+ sortbuf + sortbuf_count);
+
+ if (composer != NULL)
+ {
+ /* Attempt to combine decomposed characters, as specified
+ in the Unicode Standard Annex #15 "Unicode Normalization
+ Forms". We need to check
+ 1. whether the first accumulated character is a
+ "starter" (i.e. has ccc = 0). This is usually the
+ case. But when the string starts with a
+ non-starter, the sortbuf also starts with a
+ non-starter. Btw, this check could also be
+ omitted, because the composition table has only
+ entries (code1, code2) for which code1 is a
+ starter; if the first accumulated character is not
+ a starter, no lookup will succeed.
+ 2. If the sortbuf has more than one character, check
+ for each of these characters that are not "blocked"
+ from the starter (i.e. have a ccc that is higher
+ than the ccc of the previous character) whether it
+ can be combined with the first character.
+ 3. If only one character is left in sortbuf, check
+ whether it can be combined with the next character
+ (also a starter). */
+ if (sortbuf_count > 0 && sortbuf[0].ccc == 0)
+ {
+ for (j = 1; j < sortbuf_count; )
+ {
+ if (sortbuf[j].ccc > sortbuf[j - 1].ccc)
+ {
+ ucs4_t combined =
+ composer (sortbuf[0].code, sortbuf[j].code);
+ if (combined)
+ {
+ size_t k;
+
+ sortbuf[0].code = combined;
+ /* sortbuf[0].ccc = 0, still valid. */
+ for (k = j + 1; k < sortbuf_count; k++)
+ sortbuf[k - 1] = sortbuf[k];
+ sortbuf_count--;
+ continue;
+ }
+ }
+ j++;
+ }
+ if (s < s_end && sortbuf_count == 1)
+ {
+ ucs4_t combined =
+ composer (sortbuf[0].code, uc);
+ if (combined)
+ {
+ uc = combined;
+ ccc = 0;
+ /* uc could be further combined with subsequent
+ characters. So don't put it into sortbuf[0] in
+ this round, only in the next round. */
+ sortbuf_count = 0;
+ }
+ }
+ }
+ }
+
+ for (j = 0; j < sortbuf_count; j++)
+ {
+ ucs4_t muc = sortbuf[j].code;
+
+ /* Append muc to the result accumulator. */
+ if (length < allocated)
+ {
+ int ret =
+ U_UCTOMB (result + length, muc, allocated - length);
+ if (ret == -1)
+ {
+ errno = EINVAL;
+ goto fail;
+ }
+ if (ret >= 0)
+ {
+ length += ret;
+ goto done_appending;
+ }
+ }
+ {
+ size_t old_allocated = allocated;
+ size_t new_allocated = 2 * old_allocated;
+ if (new_allocated < 64)
+ new_allocated = 64;
+ if (new_allocated < old_allocated) /* integer overflow? */
+ abort ();
+ {
+ UNIT *larger_result;
+ if (result == NULL)
+ {
+ larger_result =
+ (UNIT *) malloc (new_allocated * sizeof (UNIT));
+ if (larger_result == NULL)
+ {
+ errno = ENOMEM;
+ goto fail;
+ }
+ }
+ else if (result == resultbuf)
+ {
+ larger_result =
+ (UNIT *) malloc (new_allocated * sizeof (UNIT));
+ if (larger_result == NULL)
+ {
+ errno = ENOMEM;
+ goto fail;
+ }
+ U_CPY (larger_result, resultbuf, length);
+ }
+ else
+ {
+ larger_result =
+ (UNIT *) realloc (result, new_allocated * sizeof (UNIT));
+ if (larger_result == NULL)
+ {
+ errno = ENOMEM;
+ goto fail;
+ }
+ }
+ result = larger_result;
+ allocated = new_allocated;
+ {
+ int ret =
+ U_UCTOMB (result + length, muc, allocated - length);
+ if (ret == -1)
+ {
+ errno = EINVAL;
+ goto fail;
+ }
+ if (ret < 0)
+ abort ();
+ length += ret;
+ goto done_appending;
+ }
+ }
+ }
+ done_appending: ;
+ }
+
+ /* sortbuf is now empty. */
+ sortbuf_count = 0;
+ }
+
+ if (!(s < s_end))
+ /* End of string reached. */
+ break;
+
+ /* Append (uc, ccc) to sortbuf. */
+ if (sortbuf_count == sortbuf_allocated)
+ {
+ struct ucs4_with_ccc *new_sortbuf;
+
+ sortbuf_allocated = 2 * sortbuf_allocated;
+ if (sortbuf_allocated < sortbuf_count) /* integer overflow? */
+ abort ();
+ new_sortbuf =
+ (struct ucs4_with_ccc *) malloc (2 * sortbuf_allocated * sizeof (struct ucs4_with_ccc));
+ if (new_sortbuf == NULL)
+ {
+ errno = ENOMEM;
+ goto fail;
+ }
+ memcpy (new_sortbuf, sortbuf,
+ sortbuf_count * sizeof (struct ucs4_with_ccc));
+ if (sortbuf != sortbuf_preallocated)
+ free (sortbuf);
+ sortbuf = new_sortbuf;
+ }
+ sortbuf[sortbuf_count].code = uc;
+ sortbuf[sortbuf_count].ccc = ccc;
+ sortbuf_count++;
+
+ i++;
+ }
+
+ if (!(s < s_end))
+ /* End of string reached. */
+ break;
+
+ s += count;
+ }
+ }
+
+ if (length == 0)
+ {
+ if (result == NULL)
+ {
+ /* Return a non-NULL value. NULL means error. */
+ result = (UNIT *) malloc (1);
+ if (result == NULL)
+ {
+ errno = ENOMEM;
+ goto fail;
+ }
+ }
+ }
+ else if (result != resultbuf && length < allocated)
+ {
+ /* Shrink the allocated memory if possible. */
+ UNIT *memory;
+
+ memory = (UNIT *) realloc (result, length * sizeof (UNIT));
+ if (memory != NULL)
+ result = memory;
+ }
+
+ if (sortbuf_count > 0)
+ abort ();
+ if (sortbuf != sortbuf_preallocated)
+ free (sortbuf);
+
+ *lengthp = length;
+ return result;
+
+ fail:
+ {
+ int saved_errno = errno;
+ if (sortbuf != sortbuf_preallocated)
+ free (sortbuf);
+ if (result != resultbuf)
+ free (result);
+ errno = saved_errno;
+ }
+ return NULL;
+}
diff --git a/lib/uninorm/u8-normalize.c b/lib/uninorm/u8-normalize.c
new file mode 100644
index 0000000..abdcac4
--- /dev/null
+++ b/lib/uninorm/u8-normalize.c
@@ -0,0 +1,38 @@
+/* Normalization of UTF-8 strings.
+ Copyright (C) 2009-2018 Free Software Foundation, Inc.
+ Written by Bruno Haible <bruno@clisp.org>, 2009.
+
+ This program is free software: you can redistribute it and/or modify it
+ under the terms of the GNU General Public License as published
+ by the Free Software Foundation; either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <https://www.gnu.org/licenses/>. */
+
+#include <config.h>
+
+/* Specification. */
+#include "uninorm.h"
+
+#include <errno.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "unistr.h"
+#include "unictype.h"
+#include "normalize-internal.h"
+#include "decompose-internal.h"
+
+#define FUNC u8_normalize
+#define UNIT uint8_t
+#define U_MBTOUC_UNSAFE u8_mbtouc_unsafe
+#define U_UCTOMB u8_uctomb
+#define U_CPY u8_cpy
+#include "u-normalize-internal.h"