summaryrefslogtreecommitdiffstats
path: root/lib/unicase/ignorable.c
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-05-06 03:06:57 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-05-06 03:06:57 +0000
commita3eed2c248067f0319cb72bcc8b5e2c7054ea6dc (patch)
treefd79d650c7ffee81608955be5f4fd8edd791834e /lib/unicase/ignorable.c
parentInitial commit. (diff)
downloadwget-a3eed2c248067f0319cb72bcc8b5e2c7054ea6dc.tar.xz
wget-a3eed2c248067f0319cb72bcc8b5e2c7054ea6dc.zip
Adding upstream version 1.20.1.upstream/1.20.1upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to '')
-rw-r--r--lib/unicase/ignorable.c71
1 files changed, 71 insertions, 0 deletions
diff --git a/lib/unicase/ignorable.c b/lib/unicase/ignorable.c
new file mode 100644
index 0000000..85ba619
--- /dev/null
+++ b/lib/unicase/ignorable.c
@@ -0,0 +1,71 @@
+/* Test whether a Unicode character is case-ignorable.
+ Copyright (C) 2002, 2006-2007, 2009-2018 Free Software Foundation, Inc.
+ Written by Bruno Haible <bruno@clisp.org>, 2009.
+
+ This program is free software: you can redistribute it and/or modify it
+ under the terms of the GNU General Public License as published
+ by the Free Software Foundation; either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <https://www.gnu.org/licenses/>. */
+
+#include <config.h>
+
+/* Specification. */
+#include "caseprop.h"
+
+/* Quoting the Unicode standard:
+ Definition: A character is defined to be "case-ignorable" if it has the
+ value MidLetter {or the value MidNumLet} for the Word_Break property or
+ its General_Category is one of Nonspacing_Mark (Mn), Enclosing_Mark (Me),
+ Format (Cf), Modifier_Letter (Lm), or Modifier_Symbol (Sk).
+ The text marked in braces was added in Unicode 5.1.0, see
+ <http://www.unicode.org/versions/Unicode5.1.0/> section "Update of
+ Definition of case-ignorable". */
+/* Since this predicate is only used for the "Before C" and "After C"
+ conditions of FINAL_SIGMA, we exclude the "cased" characters here.
+ This simplifies the evaluation of the regular expressions
+ \p{cased} (\p{case-ignorable})* C
+ and
+ C (\p{case-ignorable})* \p{cased}
+ */
+
+#if 0
+
+#include "unictype.h"
+#include "uniwbrk.h"
+
+bool
+uc_is_case_ignorable (ucs4_t uc)
+{
+ int wbp = uc_wordbreak_property (uc);
+
+ return (wbp == WBP_MIDLETTER || wbp == WBP_MIDNUMLET
+ || uc_is_general_category_withtable (uc, UC_CATEGORY_MASK_Mn
+ | UC_CATEGORY_MASK_Me
+ | UC_CATEGORY_MASK_Cf
+ | UC_CATEGORY_MASK_Lm
+ | UC_CATEGORY_MASK_Sk))
+ && !uc_is_cased (uc);
+}
+
+#else
+
+#include "unictype/bitmap.h"
+
+/* Define u_casing_property_case_ignorable table. */
+#include "ignorable.h"
+
+bool
+uc_is_case_ignorable (ucs4_t uc)
+{
+ return bitmap_lookup (&u_casing_property_case_ignorable, uc);
+}
+
+#endif