summaryrefslogtreecommitdiffstats
path: root/tests/misc/uniq-collate.sh
diff options
context:
space:
mode:
Diffstat (limited to '')
-rwxr-xr-xtests/misc/uniq-collate.sh63
1 files changed, 63 insertions, 0 deletions
diff --git a/tests/misc/uniq-collate.sh b/tests/misc/uniq-collate.sh
new file mode 100755
index 0000000..8f2cfcf
--- /dev/null
+++ b/tests/misc/uniq-collate.sh
@@ -0,0 +1,63 @@
+#!/bin/sh
+# before coreutils-8.32, uniq would not distinguish
+# items which compared equal with strcoll()
+# So ensure we avoid strcoll() for the following cases.
+
+# Copyright (C) 2020-2022 Free Software Foundation, Inc.
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <https://www.gnu.org/licenses/>.
+
+. "${srcdir=.}/tests/init.sh"; path_prepend_ ./src
+print_ver_ uniq printf
+
+gen_input()
+{
+ env LC_ALL=$LOCALE_FR_UTF8 printf "$@" > in || framework_failure_
+}
+
+# strcoll() used to return 0 comparing the following strings
+# which was fixed somewhere between glibc-2.22 and glibc-2.30
+gen_input '%s\n' 'ⁿᵘˡˡ' 'ܥܝܪܐܩ'
+test $(LC_ALL=$LOCALE_FR_UTF8 uniq < in | wc -l) = 2 || fail=1
+
+# normalization in strcoll is inconsistent across platforms.
+# glibc based systems at least do _not_ normalize in strcoll,
+# while cygwin systems for example may do so.
+# á composed and decomposed, are generally not compared equal
+gen_input '\u00E1\na\u0301\n'
+test $(LC_ALL=$LOCALE_FR_UTF8 uniq < in | wc -l) = 2 || fail=1
+# Similarly with the following equivalent hangul characters
+gen_input '\uAC01\n\u1100\u1161\u11A8\n'
+test $(LC_ALL=ko_KR.utf8 uniq < in | wc -l) = 2 || fail=1
+
+# Note if running in the wrong locale,
+# strcoll may indicate the strings match when they don't.
+# I.e., cjk and hangul will now work even if
+# uniq is running in the wrong locale
+# hangul (ko_KR.utf8)
+gen_input '\uAC00\n\uAC01\n'
+test $(LC_ALL=en_US.utf8 uniq < in | wc -l) = 2 || fail=1
+# CJK (zh_CN.utf8)
+gen_input '\u3400\n\u3401\n'
+test $(LC_ALL=en_US.utf8 uniq < in | wc -l) = 2 || fail=1
+
+# Note strcoll() ignores certain characters,
+# but not if the strings are otherwise equal.
+# I.e., the following on glibc-2.30 at least,
+# as expected, does not print a single item,
+# but testing here for illustration
+gen_input ',a\n.a\n'
+test $(LC_ALL=$LOCALE_FR_UTF8 uniq < in | wc -l) = 2 || fail=1
+
+Exit $fail