diff options
Diffstat (limited to '')
-rwxr-xr-x | tests/misc/uniq-collate.sh | 63 |
1 files changed, 63 insertions, 0 deletions
diff --git a/tests/misc/uniq-collate.sh b/tests/misc/uniq-collate.sh new file mode 100755 index 0000000..8f2cfcf --- /dev/null +++ b/tests/misc/uniq-collate.sh @@ -0,0 +1,63 @@ +#!/bin/sh +# before coreutils-8.32, uniq would not distinguish +# items which compared equal with strcoll() +# So ensure we avoid strcoll() for the following cases. + +# Copyright (C) 2020-2022 Free Software Foundation, Inc. + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. + +. "${srcdir=.}/tests/init.sh"; path_prepend_ ./src +print_ver_ uniq printf + +gen_input() +{ + env LC_ALL=$LOCALE_FR_UTF8 printf "$@" > in || framework_failure_ +} + +# strcoll() used to return 0 comparing the following strings +# which was fixed somewhere between glibc-2.22 and glibc-2.30 +gen_input '%s\n' 'ⁿᵘˡˡ' 'ܥܝܪܐܩ' +test $(LC_ALL=$LOCALE_FR_UTF8 uniq < in | wc -l) = 2 || fail=1 + +# normalization in strcoll is inconsistent across platforms. +# glibc based systems at least do _not_ normalize in strcoll, +# while cygwin systems for example may do so. +# á composed and decomposed, are generally not compared equal +gen_input '\u00E1\na\u0301\n' +test $(LC_ALL=$LOCALE_FR_UTF8 uniq < in | wc -l) = 2 || fail=1 +# Similarly with the following equivalent hangul characters +gen_input '\uAC01\n\u1100\u1161\u11A8\n' +test $(LC_ALL=ko_KR.utf8 uniq < in | wc -l) = 2 || fail=1 + +# Note if running in the wrong locale, +# strcoll may indicate the strings match when they don't. +# I.e., cjk and hangul will now work even if +# uniq is running in the wrong locale +# hangul (ko_KR.utf8) +gen_input '\uAC00\n\uAC01\n' +test $(LC_ALL=en_US.utf8 uniq < in | wc -l) = 2 || fail=1 +# CJK (zh_CN.utf8) +gen_input '\u3400\n\u3401\n' +test $(LC_ALL=en_US.utf8 uniq < in | wc -l) = 2 || fail=1 + +# Note strcoll() ignores certain characters, +# but not if the strings are otherwise equal. +# I.e., the following on glibc-2.30 at least, +# as expected, does not print a single item, +# but testing here for illustration +gen_input ',a\n.a\n' +test $(LC_ALL=$LOCALE_FR_UTF8 uniq < in | wc -l) = 2 || fail=1 + +Exit $fail |