summaryrefslogtreecommitdiffstats
path: root/intl/icu/source/test/testdata/collationtest.txt
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-07 19:33:14 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-07 19:33:14 +0000
commit36d22d82aa202bb199967e9512281e9a53db42c9 (patch)
tree105e8c98ddea1c1e4784a60a5a6410fa416be2de /intl/icu/source/test/testdata/collationtest.txt
parentInitial commit. (diff)
downloadfirefox-esr-36d22d82aa202bb199967e9512281e9a53db42c9.tar.xz
firefox-esr-36d22d82aa202bb199967e9512281e9a53db42c9.zip
Adding upstream version 115.7.0esr.upstream/115.7.0esrupstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'intl/icu/source/test/testdata/collationtest.txt')
-rw-r--r--intl/icu/source/test/testdata/collationtest.txt2585
1 files changed, 2585 insertions, 0 deletions
diff --git a/intl/icu/source/test/testdata/collationtest.txt b/intl/icu/source/test/testdata/collationtest.txt
new file mode 100644
index 0000000000..abda337e54
--- /dev/null
+++ b/intl/icu/source/test/testdata/collationtest.txt
@@ -0,0 +1,2585 @@
+# Copyright (C) 2016 and later: Unicode, Inc. and others.
+# License & terms of use: http://www.unicode.org/copyright.html
+# Copyright (c) 2012-2015 International Business Machines
+# Corporation and others. All Rights Reserved.
+#
+# This file should be in UTF-8 with a signature byte sequence ("BOM").
+#
+# collationtest.txt: Collation test data.
+#
+# created on: 2012apr13
+# created by: Markus W. Scherer
+
+# A line with "** test: description" is used for verbose and error output.
+
+# A collator can be set with "@ root" or "@ locale language-tag",
+# for example "@ locale de-u-co-phonebk".
+# An old-style locale ID can also be used, for example "@ locale de@collation=phonebook".
+
+# A collator can be built with "@ rules".
+# An "@ rules" line is followed by one or more lines with the tailoring rules.
+
+# A collator can be modified with "% attribute=value".
+
+# "* compare" tests the order (= or <) of the following strings.
+# The relation can be "=" or "<" (the level of the difference is not specified)
+# or "<1", "<2", "<c", "<3", "<4" (indicating the level of the difference).
+
+# Test sections ("* compare") are terminated by
+# definitions of new collators, changing attributes, or new test sections.
+
+** test: simple CEs & expansions
+# Many types of mappings are tested elsewhere, including via the UCA conformance tests.
+# Here we mostly cover a few unusual mappings.
+@ rules
+&\x01 # most control codes are ignorable
+<<<\u0300 # tertiary CE
+&9<\x00 # NUL not ignorable
+&\uA00A\uA00B=\uA002 # two long-primary CEs
+&\uA00A\uA00B\u00050005=\uA003 # three CEs, require 64 bits
+
+* compare
+= \x01
+= \x02
+<3 \u0300
+<1 9
+<1 \x00
+= \x01\x00\x02
+<1 a
+<3 a\u0300
+<2 a\u0308
+= ä
+<1 b
+<1 か # Hiragana Ka (U+304B)
+<2 か\u3099 # plus voiced sound mark
+= が # Hiragana Ga (U+304C)
+<1 \uA00A\uA00B
+= \uA002
+<1 \uA00A\uA00B\u00050004
+<1 \uA00A\uA00B\u00050005
+= \uA003
+<1 \uA00A\uA00B\u00050006
+
+** test: contractions
+# Create some interesting mappings, and map some normalization-inert characters
+# (which are not subject to canonical reordering)
+# to some of the same CEs to check the sequence of CEs.
+@ rules
+
+# Contractions starting with 'a' should not continue with any character < U+0300
+# so that we can test a shortcut for that.
+&a=ⓐ
+&b<bz=ⓑ
+&d<dz\u0301=ⓓ # d+z+acute
+&z
+<a\u0301=Ⓐ # a+acute sorts after z
+<a\u0301\u0301=Ⓑ # a+acute+acute
+<a\u0301\u0301\u0358=Ⓒ # a+acute+acute+dot above right
+<a\u030a=Ⓓ # a+ring
+<a\u0323=Ⓔ # a+dot below
+<a\u0323\u0358=Ⓕ # a+dot below+dot above right
+<a\u0327\u0323\u030a=Ⓖ # a+cedilla+dot below+ring
+<a\u0327\u0323bz=Ⓗ # a+cedilla+dot below+b+z
+
+&\U0001D158=⁰ # musical notehead black (has a symbol primary)
+<\U0001D158\U0001D165=¼ # musical quarter note
+
+# deliberately missing prefix contractions:
+# dz
+# a\u0327
+# a\u0327\u0323
+# a\u0327\u0323b
+
+&\x01
+<<<\U0001D165=¹ # musical stem (ccc=216)
+<<<\U0001D16D=² # musical augmentation dot (ccc=226)
+<<<\U0001D165\U0001D16D=³ # stem+dot (ccc=216 226)
+&\u0301=❶ # acute (ccc=230)
+&\u030a=❷ # ring (ccc=230)
+&\u0308=❸ # diaeresis (ccc=230)
+<<\u0308\u0301=❹ # diaeresis+acute (=dialytika tonos) (ccc=230 230)
+&\u0327=❺ # cedilla (ccc=202)
+&\u0323=❻ # dot below (ccc=220)
+&\u0331=❼ # macron below (ccc=220)
+<<\u0331\u0358=❽ # macron below+dot above right (ccc=220 232)
+&\u0334=❾ # tilde overlay (ccc=1)
+&\u0358=❿ # dot above right (ccc=232)
+
+&\u0f71=① # tibetan vowel sign aa
+&\u0f72=② # tibetan vowel sign i
+# \u0f71\u0f72 # tibetan vowel sign aa + i = ii = U+0F73
+&\u0f73=③ # tibetan vowel sign ii (ccc=0 but lccc=129)
+
+** test: simple contractions
+
+# Some strings are chosen to cause incremental contiguous contraction matching to
+# go into partial matches for prefixes of contractions
+# (where the prefixes are deliberately not also contractions).
+# When there is no complete match, then the matching code must back out of those
+# so that discontiguous contractions work as specified.
+
+* compare
+# contraction starter with no following text, or mismatch, or blocked
+<1 a
+= ⓐ
+<1 aa
+= ⓐⓐ
+<1 ab
+= ⓐb
+<1 az
+= ⓐz
+
+* compare
+<1 a
+<2 a\u0308\u030a # ring blocked by diaeresis
+= ⓐ❸❷
+<2 a\u0327
+= ⓐ❺
+
+* compare
+<2 \u0308
+= ❸
+<2 \u0308\u030a\u0301 # acute blocked by ring
+= ❸❷❶
+
+* compare
+<1 \U0001D158
+= ⁰
+<1 \U0001D158\U0001D165
+= ¼
+
+# no discontiguous contraction because of missing prefix contraction d+z,
+# and a starter ('z') after the 'd'
+* compare
+<1 dz\u0323\u0301
+= dz❻❶
+
+# contiguous contractions
+* compare
+<1 abz
+= ⓐⓑ
+<1 abzz
+= ⓐⓑz
+
+* compare
+<1 a
+<1 z
+<1 a\u0301
+= Ⓐ
+<1 a\u0301\u0301
+= Ⓑ
+<1 a\u0301\u0301\u0358
+= Ⓒ
+<1 a\u030a
+= Ⓓ
+<1 a\u0323\u0358
+= Ⓕ
+<1 a\u0327\u0323\u030a # match despite missing prefix
+= Ⓖ
+<1 a\u0327\u0323bz
+= Ⓗ
+
+* compare
+<2 \u0308\u0308\u0301 # acute blocked from first diaeresis, contracts with second
+= ❸❹
+
+* compare
+<1 \U0001D158\U0001D165
+= ¼
+
+* compare
+<3 \U0001D165\U0001D16D
+= ³
+
+** test: discontiguous contractions
+* compare
+<1 a\u0327\u030a # a+ring skips cedilla
+= Ⓓ❺
+<2 a\u0327\u0327\u030a # a+ring skips 2 cedillas
+= Ⓓ❺❺
+<2 a\u0327\u0327\u0327\u030a # a+ring skips 3 cedillas
+= Ⓓ❺❺❺
+<2 a\u0334\u0327\u0327\u030a # a+ring skips tilde overlay & 2 cedillas
+= Ⓓ❾❺❺
+<1 a\u0327\u0323 # a+dot below skips cedilla
+= Ⓔ❺
+<1 a\u0323\u0301\u0358 # a+dot below+dot ab.r.: 2-char match, then skips acute
+= Ⓕ❶
+<2 a\u0334\u0323\u0358 # a+dot below skips tilde overlay
+= Ⓕ❾
+
+* compare
+<2 \u0331\u0331\u0358 # macron below+dot ab.r. skips the second macron below
+= ❽❼
+
+* compare
+<1 a\u0327\u0331\u0323\u030a # a+ring skips cedilla, macron below, dot below (dot blocked by macron)
+= Ⓓ❺❼❻
+<1 a\u0327\u0323\U0001D16D\u030a # a+dot below skips cedilla
+= Ⓔ❺²❷
+<2 a\u0327\u0327\u0323\u030a # a+dot below skips 2 cedillas
+= Ⓔ❺❺❷
+<2 a\u0327\u0323\u0323\u030a # a+dot below skips cedilla
+= Ⓔ❺❻❷
+<2 a\u0334\u0327\u0323\u030a # a+dot below skips tilde overlay & cedilla
+= Ⓔ❾❺❷
+
+* compare
+<1 \U0001D158\u0327\U0001D165 # quarter note skips cedilla
+= ¼❺
+<1 a\U0001D165\u0323 # a+dot below skips stem
+= Ⓔ¹
+
+# partial contiguous match, backs up, matches discontiguous contraction
+<1 a\u0327\u0323b
+= Ⓔ❺b
+<1 a\u0327\u0323ba
+= Ⓔ❺bⓐ
+
+# a+acute+acute+dot above right skips cedilla, continues matching 2 same-ccc combining marks
+* compare
+<1 a\u0327\u0301\u0301\u0358
+= Ⓒ❺
+
+# FCD but not NFD
+* compare
+<1 a\u0f73\u0301 # a+acute skips tibetan ii
+= Ⓐ③
+
+# FCD but the 0f71 inside the 0f73 must be skipped
+# to match the discontiguous contraction of the first 0f71 with the trailing 0f72 inside the 0f73
+* compare
+<1 \u0f71\u0f73 # == \u0f73\u0f71 == \u0f71\u0f71\u0f72
+= ③①
+
+** test: discontiguous contractions with nested contractions
+* compare
+<1 a\u0323\u0308\u0301\u0358
+= Ⓕ❹
+<2 a\u0323\u0308\u0301\u0308\u0301\u0358
+= Ⓕ❹❹
+
+** test: discontiguous contractions with interleaved contractions
+* compare
+# a+ring & cedilla & macron below+dot above right
+<1 a\u0327\u0331\u030a\u0358
+= Ⓓ❺❽
+
+# a+ring & 1x..3x macron below+dot above right
+<2 a\u0331\u030a\u0358
+= Ⓓ❽
+<2 a\u0331\u0331\u030a\u0358\u0358
+= Ⓓ❽❽
+# also skips acute
+<2 a\u0331\u0331\u0331\u030a\u0301\u0358\u0358\u0358
+= Ⓓ❽❽❽❶
+
+# a+dot below & stem+augmentation dot, followed by contiguous d+z+acute
+<1 a\U0001D165\u0323\U0001D16Ddz\u0301
+= Ⓔ³ⓓ
+
+** test: some simple string comparisons
+@ root
+* compare
+# first string compares against ""
+= \u0000
+< a
+<1 b
+<3 B
+= \u0000B\u0000
+
+** test: compare with strength=primary
+% strength=primary
+* compare
+<1 a
+<1 b
+= B
+
+** test: compare with strength=secondary
+% strength=secondary
+* compare
+<1 a
+<1 b
+= B
+
+** test: compare with strength=tertiary
+% strength=tertiary
+* compare
+<1 a
+<1 b
+<3 B
+
+** test: compare with strength=quaternary
+% strength=quaternary
+* compare
+<1 a
+<1 b
+<3 B
+
+** test: compare with strength=identical
+% strength=identical
+* compare
+<1 a
+<1 b
+<3 B
+
+** test: côté with forwards secondary
+@ root
+* compare
+<1 cote
+<2 coté
+<2 côte
+<2 côté
+
+** test: côté with forwards secondary vs. U+FFFE merge separator
+# Merged sort keys: On each level, any difference in the first segment
+# must trump any further difference.
+* compare
+<1 cote\uFFFEcôté
+<2 coté\uFFFEcôte
+<2 côte\uFFFEcoté
+<2 côté\uFFFEcote
+
+** test: côté with backwards secondary
+% backwards=on
+* compare
+<1 cote
+<2 côte
+<2 coté
+<2 côté
+
+** test: côté with backwards secondary vs. U+FFFE merge separator
+# Merged sort keys: On each level, any difference in the first segment
+# must trump any further difference.
+* compare
+<1 cote\uFFFEcôté
+<2 côte\uFFFEcoté
+<2 coté\uFFFEcôte
+<2 côté\uFFFEcote
+
+** test: U+FFFE on identical level
+@ root
+% strength=identical
+* compare
+# All of these control codes are completely-ignorable, so that
+# their low code points are compared with the merge separator.
+# The merge separator must compare less than any other character.
+<1 \uFFFE\u0001\u0002\u0003
+<i \u0001\uFFFE\u0002\u0003
+<i \u0001\u0002\uFFFE\u0003
+<i \u0001\u0002\u0003\uFFFE
+
+* compare
+# The merge separator must even compare less than U+0000.
+<1 \uFFFE\u0000\u0000
+<i \u0000\uFFFE\u0000
+<i \u0000\u0000\uFFFE
+
+** test: Hani < surrogates < U+FFFD
+# Note: compareUTF8() treats unpaired surrogates like U+FFFD,
+# so with that the strings with surrogates will compare equal to each other
+# and equal to the string with U+FFFD.
+@ root
+% strength=identical
+* compare
+<1 abz
+<1 a\u4e00z
+<1 a\U00020000z
+<1 a\ud800z
+<1 a\udbffz
+<1 a\udc00z
+<1 a\udfffz
+<1 a\ufffdz
+
+** test: script reordering
+@ root
+% reorder Hani Zzzz digit
+* compare
+<1 ?
+<1 +
+<1 丂
+<1 a
+<1 α
+<1 5
+
+% reorder default
+* compare
+<1 ?
+<1 +
+<1 5
+<1 a
+<1 α
+<1 丂
+
+** test: empty rules
+@ rules
+* compare
+<1 a
+<2 ä
+<3 Ä
+<1 b
+
+** test: very simple rules
+@ rules
+&a=e<<<<q<<<<r<x<<<X<<y<<<Y;z,Z
+% strength=quaternary
+* compare
+<1 a
+= e
+<4 q
+<4 r
+<1 x
+<3 X
+<2 y
+<3 Y
+<2 z
+<3 Z
+
+** test: tailoring twice before a root position: primary
+@ rules
+&[before 1]b<p
+&[before 1]b<q
+* compare
+<1 a
+<1 p
+<1 q
+<1 b
+
+** test: tailoring twice before a root position: secondary
+@ rules
+&[before 2]ſ<<p
+&[before 2]ſ<<q
+* compare
+<1 s
+<2 p
+<2 q
+<2 ſ
+
+# secondary-before common weight
+@ rules
+&[before 2]b<<p
+&[before 2]b<<q
+* compare
+<1 a
+<1 p
+<2 q
+<2 b
+
+** test: tailoring twice before a root position: tertiary
+@ rules
+&[before 3]B<<<p
+&[before 3]B<<<q
+* compare
+<1 b
+<3 p
+<3 q
+<3 B
+
+# tertiary-before common weight
+@ rules
+&[before 3]b<<<p
+&[before 3]b<<<q
+* compare
+<1 a
+<1 p
+<3 q
+<3 b
+
+@ rules
+&[before 2]b<<s
+&[before 3]s<<<p
+&[before 3]s<<<q
+* compare
+<1 a
+<1 p
+<3 q
+<3 s
+<2 b
+
+** test: tailor after completely ignorable
+@ rules
+&\x00<<<x<<y
+* compare
+= \x00
+= \x1F
+<3 x
+<2 y
+
+** test: secondary tailoring gaps, ICU ticket 9362
+@ rules
+&[before 2]s<<'_'
+&s<<r # secondary between s and ſ (long s)
+&ſ<<*a-q # more than 15 between ſ and secondary CE boundary
+&[before 2][first primary ignorable]<<u<<v # between secondary CE boundary & lowest secondary CE
+&[last primary ignorable]<<y<<z
+
+* compare
+<2 u
+<2 v
+<2 \u0332 # lowest secondary CE
+<2 \u0308
+<2 y
+<2 z
+<1 s_
+<2 ss
+<2 sr
+<2 sſ
+<2 sa
+<2 sb
+<2 sp
+<2 sq
+<2 sus
+<2 svs
+<2 rs
+
+** test: tertiary tailoring gaps, ICU ticket 9362
+@ rules
+&[before 3]t<<<'_'
+&t<<<r # tertiary between t and fullwidth t
+&ᵀ<<<*a-q # more than 15 between ᵀ (modifier letter T) and tertiary CE boundary
+&[before 3][first secondary ignorable]<<<u<<<v # between tertiary CE boundary & lowest tertiary CE
+&[last secondary ignorable]<<<y<<<z
+
+* compare
+<3 u
+<3 v
+# Note: The root collator currently does not map any characters to tertiary CEs.
+<3 y
+<3 z
+<1 t_
+<3 tt
+<3 tr
+<3 tt
+<3 tᵀ
+<3 ta
+<3 tb
+<3 tp
+<3 tq
+<3 tut
+<3 tvt
+<3 rt
+
+** test: secondary & tertiary around root character
+@ rules
+&[before 2]m<<r
+&m<<s
+&[before 3]m<<<u
+&m<<<v
+* compare
+<1 l
+<1 r
+<2 u
+<3 m
+<3 v
+<2 s
+<1 n
+
+** test: secondary & tertiary around tailored item
+@ rules
+&m<x
+&[before 2]x<<r
+&x<<s
+&[before 3]x<<<u
+&x<<<v
+* compare
+<1 m
+<1 r
+<2 u
+<3 x
+<3 v
+<2 s
+<1 n
+
+** test: more nesting of secondary & tertiary before
+@ rules
+&[before 3]m<<<u
+&[before 2]m<<r
+&[before 3]r<<<q
+&m<<<w
+&m<<t
+&[before 3]w<<<v
+&w<<<x
+&w<<s
+* compare
+<1 l
+<1 q
+<3 r
+<2 u
+<3 m
+<3 v
+<3 w
+<3 x
+<2 s
+<2 t
+<1 n
+
+** test: case bits
+@ rules
+&w<x # tailored CE getting case bits
+ =uv=uV=Uv=UV # 2 chars -> 1 CE
+&ae=ch=cH=Ch=CH # 2 chars -> 2 CEs
+&rst=yz=yZ=Yz=YZ # 2 chars -> 3 CEs
+% caseFirst=lower
+* compare
+<1 ae
+= ch
+<3 cH
+<3 Ch
+<3 CH
+<1 rst
+= yz
+<3 yZ
+<3 Yz
+<3 YZ
+<1 w
+<1 x
+= uv
+<3 uV
+= Uv # mixed case on single CE cannot distinguish variations
+<3 UV
+
+** test: tertiary CEs, tertiary, caseLevel=off, caseFirst=lower
+@ rules
+&\u0001<<<t<<<T # tertiary CEs
+% caseFirst=lower
+* compare
+<1 aa
+<3 aat
+<3 aaT
+<3 aA
+<3 aAt
+<3 ata
+<3 aTa
+
+** test: tertiary CEs, tertiary, caseLevel=off, caseFirst=upper
+% caseFirst=upper
+* compare
+<1 aA
+<3 aAt
+<3 aa
+<3 aat
+<3 aaT
+<3 ata
+<3 aTa
+
+** test: reset on expansion, ICU tickets 9415 & 9593
+@ rules
+&æ<x # tailor the last primary CE so that x sorts between ae and af
+&æb=bæ # copy all reset CEs to make bæ sort the same
+&각<h # copy/tailor 3 CEs to make h sort before the next Hangul syllable 갂
+&⒀<<y # copy/tailor 4 CEs to make y sort with only a secondary difference
+&l·=z # handle the pre-context for · when fetching reset CEs
+ <<u # copy/tailor 2 CEs
+
+* compare
+<1 ae
+<2 æ
+<1 x
+<1 af
+
+* compare
+<1 aeb
+<2 æb
+= bæ
+
+* compare
+<1 각
+<1 h
+<1 갂
+<1 갃
+
+* compare
+<1 · # by itself: primary CE
+<1 l
+<2 l· # l+middle dot has only a secondary difference from l
+= z
+<2 u
+
+* compare
+<1 (13)
+<3 ⒀ # DUCET sets special tertiary weights in all CEs
+<2 y
+<1 (13[
+
+% alternate=shifted
+* compare
+<1 (13)
+= 13
+<3 ⒀
+= y # alternate=shifted removes the tailoring difference on the last CE
+<1 14
+
+** test: contraction inside extension, ICU ticket 9378
+@ rules
+&а<<х/й # all letters are Cyrillic
+* compare
+<1 ай
+<2 х
+
+** test: no duplicate tailored CEs for different reset positions with same CEs, ICU ticket 10104
+@ rules
+&t<x &ᵀ<y # same primary weights
+&q<u &[before 1]ꝗ<v # q and ꝗ are primary adjacent
+* compare
+<1 q
+<1 u
+<1 v
+<1 ꝗ
+<1 t
+<3 ᵀ
+<1 y
+<1 x
+
+# Principle: Each rule builds on the state of preceding rules and ignores following rules.
+
+** test: later rule does not affect earlier reset position, ICU ticket 10105
+@ rules
+&a < u < v < w &ov < x &b < v
+* compare
+<1 oa
+<1 ou
+<1 x # CE(o) followed by CE between u and w
+<1 ow
+<1 ob
+<1 ov
+
+** test: later rule does not affect earlier extension (1), ICU ticket 10105
+@ rules
+&a=x/b &v=b
+% strength=secondary
+* compare
+<1 B
+<1 c
+<1 v
+= b
+* compare
+<1 AB
+= x
+<1 ac
+<1 av
+= ab
+
+** test: later rule does not affect earlier extension (2), ICU ticket 10105
+@ rules
+&a <<< c / e &g <<< e / l
+% strength=secondary
+* compare
+<1 AE
+= c
+<2 æ
+<1 agl
+= ae
+
+** test: later rule does not affect earlier extension (3), ICU ticket 10105
+@ rules
+&a = b / c &d = c / e
+% strength=secondary
+* compare
+<1 AC # C is still only tertiary different from the original c
+= b
+<1 ade
+= ac
+
+** test: extension contains tailored character, ICU ticket 10105
+@ rules
+&a=e &b=u/e
+* compare
+<1 a
+= e
+<1 ba
+= be
+= u
+
+** test: add simple mappings for characters with root context
+@ rules
+&z=· # middle dot has a prefix mapping in the CLDR root
+&n=и # и (U+0438) has contractions in the root
+* compare
+<1 l
+<2 l· # root mapping for l|· still works
+<1 z
+= ·
+* compare
+<1 n
+= и
+<1 И
+<1 и\u0306 # root mapping for й=и\u0306 still works
+= й
+<3 Й
+
+** test: add context mappings around characters with root context
+@ rules
+&z=·h # middle dot has a prefix mapping in the CLDR root
+&n=ә|и # и (U+0438) has contractions in the root
+* compare
+<1 l
+<2 l· # root mapping for l|· still works
+<1 z
+= ·h
+* compare
+<1 и
+<3 И
+<1 и\u0306 # root mapping for й=и\u0306 still works
+= й
+* compare
+<1 әn
+= әи
+<1 әo
+
+** test: many secondary CEs at the top of their range
+@ rules
+&[last primary ignorable]<<*\u2801-\u28ff
+* compare
+<2 \u0308
+<2 \u2801
+<2 \u2802
+<2 \u2803
+<2 \u2804
+<2 \u28fd
+<2 \u28fe
+<2 \u28ff
+<1 \x20
+
+** test: many tertiary CEs at the top of their range
+@ rules
+&[last secondary ignorable]<<<*a-z
+* compare
+<3 a
+<3 b
+<3 c
+<3 d
+# e..w
+<3 x
+<3 y
+<3 z
+<2 \u0308
+
+** test: tailor contraction together with nearly equivalent prefix, ICU ticket 10101
+@ rules
+&a=p|x &b=px &c=op
+* compare
+<1 b
+= px
+<3 B
+<1 c
+= op
+<3 C
+* compare
+<1 ca
+= opx # first contraction op, then prefix p|x
+<3 cA
+<3 Ca
+
+** test: reset position with prefix (pre-context), ICU ticket 10102
+@ rules
+&a=p|x &px=y
+* compare
+<1 pa
+= px
+= y
+<3 pA
+<1 q
+<1 x
+
+** test: prefix+contraction together (1), ICU ticket 10071
+@ rules
+&x=a|bc
+* compare
+<1 ab
+<1 Abc
+<1 abd
+<1 ac
+<1 aw
+<1 ax
+= abc
+<3 aX
+<3 Ax
+<1 b
+<1 bb
+<1 bc
+<3 bC
+<3 Bc
+<1 bd
+
+** test: prefix+contraction together (2), ICU ticket 10071
+@ rules
+&w=bc &x=a|b
+* compare
+<1 w
+= bc
+<3 W
+* compare
+<1 aw
+<1 ax
+= ab
+<3 aX
+<1 axb
+<1 axc
+= abc # prefix match a|b takes precedence over contraction match bc
+<3 abC
+<1 abd
+<1 ay
+
+** test: prefix+contraction together (3), ICU ticket 10071
+@ rules
+&x=a|b &w=bc # reverse order of rules as previous test, order should not matter here
+* compare # same "compare" sequences as previous test
+<1 w
+= bc
+<3 W
+* compare
+<1 aw
+<1 ax
+= ab
+<3 aX
+<1 axb
+<1 axc
+= abc # prefix match a|b takes precedence over contraction match bc
+<3 abC
+<1 abd
+<1 ay
+
+** test: no mapping p|c, falls back to contraction ch, CLDR ticket 5962
+@ rules
+&d=ch &v=p|ci
+* compare
+<1 pc
+<3 pC
+<1 pcH
+<1 pcI
+<1 pd
+= pch # no-prefix contraction ch matches
+<3 pD
+<1 pv
+= pci # prefix+contraction p|ci matches
+<3 pV
+
+** test: tailor in & around compact ranges of root primaries
+# The Ogham characters U+1681..U+169A are in simple ascending order of primary CEs
+# which should be reliably encoded as one range in the root elements data.
+@ rules
+&[before 1]ᚁ<a
+&ᚁ<b
+&[before 1]ᚂ<c
+&ᚂ<d
+&[before 1]ᚚ<y
+&ᚚ<z
+&[before 2]ᚁ<<r
+&ᚁ<<s
+&[before 3]ᚚ<<<t
+&ᚚ<<<u
+* compare
+<1 ᣵ # U+18F5 last Canadian Aboriginal
+<1 a
+<1 r
+<2 ᚁ
+<2 s
+<1 b
+<1 c
+<1 ᚂ
+<1 d
+<1 ᚃ
+<1 ᚙ
+<1 y
+<1 t
+<3 ᚚ
+<3 u
+<1 z
+<1 ᚠ # U+16A0 first Runic
+
+** test: suppressContractions
+@ rules
+&z<ch<әж [suppressContractions [·cә]]
+* compare
+<1 ch
+<3 cH # ch was suppressed
+<1 l
+<1 l· # primary difference, not secondary, because l|· was suppressed
+<1 ә
+<2 ә\u0308 # secondary difference, not primary, because contractions for ә were suppressed
+<1 әж
+<3 әЖ
+
+** test: Hangul & Jamo
+@ rules
+&L=\u1100 # first Jamo L
+&V=\u1161 # first Jamo V
+&T=\u11A8 # first Jamo T
+&\uAC01<<*\u4E00-\u4EFF # first Hangul LVT syllable & lots of secondary diffs
+* compare
+<1 Lv
+<3 LV
+= \u1100\u1161
+= \uAC00
+<1 LVt
+<3 LVT
+= \u1100\u1161\u11A8
+= \uAC00\u11A8
+= \uAC01
+<2 LVT\u0308
+<2 \u4E00
+<2 \u4E01
+<2 \u4E80
+<2 \u4EFF
+<2 LV\u0308T
+<1 \uAC02
+
+** test: adjust special reset positions according to previous rules, CLDR ticket 6070
+@ rules
+&[last variable]<x
+[maxVariable space] # has effect only after building, no effect on following rules
+&[last variable]<y
+&[before 1][first regular]<z
+* compare
+<1 ? # some punctuation
+<1 x
+<1 y
+<1 z
+<1 $ # some symbol
+
+@ rules
+&[last primary ignorable]<<x<<<y
+&[last primary ignorable]<<z
+* compare
+<2 \u0358
+<2 x
+<3 y
+<2 z
+<1 \x20
+
+@ rules
+&[last secondary ignorable]<<<x
+&[last secondary ignorable]<<<y
+* compare
+<3 x
+<3 y
+<2 \u0358
+
+@ rules
+&[before 2][first variable]<<z
+&[before 2][first variable]<<y
+&[before 3][first variable]<<<x
+&[before 3][first variable]<<<w
+&[before 1][first variable]<v
+&[before 2][first variable]<<u
+&[before 3][first variable]<<<t
+&[before 2]\uFDD1\xA0<<s # FractionalUCA.txt: FDD1 00A0, SPACE first primary
+* compare
+<2 \u0358
+<1 s
+<2 \uFDD1\xA0
+<1 t
+<3 u
+<2 v
+<1 w
+<3 x
+<3 y
+<2 z
+<2 \t
+
+@ rules
+&[before 2][first regular]<<z
+&[before 3][first regular]<<<y
+&[before 1][first regular]<x
+&[before 3][first regular]<<<w
+&[before 2]\uFDD1\u263A<<v # FractionalUCA.txt: FDD1 263A, SYMBOL first primary
+&[before 3][first regular]<<<u
+&[before 1][first regular]<p # primary before the boundary: becomes variable
+&[before 3][first regular]<<<t # not affected by p
+&[last variable]<q # after p!
+* compare
+<1 ?
+<1 p
+<1 q
+<1 t
+<3 u
+<3 v
+<1 w
+<3 x
+<1 y
+<3 z
+<1 $
+
+# check that p & q are indeed variable
+% alternate=shifted
+* compare
+= ?
+= p
+= q
+<1 t
+<3 u
+<3 v
+<1 w
+<3 x
+<1 y
+<3 z
+<1 $
+
+@ rules
+&[before 2][first trailing]<<z
+&[before 1][first trailing]<y
+&[before 3][first trailing]<<<x
+* compare
+<1 \u4E00 # first Han, first implicit
+<1 \uFDD1\uFDD0 # FractionalUCA.txt: unassigned first primary
+# Note: The root collator currently does not map any characters to the trailing first boundary primary.
+<1 x
+<3 y
+<1 z
+<2 \uFFFD # The root collator currently maps U+FFFD to the first real trailing primary.
+
+@ rules
+&[before 2][first primary ignorable]<<z
+&[before 2][first primary ignorable]<<y
+&[before 3][first primary ignorable]<<<x
+&[before 3][first primary ignorable]<<<w
+* compare
+= \x01
+<2 w
+<3 x
+<3 y
+<2 z
+<2 \u0301
+
+@ rules
+&[before 3][first secondary ignorable]<<<y
+&[before 3][first secondary ignorable]<<<x
+* compare
+= \x01
+<3 x
+<3 y
+<2 \u0301
+
+** test: canonical closure
+@ rules
+&X=A &U=Â
+* compare
+<1 U
+= Â
+= A\u0302
+<2 Ú # U with acute
+= U\u0301
+= Ấ # A with circumflex & acute
+= Â\u0301
+= A\u0302\u0301
+<1 X
+= A
+<2 X\u030A # with ring above
+= Å
+= A\u030A
+= \u212B # Angstrom sign
+
+@ rules
+&x=\u5140\u55C0
+* compare
+<1 x
+= \u5140\u55C0
+= \u5140\uFA0D
+= \uFA0C\u55C0
+= \uFA0C\uFA0D # CJK compatibility characters
+<3 X
+
+# canonical closure on prefix rules, ICU ticket 9444
+@ rules
+&x=ä|ŝ
+* compare
+<1 äs # not tailored
+<1 äx
+= äŝ
+= a\u0308s\u0302
+= a\u0308ŝ
+= äs\u0302
+<3 äX
+
+** test: conjoining Jamo map to expansions
+@ rules
+&gg=\u1101 # Jamo Lead consonant GG
+&nj=\u11AC # Jamo Trail consonant NJ
+* compare
+<1 gg\u1161nj
+= \u1101\u1161\u11AC
+= \uAE4C\u11AC
+= \uAE51
+<3 gg\u1161nJ
+<1 \u1100\u1100
+
+** test: canonical tail closure, ICU ticket 5913
+@ rules
+&a<â
+* compare
+<1 a
+<1 â # tailored
+= a\u0302
+<2 a\u0323\u0302 # discontiguous contraction
+= ạ\u0302 # equivalent
+= ậ # equivalent
+<1 b
+
+@ rules
+&a<ạ
+* compare
+<1 a
+<1 ạ # tailored
+= a\u0323
+<2 a\u0323\u0302 # contiguous contraction plus extra diacritic
+= ạ\u0302 # equivalent
+= ậ # equivalent
+<1 b
+
+# Tail closure should work even if there is a prefix and/or contraction.
+@ rules
+&a<\u5140|câ
+# In order to find discontiguous contractions for \u5140|câ
+# there must exist a mapping for \u5140|ca, regardless of what it maps to.
+# (This follows from the UCA spec.)
+&x=\u5140|ca
+* compare
+<1 \u5140a
+= \uFA0Ca
+<1 \u5140câ # tailored
+= \uFA0Ccâ
+= \u5140ca\u0302
+= \uFA0Cca\u0302
+<2 \u5140ca\u0323\u0302 # discontiguous contraction
+= \uFA0Cca\u0323\u0302
+= \u5140cạ\u0302
+= \uFA0Ccạ\u0302
+= \u5140cậ
+= \uFA0Ccậ
+<1 \u5140b
+= \uFA0Cb
+<1 \u5140x
+= \u5140ca
+
+# Double-check that without the extra mapping there will be no discontiguous match.
+@ rules
+&a<\u5140|câ
+* compare
+<1 \u5140a
+= \uFA0Ca
+<1 \u5140câ # tailored
+= \uFA0Ccâ
+= \u5140ca\u0302
+= \uFA0Cca\u0302
+<1 \u5140b
+= \uFA0Cb
+<1 \u5140ca\u0323\u0302 # no discontiguous contraction
+= \uFA0Cca\u0323\u0302
+= \u5140cạ\u0302
+= \uFA0Ccạ\u0302
+= \u5140cậ
+= \uFA0Ccậ
+
+@ rules
+&a<cạ
+* compare
+<1 a
+<1 cạ # tailored
+= ca\u0323
+<2 ca\u0323\u0302 # contiguous contraction plus extra diacritic
+= cạ\u0302 # equivalent
+= cậ # equivalent
+<1 b
+
+# ᾢ = U+1FA2 GREEK SMALL LETTER OMEGA WITH PSILI AND VARIA AND YPOGEGRAMMENI
+# = 03C9 0313 0300 0345
+# ccc = 0, 230, 230, 240
+@ rules
+&δ=αῳ
+# In order to find discontiguous contractions for αῳ
+# there must exist a mapping for αω, regardless of what it maps to.
+# (This follows from the UCA spec.)
+&ε=αω
+* compare
+<1 δ
+= αῳ
+= αω\u0345
+<2 αω\u0313\u0300\u0345 # discontiguous contraction
+= αὠ\u0300\u0345
+= αὢ\u0345
+= αᾢ
+<2 αω\u0300\u0313\u0345
+= αὼ\u0313\u0345
+= αῲ\u0313 # not FCD
+<1 ε
+= αω
+
+# Double-check that without the extra mapping there will be no discontiguous match.
+@ rules
+&δ=αῳ
+* compare
+<1 αω\u0313\u0300\u0345 # no discontiguous contraction
+= αὠ\u0300\u0345
+= αὢ\u0345
+= αᾢ
+<2 αω\u0300\u0313\u0345
+= αὼ\u0313\u0345
+= αῲ\u0313 # not FCD
+<1 δ
+= αῳ
+= αω\u0345
+
+# Add U+0315 COMBINING COMMA ABOVE RIGHT which has ccc=232.
+# Tests code paths where the tailored string has a combining mark
+# that does not occur in any composite's decomposition.
+@ rules
+&δ=αὼ\u0315
+* compare
+<1 αω\u0313\u0300\u0315 # Not tailored: The grave accent blocks the comma above.
+= αὠ\u0300\u0315
+= αὢ\u0315
+<1 δ
+= αὼ\u0315
+= αω\u0300\u0315
+<2 αω\u0300\u0315\u0345
+= αὼ\u0315\u0345
+= αῲ\u0315 # not FCD
+
+** test: danish a+a vs. a-umlaut, ICU ticket 9319
+@ rules
+&z<aa
+* compare
+<1 z
+<1 aa
+<2 aa\u0308
+= aä
+
+** test: Jamo L with and in prefix
+# Useful for the Korean "searchjl" tailoring (instead of contractions of pairs of Jamo L).
+@ rules
+# Jamo Lead consonant G after G or GG
+&[last primary ignorable]<<\u1100|\u1100=\u1101|\u1100
+# Jamo Lead consonant GG sorts like G+G
+&\u1100\u1100=\u1101
+# Note: Making G|GG and GG|GG sort the same as G|G+G
+# would require the ability to reset on G|G+G,
+# or we could make G-after-G equal to some secondary-CE character,
+# and reset on a pair of those.
+# (It does not matter much if there are at most two G in a row in real text.)
+* compare
+<1 \u1100
+<2 \u1100\u1100 # only one primary from a sequence of G lead consonants
+= \u1101
+<2 \u1100\u1100\u1100
+= \u1101\u1100
+# but not = \u1100\u1101, see above
+<1 \u1100\u1161
+= \uAC00
+<2 \u1100\u1100\u1161
+= \u1100\uAC00 # prefix match from the L of the LV syllable
+= \u1101\u1161
+= \uAE4C
+
+** test: proposed Korean "searchjl" tailoring with prefixes, CLDR ticket 6546
+@ rules
+# Low secondary CEs for Jamo V & T.
+# Note: T should sort before V for proper syllable order.
+&\u0332 # COMBINING LOW LINE (first primary ignorable)
+<<\u1161<<\u1162
+
+# Korean Jamo lead consonant search rules, part 2:
+# Make modern compound L jamo primary equivalent to non-compound forms.
+
+# Secondary CEs for Jamo L-after-L, greater than Jamo V & T.
+&\u0313 # COMBINING COMMA ABOVE (second primary ignorable)
+=\u1100|\u1100
+=\u1103|\u1103
+=\u1107|\u1107
+=\u1109|\u1109
+=\u110C|\u110C
+
+# Compound L Jamo map to equivalent expansions of primary+secondary CE.
+&\u1100\u0313=\u1101<<<\u3132 # HANGUL CHOSEONG SSANGKIYEOK, HANGUL LETTER SSANGKIYEOK
+&\u1103\u0313=\u1104<<<\u3138 # HANGUL CHOSEONG SSANGTIKEUT, HANGUL LETTER SSANGTIKEUT
+&\u1107\u0313=\u1108<<<\u3143 # HANGUL CHOSEONG SSANGPIEUP, HANGUL LETTER SSANGPIEUP
+&\u1109\u0313=\u110A<<<\u3146 # HANGUL CHOSEONG SSANGSIOS, HANGUL LETTER SSANGSIOS
+&\u110C\u0313=\u110D<<<\u3149 # HANGUL CHOSEONG SSANGCIEUC, HANGUL LETTER SSANGCIEUC
+
+* compare
+<1 \u1100\u1161
+= \uAC00
+<2 \u1100\u1162
+= \uAC1C
+<2 \u1100\u1100\u1161
+= \u1100\uAC00
+= \u1101\u1161
+= \uAE4C
+<3 \u3132\u1161
+
+** test: Hangul syllables in prefix & in the interior of a contraction
+@ rules
+&x=\u1100\u1161|a\u1102\u1162z
+* compare
+<1 \u1100\u1161x
+= \u1100\u1161a\u1102\u1162z
+= \u1100\u1161a\uB0B4z
+= \uAC00a\u1102\u1162z
+= \uAC00a\uB0B4z
+
+** test: digits are unsafe-backwards when numeric=on
+@ root
+% numeric=on
+* compare
+# If digits are not unsafe, then numeric collation sees "1"=="01" and "b">"a".
+# We need to back up before the identical prefix "1" and compare the full numbers.
+<1 11b
+<1 101a
+
+** test: simple locale data test
+@ locale de
+* compare
+<1 a
+<2 ä
+<1 ae
+<2 æ
+
+@ locale de-u-co-phonebk
+* compare
+<1 a
+<1 ae
+<2 ä
+<2 æ
+
+# The following test cases were moved here from ICU 52's DataDrivenCollationTest.txt.
+
+** test: DataDrivenCollationTest/TestMorePinyin
+# Testing the primary strength.
+@ locale zh
+% strength=primary
+* compare
+< lā
+= lĀ
+= Lā
+= LĀ
+< lān
+= lĀn
+< lē
+= lĒ
+= Lē
+= LĒ
+< lēn
+= lĒn
+
+** test: DataDrivenCollationTest/TestLithuanian
+# Lithuanian sort order.
+@ locale lt
+* compare
+< cz
+< č
+< d
+< iz
+< j
+< sz
+< š
+< t
+< zz
+< ž
+
+** test: DataDrivenCollationTest/TestLatvian
+# Latvian sort order.
+@ locale lv
+* compare
+< cz
+< č
+< d
+< gz
+< ģ
+< h
+< iz
+< j
+< kz
+< ķ
+< l
+< lz
+< ļ
+< m
+< nz
+< ņ
+< o
+< rz
+< ŗ
+< s
+< sz
+< š
+< t
+< zz
+< ž
+
+** test: DataDrivenCollationTest/TestEstonian
+# Estonian sort order.
+@ locale et
+* compare
+< sy
+< š
+< šy
+< z
+< zy
+< ž
+< v
+< va
+< w
+< õ
+< õy
+< ä
+< äy
+< ö
+< öy
+< ü
+< üy
+< x
+
+** test: DataDrivenCollationTest/TestAlbanian
+# Albanian sort order.
+@ locale sq
+* compare
+< cz
+< ç
+< d
+< dz
+< dh
+< e
+< ez
+< ë
+< f
+< gz
+< gj
+< h
+< lz
+< ll
+< m
+< nz
+< nj
+< o
+< rz
+< rr
+< s
+< sz
+< sh
+< t
+< tz
+< th
+< u
+< xz
+< xh
+< y
+< zz
+< zh
+
+** test: DataDrivenCollationTest/TestSimplifiedChineseOrder
+# Sorted file has different order.
+@ root
+# normalization=on turned on & off automatically.
+* compare
+< \u5F20
+< \u5F20\u4E00\u8E3F
+
+** test: DataDrivenCollationTest/TestTibetanNormalizedIterativeCrash
+# This pretty much crashes.
+@ root
+* compare
+< \u0f71\u0f72\u0f80\u0f71\u0f72
+< \u0f80
+
+** test: DataDrivenCollationTest/TestThaiPartialSortKeyProblems
+# These are examples of strings that caused trouble in partial sort key testing.
+@ locale th-TH
+* compare
+< \u0E01\u0E01\u0E38\u0E18\u0E20\u0E31\u0E13\u0E11\u0E4C
+< \u0E01\u0E01\u0E38\u0E2A\u0E31\u0E19\u0E42\u0E18
+* compare
+< \u0E01\u0E07\u0E01\u0E32\u0E23
+< \u0E01\u0E07\u0E42\u0E01\u0E49
+* compare
+< \u0E01\u0E23\u0E19\u0E17\u0E32
+< \u0E01\u0E23\u0E19\u0E19\u0E40\u0E0A\u0E49\u0E32
+* compare
+< \u0E01\u0E23\u0E30\u0E40\u0E08\u0E35\u0E22\u0E27
+< \u0E01\u0E23\u0E30\u0E40\u0E08\u0E35\u0E4A\u0E22\u0E27
+* compare
+< \u0E01\u0E23\u0E23\u0E40\u0E0A\u0E2D
+< \u0E01\u0E23\u0E23\u0E40\u0E0A\u0E49\u0E32
+
+** test: DataDrivenCollationTest/TestJavaStyleRule
+# java.text allows rules to start as '<<<x<<<y...'
+# we emulate this by assuming a &[first tertiary ignorable] in this case.
+@ rules
+&\u0001=equal<<<z<<x<<<w &[first tertiary ignorable]=a &[first primary ignorable]=b
+* compare
+= a
+= equal
+< z
+< x
+= b # x had become the new first primary ignorable
+< w
+
+** test: DataDrivenCollationTest/TestShiftedIgnorable
+# The UCA states that primary ignorables should be completely
+# ignorable when following a shifted code point.
+@ root
+% alternate=shifted
+% strength=quaternary
+* compare
+< a\u0020b
+= a\u0020\u0300b
+= a\u0020\u0301b
+< a_b
+= a_\u0300b
+= a_\u0301b
+< A\u0020b
+= A\u0020\u0300b
+= A\u0020\u0301b
+< A_b
+= A_\u0300b
+= A_\u0301b
+< a\u0301b
+< A\u0301b
+< a\u0300b
+< A\u0300b
+
+** test: DataDrivenCollationTest/TestNShiftedIgnorable
+# The UCA states that primary ignorables should be completely
+# ignorable when following a shifted code point.
+@ root
+% alternate=non-ignorable
+% strength=tertiary
+* compare
+< a\u0020b
+< A\u0020b
+< a\u0020\u0301b
+< A\u0020\u0301b
+< a\u0020\u0300b
+< A\u0020\u0300b
+< a_b
+< A_b
+< a_\u0301b
+< A_\u0301b
+< a_\u0300b
+< A_\u0300b
+< a\u0301b
+< A\u0301b
+< a\u0300b
+< A\u0300b
+
+** test: DataDrivenCollationTest/TestSafeSurrogates
+# It turned out that surrogates were not skipped properly
+# when iterating backwards if they were in the middle of a
+# contraction. This test assures that this is fixed.
+@ rules
+&a < x\ud800\udc00b
+* compare
+< a
+< x\ud800\udc00b
+
+** test: DataDrivenCollationTest/da_TestPrimary
+# This test goes through primary strength cases
+@ locale da
+% strength=primary
+* compare
+< Lvi
+< Lwi
+* compare
+< L\u00e4vi
+< L\u00f6wi
+* compare
+< L\u00fcbeck
+= Lybeck
+
+** test: DataDrivenCollationTest/da_TestTertiary
+# This test goes through tertiary strength cases
+@ locale da
+% strength=tertiary
+* compare
+< Luc
+< luck
+* compare
+< luck
+< L\u00fcbeck
+* compare
+< lybeck
+< L\u00fcbeck
+* compare
+< L\u00e4vi
+< L\u00f6we
+* compare
+< L\u00f6ww
+< mast
+
+* compare
+< A/S
+< ANDRE
+< ANDR\u00c9
+< ANDREAS
+< AS
+< CA
+< \u00c7A
+< CB
+< \u00c7C
+< D.S.B.
+< DA
+< \u00d0A
+< DB
+< \u00d0C
+< DSB
+< DSC
+< EKSTRA_ARBEJDE
+< EKSTRABUD0
+< H\u00d8ST
+< HAAG
+< H\u00c5NDBOG
+< HAANDV\u00c6RKSBANKEN
+< Karl
+< karl
+< NIELS\u0020J\u00d8RGEN
+< NIELS-J\u00d8RGEN
+< NIELSEN
+< R\u00c9E,\u0020A
+< REE,\u0020B
+< R\u00c9E,\u0020L
+< REE,\u0020V
+< SCHYTT,\u0020B
+< SCHYTT,\u0020H
+< SCH\u00dcTT,\u0020H
+< SCHYTT,\u0020L
+< SCH\u00dcTT,\u0020M
+< SS
+< \u00df
+< SSA
+< STORE\u0020VILDMOSE
+< STOREK\u00c6R0
+< STORM\u0020PETERSEN
+< STORMLY
+< THORVALD
+< THORVARDUR
+< \u00feORVAR\u00d0UR
+< THYGESEN
+< VESTERG\u00c5RD,\u0020A
+< VESTERGAARD,\u0020A
+< VESTERG\u00c5RD,\u0020B
+< \u00c6BLE
+< \u00c4BLE
+< \u00d8BERG
+< \u00d6BERG
+
+* compare
+< andere
+< chaque
+< chemin
+< cote
+< cot\u00e9
+< c\u00f4te
+< c\u00f4t\u00e9
+< \u010du\u010d\u0113t
+< Czech
+< hi\u0161a
+< irdisch
+< lie
+< lire
+< llama
+< l\u00f5ug
+< l\u00f2za
+< lu\u010d
+< luck
+< L\u00fcbeck
+< lye
+< l\u00e4vi
+< L\u00f6wen
+< m\u00e0\u0161ta
+< m\u00eer
+< myndig
+< M\u00e4nner
+< m\u00f6chten
+< pi\u00f1a
+< pint
+< pylon
+< \u0161\u00e0ran
+< savoir
+< \u0160erb\u016bra
+< Sietla
+< \u015blub
+< subtle
+< symbol
+< s\u00e4mtlich
+< verkehrt
+< vox
+< v\u00e4ga
+< waffle
+< wood
+< yen
+< yuan
+< yucca
+< \u017eal
+< \u017eena
+< \u017den\u0113va
+< zoo0
+< Zviedrija
+< Z\u00fcrich
+< zysk0
+< \u00e4ndere
+
+** test: DataDrivenCollationTest/hi_TestNewRules
+# This test goes through new rules and tests against old rules
+@ locale hi
+* compare
+< कॐ
+< कं
+< कँ
+< कः
+
+** test: DataDrivenCollationTest/ro_TestNewRules
+# This test goes through new rules and tests against old rules
+@ locale ro
+* compare
+< xAx
+< xă
+< xĂ
+< Xă
+< XĂ
+< xăx
+< xĂx
+< xâ
+< xÂ
+< Xâ
+< XÂ
+< xâx
+< xÂx
+< xb
+< xIx
+< xî
+< xÎ
+< Xî
+< XÎ
+< xîx
+< xÎx
+< xj
+< xSx
+< xș
+= xş
+< xȘ
+= xŞ
+< Xș
+= Xş
+< XȘ
+= XŞ
+< xșx
+= xşx
+< xȘx
+= xŞx
+< xT
+< xTx
+< xț
+= xţ
+< xȚ
+= xŢ
+< Xț
+= Xţ
+< XȚ
+= XŢ
+< xțx
+= xţx
+< xȚx
+= xŢx
+< xU
+
+** test: DataDrivenCollationTest/testOffsets
+# This tests cases where forwards and backwards iteration get different offsets
+@ locale en
+% strength=tertiary
+* compare
+< a\uD800\uDC00\uDC00
+< b\uD800\uDC00\uDC00
+* compare
+< \u0301A\u0301\u0301
+< \u0301B\u0301\u0301
+* compare
+< abcd\r\u0301
+< abce\r\u0301
+# TODO: test offsets in new CollationTest
+
+# End of test cases moved here from ICU 52's DataDrivenCollationTest.txt.
+
+** test: was ICU 52 cmsccoll/TestRedundantRules
+@ rules
+& a < b < c < d& [before 1] c < m
+* compare
+<1 a
+<1 b
+<1 m
+<1 c
+<1 d
+
+@ rules
+& a < b <<< c << d <<< e& [before 3] e <<< x
+* compare
+<1 a
+<1 b
+<3 c
+<2 d
+<3 x
+<3 e
+
+@ rules
+& a < b <<< c << d <<< e <<< f < g& [before 1] g < x
+* compare
+<1 a
+<1 b
+<3 c
+<2 d
+<3 e
+<3 f
+<1 x
+<1 g
+
+@ rules
+& a <<< b << c < d& a < m
+* compare
+<1 a
+<3 b
+<2 c
+<1 m
+<1 d
+
+@ rules
+&a<b<<b\u0301 &z<b
+* compare
+<1 a
+<1 b\u0301
+<1 z
+<1 b
+
+@ rules
+&z<m<<<q<<<m
+* compare
+<1 z
+<1 q
+<3 m
+
+@ rules
+&z<<<m<q<<<m
+* compare
+<1 z
+<1 q
+<3 m
+
+@ rules
+& a < b < c < d& r < c
+* compare
+<1 a
+<1 b
+<1 d
+<1 r
+<1 c
+
+@ rules
+& a < b < c < d& c < m
+* compare
+<1 a
+<1 b
+<1 c
+<1 m
+<1 d
+
+@ rules
+& a < b < c < d& a < m
+* compare
+<1 a
+<1 m
+<1 b
+<1 c
+<1 d
+
+** test: was ICU 52 cmsccoll/TestExpansionSyntax
+# The following two rules should sort the particular list of strings the same.
+@ rules
+&AE <<< a << b <<< c &d <<< f
+* compare
+<1 AE
+<3 a
+<2 b
+<3 c
+<1 d
+<3 f
+
+@ rules
+&A <<< a / E << b / E <<< c /E &d <<< f
+* compare
+<1 AE
+<3 a
+<2 b
+<3 c
+<1 d
+<3 f
+
+# The following two rules should sort the particular list of strings the same.
+@ rules
+&AE <<< a <<< b << c << d < e < f <<< g
+* compare
+<1 AE
+<3 a
+<3 b
+<2 c
+<2 d
+<1 e
+<1 f
+<3 g
+
+@ rules
+&A <<< a / E <<< b / E << c / E << d / E < e < f <<< g
+* compare
+<1 AE
+<3 a
+<3 b
+<2 c
+<2 d
+<1 e
+<1 f
+<3 g
+
+# The following two rules should sort the particular list of strings the same.
+@ rules
+&AE <<< B <<< C / D <<< F
+* compare
+<1 AE
+<3 B
+<3 F
+<1 AED
+<3 C
+
+@ rules
+&A <<< B / E <<< C / ED <<< F / E
+* compare
+<1 AE
+<3 B
+<3 F
+<1 AED
+<3 C
+
+** test: never reorder trailing primaries
+@ root
+% reorder Zzzz Grek
+* compare
+<1 L
+<1 字
+<1 Ω
+<1 \uFFFD
+<1 \uFFFF
+
+** test: fall back to mappings with shorter prefixes, not immediately to ones with no prefixes
+@ rules
+&u=ab|cd
+&v=b|ce
+* compare
+<1 abc
+<1 abcc
+<1 abcf
+<1 abcd
+= abu
+<1 abce
+= abv
+
+# With the following rules, there is only one prefix per composite ĉ or ç,
+# but both prefixes apply to just c in NFD form.
+# We would get different results for composed vs. NFD input
+# if we fell back directly from longest-prefix mappings to no-prefix mappings.
+@ rules
+&x=op|ĉ
+&y=p|ç
+* compare
+<1 opc
+<2 opć
+<1 opcz
+<1 opd
+<1 opĉ
+= opc\u0302
+= opx
+<1 opç
+= opc\u0327
+= opy
+
+# The mapping is used which has the longest matching prefix for which
+# there is also a suffix match, with the longest suffix match among several for that prefix.
+@ rules
+&❶=d
+&❷=de
+&❸=def
+&①=c|d
+&②=c|de
+&③=c|def
+&④=bc|d
+&⑤=bc|de
+&⑥=bc|def
+&⑦=abc|d
+&⑧=abc|de
+&⑨=abc|def
+* compare
+<1 9aadzz
+= 9aa❶zz
+<1 9aadez
+= 9aa❷z
+<1 9aadef
+= 9aa❸
+<1 9acdzz
+= 9ac①zz
+<1 9acdez
+= 9ac②z
+<1 9acdef
+= 9ac③
+<1 9bcdzz
+= 9bc④zz
+<1 9bcdez
+= 9bc⑤z
+<1 9bcdef
+= 9bc⑥
+<1 abcdzz
+= abc⑦zz
+<1 abcdez
+= abc⑧z
+<1 abcdef
+= abc⑨
+
+** test: prefix + discontiguous contraction with missing prefix contraction
+# Unfortunate terminology: The first "prefix" here is the pre-context,
+# the second "prefix" refers to the contraction/relation string that is
+# one shorter than the one being tested.
+@ rules
+&x=p|e
+&y=p|ê
+&z=op|ê
+# No mapping for op|e:
+# Discontiguous contraction matching should not match op|ê in opệ
+# because it would have to skip the dot below and extend a match on op|e by the circumflex,
+# but there is no match on op|e.
+* compare
+<1 oPe
+<1 ope
+= opx
+<1 opệ
+= opy\u0323 # y not z
+<1 opê
+= opz
+
+# We cannot test for fallback by whether the contraction default CE32
+# is for another contraction. With the following rules, there is no mapping for op|e,
+# and the fallback to prefix p has no contractions.
+@ rules
+&x=p|e
+&z=op|ê
+* compare
+<1 oPe
+<1 ope
+= opx
+<2 opệ
+= opx\u0323\u0302 # x not z
+<1 opê
+= opz
+
+# One more variation: Fallback to the simple code point, no shorter non-empty prefix.
+@ rules
+&x=e
+&z=op|ê
+* compare
+<1 ope
+= opx
+<3 oPe
+= oPx
+<2 opệ
+= opx\u0323\u0302 # x not z
+<1 opê
+= opz
+
+** test: maxVariable via rules
+@ rules
+[maxVariable space][alternate shifted]
+* compare
+= \u0020
+= \u000A
+<1 .
+<1 ° # degree sign
+<1 $
+<1 0
+
+** test: maxVariable via setting
+@ root
+% maxVariable=currency
+% alternate=shifted
+* compare
+= \u0020
+= \u000A
+= .
+= ° # degree sign
+= $
+<1 0
+
+** test: ICU4J CollationMiscTest/TestContractionClosure (ää)
+# This tests canonical closure, but it also tests that CollationFastLatin
+# bails out properly for contractions with combining marks.
+# For that we need pairs of strings that remain in the Latin fastpath
+# long enough, hence the extra "= b" lines.
+@ rules
+&b=\u00e4\u00e4
+* compare
+<1 b
+= \u00e4\u00e4
+= b
+= a\u0308a\u0308
+= b
+= \u00e4a\u0308
+= b
+= a\u0308\u00e4
+
+** test: ICU4J CollationMiscTest/TestContractionClosure (Å)
+@ rules
+&b=\u00C5
+* compare
+<1 b
+= \u00C5
+= b
+= A\u030A
+= b
+= \u212B
+
+** test: reset-before on already-tailored characters, ICU ticket 10108
+@ rules
+&a<w<<x &[before 2]x<<y
+* compare
+<1 a
+<1 w
+<2 y
+<2 x
+
+@ rules
+&a<<w<<<x &[before 2]x<<y
+* compare
+<1 a
+<2 y
+<2 w
+<3 x
+
+@ rules
+&a<w<x &[before 2]x<<y
+* compare
+<1 a
+<1 w
+<1 y
+<2 x
+
+@ rules
+&a<w<<<x &[before 2]x<<y
+* compare
+<1 a
+<1 y
+<2 w
+<3 x
+
+** test: numeric collation with other settings, ICU ticket 9092
+@ root
+% strength=identical
+% caseFirst=upper
+% numeric=on
+* compare
+<1 100\u0020a
+<1 101
+
+** test: collation type fallback from unsupported type, ICU ticket 10149
+@ locale fr-CA-u-co-phonebk
+# Expect the same result as with fr-CA, using backwards-secondary order.
+# That is, we should fall back from the unsupported collation type
+# to the locale's default collation type.
+* compare
+<1 cote
+<2 côte
+<2 coté
+<2 côté
+
+** test: @ is equivalent to [backwards 2], ICU ticket 9956
+@ rules
+&b<a @ &v<<w
+* compare
+<1 b
+<1 a
+<1 cote
+<2 côte
+<2 coté
+<2 côté
+<1 v
+<2 w
+<1 x
+
+** test: shifted+reordering, ICU ticket 9507
+@ root
+% reorder Grek punct space
+% alternate=shifted
+% strength=quaternary
+# Which primaries are "variable" should be determined without script reordering,
+# and then primaries should be reordered whether they are shifted to quaternary or not.
+* compare
+<4 ( # punctuation
+<4 )
+<4 \u0020 # space
+<1 ` # symbol
+<1 ^
+<1 $ # currency symbol
+<1 €
+<1 0 # numbers
+<1 ε # Greek
+<1 e # Latin
+<1 e(e
+<4 e)e
+<4 e\u0020e
+<4 ee
+<3 e(E
+<4 e)E
+<4 e\u0020E
+<4 eE
+
+** test: "uppercase first" could sort a string before its prefix, ICU ticket 9351
+@ rules
+&\u0001<<<b<<<B
+% caseFirst=upper
+* compare
+<1 aaa
+<3 aaaB
+
+** test: secondary+case ignores secondary ignorables, ICU ticket 9355
+@ rules
+&\u0001<<<b<<<B
+% strength=secondary
+% caseLevel=on
+* compare
+<1 a
+= ab
+= aB
+
+** test: custom collation rules involving tail of a contraction in Malayalam, ICU ticket 6328
+@ rules
+&[before 2] ൌ << ൗ # U+0D57 << U+0D4C == 0D46+0D57
+* compare
+<1 ൗx
+<2 ൌx
+<1 ൗy
+<2 ൌy
+
+** test: quoted apostrophe in compact syntax, ICU ticket 8204
+@ rules
+&q<<*a''c
+* compare
+<1 d
+<1 p
+<1 q
+<2 a
+<2 \u0027
+<2 c
+<1 r
+
+# ICU ticket #8260 "Support all collation-related keywords in Collator.getInstance()"
+** test: locale -u- with collation keywords, ICU ticket 8260
+@ locale de-u-kv-sPace-ka-shifTed-kn-kk-falsE-kf-Upper-kc-tRue-ks-leVel4
+* compare
+<4 \u0020 # space is shifted, strength=quaternary
+<1 ! # punctuation is regular
+<1 2
+<1 12 # numeric sorting
+<1 B
+<c b # uppercase first on case level
+<1 x\u0301\u0308
+<2 x\u0308\u0301 # normalization off
+
+** test: locale @ with collation keywords, ICU ticket 8260
+@ locale fr@colbAckwards=yes;ColStrength=Quaternary;kv=currencY;colalternate=shifted
+* compare
+<4 $ # currency symbols are shifted, strength=quaternary
+<1 àla
+<2 alà # backwards secondary level
+
+** test: locale -u- with script reordering, ICU ticket 8260
+@ locale el-u-kr-kana-SYMBOL-Grek-hani-cyrl-latn-digit-armn-deva-ethi-thai
+* compare
+<1 \u0020
+<1 あ
+<1 ☂
+<1 Ω
+<1 丂
+<1 ж
+<1 L
+<1 4
+<1 Ձ
+<1 अ
+<1 ሄ
+<1 ฉ
+
+** test: locale @collation=type should be case-insensitive
+@ locale de@coLLation=PhoneBook
+* compare
+<1 ae
+<2 ä
+<3 Ä
+
+** test: import root search rules plus German phonebook rules, ICU ticket 8962
+@ locale de-u-co-search
+* compare
+<1 =
+<1 ≠
+<1 a
+<1 ae
+<2 ä
+
+# Once more, but with runtime builder.
+@ rules
+[import und-u-co-search][import de-u-co-phonebk]
+* compare
+<1 =
+<1 ≠
+<1 a
+<1 ae
+<2 ä
+
+# Once again, with import from "root" not "und" (as in a proper language tag).
+@ rules
+[import root-u-co-search][import de-u-co-phonebk]
+* compare
+<1 =
+<1 ≠
+<1 a
+<1 ae
+<2 ä
+
+** test: import rules from a language with non-Latin native script, and reset the reordering, ICU ticket 10998
+# Greek should sort Greek first.
+@ rules
+[import el]
+* compare
+<1 4
+<1 Ω
+<1 L
+
+# Import Greek, and then reset the reordering.
+@ rules
+[import el][reorder Zzzz]
+* compare
+<1 4
+<1 L
+<1 Ω
+
+# "others" is a synonym for Zzzz.
+@ rules
+[import el][reorder others]
+* compare
+<1 4
+<1 L
+<1 Ω
+
+** test: regression test for CollationFastLatinBuilder, ICU ticket 11388
+@ rules
+&x<<aa<<<Aa<<<AA
+% strength=secondary
+* compare
+<1 AA
+<2 Aẩ
+<2 aą
+* compare
+<1 AA
+<2 aą
+
+** test: tailor tertiary-after a common tertiary where there is a lower one
+# Assume that Hiragana small A has a below-common tertiary, and Hiragana A has a common one.
+# See ICU ticket 11448 & CLDR ticket 7222.
+@ rules
+&あ<<<x<<<y<<<z
+* compare
+<1 ぁ
+<3 あ
+<3 x
+<3 y
+<3 z
+<3 ァ
+<1 い
+
+** test: tailor tertiary-after a below-common tertiary
+@ rules
+&ぁ<<<x<<<y<<<z
+* compare
+<1 ぁ
+<3 x
+<3 y
+<3 z
+<3 あ
+<3 ァ
+<1 い
+
+** test: tailor tertiary-before a common tertiary where there is a lower one
+@ rules
+&[before 3]あ<<<x<<<y<<<z
+* compare
+<1 ぁ
+<3 x
+<3 y
+<3 z
+<3 あ
+<3 ァ
+<1 い
+
+** test: tailor tertiary-before a below-common tertiary
+@ rules
+&[before 3]ぁ<<<x<<<y<<<z
+* compare
+<1 x
+<3 y
+<3 z
+<3 ぁ
+<3 あ
+<3 ァ
+<1 い
+
+** test: reorder single scripts not groups, ICU ticket 11449
+@ root
+% reorder Goth Latn
+* compare
+<1 4
+<1 𐌰 # Gothic
+<1 L
+<1 Ω
+# Before ICU 55, the following reordered together with Gothic.
+<1 𐌈 # Old Italic
+<1 𐑐 # Shavian
+
+# Check for presence of certain chars 乛冂刂卜又小彑艹日月爫牛辶 in
+# zh pinyin and stroke, ICU-13790
+# (bracket pinyin test with 卬..作, stroke test with 一..乾)
+
+** test: DataDrivenCollationTest/VerifyCertainCharsInPinyin
+@ locale zh-u-co-pinyin
+* compare
+< 卬
+< 卜
+< 艹
+< 辶
+< 刂
+< 彑
+< 冂
+< 牛
+< 日
+< 小
+< 乛
+< 又
+< 月
+< 爫
+< 作
+
+** test: DataDrivenCollationTest/VerifyCertainCharsInStroke
+@ locale zh-u-co-stroke
+* compare
+< 一
+< 乛
+< 冂
+< 刂
+< 卜
+< 又
+< 小
+< 彑
+< 艹
+< 日
+< 月
+< 爫
+< 牛
+< 辶
+< 乾
+