--source include/have_utf32.inc --source include/have_utf8mb4.inc --echo # --echo # Start of 10.8 tests --echo # --echo # --echo # MDEV-27009 Add UCA-14.0.0 collations --echo # SET NAMES utf8mb4 COLLATE utf8mb4_bin; --source include/ctype_unicode_allchars.inc --echo # --echo # Load allkeys.txt from Unicode-14.0.0 --echo # # The 14.0.0 file has three weight levels. # Unlike 5.2.0, there are no optional extra fields after the character # name like "; QQK". But lets still use the "@qq" variable for symmetry. #00A0 ; [*0209.0020.001B] # NO-BREAK SPACE # CREATE TABLE allkeys_txt (a TEXT, b TEXT, c TEXT) ENGINE=MyISAM; LOAD DATA INFILE '../../std_data/unicode/allkeys1400.txt' INTO TABLE allkeys_txt FIELDS TERMINATED BY ';' (@a,@b,@qq) SET a=TRIM(@a), b=TRIM(REGEXP_SUBSTR(@b,'^[^#]*')), c=TRIM(REGEXP_SUBSTR(@b, '#.*$')); CREATE TABLE allkeys AS SELECT a, CONVERT(CAST(UNHEX(regexp_replace(regexp_replace(regexp_replace(a,'(\\b[0-9A-Z]{4}\\b)','-0000\\1-'),'(\\b[0-9A-Z]{5}\\b)','-000\\1-'),'[ -]','')) AS CHAR CHARACTER SET utf32) USING utf8mb4) COLLATE utf8mb4_bin AS str, HEX(WEIGHT_STRING(CONVERT(CAST(UNHEX(regexp_replace(regexp_replace(regexp_replace(a,'(\\b[0-9A-Z]{4}\\b)','-0000\\1-'),'(\\b[0-9A-Z]{5}\\b)','-000\\1-'),'[ -]','')) AS CHAR CHARACTER SET utf32) USING utf8mb4) COLLATE utf8mb4_uca1400_ai_ci)) as ws, REPLACE(REPLACE(REGEXP_REPLACE(b,'[[][.*](....)[.]....[.]....]','-\\1-'),'-0000-',''),'-','') AS wd, c FROM allkeys_txt WHERE a RLIKE '^[0-9A-Z]'; ALTER TABLE allkeys ADD KEY(str(3)); --echo # --echo # Test explicit weights for individual characters --echo # U+FDFA is the only character that has a different weight than allkeys.txt --echo # # The (NOT LIKE '% %') part of the condition filters out contractions. SELECT COUNT(*), SUM(ws<>wd) FROM allkeys WHERE a NOT LIKE '% %'; SELECT a, ws, wd FROM allkeys WHERE ws<>wd AND a NOT LIKE '% %'; --echo # --echo # Test explicit weights for built-in contractions --echo # SELECT COUNT(*), SUM(ws<>wd) FROM allkeys WHERE a LIKE '% %'; SELECT a, ws, wd FROM allkeys WHERE ws<>wd AND a LIKE '% %'; --echo # --echo # Test implicit weights. --echo # # The below CASE resembles the implicit weight formula according to # https://unicode.org/reports/tr10/#Implicit_Weights # as of Unicode-14.0.0 SELECT HEX(code), HEX(WEIGHT_STRING(str COLLATE utf8mb4_uca1400_ai_ci)) AS ws, CASE /* Core Han Unified Ideograms */ WHEN (code >= 0x4E00 AND code <= 0x9FFF) OR (code >= 0xFA0E AND code <= 0xFA0F) OR (code = 0xFA11) OR (code >= 0xFA13 AND code <= 0xFA14) OR (code = 0xFA1F) OR (code = 0xFA21) OR (code >= 0xFA23 AND code <= 0xFA24) OR (code >= 0xFA27 AND code <= 0xFA29) THEN CONCAT(LPAD(HEX(0xFB40 + (code >> 15)),4,'0'), LPAD(HEX(0x8000 | (code & 0x7FFF)),4,'0')) /* All other Han Unified Ideographs */ WHEN (code >= 0x3400 AND code <= 0x4DBF) OR (code >= 0x20000 AND code <= 0x2A6DF) OR (code >= 0x2A700 AND code <= 0x2B738) OR (code >= 0x2B740 AND code <= 0x2B81D) OR (code >= 0x2B820 AND code <= 0x2CEA1) OR (code >= 0x2CEB0 AND code <= 0x2EBE0) OR (code >= 0x30000 AND code <= 0x3134A) THEN CONCAT(LPAD(HEX(0xFB80 + (code >> 15)),4,'0'), LPAD(HEX(0x8000 | (code & 0x7FFF)),4,'0')) /* Siniform: Tangut */ WHEN (code >= 0x17000 AND code <= 0x187FF) OR (code >= 0x18800 AND code <= 0x18AFF) OR (code >= 0x18D00 AND code <= 0x18D7F) THEN CONCAT('FB00', LPAD(HEX(0x8000 | (code - 0x17000)),4,'0')) /* Siniform: Nushu */ WHEN (code >= 0x1B170 AND code <= 0x1B2FF) THEN CONCAT('FB01', LPAD(HEX(0x8000 | (code - 0x1B170)),4,'0')) /* Siniform: Khitan */ WHEN (code >= 0x18B00 AND code <= 0x18CFF) THEN CONCAT('FB02', LPAD(HEX(0x8000 | (code - 0x18B00)),4,'0')) /* Unassigned: Any other code point */ ELSE CONCAT(LPAD(HEX(0xFBC0 + (code >> 15)),4,'0'), LPAD(HEX(0x8000 | (code & 0x7FFF)),4,'0')) END AS wd FROM allchars LEFT OUTER JOIN allkeys USING (str) WHERE allkeys.str IS NULL HAVING ws<>wd ORDER BY HEX(str); DROP TABLE allkeys_txt; DROP TABLE allkeys; DROP TABLE allchars; --echo # --echo # End of 10.8 tests --echo #