diff options
Diffstat (limited to 'strings/ctype-uca.h')
-rw-r--r-- | strings/ctype-uca.h | 187 |
1 files changed, 187 insertions, 0 deletions
diff --git a/strings/ctype-uca.h b/strings/ctype-uca.h new file mode 100644 index 00000000..dd84f92f --- /dev/null +++ b/strings/ctype-uca.h @@ -0,0 +1,187 @@ +#ifndef CTYPE_UCA_H +#define CTYPE_UCA_H +/* Copyright (c) 2021, MariaDB + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Library General Public + License as published by the Free Software Foundation; version 2 + of the License. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Library General Public License for more details. + + You should have received a copy of the GNU Library General Public + License along with this library; if not, write to the Free + Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, + MA 02110-1335 USA */ + +#define MY_UCA_VERSION_ID(x,y,z) ((uint) ((x) * 100 + (y) * 10 + (z))) + +/* + Implicit weight handling is done according to + the section "Computing Implicit Weights" in + https://unicode.org/reports/tr10/#Values_For_Base_Table + (as of Unicode 14.0.0) + + Implicit weights for a code CP are constructed as follows: + [.AAAA.0020.0002][.BBBB.0000.0000] + + - There are two primary weights, depending on the character type and block. + - There is one weight on the secondary and tertiary levels. + + AAAA and BBBB are computed using different formulas for: + - Siniform ideographic scripts + - Han + - Unassigned characters +*/ + +typedef struct my_uca_implict_weight_t +{ + uint16 weight[2]; +} MY_UCA_IMPLICIT_WEIGHT; + + +/* + By default, implicit weights for a code CP are constructed as follows: + [.AAAA.0020.0002][.BBBB.0000.0000] + + where AAAA and BBBB are : + AAAA= BASE + (CP >> 15); + BBBB= (CP & 0x7FFF) | 0x8000; + + This formula covers the following implicit weight subtypes: + - Core Han Unified Ideographs + - All other Han Unified Ideographs + - Unassigned characters + Every mentioned subtype passes a different BASE. + + This formula does not cover Siniform ideographic scripts. + They are handled by separate functions. +*/ +static inline MY_UCA_IMPLICIT_WEIGHT +my_uca_implicit_weight_primary_default(uint16 base, my_wc_t code) +{ + MY_UCA_IMPLICIT_WEIGHT res; + res.weight[0]= (uint16) ((code >> 15) + base); + res.weight[1]= (uint16) ((code & 0x7FFF)|0x8000); + return res; +} + + +/** + Calculate Unicode-5.2.0 implicit weight on the primary level. + + According to UCA, BASE is calculated as follows: + - FB40 for Unified_Ideograph=True AND + ((Block=CJK_Unified_Ideograph) OR + (Block=CJK_Compatibility_Ideographs)) + - FB80 for Unified_Ideograph=True AND NOT + ((Block=CJK_Unified_Ideograph) OR + (Block=CJK_Compatibility_Ideographs)) + - FBC0 for any other code point + + But for Unicode-5.2.0 and Unicode-4.0.0 we used + a simplified formula as implemented before. +*/ +static inline MY_UCA_IMPLICIT_WEIGHT +my_uca_520_implicit_weight_primary(my_wc_t code) +{ + uint16 base; + /* + 3400;<CJK Ideograph Extension A, First> + 4DB5;<CJK Ideograph Extension A, Last> + 4E00;<CJK Ideograph, First> + 9FA5;<CJK Ideograph, Last> + */ + if (code >= 0x3400 && code <= 0x4DB5) + base= 0xFB80; + else if (code >= 0x4E00 && code <= 0x9FA5) + base= 0xFB40; + else + base= 0xFBC0; + + return my_uca_implicit_weight_primary_default(base, code); +} + + +typedef enum my_cs_encoding_enum +{ + MY_CS_ENCODING_UTF8MB3= 0, + MY_CS_ENCODING_UTF8MB4= 1, + MY_CS_ENCODING_UCS2= 2, + MY_CS_ENCODING_UTF16= 3, + MY_CS_ENCODING_UTF32= 4, +} my_cs_encoding_t; + +#define MY_CS_ENCODING_LAST MY_CS_ENCODING_UTF32 + + +#include "ctype-uca1400.h" + + +static inline MY_UCA_IMPLICIT_WEIGHT +my_uca_implicit_weight_primary(uint version, my_wc_t code) +{ + return version >= 1400 ? + my_uca_1400_implicit_weight_primary(code) : + my_uca_520_implicit_weight_primary(code); +} + + +static inline MY_UCA_IMPLICIT_WEIGHT +my_uca_implicit_weight_secondary() +{ + MY_UCA_IMPLICIT_WEIGHT res; + res.weight[0]= 0x0020; + res.weight[1]= 0; + return res; +} + + +static inline MY_UCA_IMPLICIT_WEIGHT +my_uca_implicit_weight_tertiary() +{ + MY_UCA_IMPLICIT_WEIGHT res; + res.weight[0]= 0x0002; + res.weight[1]= 0; + return res; +} + + +static inline MY_UCA_IMPLICIT_WEIGHT +my_uca_implicit_weight_quaternary() +{ + MY_UCA_IMPLICIT_WEIGHT res; + res.weight[0]= 0x0001; + res.weight[1]= 0; + return res; +} + + +static inline MY_UCA_IMPLICIT_WEIGHT +my_uca_implicit_weight_on_level(uint version, my_wc_t code, uint level) +{ + switch (level) { + case 0: + return my_uca_implicit_weight_primary(version, code); + case 1: + return my_uca_implicit_weight_secondary(); + case 2: + return my_uca_implicit_weight_tertiary(); + default: + break; + } + return my_uca_implicit_weight_quaternary(); +} + +uint +my_ci_get_id_uca(CHARSET_INFO *cs, my_collation_id_type_t type); + + +LEX_CSTRING +my_ci_get_collation_name_uca(CHARSET_INFO *cs, my_collation_name_mode_t mode); + + +#endif /* CTYPE_UCA_H */ |