diff options
Diffstat (limited to 'strings/ctype-uca.inl')
-rw-r--r-- | strings/ctype-uca.inl | 1044 |
1 files changed, 1044 insertions, 0 deletions
diff --git a/strings/ctype-uca.inl b/strings/ctype-uca.inl new file mode 100644 index 00000000..6aa72847 --- /dev/null +++ b/strings/ctype-uca.inl @@ -0,0 +1,1044 @@ +/* + Copyright (c) 2018, 2020, MariaDB Corporation + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA +*/ + +#include "ctype-mb.h" + +#ifndef MY_FUNCTION_NAME +#error MY_FUNCTION_NAME is not defined +#endif +#ifndef MY_MB_WC +#error MY_MB_WC is not defined +#endif +#ifndef MY_LIKE_RANGE +#error MY_LIKE_RANGE is not defined +#endif +#ifndef MY_UCA_ASCII_OPTIMIZE +#error MY_ASCII_OPTIMIZE is not defined +#endif +#ifndef MY_UCA_COMPILE_CONTRACTIONS +#error MY_UCA_COMPILE_CONTRACTIONS is not defined +#endif +#ifndef MY_UCA_COLL_INIT +#error MY_UCA_COLL_INIT is not defined +#endif + + +#include "ctype-uca-scanner_next.inl" +#define SCANNER_NEXT_NCHARS +#include "ctype-uca-scanner_next.inl" + +/* + Compares two strings according to the collation + + SYNOPSIS: + strnncoll_onelevel() + cs Character set information + level Weight level (0 primary, 1 secondary, 2 tertiary, etc) + s First string + slen First string length + t Second string + tlen Seconf string length + level DUCETweight level + + NOTES: + Initializes two weight scanners and gets weights + corresponding to two strings in a loop. If weights are not + the same at some step then returns their difference. + + In the while() comparison these situations are possible: + 1. (s_res>0) and (t_res>0) and (s_res == t_res) + Weights are the same so far, continue comparison + 2. (s_res>0) and (t_res>0) and (s_res!=t_res) + A difference has been found, return. + 3. (s_res>0) and (t_res<0) + We have reached the end of the second string, or found + an illegal multibyte sequence in the second string. + Return a positive number, i.e. the first string is bigger. + 4. (s_res<0) and (t_res>0) + We have reached the end of the first string, or found + an illegal multibyte sequence in the first string. + Return a negative number, i.e. the second string is bigger. + 5. (s_res<0) and (t_res<0) + Both scanners returned -1. It means we have riched + the end-of-string of illegal-sequence in both strings + at the same time. Return 0, strings are equal. + + RETURN + Difference between two strings, according to the collation: + 0 - means strings are equal + negative number - means the first string is smaller + positive number - means the first string is bigger +*/ + +static int +MY_FUNCTION_NAME(strnncoll_onelevel)(CHARSET_INFO *cs, + const MY_UCA_WEIGHT_LEVEL *level, + const uchar *s, size_t slen, + const uchar *t, size_t tlen, + my_bool t_is_prefix) +{ + my_uca_scanner sscanner; + my_uca_scanner tscanner; + my_uca_scanner_param param; + int s_res; + int t_res; + +#if MY_UCA_ASCII_OPTIMIZE +{ + size_t prefix= my_uca_level_booster_equal_prefix_length(level->booster, + s, slen, t, tlen); + s+= prefix, slen-= prefix; + t+= prefix, tlen-= prefix; +} +#endif + + my_uca_scanner_param_init(¶m, cs, level); + my_uca_scanner_init_any(&sscanner, s, slen); + my_uca_scanner_init_any(&tscanner, t, tlen); + + do + { + s_res= MY_FUNCTION_NAME(scanner_next)(&sscanner, ¶m); + t_res= MY_FUNCTION_NAME(scanner_next)(&tscanner, ¶m); + } while ( s_res == t_res && s_res >0); + + return (t_is_prefix && t_res < 0) ? 0 : (s_res - t_res); +} + + +/* + One-level, PAD SPACE. +*/ +static int +MY_FUNCTION_NAME(strnncoll)(CHARSET_INFO *cs, + const uchar *s, size_t slen, + const uchar *t, size_t tlen, + my_bool t_is_prefix) +{ + return MY_FUNCTION_NAME(strnncoll_onelevel)(cs, &cs->uca->level[0], + s, slen, t, tlen, t_is_prefix); +} + + +/* + Multi-level, PAD SPACE. +*/ +static int +MY_FUNCTION_NAME(strnncoll_multilevel)(CHARSET_INFO *cs, + const uchar *s, size_t slen, + const uchar *t, size_t tlen, + my_bool t_is_prefix) +{ + uint i, level_flags= cs->levels_for_order; + for (i= 0; level_flags; i++, level_flags>>= 1) + { + int ret; + if (!(level_flags & 1)) + continue; + ret= MY_FUNCTION_NAME(strnncoll_onelevel)(cs, &cs->uca->level[i], + s, slen, t, tlen, + t_is_prefix); + if (ret) + return ret; + } + return 0; +} + + +/* + Compares two strings according to the collation, + ignoring trailing spaces. + + SYNOPSIS: + strnncollsp_onelevel() + cs Character set information + level UCA weight level + s First string + slen First string length + t Second string + tlen Seconf string length + level DUCETweight level + + NOTES: + Works exactly the same with my_strnncoll_uca(), + but ignores trailing spaces. + + In the while() comparison these situations are possible: + 1. (s_res>0) and (t_res>0) and (s_res == t_res) + Weights are the same so far, continue comparison + 2. (s_res>0) and (t_res>0) and (s_res!=t_res) + A difference has been found, return. + 3. (s_res>0) and (t_res<0) + We have reached the end of the second string, or found + an illegal multibyte sequence in the second string. + Compare the first string to an infinite array of + space characters until difference is found, or until + the end of the first string. + 4. (s_res<0) and (t_res>0) + We have reached the end of the first string, or found + an illegal multibyte sequence in the first string. + Compare the second string to an infinite array of + space characters until difference is found or until + the end of the second steing. + 5. (s_res<0) and (t_res<0) + Both scanners returned -1. It means we have riched + the end-of-string of illegal-sequence in both strings + at the same time. Return 0, strings are equal. + + RETURN + Difference between two strings, according to the collation: + 0 - means strings are equal + negative number - means the first string is smaller + positive number - means the first string is bigger +*/ + +static int +MY_FUNCTION_NAME(strnncollsp_onelevel)(CHARSET_INFO *cs, + const MY_UCA_WEIGHT_LEVEL *level, + const uchar *s, size_t slen, + const uchar *t, size_t tlen) +{ + my_uca_scanner sscanner, tscanner; + my_uca_scanner_param param; + int s_res, t_res; + +#if MY_UCA_ASCII_OPTIMIZE +{ + size_t prefix= my_uca_level_booster_equal_prefix_length(level->booster, + s, slen, t, tlen); + s+= prefix, slen-= prefix; + t+= prefix, tlen-= prefix; +} +#endif + + my_uca_scanner_param_init(¶m, cs, level); + my_uca_scanner_init_any(&sscanner, s, slen); + my_uca_scanner_init_any(&tscanner, t, tlen); + + do + { + s_res= MY_FUNCTION_NAME(scanner_next)(&sscanner, ¶m); + t_res= MY_FUNCTION_NAME(scanner_next)(&tscanner, ¶m); + } while ( s_res == t_res && s_res >0); + + if (s_res > 0 && t_res < 0) + { + /* Calculate weight for SPACE character */ + t_res= my_space_weight(level); + + /* compare the first string to spaces */ + do + { + if (s_res != t_res) + return (s_res - t_res); + s_res= MY_FUNCTION_NAME(scanner_next)(&sscanner, ¶m); + } while (s_res > 0); + return 0; + } + + if (s_res < 0 && t_res > 0) + { + /* Calculate weight for SPACE character */ + s_res= my_space_weight(level); + + /* compare the second string to spaces */ + do + { + if (s_res != t_res) + return (s_res - t_res); + t_res= MY_FUNCTION_NAME(scanner_next)(&tscanner, ¶m); + } while (t_res > 0); + return 0; + } + + return ( s_res - t_res ); +} + + +/* + One-level, PAD SPACE +*/ +static int +MY_FUNCTION_NAME(strnncollsp)(CHARSET_INFO *cs, + const uchar *s, size_t slen, + const uchar *t, size_t tlen) +{ + return MY_FUNCTION_NAME(strnncollsp_onelevel)(cs, &cs->uca->level[0], + s, slen, t, tlen); +} + + +/* + One-level, NO PAD +*/ +static int +MY_FUNCTION_NAME(strnncollsp_nopad)(CHARSET_INFO *cs, + const uchar *s, size_t slen, + const uchar *t, size_t tlen) +{ + return MY_FUNCTION_NAME(strnncoll_onelevel)(cs, &cs->uca->level[0], + s, slen, t, tlen, FALSE); +} + + +/* + Multi-level, PAD SPACE +*/ +static int +MY_FUNCTION_NAME(strnncollsp_multilevel)(CHARSET_INFO *cs, + const uchar *s, size_t slen, + const uchar *t, size_t tlen) +{ + uint i, level_flags= cs->levels_for_order; + for (i= 0; level_flags; i++, level_flags>>= 1) + { + int ret; + if (!(level_flags & 1)) + continue; + ret= MY_FUNCTION_NAME(strnncollsp_onelevel)(cs, &cs->uca->level[i], + s, slen, t, tlen); + if (ret) + return ret; + } + return 0; +} + + +/* + Multi-level, NO PAD +*/ +static int +MY_FUNCTION_NAME(strnncollsp_nopad_multilevel)(CHARSET_INFO *cs, + const uchar *s, size_t slen, + const uchar *t, size_t tlen) +{ + uint i, level_flags; + int ret; + + /* Compare only the primary level using NO PAD */ + if ((ret= MY_FUNCTION_NAME(strnncoll_onelevel)(cs, &cs->uca->level[0], + s, slen, t, tlen, FALSE))) + return ret; + + /* + Compare the other levels using PAD SPACE. + These are Unicode-14.0.0 DUCTET weights: + +0020 ; [*0209.0020.0002] # SPACE + +0035 ; [.2070.0020.0002] # DIGIT FIVE +248C ; [.2070.0020.0004][*0281.0020.0004] # DIGIT FIVE FULL STOP + +0041 ; [.2075.0020.0008] # LATIN CAPITAL LETTER A +0061 ; [.2075.0020.0002] # LATIN SMALL LETTER A +00C1 ; [.2075.0020.0008][.0000.0024.0002] # LATIN CAPITAL LETTER A WITH ACUTE +00E1 ; [.2075.0020.0002][.0000.0024.0002] # LATIN SMALL LETTER A WITH ACUTE + + Examples demonstrating that it's important to use PAD SPACE + on the tertiary level: + + The third level weights for "SMALL LETTER A" + - U+0061 produces one weight 0002 + - U+00E1 produces two weights 0002+0002 + For _ai_cs collations these two letters must be equal. + Therefore, the difference in trailing 0002 should be ignored. + + The third level weights for "CAPITAL LETTER A" + - U+0041 produces one weight 0008 + - U+00C1 produces two weights 0008+0002 + For _ai_cs collations these two letters must be equal. + Therefore, the difference in trailing 0002 should be ignored. + + Examples demonstrating that it's important to use PAD SPACE + on the secondary level: + + When we implement variable shifted alternative weighting collations, + U+0035 will be equal to U+248C on the primary level in these collations. + The second level weights for "DIGIT FIVE" are: + - U+0035 produces one weight 0020 + - U+248C produces two weights 0020+0020. + The difference for these two characters must be found only + on the tertiary level. Therefore, the trailing 0020 should be ignored. + */ + + for (i= 1, level_flags= cs->levels_for_order >> 1; + level_flags; + i++, level_flags>>= 1) + { + if (!(level_flags & 1)) + continue; + ret= MY_FUNCTION_NAME(strnncollsp_onelevel)(cs, &cs->uca->level[i], + s, slen, t, tlen); + if (ret) + return ret; + } + return 0; +} + + +/* + Scan the next weight and perform space padding + or trimming according to "nchars". +*/ +static inline weight_and_nchars_t +MY_FUNCTION_NAME(scanner_next_pad_trim)(my_uca_scanner *scanner, + my_uca_scanner_param *param, + size_t nchars, + uint flags, + uint *generated) +{ + weight_and_nchars_t res; + if (nchars > 0 || + scanner->wbeg[0] /* Some weights from a previous expansion left */) + { + if ((res= MY_FUNCTION_NAME(scanner_next_with_nchars)(scanner, param, + nchars)).weight < 0) + { + /* + We reached the end of the string, but the caller wants more weights. + Perform space padding. + */ + res.weight= + flags & MY_STRNNCOLLSP_NCHARS_EMULATE_TRIMMED_TRAILING_SPACES ? + my_space_weight(param->level) : 0; + + res.nchars= 1; + (*generated)++; + } + else if (res.nchars > nchars) + { + /* + We scanned the next collation element, but it does not fit into + the "nchars" limit. This is possible in case of: + - A contraction, e.g. Czech 'ch' with nchars=1 + - A sequence of ignorable characters followed by non-ignorable ones, + e.g. CONCAT(x'00','a') with nchars=1. + Perform trimming. + */ + res.weight= param->cs->state & MY_CS_NOPAD ? + 0 : my_space_weight(param->level); + res.nchars= (uint) nchars; + (*generated)++; + } + } + else + { + /* The caller wants nchars==0. Perform trimming. */ + res.weight= param->cs->state & MY_CS_NOPAD ? + 0 : my_space_weight(param->level); + res.nchars= 0; + (*generated)++; + } + return res; +} + + +static int +MY_FUNCTION_NAME(strnncollsp_nchars_onelevel)(CHARSET_INFO *cs, + const MY_UCA_WEIGHT_LEVEL *level, + const uchar *s, size_t slen, + const uchar *t, size_t tlen, + size_t nchars, + uint flags) +{ + my_uca_scanner sscanner; + my_uca_scanner tscanner; + my_uca_scanner_param param; + size_t s_nchars_left= nchars; + size_t t_nchars_left= nchars; + +/* +TODO: strnncollsp_nchars_onelevel +#if MY_UCA_ASCII_OPTIMIZE +{ + size_t prefix= my_uca_level_booster_equal_prefix_length(level->booster, + s, slen, t, tlen); + s+= prefix, slen-= prefix; + t+= prefix, tlen-= prefix; +} +#endif +*/ + + my_uca_scanner_param_init(¶m, cs, level); + my_uca_scanner_init_any(&sscanner, s, slen); + my_uca_scanner_init_any(&tscanner, t, tlen); + + for ( ; ; ) + { + weight_and_nchars_t s_res; + weight_and_nchars_t t_res; + uint generated= 0; + int diff; + + s_res= MY_FUNCTION_NAME(scanner_next_pad_trim)(&sscanner, ¶m, + s_nchars_left, + flags, &generated); + t_res= MY_FUNCTION_NAME(scanner_next_pad_trim)(&tscanner, ¶m, + t_nchars_left, + flags, &generated); + if ((diff= (s_res.weight - t_res.weight))) + return diff; + + if (generated == 2) + { + if ((cs->state & MY_CS_NOPAD) && + (flags & MY_STRNNCOLLSP_NCHARS_EMULATE_TRIMMED_TRAILING_SPACES)) + { + /* + Both values are auto-generated. There's no real data any more. + We need to handle the remaining virtual trailing spaces. + The two strings still have s_nchars_left and t_nchars_left imaginary + trailing spaces at the end. If s_nchars_left != t_nchars_left, + the strings will be not equal in case of a NOPAD collation. + + Example: + "B" is German "U+00DF LATIN SMALL LETTER SHARP S" + When we have these values in a + CHAR(3) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_nopad_ci + column: + 'B ' (one character, two trailing spaces) + 'ss ' (two characters, one trailing space) + The 'B ' is greater than the 'ss '. + They are compared in the following steps: + 1. 'B' == 'ss' + 2. ' ' == ' ' + 3. ' ' > '' + + We need to emulate the same behavior in this function even if + it's called with strings 'B' and 'ss' (with space trimmed). + The side which has more remaining virtual spaces at the end + is greater. + */ + if (s_nchars_left < t_nchars_left) + return -1; + if (s_nchars_left > t_nchars_left) + return +1; + } + return 0; + } + + DBUG_ASSERT(s_nchars_left >= s_res.nchars); + DBUG_ASSERT(t_nchars_left >= t_res.nchars); + s_nchars_left-= s_res.nchars; + t_nchars_left-= t_res.nchars; + } + + return 0; +} + + +/* + One-level collations. +*/ +static int +MY_FUNCTION_NAME(strnncollsp_nchars)(CHARSET_INFO *cs, + const uchar *s, size_t slen, + const uchar *t, size_t tlen, + size_t nchars, + uint flags) +{ + return MY_FUNCTION_NAME(strnncollsp_nchars_onelevel)(cs, &cs->uca->level[0], + s, slen, t, tlen, + nchars, flags); +} + + +/* + Multi-level collations. +*/ +static int +MY_FUNCTION_NAME(strnncollsp_nchars_multilevel)(CHARSET_INFO *cs, + const uchar *s, size_t slen, + const uchar *t, size_t tlen, + size_t nchars, + uint flags) +{ + uint i, level_flags= cs->levels_for_order; + for (i= 0; level_flags; i++, level_flags>>= 1) + { + int ret; + if (!(level_flags & 1)) + continue; + ret= MY_FUNCTION_NAME(strnncollsp_nchars_onelevel)(cs, + &cs->uca->level[i], + s, slen, + t, tlen, + nchars, flags); + if (ret) + return ret; + } + return 0; +} + + +/* + Calculates hash value for the given string, + according to the collation, and ignoring trailing spaces. + + SYNOPSIS: + hash_sort() + cs Character set information + s String + slen String's length + n1 First hash parameter + n2 Second hash parameter + + NOTES: + Scans consequently weights and updates + hash parameters n1 and n2. In a case insensitive collation, + upper and lower case of the same letter will return the same + weight sequence, and thus will produce the same hash values + in n1 and n2. + + This functions is used for one-level and for multi-level collations. + We intentionally use only primary level in multi-level collations. + This helps to have PARTITION BY KEY put primarily equal records + into the same partition. E.g. in utf8mb3_thai_520_ci records that differ + only in tone marks go into the same partition. + + RETURN + N/A +*/ + +static void +MY_FUNCTION_NAME(hash_sort)(CHARSET_INFO *cs, + const uchar *s, size_t slen, + ulong *nr1, ulong *nr2) +{ + int s_res; + my_uca_scanner scanner; + my_uca_scanner_param param; + int space_weight= my_space_weight(&cs->uca->level[0]); + register ulong m1= *nr1, m2= *nr2; + + my_uca_scanner_param_init(¶m, cs, &cs->uca->level[0]); + my_uca_scanner_init_any(&scanner, s, slen); + + while ((s_res= MY_FUNCTION_NAME(scanner_next)(&scanner, ¶m)) >0) + { + if (s_res == space_weight) + { + /* Combine all spaces to be able to skip end spaces */ + uint count= 0; + do + { + count++; + if ((s_res= MY_FUNCTION_NAME(scanner_next)(&scanner, ¶m)) <= 0) + { + /* Skip strings at end of string */ + goto end; + } + } + while (s_res == space_weight); + + /* Add back that has for the space characters */ + do + { + /* + We can't use MY_HASH_ADD_16() here as we, because of a misstake + in the original code, where we added the 16 byte variable the + opposite way. Changing this would cause old partitioned tables + to fail. + */ + MY_HASH_ADD(m1, m2, space_weight >> 8); + MY_HASH_ADD(m1, m2, space_weight & 0xFF); + } + while (--count != 0); + + } + /* See comment above why we can't use MY_HASH_ADD_16() */ + MY_HASH_ADD(m1, m2, s_res >> 8); + MY_HASH_ADD(m1, m2, s_res & 0xFF); + } +end: + *nr1= m1; + *nr2= m2; +} + + +static void +MY_FUNCTION_NAME(hash_sort_nopad)(CHARSET_INFO *cs, + const uchar *s, size_t slen, + ulong *nr1, ulong *nr2) +{ + int s_res; + my_uca_scanner scanner; + my_uca_scanner_param param; + register ulong m1= *nr1, m2= *nr2; + + my_uca_scanner_param_init(¶m, cs, &cs->uca->level[0]); + my_uca_scanner_init_any(&scanner, s, slen); + + while ((s_res= MY_FUNCTION_NAME(scanner_next)(&scanner, ¶m)) >0) + { + /* See comment above why we can't use MY_HASH_ADD_16() */ + MY_HASH_ADD(m1, m2, s_res >> 8); + MY_HASH_ADD(m1, m2, s_res & 0xFF); + } + *nr1= m1; + *nr2= m2; +} + + + +/* + For the given string creates its "binary image", suitable + to be used in binary comparison, i.e. in memcmp(). + + SYNOPSIS: + my_strnxfrm_uca() + cs Character set information + dst Where to write the image + dstlen Space available for the image, in bytes + src The source string + srclen Length of the source string, in bytes + + NOTES: + In a loop, scans weights from the source string and writes + them into the binary image. In a case insensitive collation, + upper and lower cases of the same letter will produce the + same image subsequences. When we have reached the end-of-string + or found an illegal multibyte sequence, the loop stops. + + It is impossible to restore the original string using its + binary image. + + Binary images are used for bulk comparison purposes, + e.g. in ORDER BY, when it is more efficient to create + a binary image and use it instead of weight scanner + for the original strings for every comparison. + + RETURN + Number of bytes that have been written into the binary image. +*/ + +static uchar * +MY_FUNCTION_NAME(strnxfrm_onelevel_internal)(CHARSET_INFO *cs, + MY_UCA_WEIGHT_LEVEL *level, + uchar *dst, uchar *de, + uint *nweights, + const uchar *src, size_t srclen) +{ + my_uca_scanner scanner; + my_uca_scanner_param param; + int s_res; + + DBUG_ASSERT(src || !srclen); + +#if MY_UCA_ASCII_OPTIMIZE && !MY_UCA_COMPILE_CONTRACTIONS + /* + Fast path for the ASCII range with no contractions. + */ + { + const uchar *de2= de - 1; /* Last position where 2 bytes fit */ + const uint16 *weights0= level->weights[0]; + uint lengths0= level->lengths[0]; + for ( ; ; src++, srclen--) + { + const uint16 *weight; + if (!srclen || !*nweights) + return dst; /* Done */ + if (*src > 0x7F) + break; /* Non-ASCII */ + + weight= weights0 + (((uint) *src) * lengths0); + if (!(s_res= *weight)) + continue; /* Ignorable */ + if (weight[1]) /* Expansion (e.g. in a user defined collation */ + break; + + /* Here we have a character with extactly one 2-byte UCA weight */ + if (dst < de2) /* Most typical case is when both bytes fit */ + { + *dst++= s_res >> 8; + *dst++= s_res & 0xFF; + (*nweights)--; + continue; + } + if (dst >= de) /* No space left in "dst" */ + return dst; + *dst++= s_res >> 8; /* There is space only for one byte */ + (*nweights)--; + return dst; + } + } +#endif + + my_uca_scanner_param_init(¶m, cs, level); + my_uca_scanner_init_any(&scanner, src, srclen); + + for (; dst < de && *nweights && + (s_res= MY_FUNCTION_NAME(scanner_next)(&scanner, ¶m)) > 0 ; + (*nweights)--) + { + *dst++= s_res >> 8; + if (dst < de) + *dst++= s_res & 0xFF; + } + return dst; +} + + +static uchar * +MY_FUNCTION_NAME(strnxfrm_onelevel)(CHARSET_INFO *cs, + MY_UCA_WEIGHT_LEVEL *level, + uchar *dst, uchar *de, uint nweights, + const uchar *src, size_t srclen, uint flags) +{ + uchar *d0= dst; + dst= MY_FUNCTION_NAME(strnxfrm_onelevel_internal)(cs, level, + dst, de, &nweights, + src, srclen); + DBUG_ASSERT(dst <= de); + if (dst < de && nweights && (flags & MY_STRXFRM_PAD_WITH_SPACE)) + dst= my_strnxfrm_uca_padn(dst, de, nweights, my_space_weight(level)); + DBUG_ASSERT(dst <= de); + my_strxfrm_desc_and_reverse(d0, dst, flags, 0); + return dst; +} + + + +static uchar * +MY_FUNCTION_NAME(strnxfrm_nopad_onelevel)(CHARSET_INFO *cs, + MY_UCA_WEIGHT_LEVEL *level, + uchar *dst, uchar *de, uint nweights, + const uchar *src, size_t srclen, + uint flags) +{ + uchar *d0= dst; + dst= MY_FUNCTION_NAME(strnxfrm_onelevel_internal)(cs, level, + dst, de, &nweights, + src, srclen); + DBUG_ASSERT(dst <= de); + /* Pad with the minimum possible weight on this level */ + if (dst < de && nweights && (flags & MY_STRXFRM_PAD_WITH_SPACE)) + dst= my_strnxfrm_uca_padn(dst, de, nweights, min_weight_on_level(level)); + DBUG_ASSERT(dst <= de); + my_strxfrm_desc_and_reverse(d0, dst, flags, 0); + return dst; +} + + +static size_t +MY_FUNCTION_NAME(strnxfrm)(CHARSET_INFO *cs, + uchar *dst, size_t dstlen, uint nweights, + const uchar *src, size_t srclen, uint flags) +{ + uchar *d0= dst; + uchar *de= dst + dstlen; + + /* + There are two ways to handle trailing spaces for PAD SPACE collations: + 1. Keep trailing spaces as they are, so have strnxfrm_onelevel() scan + spaces as normal characters. This will call scanner_next() for every + trailing space and calculate its weight using UCA weights. + 2. Strip trailing spaces before calling strnxfrm_onelevel(), as it will + append weights for implicit spaces anyway, up to the desired key size. + This will effectively generate exactly the same sortable key result. + The latter is much faster. + */ + + if (flags & MY_STRXFRM_PAD_WITH_SPACE) + srclen= my_ci_lengthsp(cs, (const char*) src, srclen); + dst= MY_FUNCTION_NAME(strnxfrm_onelevel)(cs, &cs->uca->level[0], + dst, de, nweights, + src, srclen, flags); + /* + This can probably be changed to memset(dst, 0, de - dst), + like my_strnxfrm_uca_multilevel() does. + */ + if ((flags & MY_STRXFRM_PAD_TO_MAXLEN) && dst < de) + dst= my_strnxfrm_uca_pad(dst, de, my_space_weight(&cs->uca->level[0])); + return dst - d0; +} + + +static size_t +MY_FUNCTION_NAME(strnxfrm_nopad)(CHARSET_INFO *cs, + uchar *dst, size_t dstlen, + uint nweights, + const uchar *src, size_t srclen, + uint flags) +{ + uchar *d0= dst; + uchar *de= dst + dstlen; + + dst= MY_FUNCTION_NAME(strnxfrm_nopad_onelevel)(cs, &cs->uca->level[0], + dst, de, nweights, + src, srclen, flags); + if ((flags & MY_STRXFRM_PAD_TO_MAXLEN) && dst < de) + { + memset(dst, 0, de - dst); + dst= de; + } + return dst - d0; +} + + +static size_t +MY_FUNCTION_NAME(strnxfrm_multilevel)(CHARSET_INFO *cs, + uchar *dst, size_t dstlen, + uint nweights, + const uchar *src, size_t srclen, + uint flags) +{ + uint level_flags= cs->levels_for_order; + uchar *d0= dst; + uchar *de= dst + dstlen; + uint current_level; + + for (current_level= 0; level_flags; current_level++, level_flags>>= 1) + { + if (!(level_flags & 1)) + continue; + if (!(flags & MY_STRXFRM_LEVEL_ALL) || + (flags & (MY_STRXFRM_LEVEL1 << current_level))) + dst= cs->state & MY_CS_NOPAD ? + MY_FUNCTION_NAME(strnxfrm_nopad_onelevel)(cs, + &cs->uca->level[current_level], + dst, de, nweights, + src, srclen, flags) : + MY_FUNCTION_NAME(strnxfrm_onelevel)(cs, + &cs->uca->level[current_level], + dst, de, nweights, + src, srclen, flags); + } + + if (dst < de && (flags & MY_STRXFRM_PAD_TO_MAXLEN)) + { + memset(dst, 0, de - dst); + dst= de; + } + + return dst - d0; +} + + +/* + One-level, PAD SPACE +*/ +MY_COLLATION_HANDLER MY_FUNCTION_NAME(collation_handler)= +{ + MY_UCA_COLL_INIT, + MY_FUNCTION_NAME(strnncoll), + MY_FUNCTION_NAME(strnncollsp), + MY_FUNCTION_NAME(strnncollsp_nchars), + MY_FUNCTION_NAME(strnxfrm), + my_strnxfrmlen_any_uca, + MY_LIKE_RANGE, + my_wildcmp_uca, + NULL, /* strcasecmp() */ + my_instr_mb, + MY_FUNCTION_NAME(hash_sort), + my_propagate_complex, + my_min_str_mb_simple, + my_max_str_mb_simple, + my_ci_get_id_uca, + my_ci_get_collation_name_uca +}; + + +/* + One-level, NO PAD + For character sets with mbminlen==1 use MY_LIKE_RANGE=my_like_range_mb + For character sets with mbminlen>=2 use MY_LIKE_RANGE=my_like_range_generic +*/ +MY_COLLATION_HANDLER MY_FUNCTION_NAME(collation_handler_nopad)= +{ + MY_UCA_COLL_INIT, + MY_FUNCTION_NAME(strnncoll), + MY_FUNCTION_NAME(strnncollsp_nopad), + MY_FUNCTION_NAME(strnncollsp_nchars), + MY_FUNCTION_NAME(strnxfrm_nopad), + my_strnxfrmlen_any_uca, + MY_LIKE_RANGE, /* my_like_range_mb or my_like_range_generic */ + my_wildcmp_uca, + NULL, /* strcasecmp() */ + my_instr_mb, + MY_FUNCTION_NAME(hash_sort_nopad), + my_propagate_complex, + my_min_str_mb_simple_nopad, + my_max_str_mb_simple, + my_ci_get_id_uca, + my_ci_get_collation_name_uca +}; + + +/* + Multi-level, PAD SPACE +*/ +MY_COLLATION_HANDLER MY_FUNCTION_NAME(collation_handler_multilevel)= +{ + MY_UCA_COLL_INIT, + MY_FUNCTION_NAME(strnncoll_multilevel), + MY_FUNCTION_NAME(strnncollsp_multilevel), + MY_FUNCTION_NAME(strnncollsp_nchars_multilevel), + MY_FUNCTION_NAME(strnxfrm_multilevel), + my_strnxfrmlen_any_uca_multilevel, + MY_LIKE_RANGE, + my_wildcmp_uca, + NULL, /* strcasecmp() */ + my_instr_mb, + MY_FUNCTION_NAME(hash_sort), + my_propagate_complex, + my_min_str_mb_simple, + my_max_str_mb_simple, + my_ci_get_id_uca, + my_ci_get_collation_name_uca +}; + + +/* + Multi-level, NO PAD +*/ +MY_COLLATION_HANDLER MY_FUNCTION_NAME(collation_handler_nopad_multilevel)= +{ + MY_UCA_COLL_INIT, + MY_FUNCTION_NAME(strnncoll_multilevel), + MY_FUNCTION_NAME(strnncollsp_nopad_multilevel), + MY_FUNCTION_NAME(strnncollsp_nchars_multilevel), + MY_FUNCTION_NAME(strnxfrm_multilevel), + my_strnxfrmlen_any_uca_multilevel, + MY_LIKE_RANGE, + my_wildcmp_uca, + NULL, /* strcasecmp() */ + my_instr_mb, + MY_FUNCTION_NAME(hash_sort), + my_propagate_complex, + my_min_str_mb_simple_nopad, + my_max_str_mb_simple, + my_ci_get_id_uca, + my_ci_get_collation_name_uca +}; + + +MY_COLLATION_HANDLER_PACKAGE MY_FUNCTION_NAME(package)= +{ + &MY_FUNCTION_NAME(collation_handler), + &MY_FUNCTION_NAME(collation_handler_nopad), + &MY_FUNCTION_NAME(collation_handler_multilevel), + &MY_FUNCTION_NAME(collation_handler_nopad_multilevel) +}; + + +#undef MY_FUNCTION_NAME +#undef MY_MB_WC +#undef MY_LIKE_RANGE +#undef MY_UCA_ASCII_OPTIMIZE +#undef MY_UCA_COMPILE_CONTRACTIONS +#undef MY_UCA_COLL_INIT |