diff options
Diffstat (limited to 'strings/ctype-mb.inl')
-rw-r--r-- | strings/ctype-mb.inl | 263 |
1 files changed, 263 insertions, 0 deletions
diff --git a/strings/ctype-mb.inl b/strings/ctype-mb.inl new file mode 100644 index 00000000..6cde31a3 --- /dev/null +++ b/strings/ctype-mb.inl @@ -0,0 +1,263 @@ +/* + Copyright (c) 2015, MariaDB Foundation + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1335 USA +*/ + + +#ifndef MY_FUNCTION_NAME +#error MY_FUNCTION_NAME is not defined +#endif + +#if defined(IS_MB3_CHAR) && !defined(IS_MB2_CHAR) +#error IS_MB3_CHAR is defined, while IS_MB2_CHAR is not! +#endif + +#if defined(IS_MB4_CHAR) && !defined(IS_MB3_CHAR) +#error IS_MB4_CHAR is defined, while IS_MB3_CHAR is not! +#endif + + +#ifdef DEFINE_ASIAN_ROUTINES +#define DEFINE_WELL_FORMED_CHAR_LENGTH +#define DEFINE_CHARLEN +#define DEFINE_NATIVE_TO_MB_VARLEN +#endif + + +#ifdef DEFINE_CHARLEN +/** + Returns length of the left-most character of a string. + @param cs - charset with mbminlen==1 and mbmaxlen<=4 + @param b - the beginning of the string + @param e - the end of the string + + @return MY_CS_ILSEQ if a bad byte sequence was found + @return MY_CS_TOOSMALL(N) if the string ended unexpectedly + @return >0 if a valid character was found +*/ +static int +MY_FUNCTION_NAME(charlen)(CHARSET_INFO *cs __attribute__((unused)), + const uchar *b, const uchar *e) +{ + DBUG_ASSERT(cs->mbminlen == 1); + DBUG_ASSERT(cs->mbmaxlen <= 4); + + if (b >= e) + return MY_CS_TOOSMALL; + if ((uchar) b[0] < 128) + return 1; /* Single byte ASCII character */ + +#ifdef IS_8BIT_CHAR + if (IS_8BIT_CHAR(b[0])) + { + /* Single byte non-ASCII character, e.g. half width kana in sjis */ + return 1; + } +#endif + + if (b + 2 > e) + return MY_CS_TOOSMALLN(2); + if (IS_MB2_CHAR(b[0], b[1])) + return 2; /* Double byte character */ + +#ifdef IS_MB3_CHAR + if (b + 3 > e) + { +#ifdef IS_MB_PREFIX2 + if (!IS_MB_PREFIX2(b[0], b[1])) + return MY_CS_ILSEQ; +#endif + return MY_CS_TOOSMALLN(3); + } + if (IS_MB3_CHAR(b[0], b[1], b[2])) + return 3; /* Three-byte character */ +#endif + +#ifdef IS_MB4_CHAR + if (b + 4 > e) + return MY_CS_TOOSMALLN(4); + if (IS_MB4_CHAR(b[0], b[1], b[2], b[3])) + return 4; /* Four-byte character */ +#endif + + /* Wrong byte sequence */ + return MY_CS_ILSEQ; +} +#endif /* DEFINE_CHARLEN */ + + +#ifdef DEFINE_WELL_FORMED_CHAR_LENGTH +/** + Returns well formed length of a string + measured in characters (rather than in bytes). + Version for character sets that define IS_MB?_CHAR(), e.g. big5. +*/ +static size_t +MY_FUNCTION_NAME(well_formed_char_length)(CHARSET_INFO *cs __attribute__((unused)), + const char *b, const char *e, + size_t nchars, + MY_STRCOPY_STATUS *status) +{ + size_t nchars0= nchars; + for ( ; b < e && nchars ; nchars--) + { + if ((uchar) b[0] < 128) + { + b++; /* Single byte ASCII character */ + continue; + } + + if (b + 2 <= e && IS_MB2_CHAR(b[0], b[1])) + { + b+= 2; /* Double byte character */ + continue; + } + +#ifdef IS_MB3_CHAR + if (b + 3 <= e && IS_MB3_CHAR(b[0], b[1], b[2])) + { + b+= 3; /* Three-byte character */ + continue; + } +#endif + +#ifdef IS_MB4_CHAR + if (b + 4 <= e && IS_MB4_CHAR(b[0], b[1], b[2], b[3])) + { + b+= 4; /* Four-byte character */ + continue; + } +#endif + +#ifdef IS_8BIT_CHAR + if (IS_8BIT_CHAR(b[0])) + { + b++; /* Single byte non-ASCII character, e.g. half width kana in sjis */ + continue; + } +#endif + + /* Wrong byte sequence */ + status->m_source_end_pos= status->m_well_formed_error_pos= b; + return nchars0 - nchars; + } + status->m_source_end_pos= b; + status->m_well_formed_error_pos= NULL; + return nchars0 - nchars; +} +#endif /* DEFINE_WELL_FORMED_CHAR_LENGTH */ + + +#ifdef DEFINE_WELL_FORMED_CHAR_LENGTH_USING_CHARLEN +#ifndef CHARLEN +#error CHARLEN is not defined +#endif +/** + Returns well formed length of a string + measured in characters (rather than in bytes). + Version for character sets that define CHARLEN(), e.g. utf8mb3. + CHARLEN(cs,b,e) must use the same return code convension that mb_wc() does: + - a positive number in the range [1-mbmaxlen] if a valid + single-byte or multi-byte character was found + - MY_CS_ILSEQ (0) on a bad byte sequence + - MY_CS_TOOSMALLxx if the incoming sequence is incomplete +*/ +static size_t +MY_FUNCTION_NAME(well_formed_char_length)(CHARSET_INFO *cs __attribute__((unused)), + const char *b, const char *e, + size_t nchars, + MY_STRCOPY_STATUS *status) +{ + size_t nchars0= nchars; + int chlen; + for ( ; nchars ; nchars--, b+= chlen) + { + if ((chlen= CHARLEN(cs, (uchar*) b, (uchar*) e)) <= 0) + { + status->m_well_formed_error_pos= b < e ? b : NULL; + status->m_source_end_pos= b; + return nchars0 - nchars; + } + } + status->m_well_formed_error_pos= NULL; + status->m_source_end_pos= b; + return nchars0 - nchars; +} +#endif /* DEFINE_WELL_FORMED_CHAR_LENGTH_USING_CHARLEN */ + + +#ifdef DEFINE_NATIVE_TO_MB_VARLEN +/* + Write a native 2-byte character. + If the full character does not fit, only the first byte is written. +*/ +static inline int +my_native_to_mb_fixed2(my_wc_t wc, uchar *s, uchar *e) +{ + /* The caller must insure there is a space for at least one byte */ + DBUG_ASSERT(s < e); + s[0]= (uchar) (wc >> 8); + if (s + 2 > e) + return MY_CS_TOOSMALL2; + s[1]= wc & 0xFF; + return 2; +} + + +/* + Write a native 3-byte character. + If the full character does not fit, only the leading bytes are written. +*/ +static inline int +my_native_to_mb_fixed3(my_wc_t wc, uchar *s, uchar *e) +{ + /* The caller must insure there is a space for at least one byte */ + DBUG_ASSERT(s < e); + s[0]= (uchar) (wc >> 16); + if (s + 2 > e) + return MY_CS_TOOSMALL2; + s[1]= (wc >> 8) & 0xFF; + if (s + 3 > e) + return MY_CS_TOOSMALL3; + s[2]= wc & 0xFF; + return 3; +} + + +/* + Write a native 1-byte or 2-byte or 3-byte character. +*/ + +static int +MY_FUNCTION_NAME(native_to_mb)(CHARSET_INFO *cs __attribute__((unused)), + my_wc_t wc, uchar *s, uchar *e) +{ + if (s >= e) + return MY_CS_TOOSMALL; + if ((int) wc <= 0xFF) + { + s[0]= (uchar) wc; + return 1; + } +#ifdef IS_MB3_HEAD + if (wc > 0xFFFF) + return my_native_to_mb_fixed3(wc, s, e); +#endif + return my_native_to_mb_fixed2(wc, s, e); +} +#endif /* DEFINE_NATIVE_TO_MB_VARLEN */ + + +#undef MY_FUNCTION_NAME |