diff options
Diffstat (limited to 'contrib/fastutf8/fastutf8.c')
-rw-r--r-- | contrib/fastutf8/fastutf8.c | 160 |
1 files changed, 160 insertions, 0 deletions
diff --git a/contrib/fastutf8/fastutf8.c b/contrib/fastutf8/fastutf8.c new file mode 100644 index 0000000..89becaf --- /dev/null +++ b/contrib/fastutf8/fastutf8.c @@ -0,0 +1,160 @@ +/* + * MIT License + * + * Copyright (c) 2019 Yibo Cai + * Copyright (c) 2019 Vsevolod Stakhov + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "fastutf8.h" +#include "libcryptobox/platform_config.h" + + +/* + * http://www.unicode.org/versions/Unicode6.0.0/ch03.pdf - page 94 + * + * Table 3-7. Well-Formed UTF-8 Byte Sequences + * + * +--------------------+------------+-------------+------------+-------------+ + * | Code Points | First Byte | Second Byte | Third Byte | Fourth Byte | + * +--------------------+------------+-------------+------------+-------------+ + * | U+0000..U+007F | 00..7F | | | | + * +--------------------+------------+-------------+------------+-------------+ + * | U+0080..U+07FF | C2..DF | 80..BF | | | + * +--------------------+------------+-------------+------------+-------------+ + * | U+0800..U+0FFF | E0 | A0..BF | 80..BF | | + * +--------------------+------------+-------------+------------+-------------+ + * | U+1000..U+CFFF | E1..EC | 80..BF | 80..BF | | + * +--------------------+------------+-------------+------------+-------------+ + * | U+D000..U+D7FF | ED | 80..9F | 80..BF | | + * +--------------------+------------+-------------+------------+-------------+ + * | U+E000..U+FFFF | EE..EF | 80..BF | 80..BF | | + * +--------------------+------------+-------------+------------+-------------+ + * | U+10000..U+3FFFF | F0 | 90..BF | 80..BF | 80..BF | + * +--------------------+------------+-------------+------------+-------------+ + * | U+40000..U+FFFFF | F1..F3 | 80..BF | 80..BF | 80..BF | + * +--------------------+------------+-------------+------------+-------------+ + * | U+100000..U+10FFFF | F4 | 80..8F | 80..BF | 80..BF | + * +--------------------+------------+-------------+------------+-------------+ + */ + +/* Return 0 - success, >0 - index (1 based) of first error char */ +off_t +rspamd_fast_utf8_validate_ref (const unsigned char *data, size_t len) +{ + off_t err_pos = 1; + + while (len) { + int bytes; + const unsigned char byte1 = data[0]; + + /* 00..7F */ + if (byte1 <= 0x7F) { + bytes = 1; + /* C2..DF, 80..BF */ + } + else if (len >= 2 && byte1 >= 0xC2 && byte1 <= 0xDF && + (signed char) data[1] <= (signed char) 0xBF) { + bytes = 2; + } + else if (len >= 3) { + const unsigned char byte2 = data[1]; + + /* Is byte2, byte3 between 0x80 ~ 0xBF */ + const int byte2_ok = (signed char) byte2 <= (signed char) 0xBF; + const int byte3_ok = (signed char) data[2] <= (signed char) 0xBF; + + if (byte2_ok && byte3_ok && + /* E0, A0..BF, 80..BF */ + ((byte1 == 0xE0 && byte2 >= 0xA0) || + /* E1..EC, 80..BF, 80..BF */ + (byte1 >= 0xE1 && byte1 <= 0xEC) || + /* ED, 80..9F, 80..BF */ + (byte1 == 0xED && byte2 <= 0x9F) || + /* EE..EF, 80..BF, 80..BF */ + (byte1 >= 0xEE && byte1 <= 0xEF))) { + bytes = 3; + } + else if (len >= 4) { + /* Is byte4 between 0x80 ~ 0xBF */ + const int byte4_ok = (signed char) data[3] <= (signed char) 0xBF; + + if (byte2_ok && byte3_ok && byte4_ok && + /* F0, 90..BF, 80..BF, 80..BF */ + ((byte1 == 0xF0 && byte2 >= 0x90) || + /* F1..F3, 80..BF, 80..BF, 80..BF */ + (byte1 >= 0xF1 && byte1 <= 0xF3) || + /* F4, 80..8F, 80..BF, 80..BF */ + (byte1 == 0xF4 && byte2 <= 0x8F))) { + bytes = 4; + } + else { + return err_pos; + } + } + else { + return err_pos; + } + } + else { + return err_pos; + } + + len -= bytes; + err_pos += bytes; + data += bytes; + } + + return 0; +} + +/* Prototypes */ +#if defined(HAVE_SSE41) && defined(__x86_64__) +extern off_t rspamd_fast_utf8_validate_sse41 (const unsigned char *data, size_t len); +#endif +#if defined(HAVE_AVX2) && defined(__x86_64__) +extern off_t rspamd_fast_utf8_validate_avx2 (const unsigned char *data, size_t len); +#endif + +static off_t (*validate_func) (const unsigned char *data, size_t len) = + rspamd_fast_utf8_validate_ref; + + +void +rspamd_fast_utf8_library_init (unsigned flags) +{ +#if defined(HAVE_SSE41) && defined(__x86_64__) + if (flags & RSPAMD_FAST_UTF8_FLAG_SSE41) { + validate_func = rspamd_fast_utf8_validate_sse41; + } +#endif +#if defined(HAVE_AVX2) && defined(__x86_64__) + if (flags & RSPAMD_FAST_UTF8_FLAG_AVX2) { + validate_func = rspamd_fast_utf8_validate_avx2; + } +#endif +} + +off_t +rspamd_fast_utf8_validate (const unsigned char *data, size_t len) +{ + return len >= 64 ? + validate_func (data, len) : + rspamd_fast_utf8_validate_ref (data, len); +}
\ No newline at end of file |