From 5da14042f70711ea5cf66e034699730335462f66 Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Sun, 5 May 2024 14:08:03 +0200 Subject: Merging upstream version 1.45.3+dfsg. Signed-off-by: Daniel Baumann --- src/fluent-bit/lib/onigmo/test_enc_utf8.c | 291 ++++++++++++++++++++++++++++++ 1 file changed, 291 insertions(+) create mode 100644 src/fluent-bit/lib/onigmo/test_enc_utf8.c (limited to 'src/fluent-bit/lib/onigmo/test_enc_utf8.c') diff --git a/src/fluent-bit/lib/onigmo/test_enc_utf8.c b/src/fluent-bit/lib/onigmo/test_enc_utf8.c new file mode 100644 index 000000000..6f99fa1d2 --- /dev/null +++ b/src/fluent-bit/lib/onigmo/test_enc_utf8.c @@ -0,0 +1,291 @@ +#include "config.h" +#ifdef ONIG_ESCAPE_UCHAR_COLLISION +#undef ONIG_ESCAPE_UCHAR_COLLISION +#endif + +#include +#include +#include + +#include "onigmo.h" + +/* #define USE_UTF8_31BITS */ + +#define SLEN(s) strlen(s) + +static int nsucc = 0; +static int nfail = 0; +static int nerror = 0; + +static FILE* err_file; + +static OnigRegion* region; + +static void xx(char* pattern, char* str, int from, int to, int mem, int not) +{ + int r; + + regex_t* reg; + OnigErrorInfo einfo; + OnigSyntaxType syn = *ONIG_SYNTAX_DEFAULT; + + r = onig_new(®, (UChar* )pattern, (UChar* )(pattern + SLEN(pattern)), + ONIG_OPTION_NONE, ONIG_ENCODING_UTF8, &syn, &einfo); + if (r) { + char s[ONIG_MAX_ERROR_MESSAGE_LEN]; + onig_error_code_to_str((UChar* )s, r, &einfo); + fprintf(err_file, "ERROR: %s\n", s); + nerror++; + return ; + } + + r = onig_search(reg, (UChar* )str, (UChar* )(str + SLEN(str)), + (UChar* )str, (UChar* )(str + SLEN(str)), + region, ONIG_OPTION_NONE); + if (r < ONIG_MISMATCH) { + char s[ONIG_MAX_ERROR_MESSAGE_LEN]; + onig_error_code_to_str((UChar* )s, r); + fprintf(err_file, "ERROR: %s\n", s); + nerror++; + return ; + } + + if (r == ONIG_MISMATCH) { + if (not) { + fprintf(stdout, "OK(N): /%s/ '%s'\n", pattern, str); + nsucc++; + } + else { + fprintf(stdout, "FAIL: /%s/ '%s'\n", pattern, str); + nfail++; + } + } + else { + if (not) { + fprintf(stdout, "FAIL(N): /%s/ '%s'\n", pattern, str); + nfail++; + } + else { + if (region->beg[mem] == from && region->end[mem] == to) { + fprintf(stdout, "OK: /%s/ '%s'\n", pattern, str); + nsucc++; + } + else { + fprintf(stdout, "FAIL: /%s/ '%s' %d-%d : %d-%d\n", pattern, str, + (int)from, (int)to, (int)region->beg[mem], (int)region->end[mem]); + nfail++; + } + } + } + onig_free(reg); +} + +static void x2(char* pattern, char* str, int from, int to) +{ + xx(pattern, str, from, to, 0, 0); +} + +#if 0 +static void x3(char* pattern, char* str, int from, int to, int mem) +{ + xx(pattern, str, from, to, mem, 0); +} + +static void n(char* pattern, char* str) +{ + xx(pattern, str, 0, 0, 0, 1); +} +#endif + +const OnigEncodingType * target_encoding = ONIG_ENCODING_UTF8; + +static void test_mbc_enc_len(const char * str, int expect) { + const OnigEncodingType * enc = target_encoding; + size_t len = strlen(str); + int actual = ONIGENC_PRECISE_MBC_ENC_LEN(enc, (const UChar *)str, (const UChar *)str + len); + if (actual == expect) { + fprintf(stdout, "OK: mbc_enc_len(%s)=%d\n", str, expect); + nsucc++; + } else { + fprintf(stdout, "FAIL: mbc_enc_len(%s)=%d\n", str, expect); + nfail++; + } +} + +static void test_code_to_mbclen(OnigCodePoint code, int expect) { + const OnigEncodingType * enc = target_encoding; + int actual = ONIGENC_CODE_TO_MBCLEN(enc, code); + if (actual == expect) { + fprintf(stdout, "OK: code_to_mbclen(U+%04X)=%d\n", code, expect); + nsucc++; + } else { + fprintf(stdout, "FAIL: code_to_mbclen(U+%04X)=%d\n", code, expect); + nfail++; + } +} + +static void test_mbc_to_code(const char * str, int expect) { + const OnigEncodingType * enc = target_encoding; + size_t len = strlen(str); + int actual = ONIGENC_MBC_TO_CODE(enc, (const UChar *)str, (const UChar *)str + len); + if (actual == expect) { + fprintf(stdout, "OK: mbc_to_code(%s)=U+%04X\n", str, expect); + nsucc++; + } else { + fprintf(stdout, "FAIL: mbc_to_code(%s)=U+%04X\n", str, expect); + nfail++; + } +} + +static void test_code_to_mbc(OnigCodePoint code, const char * expect, int exp_error) { + const OnigEncodingType * enc = target_encoding; + UChar * buf = (UChar *)malloc(ONIGENC_MBC_MAXLEN(enc) + 1); + int len = ONIGENC_CODE_TO_MBC(enc, code, buf); + if (len < 0) { + int err = len; + if (err == exp_error) { + fprintf(stdout, "OK: code_to_mbc(U+%04X)=(error %d)\n", code, len); + nsucc++; + } else { + fprintf(stdout, "FAIL: code_to_mbc(U+%04X)=(error %d)\n", code, len); + nfail++; + } + goto exit; + } + + buf[len] = '\0'; + if (strcmp((const char *)buf, expect) == 0) { + fprintf(stdout, "OK: code_to_mbc(U+%04X)=%s\n", code, expect); + nsucc++; + } else { + fprintf(stdout, "FAIL: code_to_mbc(U+%04X)=%s\n", code, expect); + nfail++; + } + +exit: + free(buf); +} + +extern int main(int argc, char* argv[]) +{ + err_file = stdout; + + region = onig_region_new(); + + test_mbc_enc_len("\xC2\x80", 2); // S0, S1, A + test_code_to_mbclen(0x0080, 2); + test_code_to_mbc(0x0080, "\xC2\x80", 0); + x2("\\x{0080}", "\xC2\x80", 0, 2); + x2("\xC2\x80", "\xC2\x80", 0, 2); // min 2 bytes + + test_mbc_enc_len("\xC2\xC0", -1); // S0, S1, F + + test_mbc_enc_len("\xDF\xBF", 2); // S0, S1, A + test_code_to_mbclen(0x07FF, 2); + test_code_to_mbc(0x07FF, "\xDF\xBF", 0); + x2("\\x{07FF}", "\xDF\xBF", 0, 2); + x2("\xDF\xBF", "\xDF\xBF", 0, 2); // max 2 bytes + + test_mbc_enc_len("\xE0\xA0\x80", 3); // S0, S2, S1, A + test_code_to_mbclen(0x0800, 3); + test_code_to_mbc(0x0800, "\xE0\xA0\x80", 0); + x2("\xE0\xA0\x80", "\xE0\xA0\x80", 0, 3); + x2("\\x{0800}", "\xE0\xA0\x80", 0, 3); // min 3 bytes + + test_mbc_enc_len("\xE0\xC0\x80", -1); // S0, S2, F + + test_mbc_enc_len("\xEF\xBF\xBF", 3); // S0, S3, S1, A + test_code_to_mbclen(0xFFFF, 3); + test_code_to_mbc(0xFFFF, "\xEF\xBF\xBF", 0); + x2("\xEF\xBF\xBF", "\xEF\xBF\xBF", 0, 3); + x2("\\x{FFFF}", "\xEF\xBF\xBF", 0, 3); // max 3 bytes + + test_mbc_enc_len("\xEF\xC0\xBF", -1); // S0, S3, F + + test_mbc_enc_len("\xED\x80\x80", 3); // S0, S4, S1, A + test_code_to_mbclen(0xD000, 3); + test_code_to_mbc(0xD000, "\xED\x80\x80", 0); + x2("\xED\x80\x80", "\xED\x80\x80", 0, 3); + x2("\\x{D000}", "\xED\x80\x80", 0, 3); + + test_mbc_enc_len("\xED\xA0\xA0", -1); // S0, S4, F + + test_mbc_enc_len("\xF0\x90\x80\x80", 4); // S0, S5, S3, S1, A + test_code_to_mbclen(0x00010000, 4); + test_code_to_mbc(0x00010000, "\xF0\x90\x80\x80", 0); + x2("\xF0\x90\x80\x80", "\xF0\x90\x80\x80", 0, 4); + x2("\\x{00010000}", "\xF0\x90\x80\x80", 0, 4); // min 4 bytes + + test_mbc_enc_len("\xF0\x80\x80\x80", -1); // S0, S5, F + + test_mbc_enc_len("\xF4\x8F\xBF\xBF", 4); // S0, S7, S3, S1, A + test_code_to_mbclen(0x0010FFFF, 4); + test_code_to_mbc(0x0010FFFF, "\xF4\x8F\xBF\xBF", 0); + x2("\xF4\x8F\xBF\xBF", "\xF4\x8F\xBF\xBF", 0, 4); + x2("\\x{0010FFFF}", "\xF4\x8F\xBF\xBF", 0, 4); // max Unicode + +#ifndef USE_UTF8_31BITS + test_mbc_enc_len("\xF7\xBF\xBF\xBF", -1); // S0, F +#else + test_mbc_enc_len("\xF7\xBF\xBF\xBF", 4); // S0, S6, S3, S1, A + test_code_to_mbclen(0x001FFFFF, 4); + test_code_to_mbc(0x001FFFFF, "\xF7\xBF\xBF\xBF", 0); + x2("\xF7\xBF\xBF\xBF", "\xF7\xBF\xBF\xBF", 0, 4); + x2("\\x{001FFFFF}", "\xF7\xBF\xBF\xBF", 0, 4); // max 4 bytes (21bits) + + test_mbc_enc_len("\xF7\xC0\xBF\xBF", -1); // S0, S6, F + + test_mbc_enc_len("\xF8\x88\x80\x80\x80", 5); // S0, S8, S6, S3, S1, A + test_code_to_mbclen(0x00200000, 5); + test_code_to_mbc(0x00200000, "\xF8\x88\x80\x80\x80", 0); + x2("\xF8\x88\x80\x80\x80", "\xF8\x88\x80\x80\x80", 0, 5); + x2("\\x{00200000}", "\xF8\x88\x80\x80\x80", 0, 5); // min 5 bytes + + test_mbc_enc_len("\xF8\x80\x80\x80\x80", -1); // S0, S8, F + + test_mbc_enc_len("\xFB\xBF\xBF\xBF\xBF", 5); // S0, S9, S6, S3, S1, A + test_code_to_mbclen(0x03FFFFFF, 5); + test_code_to_mbc(0x03FFFFFF, "\xFB\xBF\xBF\xBF\xBF", 0); + x2("\xFB\xBF\xBF\xBF\xBF", "\xFB\xBF\xBF\xBF\xBF", 0, 5); + x2("\\x{03FFFFFF}", "\xFB\xBF\xBF\xBF\xBF", 0, 5); // max 5 bytes + + test_mbc_enc_len("\xFB\xC0\xBF\xBF\xBF", -1); // S0, S9, F + + test_mbc_enc_len("\xFC\x84\x80\x80\x80\x80", 6); // S0, S10, S9, S6, S3, S1, A + test_code_to_mbclen(0x04000000, 6); + test_code_to_mbc(0x04000000, "\xFC\x84\x80\x80\x80\x80", 0); + x2("\xFC\x84\x80\x80\x80\x80", "\xFC\x84\x80\x80\x80\x80", 0, 6); + x2("\\x{04000000}", "\xFC\x84\x80\x80\x80\x80", 0, 6); // min 6 bytes + + test_mbc_enc_len("\xFC\x80\x80\x80\x80\x80", -1); // S0, S10, F + + test_mbc_enc_len("\xFD\xBF\xBF\xBF\xBF\xBF", 6); // S0, S11, S9, S6, S3, S1, A + test_code_to_mbclen(0x7FFFFFFF, 6); + test_code_to_mbc(0x7FFFFFFF, "\xFD\xBF\xBF\xBF\xBF\xBF", 0); + x2("\xFD\xBF\xBF\xBF\xBF\xBF", "\xFD\xBF\xBF\xBF\xBF\xBF", 0, 6); + x2("\\x{7FFFFFFF}", "\xFD\xBF\xBF\xBF\xBF\xBF", 0, 6); // max 6 bytes + + test_mbc_enc_len("\xFD\xC0\xBF\xBF\xBF\xBF", -1); // S0, S11, F + + test_code_to_mbclen(0x80000000, ONIGERR_TOO_BIG_WIDE_CHAR_VALUE); + test_code_to_mbc(0x80000000, "", ONIGERR_TOO_BIG_WIDE_CHAR_VALUE); + test_code_to_mbclen(0xFFFFFFFD, ONIGERR_TOO_BIG_WIDE_CHAR_VALUE); + test_code_to_mbc(0xFFFFFFFD, "", ONIGERR_TOO_BIG_WIDE_CHAR_VALUE); + test_code_to_mbclen(0xFFFFFFFE, 1); + test_code_to_mbc(0xFFFFFFFE, "\xFE", 0); + test_code_to_mbclen(0xFFFFFFFF, 1); + test_code_to_mbc(0xFFFFFFFF, "\xFF", 0); +#endif + + test_mbc_to_code("\xFE", 0xFFFFFFFE); + test_mbc_to_code("\xFF", 0xFFFFFFFF); + + fprintf(stdout, + "\nRESULT SUCC: %d, FAIL: %d, ERROR: %d (by Onigmo %s)\n", + nsucc, nfail, nerror, onig_version()); + + onig_region_free(region, 1); + onig_end(); + + return ((nfail == 0 && nerror == 0) ? 0 : -1); +} -- cgit v1.2.3