diff options
Diffstat (limited to 'test/internal/testucs.c')
-rw-r--r-- | test/internal/testucs.c | 348 |
1 files changed, 348 insertions, 0 deletions
diff --git a/test/internal/testucs.c b/test/internal/testucs.c new file mode 100644 index 0000000..bf8874e --- /dev/null +++ b/test/internal/testucs.c @@ -0,0 +1,348 @@ +/* Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "apr.h" +#include "arch/win32/apr_arch_utf8.h" +#include <wchar.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <assert.h> + +struct testval { + unsigned char n[8]; + apr_size_t nl; + wchar_t w[4]; + apr_size_t wl; +}; + +#ifdef FOR_REFERENCE +/* For reference; a table of invalid utf-8 encoded ucs-2/ucs-4 sequences. + * The table consists of start, end pairs for all invalid ranges. + * NO_UCS2_PAIRS will pass the reservered D800-DFFF values, halting at FFFF + * FULL_UCS4_MAPPER represents all 31 bit values to 7FFF FFFF + * + * We already tested these, because we ensure there is a 1:1 mapping across + * the entire range of byte values in each position of 1 to 6 byte sequences. + */ +struct testval malformed[] = [ + [[0x80,], 1,], /* 10000000 64 invalid leading continuation values */ + [[0xBF,], 1,], /* 10111111 64 invalid leading continuation values */ + [[0xC0,0x80], 2,], /* overshort mapping of 0000 */ + [[0xC1,0xBF], 2,], /* overshort mapping of 007F */ + [[0xE0,0x80,0x80,], 3,], /* overshort mapping of 0000 */ + [[0xE0,0x9F,0xBF,], 3,], /* overshort mapping of 07FF */ +#ifndef NO_UCS2_PAIRS + [[0xED,0xA0,0x80,], 3,], /* unexpected mapping of UCS-2 literal D800 */ + [[0xED,0xBF,0xBF,], 3,], /* unexpected mapping of UCS-2 literal DFFF */ +#endif + [[0xF0,0x80,0x80,0x80,], 4,], /* overshort mapping of 0000 */ + [[0xF0,0x8F,0xBF,0xBF,], 4,], /* overshort mapping of FFFF */ +#ifdef NO_UCS2_PAIRS + [[0xF0,0x90,0x80,0x80,], 4,], /* invalid too large value 0001 0000 */ + [[0xF4,0x8F,0xBF,0xBF,], 4,], /* invalid too large value 0010 FFFF */ +#endif +#ifndef FULL_UCS4_MAPPER + [[0xF4,0x90,0x80,0x80,], 4,], /* invalid too large value 0011 0000 */ + [[0xF7,0xBF,0xBF,0xBF,], 4,], /* invalid too large value 001F FFFF */ +#endif + [[0xF8,0x80,0x80,0x80,0x80,], 5,], /* overshort mapping of 0000 0000 */ + [[0xF8,0x87,0xBF,0xBF,0xBF,], 5,], /* overshort mapping of 001F FFFF */ +#ifndef FULL_UCS4_MAPPER + [[0xF8,0x88,0x80,0x80,0x80,], 5,], /* invalid too large value 0020 0000 */ + [[0xFB,0xBF,0xBF,0xBF,0xBF,], 5,], /* invalid too large value 03FF FFFF */ +#endif + [[0xFC,0x80,0x80,0x80,0x80,0x80,], 6,], /* overshort mapping 0000 0000 */ + [[0xFC,0x83,0xBF,0xBF,0xBF,0xBF,], 6,], /* overshort mapping 03FF FFFF */ +#ifndef FULL_UCS4_MAPPER + [[0xFC,0x84,0x80,0x80,0x80,0x80,], 6,], /* overshort mapping 0400 0000 */ + [[0xFD,0xBF,0xBF,0xBF,0xBF,0xBF,], 6,], /* overshort mapping 7FFF FFFF */ +#endif + [[0xFE,], 1,], /* 11111110 invalid "too large" value, no 7 byte seq */ + [[0xFF,], 1,], /* 11111111 invalid "too large" value, no 8 byte seq */ +]; +#endif /* FOR_REFERENCE */ + +void displaynw(struct testval *f, struct testval *l) +{ + char x[80], *t = x; + int i; + for (i = 0; i < f->nl; ++i) + t += sprintf(t, "%02X ", f->n[i]); + *(t++) = '-'; + for (i = 0; i < l->nl; ++i) + t += sprintf(t, " %02X", l->n[i]); + *(t++) = ' '; + *(t++) = '='; + *(t++) = ' '; + for (i = 0; i < f->wl; ++i) + t += sprintf(t, "%04X ", f->w[i]); + *(t++) = '-'; + for (i = 0; i < l->wl; ++i) + t += sprintf(t, " %04X", l->w[i]); + *t = '\0'; + puts(x); +} + +/* + * Test every possible byte value. + * If the test passes or fails at this byte value we are done. + * Otherwise iterate test_nrange again, appending another byte. + */ +void test_nrange(struct testval *p) +{ + struct testval f, l, s; + apr_status_t rc; + int success = 0; + + memcpy (&s, p, sizeof(s)); + ++s.nl; + + do { + apr_size_t nl = s.nl, wl = sizeof(s.w) / 2; + rc = apr_conv_utf8_to_ucs2(s.n, &nl, s.w, &wl); + s.wl = (sizeof(s.w) / 2) - wl; + if (!nl && rc == APR_SUCCESS) { + if (!success) { + memcpy(&f, &s, sizeof(s)); + success = -1; + } + else { + if (s.wl != l.wl + || memcmp(s.w, l.w, (s.wl - 1) * 2) != 0 + || s.w[s.wl - 1] != l.w[l.wl - 1] + 1) { + displaynw(&f, &l); + memcpy(&f, &s, sizeof(s)); + } + } + memcpy(&l, &s, sizeof(s)); + } + else { + if (success) { + displaynw(&f, &l); + success = 0; + } + if (rc == APR_INCOMPLETE) { + test_nrange(&s); + } + } + } while (++s.n[s.nl - 1]); + + if (success) { + displaynw(&f, &l); + success = 0; + } +} + +/* + * Test every possible word value. + * Once we are finished, retest every possible word value. + * if the test fails on the following null word, iterate test_nrange + * again, appending another word. + * This assures the output order of the two tests are in sync. + */ +void test_wrange(struct testval *p) +{ + struct testval f, l, s; + apr_status_t rc; + int success = 0; + + memcpy (&s, p, sizeof(s)); + ++s.wl; + + do { + apr_size_t nl = sizeof(s.n), wl = s.wl; + rc = apr_conv_ucs2_to_utf8(s.w, &wl, s.n, &nl); + s.nl = sizeof(s.n) - nl; + if (!wl && rc == APR_SUCCESS) { + if (!success) { + memcpy(&f, &s, sizeof(s)); + success = -1; + } + else { + if (s.nl != l.nl + || memcmp(s.n, l.n, s.nl - 1) != 0 + || s.n[s.nl - 1] != l.n[l.nl - 1] + 1) { + displaynw(&f, &l); + memcpy(&f, &s, sizeof(s)); + } + } + memcpy(&l, &s, sizeof(s)); + } + else { + if (success) { + displaynw(&f, &l); + success = 0; + } + } + } while (++s.w[s.wl - 1]); + + if (success) { + displaynw(&f, &l); + success = 0; + } + + do { + apr_size_t wl = s.wl, nl = sizeof(s.n); + rc = apr_conv_ucs2_to_utf8(s.w, &wl, s.n, &nl); + s.nl = sizeof(s.n) - s.nl; + if (rc == APR_INCOMPLETE) { + test_wrange(&s); + } + } while (++s.w[s.wl - 1]); +} + +/* + * Test every possible byte value. + * If the test passes or fails at this byte value we are done. + * Otherwise iterate test_nrange again, appending another byte. + */ +void test_ranges() +{ + struct testval ntest, wtest; + apr_status_t nrc, wrc; + apr_size_t inlen; + unsigned long matches = 0; + + memset(&ntest, 0, sizeof(ntest)); + ++ntest.nl; + + memset(&wtest, 0, sizeof(wtest)); + ++wtest.wl; + + do { + do { + inlen = ntest.nl; + ntest.wl = sizeof(ntest.w) / 2; + nrc = apr_conv_utf8_to_ucs2(ntest.n, &inlen, ntest.w, &ntest.wl); + if (nrc == APR_SUCCESS) { + ntest.wl = (sizeof(ntest.w) / 2) - ntest.wl; + break; + } + if (nrc == APR_INCOMPLETE) { + ++ntest.nl; + if (ntest.nl > 6) { + printf ("\n\nUnexpected utf8 sequence of >6 bytes;\n"); + exit(255); + } + continue; + } + else { + while (!(++ntest.n[ntest.nl - 1])) { + if (!(--ntest.nl)) + break; + } + } + } while (ntest.nl); + + do { + inlen = wtest.wl; + wtest.nl = sizeof(wtest.n); + wrc = apr_conv_ucs2_to_utf8(wtest.w, &inlen, wtest.n, &wtest.nl); + if (wrc == APR_SUCCESS) { + wtest.nl = sizeof(wtest.n) - wtest.nl; + break; + } + else { + if (!(++wtest.w[wtest.wl - 1])) { + if (wtest.wl == 1) + ++wtest.wl; + else + ++wtest.w[0]; + + /* On the second pass, ensure lead word is incomplete */ + do { + inlen = 1; + wtest.nl = sizeof(wtest.n); + if (apr_conv_ucs2_to_utf8(wtest.w, &inlen, wtest.n, &wtest.nl) + == APR_INCOMPLETE) + break; + if (!(++wtest.w[0])) { + wtest.wl = 0; + break; + } + } while (1); + } + } + } while (wtest.wl); + + if (!ntest.nl && !wtest.wl) + break; + + /* Identical? */ + if ((wtest.nl != ntest.nl) + || (memcmp(wtest.n, ntest.n, ntest.nl) != 0) + || (wtest.wl != ntest.wl) + || (memcmp(ntest.w, wtest.w, wtest.wl * 2) != 0)) { + printf ("\n\nMismatch of w/n conversion at;\n"); + displaynw(&ntest, &wtest); + exit(255); + } + ++matches; + + while (!(++ntest.n[ntest.nl - 1])) { + if (!(--ntest.nl)) + break; + } + + if (!(++wtest.w[wtest.wl - 1])) { + if (wtest.wl == 1) + ++wtest.wl; + else + ++wtest.w[0]; + + /* On the second pass, ensure lead word is incomplete */ + do { + inlen = 1; + wtest.nl = sizeof(wtest.n); + if (apr_conv_ucs2_to_utf8(wtest.w, &inlen, wtest.n, &wtest.nl) + == APR_INCOMPLETE) + break; + if (!(++wtest.w[0])) { + wtest.wl = 0; + break; + } + } while (1); + } + } while (wtest.wl || ntest.nl); + + printf ("\n\nutf8 and ucs2 sequences of %lu transformations matched OK.\n", + matches); +} + +/* + * Syntax: testucs [w|n] + * + * If no arg or arg is not recognized, run equality sequence test. + */ +int main(int argc, char **argv) +{ + struct testval s; + memset (&s, 0, sizeof(s)); + + if (argc >= 2 && apr_tolower(*argv[1]) != 'w') { + printf ("\n\nTesting Narrow Char Ranges\n"); + test_nrange(&s); + } + else if (argc >= 2 && apr_tolower(*argv[1]) != 'n') { + printf ("\n\nTesting Wide Char Ranges\n"); + test_wrange(&s); + } + else { + test_ranges(); + } + return 0; +} |