diff options
Diffstat (limited to '')
-rw-r--r-- | src/utf8proc/test/case.c | 73 | ||||
-rw-r--r-- | src/utf8proc/test/charwidth.c | 77 | ||||
-rw-r--r-- | src/utf8proc/test/custom.c | 28 | ||||
-rw-r--r-- | src/utf8proc/test/graphemetest.c | 74 | ||||
-rw-r--r-- | src/utf8proc/test/iterate.c | 166 | ||||
-rw-r--r-- | src/utf8proc/test/misc.c | 46 | ||||
-rw-r--r-- | src/utf8proc/test/normtest.c | 64 | ||||
-rw-r--r-- | src/utf8proc/test/printproperty.c | 60 | ||||
-rw-r--r-- | src/utf8proc/test/tests.c | 46 | ||||
-rw-r--r-- | src/utf8proc/test/tests.h | 23 | ||||
-rw-r--r-- | src/utf8proc/test/valid.c | 41 |
11 files changed, 698 insertions, 0 deletions
diff --git a/src/utf8proc/test/case.c b/src/utf8proc/test/case.c new file mode 100644 index 000000000..b3947e2ad --- /dev/null +++ b/src/utf8proc/test/case.c @@ -0,0 +1,73 @@ +#include "tests.h" +#include <wctype.h> + +int main(int argc, char **argv) +{ + int error = 0, better = 0; + utf8proc_int32_t c; + + (void) argc; /* unused */ + (void) argv; /* unused */ + + /* some simple sanity tests of the character widths */ + for (c = 0; c <= 0x110000; ++c) { + utf8proc_int32_t l = utf8proc_tolower(c); + utf8proc_int32_t u = utf8proc_toupper(c); + utf8proc_int32_t t = utf8proc_totitle(c); + + check(l == c || utf8proc_codepoint_valid(l), "invalid tolower"); + check(u == c || utf8proc_codepoint_valid(u), "invalid toupper"); + check(t == c || utf8proc_codepoint_valid(t), "invalid totitle"); + + if (utf8proc_codepoint_valid(c) && (l == u) != (l == t) && + /* Unicode 11: Georgian Mkhedruli chars have uppercase but no titlecase. */ + !(((c >= 0x10d0 && c <= 0x10fa) || c >= (0x10fd && c <= 0x10ff)) && l != u)) { + fprintf(stderr, "unexpected titlecase %x for lowercase %x / uppercase %x\n", t, l, c); + ++error; + } + + if (sizeof(wint_t) > 2 || c < (1<<16)) { + wint_t l0 = towlower(c), u0 = towupper(c); + + /* OS unicode tables may be out of date. But if they + do have a lower/uppercase mapping, hopefully it + is correct? */ + if (l0 != c && l0 != l) { + fprintf(stderr, "MISMATCH %x != towlower(%x) == %x\n", + l, c, l0); + ++error; + } + else if (l0 != l) { /* often true for out-of-date OS unicode */ + ++better; + /* printf("%x != towlower(%x) == %x\n", l, c, l0); */ + } + if (u0 != c && u0 != u) { + fprintf(stderr, "MISMATCH %x != towupper(%x) == %x\n", + u, c, u0); + ++error; + } + else if (u0 != u) { /* often true for out-of-date OS unicode */ + ++better; + /* printf("%x != towupper(%x) == %x\n", u, c, u0); */ + } + } + } + check(!error, "utf8proc case conversion FAILED %d tests.", error); + + /* issue #130 */ + check(utf8proc_toupper(0x00df) == 0x1e9e && + utf8proc_totitle(0x00df) == 0x1e9e && + utf8proc_tolower(0x00df) == 0x00df && + utf8proc_tolower(0x1e9e) == 0x00df && + utf8proc_toupper(0x1e9e) == 0x1e9e, + "incorrect 0x00df/0x1e9e case conversions"); + utf8proc_uint8_t str_00df[] = {0xc3, 0x9f, 0x00}; + utf8proc_uint8_t str_1e9e[] = {0xe1, 0xba, 0x9e, 0x00}; + check(!strcmp((char*)utf8proc_NFKC_Casefold(str_00df), "ss") && + !strcmp((char*)utf8proc_NFKC_Casefold(str_1e9e), "ss"), + "incorrect 0x00df/0x1e9e casefold normalization"); + + printf("More up-to-date than OS unicode tables for %d tests.\n", better); + printf("utf8proc case conversion tests SUCCEEDED.\n"); + return 0; +} diff --git a/src/utf8proc/test/charwidth.c b/src/utf8proc/test/charwidth.c new file mode 100644 index 000000000..c5cbbd7cd --- /dev/null +++ b/src/utf8proc/test/charwidth.c @@ -0,0 +1,77 @@ +#include "tests.h" +#include <ctype.h> +#include <wchar.h> + +static int my_unassigned(int c) { + int cat = utf8proc_get_property(c)->category; + return (cat == UTF8PROC_CATEGORY_CN) || (cat == UTF8PROC_CATEGORY_CO); +} + +static int my_isprint(int c) { + int cat = utf8proc_get_property(c)->category; + return (UTF8PROC_CATEGORY_LU <= cat && cat <= UTF8PROC_CATEGORY_ZS) || + (c == 0x0601 || c == 0x0602 || c == 0x0603 || c == 0x06dd || c == 0x00ad) || + (cat == UTF8PROC_CATEGORY_CN) || (cat == UTF8PROC_CATEGORY_CO); +} + +int main(int argc, char **argv) +{ + int c, error = 0, updates = 0; + + (void) argc; /* unused */ + (void) argv; /* unused */ + + /* some simple sanity tests of the character widths */ + for (c = 0; c <= 0x110000; ++c) { + int cat = utf8proc_get_property(c)->category; + int w = utf8proc_charwidth(c); + if ((cat == UTF8PROC_CATEGORY_MN || cat == UTF8PROC_CATEGORY_ME) && w > 0) { + fprintf(stderr, "nonzero width %d for combining char %x\n", w, c); + error += 1; + } + if (w == 0 && + ((cat >= UTF8PROC_CATEGORY_LU && cat <= UTF8PROC_CATEGORY_LO) || + (cat >= UTF8PROC_CATEGORY_ND && cat <= UTF8PROC_CATEGORY_SC) || + (cat >= UTF8PROC_CATEGORY_SO && cat <= UTF8PROC_CATEGORY_ZS))) { + fprintf(stderr, "zero width for symbol-like char %x\n", c); + error += 1; + } + if (c <= 127 && ((!isprint(c) && w > 0) || (isprint(c) && wcwidth(c) != w))) { + fprintf(stderr, "wcwidth %d mismatch %d for %s ASCII %x\n", + wcwidth(c), w, + isprint(c) ? "printable" : "non-printable", c); + error += 1; + } + if (!my_isprint(c) && w > 0) { + fprintf(stderr, "non-printing %x had width %d\n", c, w); + error += 1; + } + if (my_unassigned(c) && w != 1) { + fprintf(stderr, "unexpected width %d for unassigned char %x\n", w, c); + error += 1; + } + } + check(!error, "utf8proc_charwidth FAILED %d tests.", error); + + check(utf8proc_charwidth(0x00ad) == 1, "incorrect width for U+00AD (soft hyphen)"); + check(utf8proc_charwidth(0xe000) == 1, "incorrect width for U+e000 (PUA)"); + + /* print some other information by compariing with system wcwidth */ + printf("Mismatches with system wcwidth (not necessarily errors):\n"); + for (c = 0; c <= 0x110000; ++c) { + int w = utf8proc_charwidth(c); + int wc = wcwidth(c); + if (sizeof(wchar_t) == 2 && c >= (1<<16)) continue; + /* lots of these errors for out-of-date system unicode tables */ + if (wc == -1 && my_isprint(c) && !my_unassigned(c) && w > 0) + updates += 1; + if (wc == -1 && !my_isprint(c) && w > 0) + printf(" wcwidth(%x) = -1 for non-printable width-%d char\n", c, w); + if (wc >= 0 && wc != w) + printf(" wcwidth(%x) = %d != charwidth %d\n", c, wc, w); + } + printf(" ... (positive widths for %d chars unknown to wcwidth) ...\n", updates); + printf("Character-width tests SUCCEEDED.\n"); + + return 0; +} diff --git a/src/utf8proc/test/custom.c b/src/utf8proc/test/custom.c new file mode 100644 index 000000000..fe4239d91 --- /dev/null +++ b/src/utf8proc/test/custom.c @@ -0,0 +1,28 @@ +#include "tests.h" + +static int thunk_test = 1; + +static utf8proc_int32_t custom(utf8proc_int32_t codepoint, void *thunk) +{ + check(((int *) thunk) == &thunk_test, "unexpected thunk passed"); + if (codepoint == 'a') + return 'b'; + if (codepoint == 'S') + return 0x00df; /* ß */ + return codepoint; +} + +int main(void) +{ + utf8proc_uint8_t input[] = {0x41,0x61,0x53,0x62,0xef,0xbd,0x81,0x00}; /* "AaSb\uff41" */ + utf8proc_uint8_t correct[] = {0x61,0x62,0x73,0x73,0x62,0x61,0x00}; /* "abssba" */ + utf8proc_uint8_t *output; + utf8proc_map_custom(input, 0, &output, UTF8PROC_CASEFOLD | UTF8PROC_COMPOSE | UTF8PROC_COMPAT | UTF8PROC_NULLTERM, + custom, &thunk_test); + printf("mapped \"%s\" -> \"%s\"\n", (char*)input, (char*)output); + check(strlen((char*) output) == 6, "incorrect output length"); + check(!memcmp(correct, output, 7), "incorrect output data"); + free(output); + printf("map_custom tests SUCCEEDED.\n"); + return 0; +} diff --git a/src/utf8proc/test/graphemetest.c b/src/utf8proc/test/graphemetest.c new file mode 100644 index 000000000..eb3645b9a --- /dev/null +++ b/src/utf8proc/test/graphemetest.c @@ -0,0 +1,74 @@ +#include "tests.h" + +int main(int argc, char **argv) +{ + char *buf = NULL; + size_t bufsize = 0; + FILE *f = argc > 1 ? fopen(argv[1], "r") : NULL; + utf8proc_uint8_t src[1024]; + int len; + + check(f != NULL, "error opening GraphemeBreakTest.txt"); + while (getline(&buf, &bufsize, f) > 0) { + size_t bi = 0, si = 0; + lineno += 1; + + if (lineno % 100 == 0) + printf("checking line %zd...\n", lineno); + + if (buf[0] == '#') continue; + + while (buf[bi]) { + bi = skipspaces(buf, bi); + if (buf[bi] == '/') { /* grapheme break */ + src[si++] = '/'; + bi++; + } + else if (buf[bi] == '+') { /* no break */ + bi++; + } + else if (buf[bi] == '#') { /* start of comments */ + break; + } + else { /* hex-encoded codepoint */ + len = encode((char*) (src + si), buf + bi) - 1; + while (src[si]) ++si; /* advance to NUL termination */ + bi += len; + } + } + if (si && src[si-1] == '/') + --si; /* no break after final grapheme */ + src[si] = 0; /* NUL-terminate */ + + if (si) { + utf8proc_uint8_t utf8[1024]; /* copy src without 0xff grapheme separators */ + size_t i = 0, j = 0; + utf8proc_ssize_t glen; + utf8proc_uint8_t *g; /* utf8proc_map grapheme results */ + while (i < si) { + if (src[i] != '/') + utf8[j++] = src[i++]; + else + i++; + } + glen = utf8proc_map(utf8, j, &g, UTF8PROC_CHARBOUND); + if (glen == UTF8PROC_ERROR_INVALIDUTF8) { + /* the test file contains surrogate codepoints, which are only for UTF-16 */ + printf("line %zd: ignoring invalid UTF-8 codepoints\n", lineno); + } + else { + check(glen >= 0, "utf8proc_map error = %s", + utf8proc_errmsg(glen)); + for (i = 0; i <= glen; ++i) + if (g[i] == 0xff) + g[i] = '/'; /* easier-to-read output (/ is not in test strings) */ + check(!strcmp((char*)g, (char*)src), + "grapheme mismatch: \"%s\" instead of \"%s\"", (char*)g, (char*)src); + } + free(g); + } + } + fclose(f); + printf("Passed tests after %zd lines!\n", lineno); + return 0; +} diff --git a/src/utf8proc/test/iterate.c b/src/utf8proc/test/iterate.c new file mode 100644 index 000000000..c1674b799 --- /dev/null +++ b/src/utf8proc/test/iterate.c @@ -0,0 +1,166 @@ +#include "tests.h" +#include <ctype.h> +#include <wchar.h> + +static int tests; +static int error; + +#define CHECKVALID(pos, val, len) buf[pos] = val; testbytes(buf,len,len,__LINE__) +#define CHECKINVALID(pos, val, len) buf[pos] = val; testbytes(buf,len,UTF8PROC_ERROR_INVALIDUTF8,__LINE__) + +static void testbytes(unsigned char *buf, int len, utf8proc_ssize_t retval, int line) +{ + utf8proc_int32_t out[16]; + utf8proc_ssize_t ret; + + /* Make a copy to ensure that memory is left uninitialized after "len" + * bytes. This way, Valgrind can detect overreads. + */ + unsigned char tmp[16]; + memcpy(tmp, buf, len); + + tests++; + if ((ret = utf8proc_iterate(tmp, len, out)) != retval) { + fprintf(stderr, "Failed (%d):", line); + for (int i = 0; i < len ; i++) { + fprintf(stderr, " 0x%02x", tmp[i]); + } + fprintf(stderr, " -> %zd\n", ret); + error++; + } +} + +int main(int argc, char **argv) +{ + uint32_t byt; + unsigned char buf[16]; + + tests = error = 0; + + // Check valid sequences that were considered valid erroneously before + buf[0] = 0xef; + buf[1] = 0xb7; + for (byt = 0x90; byt < 0xa0; byt++) { + CHECKVALID(2, byt, 3); + } + // Check 0xfffe and 0xffff + buf[1] = 0xbf; + CHECKVALID(2, 0xbe, 3); + CHECKVALID(2, 0xbf, 3); + // Check 0x??fffe & 0x??ffff + for (byt = 0x1fffe; byt < 0x110000; byt += 0x10000) { + buf[0] = 0xf0 | (byt >> 18); + buf[1] = 0x80 | ((byt >> 12) & 0x3f); + CHECKVALID(3, 0xbe, 4); + CHECKVALID(3, 0xbf, 4); + } + + // Continuation byte not after lead + for (byt = 0x80; byt < 0xc0; byt++) { + CHECKINVALID(0, byt, 1); + } + + // Continuation byte not after lead + for (byt = 0x80; byt < 0xc0; byt++) { + CHECKINVALID(0, byt, 1); + } + + // Test lead bytes + for (byt = 0xc0; byt <= 0xff; byt++) { + // Single lead byte at end of string + CHECKINVALID(0, byt, 1); + // Lead followed by non-continuation character < 0x80 + CHECKINVALID(1, 65, 2); + // Lead followed by non-continuation character > 0xbf + CHECKINVALID(1, 0xc0, 2); + } + + // Test overlong 2-byte + buf[0] = 0xc0; + for (byt = 0x81; byt <= 0xbf; byt++) { + CHECKINVALID(1, byt, 2); + } + buf[0] = 0xc1; + for (byt = 0x80; byt <= 0xbf; byt++) { + CHECKINVALID(1, byt, 2); + } + + // Test overlong 3-byte + buf[0] = 0xe0; + buf[2] = 0x80; + for (byt = 0x80; byt <= 0x9f; byt++) { + CHECKINVALID(1, byt, 3); + } + + // Test overlong 4-byte + buf[0] = 0xf0; + buf[2] = 0x80; + buf[3] = 0x80; + for (byt = 0x80; byt <= 0x8f; byt++) { + CHECKINVALID(1, byt, 4); + } + + // Test 4-byte > 0x10ffff + buf[0] = 0xf4; + buf[2] = 0x80; + buf[3] = 0x80; + for (byt = 0x90; byt <= 0xbf; byt++) { + CHECKINVALID(1, byt, 4); + } + buf[1] = 0x80; + for (byt = 0xf5; byt <= 0xf7; byt++) { + CHECKINVALID(0, byt, 4); + } + + // Test 5-byte + buf[4] = 0x80; + for (byt = 0xf8; byt <= 0xfb; byt++) { + CHECKINVALID(0, byt, 5); + } + + // Test 6-byte + buf[5] = 0x80; + for (byt = 0xfc; byt <= 0xfd; byt++) { + CHECKINVALID(0, byt, 6); + } + + // Test 7-byte + buf[6] = 0x80; + CHECKINVALID(0, 0xfe, 7); + + // Three and above byte sequences + for (byt = 0xe0; byt < 0xf0; byt++) { + // Lead followed by only 1 continuation byte + CHECKINVALID(0, byt, 2); + // Lead ended by non-continuation character < 0x80 + CHECKINVALID(2, 65, 3); + // Lead ended by non-continuation character > 0xbf + CHECKINVALID(2, 0xc0, 3); + } + + // 3-byte encoded surrogate character(s) + buf[0] = 0xed; buf[2] = 0x80; + // Single surrogate + CHECKINVALID(1, 0xa0, 3); + // Trailing surrogate first + CHECKINVALID(1, 0xb0, 3); + + // Four byte sequences + buf[1] = 0x80; + for (byt = 0xf0; byt < 0xf5; byt++) { + // Lead followed by only 1 continuation bytes + CHECKINVALID(0, byt, 2); + // Lead followed by only 2 continuation bytes + CHECKINVALID(0, byt, 3); + // Lead followed by non-continuation character < 0x80 + CHECKINVALID(3, 65, 4); + // Lead followed by non-continuation character > 0xbf + CHECKINVALID(3, 0xc0, 4); + + } + + check(!error, "utf8proc_iterate FAILED %d tests out of %d", error, tests); + printf("utf8proc_iterate tests SUCCEEDED, (%d) tests passed.\n", tests); + + return 0; +} diff --git a/src/utf8proc/test/misc.c b/src/utf8proc/test/misc.c new file mode 100644 index 000000000..56d81d549 --- /dev/null +++ b/src/utf8proc/test/misc.c @@ -0,0 +1,46 @@ +/* Miscellaneous tests, e.g. regression tests */ + +#include "tests.h" + +static void issue128(void) /* #128 */ +{ + utf8proc_uint8_t input[] = {0x72, 0xcc, 0x87, 0xcc, 0xa3, 0x00}; /* "r\u0307\u0323" */ + utf8proc_uint8_t nfc[] = {0xe1, 0xb9, 0x9b, 0xcc, 0x87, 0x00}; /* "\u1E5B\u0307" */ + utf8proc_uint8_t nfd[] = {0x72, 0xcc, 0xa3, 0xcc, 0x87, 0x00}; /* "r\u0323\u0307" */ + utf8proc_uint8_t *nfc_out, *nfd_out; + nfc_out = utf8proc_NFC(input); + printf("NFC \"%s\" -> \"%s\" vs. \"%s\"\n", (char*)input, (char*)nfc_out, (char*)nfc); + check(strlen((char*) nfc_out) == 5, "incorrect nfc length"); + check(!memcmp(nfc, nfc_out, 6), "incorrect nfc data"); + nfd_out = utf8proc_NFD(input); + printf("NFD \"%s\" -> \"%s\" vs. \"%s\"\n", (char*)input, (char*)nfd_out, (char*)nfd); + check(strlen((char*) nfd_out) == 5, "incorrect nfd length"); + check(!memcmp(nfd, nfd_out, 6), "incorrect nfd data"); + free(nfd_out); free(nfc_out); +} + +static void issue102(void) /* #128 */ +{ + utf8proc_uint8_t input[] = {0x58, 0xe2, 0x81, 0xa5, 0x45, 0xcc, 0x80, 0xc2, 0xad, 0xe1, 0xb4, 0xac, 0x00}; /* "X\u2065E\u0300\u00ad\u1d2c" */ + utf8proc_uint8_t stripna[] = {0x78, 0xc3, 0xa8, 0x61, 0x00}; /* "x\u00e8a" */ + utf8proc_uint8_t correct[] = {0x78, 0xe2, 0x81, 0xa5, 0xc3, 0xa8, 0x61, 0x00}; /* "x\u2065\u00e8a" */ + utf8proc_uint8_t *output; + utf8proc_map(input, 0, &output, UTF8PROC_NULLTERM | UTF8PROC_STABLE | + UTF8PROC_COMPOSE | UTF8PROC_COMPAT | UTF8PROC_CASEFOLD | UTF8PROC_IGNORE | UTF8PROC_STRIPNA); + printf("NFKC_Casefold \"%s\" -> \"%s\" vs. \"%s\"\n", (char*)input, (char*)output, (char*)stripna); + check(strlen((char*) output) == 4, "incorrect NFKC_Casefold+stripna length"); + check(!memcmp(stripna, output, 5), "incorrect NFKC_Casefold+stripna data"); + free(output); + output = utf8proc_NFKC_Casefold(input); + printf("NFKC_Casefold \"%s\" -> \"%s\" vs. \"%s\"\n", (char*)input, (char*)output, (char*)correct); + check(strlen((char*) output) == 7, "incorrect NFKC_Casefold length"); + check(!memcmp(correct, output, 8), "incorrect NFKC_Casefold data"); +} + +int main(void) +{ + issue128(); + issue102(); + printf("Misc tests SUCCEEDED.\n"); + return 0; +} diff --git a/src/utf8proc/test/normtest.c b/src/utf8proc/test/normtest.c new file mode 100644 index 000000000..555c14c84 --- /dev/null +++ b/src/utf8proc/test/normtest.c @@ -0,0 +1,64 @@ +#include "tests.h" + +#define CHECK_NORM(NRM, norm, src) { \ + char *src_norm = (char*) utf8proc_ ## NRM((utf8proc_uint8_t*) src); \ + check(!strcmp(norm, src_norm), \ + "normalization failed for %s -> %s", src, norm); \ + free(src_norm); \ +} + +int main(int argc, char **argv) +{ + char *buf = NULL; + size_t bufsize = 0; + FILE *f = argc > 1 ? fopen(argv[1], "r") : NULL; + char source[1024], NFC[1024], NFD[1024], NFKC[1024], NFKD[1024]; + + check(f != NULL, "error opening NormalizationTest.txt"); + while (getline(&buf, &bufsize, f) > 0) { + size_t offset; + lineno += 1; + + if (buf[0] == '@') { + printf("line %zd: %s", lineno, buf + 1); + continue; + } + else if (lineno % 1000 == 0) + printf("checking line %zd...\n", lineno); + + if (buf[0] == '#') continue; + + offset = encode(source, buf); + offset += encode(NFC, buf + offset); + offset += encode(NFD, buf + offset); + offset += encode(NFKC, buf + offset); + offset += encode(NFKD, buf + offset); + + CHECK_NORM(NFC, NFC, source); + CHECK_NORM(NFC, NFC, NFC); + CHECK_NORM(NFC, NFC, NFD); + CHECK_NORM(NFC, NFKC, NFKC); + CHECK_NORM(NFC, NFKC, NFKD); + + CHECK_NORM(NFD, NFD, source); + CHECK_NORM(NFD, NFD, NFC); + CHECK_NORM(NFD, NFD, NFD); + CHECK_NORM(NFD, NFKD, NFKC); + CHECK_NORM(NFD, NFKD, NFKD); + + CHECK_NORM(NFKC, NFKC, source); + CHECK_NORM(NFKC, NFKC, NFC); + CHECK_NORM(NFKC, NFKC, NFD); + CHECK_NORM(NFKC, NFKC, NFKC); + CHECK_NORM(NFKC, NFKC, NFKD); + + CHECK_NORM(NFKD, NFKD, source); + CHECK_NORM(NFKD, NFKD, NFC); + CHECK_NORM(NFKD, NFKD, NFD); + CHECK_NORM(NFKD, NFKD, NFKC); + CHECK_NORM(NFKD, NFKD, NFKD); + } + fclose(f); + printf("Passed tests after %zd lines!\n", lineno); + return 0; +} diff --git a/src/utf8proc/test/printproperty.c b/src/utf8proc/test/printproperty.c new file mode 100644 index 000000000..4017eac4c --- /dev/null +++ b/src/utf8proc/test/printproperty.c @@ -0,0 +1,60 @@ +/* simple test program to print out the utf8proc properties for a codepoint */ + +#include "tests.h" + +int main(int argc, char **argv) +{ + int i; + + for (i = 1; i < argc; ++i) { + utf8proc_uint8_t cstr[16], *map; + unsigned int c; + if (!strcmp(argv[i], "-V")) { + printf("utf8proc version %s\n", utf8proc_version()); + continue; + } + check(sscanf(argv[i],"%x",&c) == 1, "invalid hex input %s", argv[i]); + const utf8proc_property_t *p = utf8proc_get_property(c); + + if (utf8proc_codepoint_valid(c)) + cstr[utf8proc_encode_char(c, cstr)] = 0; + else + strcat((char*)cstr, "N/A"); + utf8proc_map(cstr, 0, &map, UTF8PROC_NULLTERM | UTF8PROC_CASEFOLD); + + printf("U+%s: %s\n" + " category = %s\n" + " combining_class = %d\n" + " bidi_class = %d\n" + " decomp_type = %d\n" + " uppercase_mapping = %x\n" + " lowercase_mapping = %x\n" + " titlecase_mapping = %x\n" + " casefold = %s\n" + " comb_index = %d\n" + " bidi_mirrored = %d\n" + " comp_exclusion = %d\n" + " ignorable = %d\n" + " control_boundary = %d\n" + " boundclass = %d\n" + " charwidth = %d\n", + argv[i], (char*) cstr, + utf8proc_category_string(c), + p->combining_class, + p->bidi_class, + p->decomp_type, + utf8proc_toupper(c), + utf8proc_tolower(c), + utf8proc_totitle(c), + (char *) map, + p->comb_index, + p->bidi_mirrored, + p->comp_exclusion, + p->ignorable, + p->control_boundary, + p->boundclass, + utf8proc_charwidth(c)); + free(map); + } + return 0; +} diff --git a/src/utf8proc/test/tests.c b/src/utf8proc/test/tests.c new file mode 100644 index 000000000..0fb0da363 --- /dev/null +++ b/src/utf8proc/test/tests.c @@ -0,0 +1,46 @@ +/* Common functions for our test programs. */ + +#include "tests.h" + +size_t lineno = 0; + +void check(int cond, const char *format, ...) +{ + if (!cond) { + va_list args; + fprintf(stderr, "line %zd: ", lineno); + va_start(args, format); + vfprintf(stderr, format, args); + va_end(args); + fprintf(stderr, "\n"); + exit(1); + } +} + +size_t skipspaces(const char *buf, size_t i) +{ + while (isspace(buf[i])) ++i; + return i; +} + +/* if buf points to a sequence of codepoints encoded as hexadecimal strings, + separated by whitespace, and terminated by any character not in + [0-9a-fA-F] or whitespace, then stores the corresponding utf8 string + in dest, returning the number of bytes read from buf */ +size_t encode(char *dest, const char *buf) +{ + size_t i = 0, j, d = 0; + for (;;) { + int c; + i = skipspaces(buf, i); + for (j=i; buf[j] && strchr("0123456789abcdef", tolower(buf[j])); ++j) + ; /* find end of hex input */ + if (j == i) { /* no codepoint found */ + dest[d] = 0; /* NUL-terminate destination string */ + return i + 1; + } + check(sscanf(buf + i, "%x", (unsigned int *)&c) == 1, "invalid hex input %s", buf+i); + i = j; /* skip to char after hex input */ + d += utf8proc_encode_char(c, (utf8proc_uint8_t *) (dest + d)); + } +} diff --git a/src/utf8proc/test/tests.h b/src/utf8proc/test/tests.h new file mode 100644 index 000000000..1811a734a --- /dev/null +++ b/src/utf8proc/test/tests.h @@ -0,0 +1,23 @@ +/* Common functions and includes for our test programs. */ + +/* + * Set feature macro to enable getline() and wcwidth(). + * + * Please refer to section 2.2.1 of POSIX.1-2008: + * http://pubs.opengroup.org/onlinepubs/9699919799/functions/V2_chap02.html#tag_15_02_01_02 + */ +#define _XOPEN_SOURCE 700 + +#include <stdio.h> +#include <stdlib.h> +#include <ctype.h> +#include <string.h> +#include <stdarg.h> + +#include "../utf8proc.h" + +extern size_t lineno; + +void check(int cond, const char *format, ...); +size_t skipspaces(const char *buf, size_t i); +size_t encode(char *dest, const char *buf); diff --git a/src/utf8proc/test/valid.c b/src/utf8proc/test/valid.c new file mode 100644 index 000000000..eadfb8520 --- /dev/null +++ b/src/utf8proc/test/valid.c @@ -0,0 +1,41 @@ +#include "tests.h" +#include <ctype.h> +#include <wchar.h> + +int main(int argc, char **argv) +{ + int c, error = 0; + + (void) argc; /* unused */ + (void) argv; /* unused */ + + /* some simple sanity tests of */ + for (c = 0; c < 0xd800; c++) { + if (!utf8proc_codepoint_valid(c)) { + fprintf(stderr, "Failed: codepoint_valid(%04x) -> false\n", c); + error++; + } + } + for (;c < 0xe000; c++) { + if (utf8proc_codepoint_valid(c)) { + fprintf(stderr, "Failed: codepoint_valid(%04x) -> true\n", c); + error++; + } + } + for (;c < 0x110000; c++) { + if (!utf8proc_codepoint_valid(c)) { + fprintf(stderr, "Failed: codepoint_valid(%06x) -> false\n", c); + error++; + } + } + for (;c < 0x110010; c++) { + if (utf8proc_codepoint_valid(c)) { + fprintf(stderr, "Failed: codepoint_valid(%06x) -> true\n", c); + error++; + } + } + check(!error, "utf8proc_codepoint_valid FAILED %d tests.", error); + printf("Validity tests SUCCEEDED.\n"); + + return 0; +} |