From e6918187568dbd01842d8d1d2c808ce16a894239 Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Sun, 21 Apr 2024 13:54:28 +0200 Subject: Adding upstream version 18.2.2. Signed-off-by: Daniel Baumann --- src/utf8proc/bench/Makefile | 39 +++++++++++++++++++++++++++ src/utf8proc/bench/bench.c | 56 ++++++++++++++++++++++++++++++++++++++ src/utf8proc/bench/icu.c | 61 ++++++++++++++++++++++++++++++++++++++++++ src/utf8proc/bench/unistring.c | 60 +++++++++++++++++++++++++++++++++++++++++ src/utf8proc/bench/util.c | 39 +++++++++++++++++++++++++++ src/utf8proc/bench/util.h | 22 +++++++++++++++ 6 files changed, 277 insertions(+) create mode 100644 src/utf8proc/bench/Makefile create mode 100644 src/utf8proc/bench/bench.c create mode 100644 src/utf8proc/bench/icu.c create mode 100644 src/utf8proc/bench/unistring.c create mode 100644 src/utf8proc/bench/util.c create mode 100644 src/utf8proc/bench/util.h (limited to 'src/utf8proc/bench') diff --git a/src/utf8proc/bench/Makefile b/src/utf8proc/bench/Makefile new file mode 100644 index 000000000..ea12dcbba --- /dev/null +++ b/src/utf8proc/bench/Makefile @@ -0,0 +1,39 @@ +CURL=curl + +CC = cc +CFLAGS = -O2 -std=c99 -pedantic -Wall + +all: bench + +LIBUTF8PROC = ../utf8proc.o + +bench: bench.o util.o $(LIBUTF8PROC) + $(CC) $(CFLAGS) $(LDFLAGS) -o $@ bench.o util.o $(LIBUTF8PROC) + +DATAURL = https://raw.githubusercontent.com/duerst/eprun/master/benchmark +DATAFILES = Deutsch_.txt Japanese_.txt Korean_.txt Vietnamese_.txt + +$(DATAFILES): + $(CURL) -O $(DATAURL)/$@ + +bench.out: $(DATAFILES) bench + ./bench -nfkc $(DATAFILES) > $@ + +# you may need make CPPFLAGS=... LDFLAGS=... to help it find ICU +icu: icu.o util.o + $(CC) $(CFLAGS) $(LDFLAGS) -o $@ icu.o util.o -licuuc + +icu.out: $(DATAFILES) icu + ./icu $(DATAFILES) > $@ + +unistring: unistring.o util.o + $(CC) $(CFLAGS) $(LDFLAGS) -o $@ unistring.o util.o -lunistring + +unistring.out: $(DATAFILES) unistring + ./unistring $(DATAFILES) > $@ + +.c.o: + $(CC) $(CPPFLAGS) -I.. $(CFLAGS) -c -o $@ $< + +clean: + rm -rf *.o *.txt bench *.out icu unistring diff --git a/src/utf8proc/bench/bench.c b/src/utf8proc/bench/bench.c new file mode 100644 index 000000000..4932c6d44 --- /dev/null +++ b/src/utf8proc/bench/bench.c @@ -0,0 +1,56 @@ +#include +#include +#include + +#include "utf8proc.h" +#include "util.h" + +int main(int argc, char **argv) +{ + int i, j; + int options = 0; + + for (i = 1; i < argc; ++i) { + if (!strcmp(argv[i], "-nfkc")) { + options |= UTF8PROC_STABLE|UTF8PROC_COMPOSE|UTF8PROC_COMPAT; + continue; + } + if (!strcmp(argv[i], "-nfkd")) { + options |= UTF8PROC_STABLE|UTF8PROC_DECOMPOSE|UTF8PROC_COMPAT; + continue; + } + if (!strcmp(argv[i], "-nfc")) { + options |= UTF8PROC_STABLE|UTF8PROC_COMPOSE; + continue; + } + if (!strcmp(argv[i], "-nfd")) { + options |= UTF8PROC_STABLE|UTF8PROC_DECOMPOSE; + continue; + } + if (!strcmp(argv[i], "-casefold")) { + options |= UTF8PROC_CASEFOLD; + continue; + } + if (argv[i][0] == '-') { + fprintf(stderr, "unrecognized option: %s\n", argv[i]); + return EXIT_FAILURE; + } + + size_t len; + uint8_t *src = readfile(argv[i], &len); + if (!src) { + fprintf(stderr, "error reading %s\n", argv[i]); + return EXIT_FAILURE; + } + uint8_t *dest; + mytime start = gettime(); + for (j = 0; j < 100; ++j) { + utf8proc_map(src, len, &dest, options); + free(dest); + } + printf("%s: %g\n", argv[i], elapsed(gettime(), start) / 100); + free(src); + } + + return EXIT_SUCCESS; +} diff --git a/src/utf8proc/bench/icu.c b/src/utf8proc/bench/icu.c new file mode 100644 index 000000000..3ac351415 --- /dev/null +++ b/src/utf8proc/bench/icu.c @@ -0,0 +1,61 @@ +#include +#include + +/* ICU4C */ +#include +#include +#include +#include + +#include "util.h" + +int main(int argc, char **argv) +{ + int i; + + UErrorCode err; + UConverter *uc = ucnv_open("UTF8", &err); + if (U_FAILURE(err)) return EXIT_FAILURE; + + const UNormalizer2 *NFKC = unorm2_getNFKCInstance(&err); + if (U_FAILURE(err)) return EXIT_FAILURE; + + for (i = 1; i < argc; ++i) { + if (argv[i][0] == '-') { + fprintf(stderr, "unrecognized option: %s\n", argv[i]); + return EXIT_FAILURE; + } + + size_t len; + uint8_t *src = readfile(argv[i], &len); + if (!src) { + fprintf(stderr, "error reading %s\n", argv[i]); + return EXIT_FAILURE; + } + + /* convert UTF8 data to ICU's UTF16 */ + UChar *usrc = (UChar*) malloc(2*len * sizeof(UChar)); + ucnv_toUChars(uc, usrc, 2*len, (char*) src, len, &err); + if (U_FAILURE(err)) return EXIT_FAILURE; + size_t ulen = u_strlen(usrc); + + /* ICU's insane normalization API requires you to + know the size of the destination buffer in advance, + or alternatively to repeatly try normalizing and + double the buffer size until it succeeds. Here, I just + allocate a huge destination buffer to avoid the issue. */ + UChar *udest = (UChar*) malloc(10*ulen * sizeof(UChar)); + + mytime start = gettime(); + for (int i = 0; i < 100; ++i) { + unorm2_normalize(NFKC, usrc, ulen, udest, 10*ulen, &err); + if (U_FAILURE(err)) return EXIT_FAILURE; + } + printf("%s: %g\n", argv[i], elapsed(gettime(), start) / 100); + free(udest); + free(usrc); + free(src); + } + + return EXIT_SUCCESS; +} diff --git a/src/utf8proc/bench/unistring.c b/src/utf8proc/bench/unistring.c new file mode 100644 index 000000000..2cc5ce8d9 --- /dev/null +++ b/src/utf8proc/bench/unistring.c @@ -0,0 +1,60 @@ +/* comparitive benchmark of GNU libunistring */ + +#include +#include +#include + +/* libunistring */ +#include +#include + +#include "util.h" + +int main(int argc, char **argv) +{ + int i; + uninorm_t nf = UNINORM_NFKC; + + for (i = 1; i < argc; ++i) { + if (!strcmp(argv[i], "-nfkc")) { + nf = UNINORM_NFKC; + continue; + } + if (!strcmp(argv[i], "-nfkd")) { + nf = UNINORM_NFKD; + continue; + } + if (!strcmp(argv[i], "-nfc")) { + nf = UNINORM_NFC; + continue; + } + if (!strcmp(argv[i], "-nfd")) { + nf = UNINORM_NFD; + continue; + } + if (argv[i][0] == '-') { + fprintf(stderr, "unrecognized option: %s\n", argv[i]); + return EXIT_FAILURE; + } + + size_t len; + uint8_t *src = readfile(argv[i], &len); + if (!src) { + fprintf(stderr, "error reading %s\n", argv[i]); + return EXIT_FAILURE; + } + + size_t destlen; + uint8_t *dest; + mytime start = gettime(); + for (int i = 0; i < 100; ++i) { + dest = u8_normalize(nf, src, len, NULL, &destlen); + if (!dest) return EXIT_FAILURE; + free(dest); + } + printf("%s: %g\n", argv[i], elapsed(gettime(), start) / 100); + free(src); + } + + return EXIT_SUCCESS; +} diff --git a/src/utf8proc/bench/util.c b/src/utf8proc/bench/util.c new file mode 100644 index 000000000..6130af57f --- /dev/null +++ b/src/utf8proc/bench/util.c @@ -0,0 +1,39 @@ +#include +#include +#include + +#include "util.h" + +/* read file named FILENAME into an array of *len bytes, + returning NULL on error */ +uint8_t *readfile(const char *filename, size_t *len) +{ + *len = 0; + struct stat st; + if (0 != stat(filename, &st)) return NULL; + *len = st.st_size; + FILE *f = fopen(filename, "r"); + if (!f) return NULL; + uint8_t *s = (uint8_t *) malloc(sizeof(uint8_t) * *len); + if (!s) return NULL; + if (fread(s, 1, *len, f) != *len) { + free(s); + s = NULL; + } + fclose(f); + return s; +} + +mytime gettime(void) { + mytime t; + gettimeofday(&t, NULL); + return t; +} + +/* time difference in seconds */ +double elapsed(mytime t1, mytime t0) +{ + return (double)(t1.tv_sec - t0.tv_sec) + + (double)(t1.tv_usec - t0.tv_usec) * 1.0E-6; +} + diff --git a/src/utf8proc/bench/util.h b/src/utf8proc/bench/util.h new file mode 100644 index 000000000..b178d3bb1 --- /dev/null +++ b/src/utf8proc/bench/util.h @@ -0,0 +1,22 @@ +#ifndef UTIL_H +#define UTIL_H 1 + +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +uint8_t *readfile(const char *filename, size_t *len); + +typedef struct timeval mytime; +mytime gettime(void); +double elapsed(mytime t1, mytime t0); + +#ifdef __cplusplus +} +#endif + +#endif /* UTIL_H */ -- cgit v1.2.3