summaryrefslogtreecommitdiffstats
path: root/src/utf8proc/bench
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-21 11:54:28 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-21 11:54:28 +0000
commite6918187568dbd01842d8d1d2c808ce16a894239 (patch)
tree64f88b554b444a49f656b6c656111a145cbbaa28 /src/utf8proc/bench
parentInitial commit. (diff)
downloadceph-e6918187568dbd01842d8d1d2c808ce16a894239.tar.xz
ceph-e6918187568dbd01842d8d1d2c808ce16a894239.zip
Adding upstream version 18.2.2.upstream/18.2.2
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/utf8proc/bench')
-rw-r--r--src/utf8proc/bench/Makefile39
-rw-r--r--src/utf8proc/bench/bench.c56
-rw-r--r--src/utf8proc/bench/icu.c61
-rw-r--r--src/utf8proc/bench/unistring.c60
-rw-r--r--src/utf8proc/bench/util.c39
-rw-r--r--src/utf8proc/bench/util.h22
6 files changed, 277 insertions, 0 deletions
diff --git a/src/utf8proc/bench/Makefile b/src/utf8proc/bench/Makefile
new file mode 100644
index 000000000..ea12dcbba
--- /dev/null
+++ b/src/utf8proc/bench/Makefile
@@ -0,0 +1,39 @@
+CURL=curl
+
+CC = cc
+CFLAGS = -O2 -std=c99 -pedantic -Wall
+
+all: bench
+
+LIBUTF8PROC = ../utf8proc.o
+
+bench: bench.o util.o $(LIBUTF8PROC)
+ $(CC) $(CFLAGS) $(LDFLAGS) -o $@ bench.o util.o $(LIBUTF8PROC)
+
+DATAURL = https://raw.githubusercontent.com/duerst/eprun/master/benchmark
+DATAFILES = Deutsch_.txt Japanese_.txt Korean_.txt Vietnamese_.txt
+
+$(DATAFILES):
+ $(CURL) -O $(DATAURL)/$@
+
+bench.out: $(DATAFILES) bench
+ ./bench -nfkc $(DATAFILES) > $@
+
+# you may need make CPPFLAGS=... LDFLAGS=... to help it find ICU
+icu: icu.o util.o
+ $(CC) $(CFLAGS) $(LDFLAGS) -o $@ icu.o util.o -licuuc
+
+icu.out: $(DATAFILES) icu
+ ./icu $(DATAFILES) > $@
+
+unistring: unistring.o util.o
+ $(CC) $(CFLAGS) $(LDFLAGS) -o $@ unistring.o util.o -lunistring
+
+unistring.out: $(DATAFILES) unistring
+ ./unistring $(DATAFILES) > $@
+
+.c.o:
+ $(CC) $(CPPFLAGS) -I.. $(CFLAGS) -c -o $@ $<
+
+clean:
+ rm -rf *.o *.txt bench *.out icu unistring
diff --git a/src/utf8proc/bench/bench.c b/src/utf8proc/bench/bench.c
new file mode 100644
index 000000000..4932c6d44
--- /dev/null
+++ b/src/utf8proc/bench/bench.c
@@ -0,0 +1,56 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "utf8proc.h"
+#include "util.h"
+
+int main(int argc, char **argv)
+{
+ int i, j;
+ int options = 0;
+
+ for (i = 1; i < argc; ++i) {
+ if (!strcmp(argv[i], "-nfkc")) {
+ options |= UTF8PROC_STABLE|UTF8PROC_COMPOSE|UTF8PROC_COMPAT;
+ continue;
+ }
+ if (!strcmp(argv[i], "-nfkd")) {
+ options |= UTF8PROC_STABLE|UTF8PROC_DECOMPOSE|UTF8PROC_COMPAT;
+ continue;
+ }
+ if (!strcmp(argv[i], "-nfc")) {
+ options |= UTF8PROC_STABLE|UTF8PROC_COMPOSE;
+ continue;
+ }
+ if (!strcmp(argv[i], "-nfd")) {
+ options |= UTF8PROC_STABLE|UTF8PROC_DECOMPOSE;
+ continue;
+ }
+ if (!strcmp(argv[i], "-casefold")) {
+ options |= UTF8PROC_CASEFOLD;
+ continue;
+ }
+ if (argv[i][0] == '-') {
+ fprintf(stderr, "unrecognized option: %s\n", argv[i]);
+ return EXIT_FAILURE;
+ }
+
+ size_t len;
+ uint8_t *src = readfile(argv[i], &len);
+ if (!src) {
+ fprintf(stderr, "error reading %s\n", argv[i]);
+ return EXIT_FAILURE;
+ }
+ uint8_t *dest;
+ mytime start = gettime();
+ for (j = 0; j < 100; ++j) {
+ utf8proc_map(src, len, &dest, options);
+ free(dest);
+ }
+ printf("%s: %g\n", argv[i], elapsed(gettime(), start) / 100);
+ free(src);
+ }
+
+ return EXIT_SUCCESS;
+}
diff --git a/src/utf8proc/bench/icu.c b/src/utf8proc/bench/icu.c
new file mode 100644
index 000000000..3ac351415
--- /dev/null
+++ b/src/utf8proc/bench/icu.c
@@ -0,0 +1,61 @@
+#include <stdio.h>
+#include <stdlib.h>
+
+/* ICU4C */
+#include <unicode/utypes.h>
+#include <unicode/ustring.h>
+#include <unicode/ucnv.h>
+#include <unicode/unorm2.h>
+
+#include "util.h"
+
+int main(int argc, char **argv)
+{
+ int i;
+
+ UErrorCode err;
+ UConverter *uc = ucnv_open("UTF8", &err);
+ if (U_FAILURE(err)) return EXIT_FAILURE;
+
+ const UNormalizer2 *NFKC = unorm2_getNFKCInstance(&err);
+ if (U_FAILURE(err)) return EXIT_FAILURE;
+
+ for (i = 1; i < argc; ++i) {
+ if (argv[i][0] == '-') {
+ fprintf(stderr, "unrecognized option: %s\n", argv[i]);
+ return EXIT_FAILURE;
+ }
+
+ size_t len;
+ uint8_t *src = readfile(argv[i], &len);
+ if (!src) {
+ fprintf(stderr, "error reading %s\n", argv[i]);
+ return EXIT_FAILURE;
+ }
+
+ /* convert UTF8 data to ICU's UTF16 */
+ UChar *usrc = (UChar*) malloc(2*len * sizeof(UChar));
+ ucnv_toUChars(uc, usrc, 2*len, (char*) src, len, &err);
+ if (U_FAILURE(err)) return EXIT_FAILURE;
+ size_t ulen = u_strlen(usrc);
+
+ /* ICU's insane normalization API requires you to
+ know the size of the destination buffer in advance,
+ or alternatively to repeatly try normalizing and
+ double the buffer size until it succeeds. Here, I just
+ allocate a huge destination buffer to avoid the issue. */
+ UChar *udest = (UChar*) malloc(10*ulen * sizeof(UChar));
+
+ mytime start = gettime();
+ for (int i = 0; i < 100; ++i) {
+ unorm2_normalize(NFKC, usrc, ulen, udest, 10*ulen, &err);
+ if (U_FAILURE(err)) return EXIT_FAILURE;
+ }
+ printf("%s: %g\n", argv[i], elapsed(gettime(), start) / 100);
+ free(udest);
+ free(usrc);
+ free(src);
+ }
+
+ return EXIT_SUCCESS;
+}
diff --git a/src/utf8proc/bench/unistring.c b/src/utf8proc/bench/unistring.c
new file mode 100644
index 000000000..2cc5ce8d9
--- /dev/null
+++ b/src/utf8proc/bench/unistring.c
@@ -0,0 +1,60 @@
+/* comparitive benchmark of GNU libunistring */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+/* libunistring */
+#include <unistr.h>
+#include <uninorm.h>
+
+#include "util.h"
+
+int main(int argc, char **argv)
+{
+ int i;
+ uninorm_t nf = UNINORM_NFKC;
+
+ for (i = 1; i < argc; ++i) {
+ if (!strcmp(argv[i], "-nfkc")) {
+ nf = UNINORM_NFKC;
+ continue;
+ }
+ if (!strcmp(argv[i], "-nfkd")) {
+ nf = UNINORM_NFKD;
+ continue;
+ }
+ if (!strcmp(argv[i], "-nfc")) {
+ nf = UNINORM_NFC;
+ continue;
+ }
+ if (!strcmp(argv[i], "-nfd")) {
+ nf = UNINORM_NFD;
+ continue;
+ }
+ if (argv[i][0] == '-') {
+ fprintf(stderr, "unrecognized option: %s\n", argv[i]);
+ return EXIT_FAILURE;
+ }
+
+ size_t len;
+ uint8_t *src = readfile(argv[i], &len);
+ if (!src) {
+ fprintf(stderr, "error reading %s\n", argv[i]);
+ return EXIT_FAILURE;
+ }
+
+ size_t destlen;
+ uint8_t *dest;
+ mytime start = gettime();
+ for (int i = 0; i < 100; ++i) {
+ dest = u8_normalize(nf, src, len, NULL, &destlen);
+ if (!dest) return EXIT_FAILURE;
+ free(dest);
+ }
+ printf("%s: %g\n", argv[i], elapsed(gettime(), start) / 100);
+ free(src);
+ }
+
+ return EXIT_SUCCESS;
+}
diff --git a/src/utf8proc/bench/util.c b/src/utf8proc/bench/util.c
new file mode 100644
index 000000000..6130af57f
--- /dev/null
+++ b/src/utf8proc/bench/util.c
@@ -0,0 +1,39 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/stat.h>
+
+#include "util.h"
+
+/* read file named FILENAME into an array of *len bytes,
+ returning NULL on error */
+uint8_t *readfile(const char *filename, size_t *len)
+{
+ *len = 0;
+ struct stat st;
+ if (0 != stat(filename, &st)) return NULL;
+ *len = st.st_size;
+ FILE *f = fopen(filename, "r");
+ if (!f) return NULL;
+ uint8_t *s = (uint8_t *) malloc(sizeof(uint8_t) * *len);
+ if (!s) return NULL;
+ if (fread(s, 1, *len, f) != *len) {
+ free(s);
+ s = NULL;
+ }
+ fclose(f);
+ return s;
+}
+
+mytime gettime(void) {
+ mytime t;
+ gettimeofday(&t, NULL);
+ return t;
+}
+
+/* time difference in seconds */
+double elapsed(mytime t1, mytime t0)
+{
+ return (double)(t1.tv_sec - t0.tv_sec) +
+ (double)(t1.tv_usec - t0.tv_usec) * 1.0E-6;
+}
+
diff --git a/src/utf8proc/bench/util.h b/src/utf8proc/bench/util.h
new file mode 100644
index 000000000..b178d3bb1
--- /dev/null
+++ b/src/utf8proc/bench/util.h
@@ -0,0 +1,22 @@
+#ifndef UTIL_H
+#define UTIL_H 1
+
+#include <inttypes.h>
+#include <sys/time.h>
+#include <time.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+uint8_t *readfile(const char *filename, size_t *len);
+
+typedef struct timeval mytime;
+mytime gettime(void);
+double elapsed(mytime t1, mytime t0);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* UTIL_H */