summaryrefslogtreecommitdiffstats
path: root/src/util/strcasecmp_utf8.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/util/strcasecmp_utf8.c')
-rw-r--r--src/util/strcasecmp_utf8.c216
1 files changed, 216 insertions, 0 deletions
diff --git a/src/util/strcasecmp_utf8.c b/src/util/strcasecmp_utf8.c
new file mode 100644
index 0000000..e3f20df
--- /dev/null
+++ b/src/util/strcasecmp_utf8.c
@@ -0,0 +1,216 @@
+/*++
+/* NAME
+/* strcasecmp_utf8 3
+/* SUMMARY
+/* caseless string comparison
+/* SYNOPSIS
+/* #include <stringops.h>
+/*
+/* int strcasecmp_utf8(
+/* const char *s1,
+/* const char *s2)
+/*
+/* int strncasecmp_utf8(
+/* const char *s1,
+/* const char *s2,
+/* ssize_t len)
+/* AUXILIARY FUNCTIONS
+/* int strcasecmp_utf8x(
+/* int flags,
+/* const char *s1,
+/* const char *s2)
+/*
+/* int strncasecmp_utf8x(
+/* int flags,
+/* const char *s1,
+/* const char *s2,
+/* ssize_t len)
+/* DESCRIPTION
+/* strcasecmp_utf8() implements caseless string comparison for
+/* UTF-8 text, with an API similar to strcasecmp(). Only ASCII
+/* characters are casefolded when the code is compiled without
+/* EAI support or when util_utf8_enable is zero.
+/*
+/* strncasecmp_utf8() implements caseless string comparison
+/* for UTF-8 text, with an API similar to strncasecmp(). Only
+/* ASCII characters are casefolded when the code is compiled
+/* without EAI support or when util_utf8_enable is zero.
+/*
+/* strcasecmp_utf8x() and strncasecmp_utf8x() implement a more
+/* complex API that provides the above functionality and more.
+/*
+/* Arguments:
+/* .IP "s1, s2"
+/* Null-terminated strings to be compared.
+/* .IP len
+/* String length before casefolding.
+/* .IP flags
+/* Zero or CASEF_FLAG_UTF8. The latter flag enables UTF-8 case
+/* folding instead of folding only ASCII characters. This flag
+/* is ignored when compiled without EAI support.
+/* SEE ALSO
+/* casefold(), casefold text for caseless comparison.
+/* LICENSE
+/* .ad
+/* .fi
+/* The Secure Mailer license must be distributed with this software.
+/* AUTHOR(S)
+/* Wietse Venema
+/* IBM T.J. Watson Research
+/* P.O. Box 704
+/* Yorktown Heights, NY 10598, USA
+/*
+/* Wietse Venema
+/* Google, Inc.
+/* 111 8th Avenue
+/* New York, NY 10011, USA
+/*--*/
+
+ /*
+ * System library.
+ */
+#include <sys_defs.h>
+#include <string.h>
+
+#ifdef STRCASECMP_IN_STRINGS_H
+#include <strings.h>
+#endif
+
+ /*
+ * Utility library.
+ */
+#include <stringops.h>
+
+#define STR(x) vstring_str(x)
+
+static VSTRING *f1; /* casefold result for s1 */
+static VSTRING *f2; /* casefold result for s2 */
+
+/* strcasecmp_utf8_init - initialize */
+
+static void strcasecmp_utf8_init(void)
+{
+ f1 = vstring_alloc(100);
+ f2 = vstring_alloc(100);
+}
+
+/* strcasecmp_utf8x - caseless string comparison */
+
+int strcasecmp_utf8x(int flags, const char *s1, const char *s2)
+{
+
+ /*
+ * Short-circuit optimization for ASCII-only text. This may be slower
+ * than using a cache for all results. We must not expose strcasecmp(3)
+ * to non-ASCII text.
+ */
+ if (allascii(s1) && allascii(s2))
+ return (strcasecmp(s1, s2));
+
+ if (f1 == 0)
+ strcasecmp_utf8_init();
+
+ /*
+ * Cross our fingers and hope that strcmp() remains agnostic of
+ * charactersets and locales.
+ */
+ flags &= CASEF_FLAG_UTF8;
+ casefoldx(flags, f1, s1, -1);
+ casefoldx(flags, f2, s2, -1);
+ return (strcmp(STR(f1), STR(f2)));
+}
+
+/* strncasecmp_utf8x - caseless string comparison */
+
+int strncasecmp_utf8x(int flags, const char *s1, const char *s2,
+ ssize_t len)
+{
+
+ /*
+ * Consider using a cache for all results.
+ */
+ if (f1 == 0)
+ strcasecmp_utf8_init();
+
+ /*
+ * Short-circuit optimization for ASCII-only text. This may be slower
+ * than using a cache for all results. See comments above for limitations
+ * of strcasecmp().
+ */
+ if (allascii_len(s1, len) && allascii_len(s2, len))
+ return (strncasecmp(s1, s2, len));
+
+ /*
+ * Caution: casefolding may change the number of bytes. See comments
+ * above for concerns about strcmp().
+ */
+ flags &= CASEF_FLAG_UTF8;
+ casefoldx(flags, f1, s1, len);
+ casefoldx(flags, f2, s2, len);
+ return (strcmp(STR(f1), STR(f2)));
+}
+
+#ifdef TEST
+#include <stdio.h>
+#include <stdlib.h>
+#include <vstream.h>
+#include <vstring_vstream.h>
+#include <msg_vstream.h>
+#include <argv.h>
+
+int main(int argc, char **argv)
+{
+ VSTRING *buffer = vstring_alloc(1);
+ ARGV *cmd;
+ char **args;
+ int len;
+ int flags;
+ int res;
+
+ msg_vstream_init(argv[0], VSTREAM_ERR);
+ flags = CASEF_FLAG_UTF8;
+ util_utf8_enable = 1;
+ while (vstring_fgets_nonl(buffer, VSTREAM_IN)) {
+ vstream_printf("> %s\n", STR(buffer));
+ cmd = argv_split(STR(buffer), CHARS_SPACE);
+ if (cmd->argc == 0 || cmd->argv[0][0] == '#')
+ continue;
+ args = cmd->argv;
+
+ /*
+ * Compare two strings.
+ */
+ if (strcmp(args[0], "compare") == 0 && cmd->argc == 3) {
+ res = strcasecmp_utf8x(flags, args[1], args[2]);
+ vstream_printf("\"%s\" %s \"%s\"\n",
+ args[1],
+ res < 0 ? "<" : res == 0 ? "==" : ">",
+ args[2]);
+ }
+
+ /*
+ * Compare two substrings.
+ */
+ else if (strcmp(args[0], "compare-len") == 0 && cmd->argc == 4
+ && sscanf(args[3], "%d", &len) == 1 && len >= 0) {
+ res = strncasecmp_utf8x(flags, args[1], args[2], len);
+ vstream_printf("\"%.*s\" %s \"%.*s\"\n",
+ len, args[1],
+ res < 0 ? "<" : res == 0 ? "==" : ">",
+ len, args[2]);
+ }
+
+ /*
+ * Usage.
+ */
+ else {
+ vstream_printf("Usage: %s compare <s1> <s2> | compare-len <s1> <s2> <len>\n",
+ argv[0]);
+ }
+ vstream_fflush(VSTREAM_OUT);
+ argv_free(cmd);
+ }
+ exit(0);
+}
+
+#endif /* TEST */