diff options
Diffstat (limited to 'src/util/strcasecmp_utf8.c')
-rw-r--r-- | src/util/strcasecmp_utf8.c | 216 |
1 files changed, 216 insertions, 0 deletions
diff --git a/src/util/strcasecmp_utf8.c b/src/util/strcasecmp_utf8.c new file mode 100644 index 0000000..e3f20df --- /dev/null +++ b/src/util/strcasecmp_utf8.c @@ -0,0 +1,216 @@ +/*++ +/* NAME +/* strcasecmp_utf8 3 +/* SUMMARY +/* caseless string comparison +/* SYNOPSIS +/* #include <stringops.h> +/* +/* int strcasecmp_utf8( +/* const char *s1, +/* const char *s2) +/* +/* int strncasecmp_utf8( +/* const char *s1, +/* const char *s2, +/* ssize_t len) +/* AUXILIARY FUNCTIONS +/* int strcasecmp_utf8x( +/* int flags, +/* const char *s1, +/* const char *s2) +/* +/* int strncasecmp_utf8x( +/* int flags, +/* const char *s1, +/* const char *s2, +/* ssize_t len) +/* DESCRIPTION +/* strcasecmp_utf8() implements caseless string comparison for +/* UTF-8 text, with an API similar to strcasecmp(). Only ASCII +/* characters are casefolded when the code is compiled without +/* EAI support or when util_utf8_enable is zero. +/* +/* strncasecmp_utf8() implements caseless string comparison +/* for UTF-8 text, with an API similar to strncasecmp(). Only +/* ASCII characters are casefolded when the code is compiled +/* without EAI support or when util_utf8_enable is zero. +/* +/* strcasecmp_utf8x() and strncasecmp_utf8x() implement a more +/* complex API that provides the above functionality and more. +/* +/* Arguments: +/* .IP "s1, s2" +/* Null-terminated strings to be compared. +/* .IP len +/* String length before casefolding. +/* .IP flags +/* Zero or CASEF_FLAG_UTF8. The latter flag enables UTF-8 case +/* folding instead of folding only ASCII characters. This flag +/* is ignored when compiled without EAI support. +/* SEE ALSO +/* casefold(), casefold text for caseless comparison. +/* LICENSE +/* .ad +/* .fi +/* The Secure Mailer license must be distributed with this software. +/* AUTHOR(S) +/* Wietse Venema +/* IBM T.J. Watson Research +/* P.O. Box 704 +/* Yorktown Heights, NY 10598, USA +/* +/* Wietse Venema +/* Google, Inc. +/* 111 8th Avenue +/* New York, NY 10011, USA +/*--*/ + + /* + * System library. + */ +#include <sys_defs.h> +#include <string.h> + +#ifdef STRCASECMP_IN_STRINGS_H +#include <strings.h> +#endif + + /* + * Utility library. + */ +#include <stringops.h> + +#define STR(x) vstring_str(x) + +static VSTRING *f1; /* casefold result for s1 */ +static VSTRING *f2; /* casefold result for s2 */ + +/* strcasecmp_utf8_init - initialize */ + +static void strcasecmp_utf8_init(void) +{ + f1 = vstring_alloc(100); + f2 = vstring_alloc(100); +} + +/* strcasecmp_utf8x - caseless string comparison */ + +int strcasecmp_utf8x(int flags, const char *s1, const char *s2) +{ + + /* + * Short-circuit optimization for ASCII-only text. This may be slower + * than using a cache for all results. We must not expose strcasecmp(3) + * to non-ASCII text. + */ + if (allascii(s1) && allascii(s2)) + return (strcasecmp(s1, s2)); + + if (f1 == 0) + strcasecmp_utf8_init(); + + /* + * Cross our fingers and hope that strcmp() remains agnostic of + * charactersets and locales. + */ + flags &= CASEF_FLAG_UTF8; + casefoldx(flags, f1, s1, -1); + casefoldx(flags, f2, s2, -1); + return (strcmp(STR(f1), STR(f2))); +} + +/* strncasecmp_utf8x - caseless string comparison */ + +int strncasecmp_utf8x(int flags, const char *s1, const char *s2, + ssize_t len) +{ + + /* + * Consider using a cache for all results. + */ + if (f1 == 0) + strcasecmp_utf8_init(); + + /* + * Short-circuit optimization for ASCII-only text. This may be slower + * than using a cache for all results. See comments above for limitations + * of strcasecmp(). + */ + if (allascii_len(s1, len) && allascii_len(s2, len)) + return (strncasecmp(s1, s2, len)); + + /* + * Caution: casefolding may change the number of bytes. See comments + * above for concerns about strcmp(). + */ + flags &= CASEF_FLAG_UTF8; + casefoldx(flags, f1, s1, len); + casefoldx(flags, f2, s2, len); + return (strcmp(STR(f1), STR(f2))); +} + +#ifdef TEST +#include <stdio.h> +#include <stdlib.h> +#include <vstream.h> +#include <vstring_vstream.h> +#include <msg_vstream.h> +#include <argv.h> + +int main(int argc, char **argv) +{ + VSTRING *buffer = vstring_alloc(1); + ARGV *cmd; + char **args; + int len; + int flags; + int res; + + msg_vstream_init(argv[0], VSTREAM_ERR); + flags = CASEF_FLAG_UTF8; + util_utf8_enable = 1; + while (vstring_fgets_nonl(buffer, VSTREAM_IN)) { + vstream_printf("> %s\n", STR(buffer)); + cmd = argv_split(STR(buffer), CHARS_SPACE); + if (cmd->argc == 0 || cmd->argv[0][0] == '#') + continue; + args = cmd->argv; + + /* + * Compare two strings. + */ + if (strcmp(args[0], "compare") == 0 && cmd->argc == 3) { + res = strcasecmp_utf8x(flags, args[1], args[2]); + vstream_printf("\"%s\" %s \"%s\"\n", + args[1], + res < 0 ? "<" : res == 0 ? "==" : ">", + args[2]); + } + + /* + * Compare two substrings. + */ + else if (strcmp(args[0], "compare-len") == 0 && cmd->argc == 4 + && sscanf(args[3], "%d", &len) == 1 && len >= 0) { + res = strncasecmp_utf8x(flags, args[1], args[2], len); + vstream_printf("\"%.*s\" %s \"%.*s\"\n", + len, args[1], + res < 0 ? "<" : res == 0 ? "==" : ">", + len, args[2]); + } + + /* + * Usage. + */ + else { + vstream_printf("Usage: %s compare <s1> <s2> | compare-len <s1> <s2> <len>\n", + argv[0]); + } + vstream_fflush(VSTREAM_OUT); + argv_free(cmd); + } + exit(0); +} + +#endif /* TEST */ |