diff options
Diffstat (limited to 'src/util/casefold.c')
-rw-r--r-- | src/util/casefold.c | 359 |
1 files changed, 359 insertions, 0 deletions
diff --git a/src/util/casefold.c b/src/util/casefold.c new file mode 100644 index 0000000..d3ebd4b --- /dev/null +++ b/src/util/casefold.c @@ -0,0 +1,359 @@ +/*++ +/* NAME +/* casefold 3 +/* SUMMARY +/* casefold text for caseless comparison +/* SYNOPSIS +/* #include <stringops.h> +/* +/* char *casefold( +/* VSTRING *dst, +/* const char *src) +/* +/* char *casefold_append( +/* VSTRING *dst, +/* const char *src) +/* +/* char *casefold_len( +/* VSTRING *dst, +/* const char *src, +/* ssize_t src_len) +/* AUXILIARY FUNCTIONS +/* char *casefoldx( +/* int flags, +/* VSTRING *dst, +/* const char *src, +/* ssize_t src_len) +/* DESCRIPTION +/* casefold() converts text to a form that is suitable for +/* caseless comparison, rather than presentation to humans. +/* +/* When compiled without EAI support or util_utf8_enable is +/* zero, casefold() implements ASCII case folding, leaving +/* non-ASCII byte values unchanged. +/* +/* When compiled with EAI support and util_utf8_enable is +/* non-zero, casefold() implements UTF-8 case folding using +/* the en_US locale, as recommended when the conversion result +/* is not meant to be presented to humans. +/* +/* casefold_len() implements casefold() with a source length +/* argument. +/* +/* casefold_append() implements casefold() without overwriting +/* the result. +/* +/* casefoldx() implements a more complex API that implements +/* all of the above and more. +/* +/* Arguments: +/* .IP src +/* Null-terminated input string. +/* .IP dest +/* Output buffer, null-terminated. Specify a null pointer to +/* use an internal buffer that is overwritten upon each call. +/* .IP src_len +/* The string length, -1 to determine the length dynamically. +/* .IP flags +/* Bitwise OR of zero or more of the following: +/* .RS +/* .IP CASEF_FLAG_UTF8 +/* Enable UTF-8 support. This flag has no effect when compiled +/* without EAI support. +/* .IP CASEF_FLAG_APPEND +/* Append the result to the buffer, instead of overwriting it. +/* DIAGNOSTICS +/* All errors are fatal. There appear to be no input-dependent +/* errors. +/* +/* With the ICU 4.8 library, there is no casefold error for +/* UTF-8 code points U+0000..U+10FFFF (including surrogate +/* range), not even when running inside an empty chroot jail. +/* Nor does malformed UTF-8 trigger errors; non-UTF-8 bytes +/* are copied verbatim. Based on ICU 4.8 source-code review +/* and experimentation(!) we conclude that UTF-8 casefolding +/* has no data-dependent error cases, and that it is safe to +/* treat all casefolding errors as fatal runtime errors. +/* LICENSE +/* .ad +/* .fi +/* The Secure Mailer license must be distributed with this software. +/* AUTHOR(S) +/* Wietse Venema +/* IBM T.J. Watson Research +/* P.O. Box 704 +/* Yorktown Heights, NY 10598, USA +/* +/* Wietse Venema +/* Google, Inc. +/* 111 8th Avenue +/* New York, NY 10011, USA +/*--*/ + +/* System library. */ + +#include <sys_defs.h> +#include <string.h> +#include <ctype.h> +#ifndef NO_EAI +#include <unicode/ucasemap.h> +#include <unicode/ustring.h> +#include <unicode/uchar.h> +#endif + +/* Utility library. */ + +#include <msg.h> +#include <stringops.h> + +#define STR(x) vstring_str(x) +#define LEN(x) VSTRING_LEN(x) + +/* casefoldx - casefold an UTF-8 string */ + +char *casefoldx(int flags, VSTRING *dest, const char *src, ssize_t len) +{ + size_t old_len; + +#ifdef NO_EAI + + /* + * ASCII mode only. + */ + if (len < 0) + len = strlen(src); + if ((flags & CASEF_FLAG_APPEND) == 0) + VSTRING_RESET(dest); + old_len = VSTRING_LEN(dest); + vstring_strncat(dest, src, len); + lowercase(STR(dest) + old_len); + return (STR(dest)); +#else + + /* + * Unicode mode. + */ + const char myname[] = "casefold"; + static VSTRING *fold_buf = 0; + static UCaseMap *csm = 0; + UErrorCode error; + ssize_t space_needed; + int n; + + /* + * Handle special cases. + */ + if (len < 0) + len = strlen(src); + if (dest == 0) + dest = (fold_buf != 0 ? fold_buf : (fold_buf = vstring_alloc(100))); + if ((flags & CASEF_FLAG_APPEND) == 0) + VSTRING_RESET(dest); + old_len = VSTRING_LEN(dest); + + /* + * All-ASCII input, or ASCII mode only. + */ + if ((flags & CASEF_FLAG_UTF8) == 0 || allascii(src)) { + vstring_strncat(dest, src, len); + lowercase(STR(dest) + old_len); + return (STR(dest)); + } + + /* + * ICU 4.8 ucasemap_utf8FoldCase() does not complain about UTF-8 syntax + * errors. XXX Based on source-code review we conclude that non-UTF-8 + * bytes are copied verbatim, and experiments confirm this. Given that + * this behavior is intentional, we assume that it will stay that way. + */ +#if 0 + if (valid_utf8_string(src, len) == 0) { + if (err) + *err = "malformed UTF-8 or invalid codepoint"; + return (0); + } +#endif + + /* + * One-time initialization. With ICU 4.8 this works while chrooted. + */ + if (csm == 0) { + error = U_ZERO_ERROR; + csm = ucasemap_open("en_US", U_FOLD_CASE_DEFAULT, &error); + if (U_SUCCESS(error) == 0) + msg_fatal("ucasemap_open error: %s", u_errorName(error)); + } + + /* + * Fold the input, adjusting the buffer size if needed. Safety: don't + * loop forever. + * + * Note: the requested amount of space for casemapped output (as reported + * with space_needed below) does not include storage for the null + * terminator. The terminator is written only when the output buffer is + * large enough. This is why we overallocate space when the output does + * not fit. But if the output fits exactly, then the output will be + * unterminated, and we have to terminate the output ourselves. + */ + for (n = 0; n < 3; n++) { + error = U_ZERO_ERROR; + space_needed = ucasemap_utf8FoldCase(csm, STR(dest) + old_len, + vstring_avail(dest), src, len, &error); + if (U_SUCCESS(error)) { + vstring_set_payload_size(dest, old_len + space_needed); + if (vstring_avail(dest) == 0) /* exact fit, no terminator */ + VSTRING_TERMINATE(dest); /* add terminator */ + break; + } else if (error == U_BUFFER_OVERFLOW_ERROR) { + VSTRING_SPACE(dest, space_needed + 1); /* for terminator */ + } else { + msg_fatal("%s: conversion error for \"%s\": %s", + myname, src, u_errorName(error)); + } + } + return (STR(dest)); +#endif /* NO_EAI */ +} + +#ifdef TEST + +static void encode_utf8(VSTRING *buffer, int codepoint) +{ + const char myname[] = "encode_utf8"; + + VSTRING_RESET(buffer); + if (codepoint < 0x80) { + VSTRING_ADDCH(buffer, codepoint); + } else if (codepoint < 0x800) { + VSTRING_ADDCH(buffer, 0xc0 | (codepoint >> 6)); + VSTRING_ADDCH(buffer, 0x80 | (codepoint & 0x3f)); + } else if (codepoint < 0x10000) { + VSTRING_ADDCH(buffer, 0xe0 | (codepoint >> 12)); + VSTRING_ADDCH(buffer, 0x80 | ((codepoint >> 6) & 0x3f)); + VSTRING_ADDCH(buffer, 0x80 | (codepoint & 0x3f)); + } else if (codepoint <= 0x10FFFF) { + VSTRING_ADDCH(buffer, 0xf0 | (codepoint >> 18)); + VSTRING_ADDCH(buffer, 0x80 | ((codepoint >> 12) & 0x3f)); + VSTRING_ADDCH(buffer, 0x80 | ((codepoint >> 6) & 0x3f)); + VSTRING_ADDCH(buffer, 0x80 | (codepoint & 0x3f)); + } else { + msg_panic("%s: out-of-range codepoint U+%X", myname, codepoint); + } + VSTRING_TERMINATE(buffer); +} + +#include <stdlib.h> +#include <stdio.h> +#include <locale.h> + +#include <vstream.h> +#include <vstring_vstream.h> +#include <msg_vstream.h> + +int main(int argc, char **argv) +{ + VSTRING *buffer = vstring_alloc(1); + VSTRING *dest = vstring_alloc(1); + char *bp; + char *conv_res; + char *cmd; + int codepoint, first, last; + VSTREAM *fp; + + if (setlocale(LC_ALL, "C") == 0) + msg_fatal("setlocale(LC_ALL, C) failed: %m"); + + msg_vstream_init(argv[0], VSTREAM_ERR); + + util_utf8_enable = 1; + + VSTRING_SPACE(buffer, 256); /* chroot/file pathname */ + + while (vstring_fgets_nonl(buffer, VSTREAM_IN)) { + bp = STR(buffer); + vstream_printf("> %s\n", bp); + cmd = mystrtok(&bp, CHARS_SPACE); + if (cmd == 0 || *cmd == '#') + continue; + while (ISSPACE(*bp)) + bp++; + + /* + * Null-terminated string. + */ + if (strcmp(cmd, "fold") == 0) { + conv_res = casefold(dest, bp); + vstream_printf("\"%s\" ->fold \"%s\"\n", bp, conv_res); + } + + /* + * Codepoint range. + */ + else if (strcmp(cmd, "range") == 0 + && sscanf(bp, "%i %i", &first, &last) == 2 + && first <= last) { + for (codepoint = first; codepoint <= last; codepoint++) { + if (codepoint >= 0xD800 && codepoint <= 0xDFFF) { + vstream_printf("skipping surrogate range\n"); + codepoint = 0xDFFF; + } else { + encode_utf8(buffer, codepoint); + if (msg_verbose) + vstream_printf("U+%X -> %s\n", codepoint, STR(buffer)); + if (valid_utf8_string(STR(buffer), LEN(buffer)) == 0) + msg_fatal("bad utf-8 encoding for U+%X\n", codepoint); + casefold(dest, STR(buffer)); + } + } + vstream_printf("range completed: 0x%x..0x%x\n", first, last); + } + + /* + * Chroot directory. + */ + else if (strcmp(cmd, "chroot") == 0 + && sscanf(bp, "%255s", STR(buffer)) == 1) { + if (geteuid() == 0) { + if (chdir(STR(buffer)) < 0) + msg_fatal("chdir(%s): %m", STR(buffer)); + if (chroot(STR(buffer)) < 0) + msg_fatal("chroot(%s): %m", STR(buffer)); + vstream_printf("chroot %s completed\n", STR(buffer)); + } + } + + /* + * File. + */ + else if (strcmp(cmd, "file") == 0 + && sscanf(bp, "%255s", STR(buffer)) == 1) { + if ((fp = vstream_fopen(STR(buffer), O_RDONLY, 0)) == 0) + msg_fatal("open(%s): %m", STR(buffer)); + while (vstring_fgets_nonl(buffer, fp)) + vstream_printf("%s\n", casefold(dest, STR(buffer))); + vstream_fclose(fp); + } + + /* + * Verbose. + */ + else if (strcmp(cmd, "verbose") == 0 + && sscanf(bp, "%i", &msg_verbose) == 1) { + /* void */ ; + } + + /* + * Usage + */ + else { + vstream_printf("Usage: %s chroot <path> | file <path> | fold <text> | range <first> <last> | verbose <int>\n", + argv[0]); + } + vstream_fflush(VSTREAM_OUT); + } + vstring_free(buffer); + vstring_free(dest); + exit(0); +} + +#endif /* TEST */ |