/*++ /* NAME /* casefold 3 /* SUMMARY /* casefold text for caseless comparison /* SYNOPSIS /* #include /* /* char *casefold( /* VSTRING *dst, /* const char *src) /* /* char *casefold_append( /* VSTRING *dst, /* const char *src) /* /* char *casefold_len( /* VSTRING *dst, /* const char *src, /* ssize_t src_len) /* AUXILIARY FUNCTIONS /* char *casefoldx( /* int flags, /* VSTRING *dst, /* const char *src, /* ssize_t src_len) /* DESCRIPTION /* casefold() converts text to a form that is suitable for /* caseless comparison, rather than presentation to humans. /* /* When compiled without EAI support or util_utf8_enable is /* zero, casefold() implements ASCII case folding, leaving /* non-ASCII byte values unchanged. /* /* When compiled with EAI support and util_utf8_enable is /* non-zero, casefold() implements UTF-8 case folding using /* the en_US locale, as recommended when the conversion result /* is not meant to be presented to humans. /* /* casefold_len() implements casefold() with a source length /* argument. /* /* casefold_append() implements casefold() without overwriting /* the result. /* /* casefoldx() implements a more complex API that implements /* all of the above and more. /* /* Arguments: /* .IP src /* Null-terminated input string. /* .IP dest /* Output buffer, null-terminated. Specify a null pointer to /* use an internal buffer that is overwritten upon each call. /* .IP src_len /* The string length, -1 to determine the length dynamically. /* .IP flags /* Bitwise OR of zero or more of the following: /* .RS /* .IP CASEF_FLAG_UTF8 /* Enable UTF-8 support. This flag has no effect when compiled /* without EAI support. /* .IP CASEF_FLAG_APPEND /* Append the result to the buffer, instead of overwriting it. /* DIAGNOSTICS /* All errors are fatal. There appear to be no input-dependent /* errors. /* /* With the ICU 4.8 library, there is no casefold error for /* UTF-8 code points U+0000..U+10FFFF (including surrogate /* range), not even when running inside an empty chroot jail. /* Nor does malformed UTF-8 trigger errors; non-UTF-8 bytes /* are copied verbatim. Based on ICU 4.8 source-code review /* and experimentation(!) we conclude that UTF-8 casefolding /* has no data-dependent error cases, and that it is safe to /* treat all casefolding errors as fatal runtime errors. /* LICENSE /* .ad /* .fi /* The Secure Mailer license must be distributed with this software. /* AUTHOR(S) /* Wietse Venema /* IBM T.J. Watson Research /* P.O. Box 704 /* Yorktown Heights, NY 10598, USA /* /* Wietse Venema /* Google, Inc. /* 111 8th Avenue /* New York, NY 10011, USA /*--*/ /* System library. */ #include #include #include #ifndef NO_EAI #include #include #include #endif /* Utility library. */ #include #include #define STR(x) vstring_str(x) #define LEN(x) VSTRING_LEN(x) /* casefoldx - casefold an UTF-8 string */ char *casefoldx(int flags, VSTRING *dest, const char *src, ssize_t len) { size_t old_len; #ifdef NO_EAI /* * ASCII mode only. */ if (len < 0) len = strlen(src); if ((flags & CASEF_FLAG_APPEND) == 0) VSTRING_RESET(dest); old_len = VSTRING_LEN(dest); vstring_strncat(dest, src, len); lowercase(STR(dest) + old_len); return (STR(dest)); #else /* * Unicode mode. */ const char myname[] = "casefold"; static VSTRING *fold_buf = 0; static UCaseMap *csm = 0; UErrorCode error; ssize_t space_needed; int n; /* * Handle special cases. */ if (len < 0) len = strlen(src); if (dest == 0) dest = (fold_buf != 0 ? fold_buf : (fold_buf = vstring_alloc(100))); if ((flags & CASEF_FLAG_APPEND) == 0) VSTRING_RESET(dest); old_len = VSTRING_LEN(dest); /* * All-ASCII input, or ASCII mode only. */ if ((flags & CASEF_FLAG_UTF8) == 0 || allascii(src)) { vstring_strncat(dest, src, len); lowercase(STR(dest) + old_len); return (STR(dest)); } /* * ICU 4.8 ucasemap_utf8FoldCase() does not complain about UTF-8 syntax * errors. XXX Based on source-code review we conclude that non-UTF-8 * bytes are copied verbatim, and experiments confirm this. Given that * this behavior is intentional, we assume that it will stay that way. */ #if 0 if (valid_utf8_string(src, len) == 0) { if (err) *err = "malformed UTF-8 or invalid codepoint"; return (0); } #endif /* * One-time initialization. With ICU 4.8 this works while chrooted. */ if (csm == 0) { error = U_ZERO_ERROR; csm = ucasemap_open("en_US", U_FOLD_CASE_DEFAULT, &error); if (U_SUCCESS(error) == 0) msg_fatal("ucasemap_open error: %s", u_errorName(error)); } /* * Fold the input, adjusting the buffer size if needed. Safety: don't * loop forever. * * Note: the requested amount of space for casemapped output (as reported * with space_needed below) does not include storage for the null * terminator. The terminator is written only when the output buffer is * large enough. This is why we overallocate space when the output does * not fit. But if the output fits exactly, then the output will be * unterminated, and we have to terminate the output ourselves. */ for (n = 0; n < 3; n++) { error = U_ZERO_ERROR; space_needed = ucasemap_utf8FoldCase(csm, STR(dest) + old_len, vstring_avail(dest), src, len, &error); if (U_SUCCESS(error)) { vstring_set_payload_size(dest, old_len + space_needed); if (vstring_avail(dest) == 0) /* exact fit, no terminator */ VSTRING_TERMINATE(dest); /* add terminator */ break; } else if (error == U_BUFFER_OVERFLOW_ERROR) { VSTRING_SPACE(dest, space_needed + 1); /* for terminator */ } else { msg_fatal("%s: conversion error for \"%s\": %s", myname, src, u_errorName(error)); } } return (STR(dest)); #endif /* NO_EAI */ } #ifdef TEST static void encode_utf8(VSTRING *buffer, int codepoint) { const char myname[] = "encode_utf8"; VSTRING_RESET(buffer); if (codepoint < 0x80) { VSTRING_ADDCH(buffer, codepoint); } else if (codepoint < 0x800) { VSTRING_ADDCH(buffer, 0xc0 | (codepoint >> 6)); VSTRING_ADDCH(buffer, 0x80 | (codepoint & 0x3f)); } else if (codepoint < 0x10000) { VSTRING_ADDCH(buffer, 0xe0 | (codepoint >> 12)); VSTRING_ADDCH(buffer, 0x80 | ((codepoint >> 6) & 0x3f)); VSTRING_ADDCH(buffer, 0x80 | (codepoint & 0x3f)); } else if (codepoint <= 0x10FFFF) { VSTRING_ADDCH(buffer, 0xf0 | (codepoint >> 18)); VSTRING_ADDCH(buffer, 0x80 | ((codepoint >> 12) & 0x3f)); VSTRING_ADDCH(buffer, 0x80 | ((codepoint >> 6) & 0x3f)); VSTRING_ADDCH(buffer, 0x80 | (codepoint & 0x3f)); } else { msg_panic("%s: out-of-range codepoint U+%X", myname, codepoint); } VSTRING_TERMINATE(buffer); } #include #include #include #include #include #include int main(int argc, char **argv) { VSTRING *buffer = vstring_alloc(1); VSTRING *dest = vstring_alloc(1); char *bp; char *conv_res; char *cmd; int codepoint, first, last; VSTREAM *fp; if (setlocale(LC_ALL, "C") == 0) msg_fatal("setlocale(LC_ALL, C) failed: %m"); msg_vstream_init(argv[0], VSTREAM_ERR); util_utf8_enable = 1; VSTRING_SPACE(buffer, 256); /* chroot/file pathname */ while (vstring_fgets_nonl(buffer, VSTREAM_IN)) { bp = STR(buffer); vstream_printf("> %s\n", bp); cmd = mystrtok(&bp, CHARS_SPACE); if (cmd == 0 || *cmd == '#') continue; while (ISSPACE(*bp)) bp++; /* * Null-terminated string. */ if (strcmp(cmd, "fold") == 0) { conv_res = casefold(dest, bp); vstream_printf("\"%s\" ->fold \"%s\"\n", bp, conv_res); } /* * Codepoint range. */ else if (strcmp(cmd, "range") == 0 && sscanf(bp, "%i %i", &first, &last) == 2 && first <= last) { for (codepoint = first; codepoint <= last; codepoint++) { if (codepoint >= 0xD800 && codepoint <= 0xDFFF) { vstream_printf("skipping surrogate range\n"); codepoint = 0xDFFF; } else { encode_utf8(buffer, codepoint); if (msg_verbose) vstream_printf("U+%X -> %s\n", codepoint, STR(buffer)); if (valid_utf8_string(STR(buffer), LEN(buffer)) == 0) msg_fatal("bad utf-8 encoding for U+%X\n", codepoint); casefold(dest, STR(buffer)); } } vstream_printf("range completed: 0x%x..0x%x\n", first, last); } /* * Chroot directory. */ else if (strcmp(cmd, "chroot") == 0 && sscanf(bp, "%255s", STR(buffer)) == 1) { if (geteuid() == 0) { if (chdir(STR(buffer)) < 0) msg_fatal("chdir(%s): %m", STR(buffer)); if (chroot(STR(buffer)) < 0) msg_fatal("chroot(%s): %m", STR(buffer)); vstream_printf("chroot %s completed\n", STR(buffer)); } } /* * File. */ else if (strcmp(cmd, "file") == 0 && sscanf(bp, "%255s", STR(buffer)) == 1) { if ((fp = vstream_fopen(STR(buffer), O_RDONLY, 0)) == 0) msg_fatal("open(%s): %m", STR(buffer)); while (vstring_fgets_nonl(buffer, fp)) vstream_printf("%s\n", casefold(dest, STR(buffer))); vstream_fclose(fp); } /* * Verbose. */ else if (strcmp(cmd, "verbose") == 0 && sscanf(bp, "%i", &msg_verbose) == 1) { /* void */ ; } /* * Usage */ else { vstream_printf("Usage: %s chroot | file | fold | range | verbose \n", argv[0]); } vstream_fflush(VSTREAM_OUT); } vstring_free(buffer); vstring_free(dest); exit(0); } #endif /* TEST */