summaryrefslogtreecommitdiffstats
path: root/src/util/casefold.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/util/casefold.c')
-rw-r--r--src/util/casefold.c359
1 files changed, 359 insertions, 0 deletions
diff --git a/src/util/casefold.c b/src/util/casefold.c
new file mode 100644
index 0000000..d3ebd4b
--- /dev/null
+++ b/src/util/casefold.c
@@ -0,0 +1,359 @@
+/*++
+/* NAME
+/* casefold 3
+/* SUMMARY
+/* casefold text for caseless comparison
+/* SYNOPSIS
+/* #include <stringops.h>
+/*
+/* char *casefold(
+/* VSTRING *dst,
+/* const char *src)
+/*
+/* char *casefold_append(
+/* VSTRING *dst,
+/* const char *src)
+/*
+/* char *casefold_len(
+/* VSTRING *dst,
+/* const char *src,
+/* ssize_t src_len)
+/* AUXILIARY FUNCTIONS
+/* char *casefoldx(
+/* int flags,
+/* VSTRING *dst,
+/* const char *src,
+/* ssize_t src_len)
+/* DESCRIPTION
+/* casefold() converts text to a form that is suitable for
+/* caseless comparison, rather than presentation to humans.
+/*
+/* When compiled without EAI support or util_utf8_enable is
+/* zero, casefold() implements ASCII case folding, leaving
+/* non-ASCII byte values unchanged.
+/*
+/* When compiled with EAI support and util_utf8_enable is
+/* non-zero, casefold() implements UTF-8 case folding using
+/* the en_US locale, as recommended when the conversion result
+/* is not meant to be presented to humans.
+/*
+/* casefold_len() implements casefold() with a source length
+/* argument.
+/*
+/* casefold_append() implements casefold() without overwriting
+/* the result.
+/*
+/* casefoldx() implements a more complex API that implements
+/* all of the above and more.
+/*
+/* Arguments:
+/* .IP src
+/* Null-terminated input string.
+/* .IP dest
+/* Output buffer, null-terminated. Specify a null pointer to
+/* use an internal buffer that is overwritten upon each call.
+/* .IP src_len
+/* The string length, -1 to determine the length dynamically.
+/* .IP flags
+/* Bitwise OR of zero or more of the following:
+/* .RS
+/* .IP CASEF_FLAG_UTF8
+/* Enable UTF-8 support. This flag has no effect when compiled
+/* without EAI support.
+/* .IP CASEF_FLAG_APPEND
+/* Append the result to the buffer, instead of overwriting it.
+/* DIAGNOSTICS
+/* All errors are fatal. There appear to be no input-dependent
+/* errors.
+/*
+/* With the ICU 4.8 library, there is no casefold error for
+/* UTF-8 code points U+0000..U+10FFFF (including surrogate
+/* range), not even when running inside an empty chroot jail.
+/* Nor does malformed UTF-8 trigger errors; non-UTF-8 bytes
+/* are copied verbatim. Based on ICU 4.8 source-code review
+/* and experimentation(!) we conclude that UTF-8 casefolding
+/* has no data-dependent error cases, and that it is safe to
+/* treat all casefolding errors as fatal runtime errors.
+/* LICENSE
+/* .ad
+/* .fi
+/* The Secure Mailer license must be distributed with this software.
+/* AUTHOR(S)
+/* Wietse Venema
+/* IBM T.J. Watson Research
+/* P.O. Box 704
+/* Yorktown Heights, NY 10598, USA
+/*
+/* Wietse Venema
+/* Google, Inc.
+/* 111 8th Avenue
+/* New York, NY 10011, USA
+/*--*/
+
+/* System library. */
+
+#include <sys_defs.h>
+#include <string.h>
+#include <ctype.h>
+#ifndef NO_EAI
+#include <unicode/ucasemap.h>
+#include <unicode/ustring.h>
+#include <unicode/uchar.h>
+#endif
+
+/* Utility library. */
+
+#include <msg.h>
+#include <stringops.h>
+
+#define STR(x) vstring_str(x)
+#define LEN(x) VSTRING_LEN(x)
+
+/* casefoldx - casefold an UTF-8 string */
+
+char *casefoldx(int flags, VSTRING *dest, const char *src, ssize_t len)
+{
+ size_t old_len;
+
+#ifdef NO_EAI
+
+ /*
+ * ASCII mode only.
+ */
+ if (len < 0)
+ len = strlen(src);
+ if ((flags & CASEF_FLAG_APPEND) == 0)
+ VSTRING_RESET(dest);
+ old_len = VSTRING_LEN(dest);
+ vstring_strncat(dest, src, len);
+ lowercase(STR(dest) + old_len);
+ return (STR(dest));
+#else
+
+ /*
+ * Unicode mode.
+ */
+ const char myname[] = "casefold";
+ static VSTRING *fold_buf = 0;
+ static UCaseMap *csm = 0;
+ UErrorCode error;
+ ssize_t space_needed;
+ int n;
+
+ /*
+ * Handle special cases.
+ */
+ if (len < 0)
+ len = strlen(src);
+ if (dest == 0)
+ dest = (fold_buf != 0 ? fold_buf : (fold_buf = vstring_alloc(100)));
+ if ((flags & CASEF_FLAG_APPEND) == 0)
+ VSTRING_RESET(dest);
+ old_len = VSTRING_LEN(dest);
+
+ /*
+ * All-ASCII input, or ASCII mode only.
+ */
+ if ((flags & CASEF_FLAG_UTF8) == 0 || allascii(src)) {
+ vstring_strncat(dest, src, len);
+ lowercase(STR(dest) + old_len);
+ return (STR(dest));
+ }
+
+ /*
+ * ICU 4.8 ucasemap_utf8FoldCase() does not complain about UTF-8 syntax
+ * errors. XXX Based on source-code review we conclude that non-UTF-8
+ * bytes are copied verbatim, and experiments confirm this. Given that
+ * this behavior is intentional, we assume that it will stay that way.
+ */
+#if 0
+ if (valid_utf8_string(src, len) == 0) {
+ if (err)
+ *err = "malformed UTF-8 or invalid codepoint";
+ return (0);
+ }
+#endif
+
+ /*
+ * One-time initialization. With ICU 4.8 this works while chrooted.
+ */
+ if (csm == 0) {
+ error = U_ZERO_ERROR;
+ csm = ucasemap_open("en_US", U_FOLD_CASE_DEFAULT, &error);
+ if (U_SUCCESS(error) == 0)
+ msg_fatal("ucasemap_open error: %s", u_errorName(error));
+ }
+
+ /*
+ * Fold the input, adjusting the buffer size if needed. Safety: don't
+ * loop forever.
+ *
+ * Note: the requested amount of space for casemapped output (as reported
+ * with space_needed below) does not include storage for the null
+ * terminator. The terminator is written only when the output buffer is
+ * large enough. This is why we overallocate space when the output does
+ * not fit. But if the output fits exactly, then the output will be
+ * unterminated, and we have to terminate the output ourselves.
+ */
+ for (n = 0; n < 3; n++) {
+ error = U_ZERO_ERROR;
+ space_needed = ucasemap_utf8FoldCase(csm, STR(dest) + old_len,
+ vstring_avail(dest), src, len, &error);
+ if (U_SUCCESS(error)) {
+ vstring_set_payload_size(dest, old_len + space_needed);
+ if (vstring_avail(dest) == 0) /* exact fit, no terminator */
+ VSTRING_TERMINATE(dest); /* add terminator */
+ break;
+ } else if (error == U_BUFFER_OVERFLOW_ERROR) {
+ VSTRING_SPACE(dest, space_needed + 1); /* for terminator */
+ } else {
+ msg_fatal("%s: conversion error for \"%s\": %s",
+ myname, src, u_errorName(error));
+ }
+ }
+ return (STR(dest));
+#endif /* NO_EAI */
+}
+
+#ifdef TEST
+
+static void encode_utf8(VSTRING *buffer, int codepoint)
+{
+ const char myname[] = "encode_utf8";
+
+ VSTRING_RESET(buffer);
+ if (codepoint < 0x80) {
+ VSTRING_ADDCH(buffer, codepoint);
+ } else if (codepoint < 0x800) {
+ VSTRING_ADDCH(buffer, 0xc0 | (codepoint >> 6));
+ VSTRING_ADDCH(buffer, 0x80 | (codepoint & 0x3f));
+ } else if (codepoint < 0x10000) {
+ VSTRING_ADDCH(buffer, 0xe0 | (codepoint >> 12));
+ VSTRING_ADDCH(buffer, 0x80 | ((codepoint >> 6) & 0x3f));
+ VSTRING_ADDCH(buffer, 0x80 | (codepoint & 0x3f));
+ } else if (codepoint <= 0x10FFFF) {
+ VSTRING_ADDCH(buffer, 0xf0 | (codepoint >> 18));
+ VSTRING_ADDCH(buffer, 0x80 | ((codepoint >> 12) & 0x3f));
+ VSTRING_ADDCH(buffer, 0x80 | ((codepoint >> 6) & 0x3f));
+ VSTRING_ADDCH(buffer, 0x80 | (codepoint & 0x3f));
+ } else {
+ msg_panic("%s: out-of-range codepoint U+%X", myname, codepoint);
+ }
+ VSTRING_TERMINATE(buffer);
+}
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <locale.h>
+
+#include <vstream.h>
+#include <vstring_vstream.h>
+#include <msg_vstream.h>
+
+int main(int argc, char **argv)
+{
+ VSTRING *buffer = vstring_alloc(1);
+ VSTRING *dest = vstring_alloc(1);
+ char *bp;
+ char *conv_res;
+ char *cmd;
+ int codepoint, first, last;
+ VSTREAM *fp;
+
+ if (setlocale(LC_ALL, "C") == 0)
+ msg_fatal("setlocale(LC_ALL, C) failed: %m");
+
+ msg_vstream_init(argv[0], VSTREAM_ERR);
+
+ util_utf8_enable = 1;
+
+ VSTRING_SPACE(buffer, 256); /* chroot/file pathname */
+
+ while (vstring_fgets_nonl(buffer, VSTREAM_IN)) {
+ bp = STR(buffer);
+ vstream_printf("> %s\n", bp);
+ cmd = mystrtok(&bp, CHARS_SPACE);
+ if (cmd == 0 || *cmd == '#')
+ continue;
+ while (ISSPACE(*bp))
+ bp++;
+
+ /*
+ * Null-terminated string.
+ */
+ if (strcmp(cmd, "fold") == 0) {
+ conv_res = casefold(dest, bp);
+ vstream_printf("\"%s\" ->fold \"%s\"\n", bp, conv_res);
+ }
+
+ /*
+ * Codepoint range.
+ */
+ else if (strcmp(cmd, "range") == 0
+ && sscanf(bp, "%i %i", &first, &last) == 2
+ && first <= last) {
+ for (codepoint = first; codepoint <= last; codepoint++) {
+ if (codepoint >= 0xD800 && codepoint <= 0xDFFF) {
+ vstream_printf("skipping surrogate range\n");
+ codepoint = 0xDFFF;
+ } else {
+ encode_utf8(buffer, codepoint);
+ if (msg_verbose)
+ vstream_printf("U+%X -> %s\n", codepoint, STR(buffer));
+ if (valid_utf8_string(STR(buffer), LEN(buffer)) == 0)
+ msg_fatal("bad utf-8 encoding for U+%X\n", codepoint);
+ casefold(dest, STR(buffer));
+ }
+ }
+ vstream_printf("range completed: 0x%x..0x%x\n", first, last);
+ }
+
+ /*
+ * Chroot directory.
+ */
+ else if (strcmp(cmd, "chroot") == 0
+ && sscanf(bp, "%255s", STR(buffer)) == 1) {
+ if (geteuid() == 0) {
+ if (chdir(STR(buffer)) < 0)
+ msg_fatal("chdir(%s): %m", STR(buffer));
+ if (chroot(STR(buffer)) < 0)
+ msg_fatal("chroot(%s): %m", STR(buffer));
+ vstream_printf("chroot %s completed\n", STR(buffer));
+ }
+ }
+
+ /*
+ * File.
+ */
+ else if (strcmp(cmd, "file") == 0
+ && sscanf(bp, "%255s", STR(buffer)) == 1) {
+ if ((fp = vstream_fopen(STR(buffer), O_RDONLY, 0)) == 0)
+ msg_fatal("open(%s): %m", STR(buffer));
+ while (vstring_fgets_nonl(buffer, fp))
+ vstream_printf("%s\n", casefold(dest, STR(buffer)));
+ vstream_fclose(fp);
+ }
+
+ /*
+ * Verbose.
+ */
+ else if (strcmp(cmd, "verbose") == 0
+ && sscanf(bp, "%i", &msg_verbose) == 1) {
+ /* void */ ;
+ }
+
+ /*
+ * Usage
+ */
+ else {
+ vstream_printf("Usage: %s chroot <path> | file <path> | fold <text> | range <first> <last> | verbose <int>\n",
+ argv[0]);
+ }
+ vstream_fflush(VSTREAM_OUT);
+ }
+ vstring_free(buffer);
+ vstring_free(dest);
+ exit(0);
+}
+
+#endif /* TEST */