summaryrefslogtreecommitdiffstats
path: root/src/util/strcasecmp_utf8.c
blob: e3f20dfb4af54455d5dac4853ef13eb93db6d336 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
/*++
/* NAME
/*	strcasecmp_utf8 3
/* SUMMARY
/*	caseless string comparison
/* SYNOPSIS
/*	#include <stringops.h>
/*
/*	int	strcasecmp_utf8(
/*	const char *s1,
/*	const char *s2)
/*
/*	int	strncasecmp_utf8(
/*	const char *s1,
/*	const char *s2,
/*	ssize_t	len)
/* AUXILIARY FUNCTIONS
/*	int	strcasecmp_utf8x(
/*	int	flags,
/*	const char *s1,
/*	const char *s2)
/*
/*	int	strncasecmp_utf8x(
/*	int	flags,
/*	const char *s1,
/*	const char *s2,
/*	ssize_t	len)
/* DESCRIPTION
/*	strcasecmp_utf8() implements caseless string comparison for
/*	UTF-8 text, with an API similar to strcasecmp(). Only ASCII
/*	characters are casefolded when the code is compiled without
/*	EAI support or when util_utf8_enable is zero.
/*
/*	strncasecmp_utf8() implements caseless string comparison
/*	for UTF-8 text, with an API similar to strncasecmp(). Only
/*	ASCII characters are casefolded when the code is compiled
/*	without EAI support or when util_utf8_enable is zero.
/*
/*	strcasecmp_utf8x() and strncasecmp_utf8x() implement a more
/*	complex API that provides the above functionality and more.
/*
/*	Arguments:
/* .IP "s1, s2"
/*	Null-terminated strings to be compared.
/* .IP len
/*	String length before casefolding.
/* .IP flags
/*	Zero or CASEF_FLAG_UTF8. The latter flag enables UTF-8 case
/*	folding instead of folding only ASCII characters. This flag
/*	is ignored when compiled without EAI support.
/* SEE ALSO
/*	casefold(), casefold text for caseless comparison.
/* LICENSE
/* .ad
/* .fi
/*	The Secure Mailer license must be distributed with this software.
/* AUTHOR(S)
/*	Wietse Venema
/*	IBM T.J. Watson Research
/*	P.O. Box 704
/*	Yorktown Heights, NY 10598, USA
/*
/*	Wietse Venema
/*	Google, Inc.
/*	111 8th Avenue
/*	New York, NY 10011, USA
/*--*/

 /*
  * System library.
  */
#include <sys_defs.h>
#include <string.h>

#ifdef STRCASECMP_IN_STRINGS_H
#include <strings.h>
#endif

 /*
  * Utility library.
  */
#include <stringops.h>

#define STR(x)	vstring_str(x)

static VSTRING *f1;			/* casefold result for s1 */
static VSTRING *f2;			/* casefold result for s2 */

/* strcasecmp_utf8_init - initialize */

static void strcasecmp_utf8_init(void)
{
    f1 = vstring_alloc(100);
    f2 = vstring_alloc(100);
}

/* strcasecmp_utf8x - caseless string comparison */

int     strcasecmp_utf8x(int flags, const char *s1, const char *s2)
{

    /*
     * Short-circuit optimization for ASCII-only text. This may be slower
     * than using a cache for all results. We must not expose strcasecmp(3)
     * to non-ASCII text.
     */
    if (allascii(s1) && allascii(s2))
	return (strcasecmp(s1, s2));

    if (f1 == 0)
	strcasecmp_utf8_init();

    /*
     * Cross our fingers and hope that strcmp() remains agnostic of
     * charactersets and locales.
     */
    flags &= CASEF_FLAG_UTF8;
    casefoldx(flags, f1, s1, -1);
    casefoldx(flags, f2, s2, -1);
    return (strcmp(STR(f1), STR(f2)));
}

/* strncasecmp_utf8x - caseless string comparison */

int     strncasecmp_utf8x(int flags, const char *s1, const char *s2,
			          ssize_t len)
{

    /*
     * Consider using a cache for all results.
     */
    if (f1 == 0)
	strcasecmp_utf8_init();

    /*
     * Short-circuit optimization for ASCII-only text. This may be slower
     * than using a cache for all results. See comments above for limitations
     * of strcasecmp().
     */
    if (allascii_len(s1, len) && allascii_len(s2, len))
	return (strncasecmp(s1, s2, len));

    /*
     * Caution: casefolding may change the number of bytes. See comments
     * above for concerns about strcmp().
     */
    flags &= CASEF_FLAG_UTF8;
    casefoldx(flags, f1, s1, len);
    casefoldx(flags, f2, s2, len);
    return (strcmp(STR(f1), STR(f2)));
}

#ifdef TEST
#include <stdio.h>
#include <stdlib.h>
#include <vstream.h>
#include <vstring_vstream.h>
#include <msg_vstream.h>
#include <argv.h>

int     main(int argc, char **argv)
{
    VSTRING *buffer = vstring_alloc(1);
    ARGV   *cmd;
    char  **args;
    int     len;
    int     flags;
    int     res;

    msg_vstream_init(argv[0], VSTREAM_ERR);
    flags = CASEF_FLAG_UTF8;
    util_utf8_enable = 1;
    while (vstring_fgets_nonl(buffer, VSTREAM_IN)) {
	vstream_printf("> %s\n", STR(buffer));
	cmd = argv_split(STR(buffer), CHARS_SPACE);
	if (cmd->argc == 0 || cmd->argv[0][0] == '#')
	    continue;
	args = cmd->argv;

	/*
	 * Compare two strings.
	 */
	if (strcmp(args[0], "compare") == 0 && cmd->argc == 3) {
	    res = strcasecmp_utf8x(flags, args[1], args[2]);
	    vstream_printf("\"%s\" %s \"%s\"\n",
			   args[1],
			   res < 0 ? "<" : res == 0 ? "==" : ">",
			   args[2]);
	}

	/*
	 * Compare two substrings.
	 */
	else if (strcmp(args[0], "compare-len") == 0 && cmd->argc == 4
		 && sscanf(args[3], "%d", &len) == 1 && len >= 0) {
	    res = strncasecmp_utf8x(flags, args[1], args[2], len);
	    vstream_printf("\"%.*s\" %s \"%.*s\"\n",
			   len, args[1],
			   res < 0 ? "<" : res == 0 ? "==" : ">",
			   len, args[2]);
	}

	/*
	 * Usage.
	 */
	else {
	    vstream_printf("Usage: %s compare <s1> <s2> | compare-len <s1> <s2> <len>\n",
			   argv[0]);
	}
	vstream_fflush(VSTREAM_OUT);
	argv_free(cmd);
    }
    exit(0);
}

#endif					/* TEST */