summaryrefslogtreecommitdiffstats
path: root/src/util/printable.c
blob: 0e1ae19543b76d97b97544e77d7c4ba0036cbc40 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
/*++
/* NAME
/*	printable 3
/* SUMMARY
/*	mask non-printable characters
/* SYNOPSIS
/*	#include <stringops.h>
/*
/*	int	util_utf8_enable;
/*
/*	char	*printable(buffer, replacement)
/*	char	*buffer;
/*	int	replacement;
/*
/*	char	*printable_except(buffer, replacement, except)
/*	char	*buffer;
/*	int	replacement;
/*	const char *except;
/* DESCRIPTION
/*	printable() replaces non-printable characters
/*	in its input with the given replacement.
/*
/*	util_utf8_enable controls whether UTF8 is considered printable.
/*	With util_utf8_enable equal to zero, non-ASCII text is replaced.
/*
/*	Arguments:
/* .IP buffer
/*	The null-terminated input string.
/* .IP replacement
/*	Replacement value for characters in \fIbuffer\fR that do not
/*	pass the ASCII isprint(3) test or that are not valid UTF8.
/* .IP except
/*	Null-terminated sequence of non-replaced ASCII characters.
/* LICENSE
/* .ad
/* .fi
/*	The Secure Mailer license must be distributed with this software.
/* AUTHOR(S)
/*	Wietse Venema
/*	IBM T.J. Watson Research
/*	P.O. Box 704
/*	Yorktown Heights, NY 10598, USA
/*
/*	Wietse Venema
/*	Google, Inc.
/*	111 8th Avenue
/*	New York, NY 10011, USA
/*
/*	Wietse Venema
/*	porcupine.org
/*	Amawalk, NY 10501, USA
/*--*/

/* System library. */

#include "sys_defs.h"
#include <ctype.h>
#include <string.h>

/* Utility library. */

#include "stringops.h"
#include "parse_utf8_char.h"

int     util_utf8_enable = 0;

/* printable -  binary compatibility */

#undef printable

char   *printable(char *, int);

char   *printable(char *string, int replacement)
{
    return (printable_except(string, replacement, (char *) 0));
}

/* printable_except -  pass through printable or other preserved characters */

char   *printable_except(char *string, int replacement, const char *except)
{
    char   *cp;
    char   *last;
    int     ch;

    /*
     * In case of a non-UTF8 sequence (bad leader byte, bad non-leader byte,
     * over-long encodings, out-of-range code points, etc), replace the first
     * byte, and try to resynchronize at the next byte.
     */
#define PRINT_OR_EXCEPT(ch) (ISPRINT(ch) || (except && strchr(except, ch)))

    for (cp = string; (ch = *(unsigned char *) cp) != 0; cp++) {
	if (util_utf8_enable == 0) {
	    if (ISASCII(ch) && PRINT_OR_EXCEPT(ch))
		continue;
	} else if ((last = parse_utf8_char(cp, 0)) == cp) {	/* ASCII */
	    if (PRINT_OR_EXCEPT(ch))
		continue;
	} else if (last != 0) {			/* Other UTF8 */
	    cp = last;
	    continue;
	}
	*cp = replacement;
    }
    return (string);
}

#ifdef TEST

#include <stdlib.h>
#include <string.h>
#include <msg.h>
#include <msg_vstream.h>
#include <mymalloc.h>
#include <vstream.h>

 /*
  * Test cases for 1-, 2-, and 3-byte encodings. Originally contributed by
  * Viktor Dukhovni, and annotated using translate.google.com.
  * 
  * See valid_utf8_string.c for single-error tests.
  * 
  * XXX Need a test for 4-byte encodings, preferably with strings that can be
  * displayed.
  */
struct testcase {
    const char *name;
    const char *input;
    const char *expected;;
};
static const struct testcase testcases[] = {
    {"Printable ASCII",
	"printable", "printable"
    },
    {"ASCII with control character",
	"non\bn-printable", "non?n-printable"
    },
    {"Latin accented text, no error",
	"na\303\257ve", "na\303\257ve"
    },
    {"Latin text, with error",
	"na\303ve", "na?ve"
    },
    {"Viktor, Cyrillic, no error",
	"\320\262\320\270\320\272\321\202\320\276\321\200",
	"\320\262\320\270\320\272\321\202\320\276\321\200"
    },
    {"Viktor, Cyrillic, two errors",
	"\320\262\320\320\272\272\321\202\320\276\321\200",
	"\320\262?\320\272?\321\202\320\276\321\200"
    },
    {"Viktor, Hebrew, no error",
	"\327\225\327\231\327\247\327\230\327\225\326\274\327\250",
	"\327\225\327\231\327\247\327\230\327\225\326\274\327\250"
    },
    {"Viktor, Hebrew, with error",
	"\327\225\231\327\247\327\230\327\225\326\274\327\250",
	"\327\225?\327\247\327\230\327\225\326\274\327\250"
    },
    {"Chinese (Simplified), no error",
	"\344\270\255\345\233\275\344\272\222\350\201\224\347\275\221\347"
	"\273\234\345\217\221\345\261\225\347\212\266\345\206\265\347\273"
	"\237\350\256\241\346\212\245\345\221\212",
	"\344\270\255\345\233\275\344\272\222\350\201\224\347\275\221\347"
	"\273\234\345\217\221\345\261\225\347\212\266\345\206\265\347\273"
	"\237\350\256\241\346\212\245\345\221\212"
    },
    {"Chinese (Simplified), with errors",
	"\344\270\255\345\344\272\222\350\224\347\275\221\347"
	"\273\234\345\217\221\345\261\225\347\212\266\345\206\265\347\273"
	"\237\350\256\241\346\212\245\345",
	"\344\270\255?\344\272\222??\347\275\221\347"
	"\273\234\345\217\221\345\261\225\347\212\266\345\206\265\347\273"
	"\237\350\256\241\346\212\245?"
    },
};

int     main(int argc, char **argv)
{
    const struct testcase *tp;
    int     pass;
    int     fail;

#define NUM_TESTS	sizeof(testcases)/sizeof(testcases[0])

    msg_vstream_init(basename(argv[0]), VSTREAM_ERR);
    util_utf8_enable = 1;

    for (pass = fail = 0, tp = testcases; tp < testcases + NUM_TESTS; tp++) {
	char   *input;
	char   *actual;
	int     ok = 0;

	/*
	 * Notes:
	 * 
	 * - The input is modified, therefore it must be copied.
	 * 
	 * - The msg(3) functions use printable() which interferes when logging
	 * inputs and outputs. Use vstream_fprintf() instead.
	 */
	vstream_fprintf(VSTREAM_ERR, "RUN  %s\n", tp->name);
	input = mystrdup(tp->input);
	actual = printable(input, '?');

	if (strcmp(actual, tp->expected) != 0) {
	    vstream_fprintf(VSTREAM_ERR, "input: >%s<, got: >%s<, want: >%s<\n",
			    tp->input, actual, tp->expected);
	} else {
	    vstream_fprintf(VSTREAM_ERR, "input: >%s<, got and want: >%s<\n",
			    tp->input, actual);
	    ok = 1;
	}
	if (ok) {
	    vstream_fprintf(VSTREAM_ERR, "PASS %s\n", tp->name);
	    pass++;
	} else {
	    vstream_fprintf(VSTREAM_ERR, "FAIL %s\n", tp->name);
	    fail++;
	}
	myfree(input);
    }
    msg_info("PASS=%d FAIL=%d", pass, fail);
    return (fail > 0);
}

#endif