summaryrefslogtreecommitdiffstats
path: root/src/utf8proc/test/iterate.c
blob: c1674b79952d66f8d678a68c647f6df9a891aca2 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
#include "tests.h"
#include <ctype.h>
#include <wchar.h>

static  int     tests;
static  int     error;

#define CHECKVALID(pos, val, len) buf[pos] = val; testbytes(buf,len,len,__LINE__)
#define CHECKINVALID(pos, val, len) buf[pos] = val; testbytes(buf,len,UTF8PROC_ERROR_INVALIDUTF8,__LINE__)

static void testbytes(unsigned char *buf, int len, utf8proc_ssize_t retval, int line)
{
    utf8proc_int32_t out[16];
    utf8proc_ssize_t ret;

    /* Make a copy to ensure that memory is left uninitialized after "len"
     * bytes. This way, Valgrind can detect overreads.
     */
    unsigned char tmp[16];
    memcpy(tmp, buf, len);

    tests++;
    if ((ret = utf8proc_iterate(tmp, len, out)) != retval) {
        fprintf(stderr, "Failed (%d):", line);
        for (int i = 0; i < len ; i++) {
            fprintf(stderr, " 0x%02x", tmp[i]);
        }
        fprintf(stderr, " -> %zd\n", ret);
        error++;
    }
}

int main(int argc, char **argv)
{
    uint32_t byt;
    unsigned char buf[16];

    tests = error = 0;

    // Check valid sequences that were considered valid erroneously before
    buf[0] = 0xef;
    buf[1] = 0xb7;
    for (byt = 0x90; byt < 0xa0; byt++) {
        CHECKVALID(2, byt, 3);
    }
    // Check 0xfffe and 0xffff
    buf[1] = 0xbf;
    CHECKVALID(2, 0xbe, 3);
    CHECKVALID(2, 0xbf, 3);
    // Check 0x??fffe & 0x??ffff
    for (byt = 0x1fffe; byt < 0x110000; byt += 0x10000) {
        buf[0] = 0xf0 | (byt >> 18);
        buf[1] = 0x80 | ((byt >> 12) & 0x3f);
        CHECKVALID(3, 0xbe, 4);
        CHECKVALID(3, 0xbf, 4);
    }
    
    // Continuation byte not after lead
    for (byt = 0x80; byt < 0xc0; byt++) {
        CHECKINVALID(0, byt, 1);
    }

    // Continuation byte not after lead
    for (byt = 0x80; byt < 0xc0; byt++) {
        CHECKINVALID(0, byt, 1);
    }

    // Test lead bytes
    for (byt = 0xc0; byt <= 0xff; byt++) {
	// Single lead byte at end of string
        CHECKINVALID(0, byt, 1);
        // Lead followed by non-continuation character < 0x80
        CHECKINVALID(1, 65, 2);
	// Lead followed by non-continuation character > 0xbf
        CHECKINVALID(1, 0xc0, 2);
    }

    // Test overlong 2-byte
    buf[0] = 0xc0;
    for (byt = 0x81; byt <= 0xbf; byt++) {
        CHECKINVALID(1, byt, 2);
    }
    buf[0] = 0xc1;
    for (byt = 0x80; byt <= 0xbf; byt++) {
        CHECKINVALID(1, byt, 2);
    }

    // Test overlong 3-byte
    buf[0] = 0xe0;
    buf[2] = 0x80;
    for (byt = 0x80; byt <= 0x9f; byt++) {
        CHECKINVALID(1, byt, 3);
    }

    // Test overlong 4-byte
    buf[0] = 0xf0;
    buf[2] = 0x80;
    buf[3] = 0x80;
    for (byt = 0x80; byt <= 0x8f; byt++) {
        CHECKINVALID(1, byt, 4);
    }

    // Test 4-byte > 0x10ffff
    buf[0] = 0xf4;
    buf[2] = 0x80;
    buf[3] = 0x80;
    for (byt = 0x90; byt <= 0xbf; byt++) {
        CHECKINVALID(1, byt, 4);
    }
    buf[1] = 0x80;
    for (byt = 0xf5; byt <= 0xf7; byt++) {
        CHECKINVALID(0, byt, 4);
    }

    // Test 5-byte
    buf[4] = 0x80;
    for (byt = 0xf8; byt <= 0xfb; byt++) {
        CHECKINVALID(0, byt, 5);
    }

    // Test 6-byte
    buf[5] = 0x80;
    for (byt = 0xfc; byt <= 0xfd; byt++) {
        CHECKINVALID(0, byt, 6);
    }

    // Test 7-byte
    buf[6] = 0x80;
    CHECKINVALID(0, 0xfe, 7);

    // Three and above byte sequences
    for (byt = 0xe0; byt < 0xf0; byt++) {
        // Lead followed by only 1 continuation byte
        CHECKINVALID(0, byt, 2);
        // Lead ended by non-continuation character < 0x80
        CHECKINVALID(2, 65, 3);
        // Lead ended by non-continuation character > 0xbf
        CHECKINVALID(2, 0xc0, 3);
    }

    // 3-byte encoded surrogate character(s)
    buf[0] = 0xed; buf[2] = 0x80;
    // Single surrogate
    CHECKINVALID(1, 0xa0, 3);
    // Trailing surrogate first
    CHECKINVALID(1, 0xb0, 3);

    // Four byte sequences
    buf[1] = 0x80;
    for (byt = 0xf0; byt < 0xf5; byt++) {
        // Lead followed by only 1 continuation bytes
        CHECKINVALID(0, byt, 2);
        // Lead followed by only 2 continuation bytes
        CHECKINVALID(0, byt, 3);
        // Lead followed by non-continuation character < 0x80
        CHECKINVALID(3, 65, 4);
        // Lead followed by non-continuation character > 0xbf
        CHECKINVALID(3, 0xc0, 4);

    }

     check(!error, "utf8proc_iterate FAILED %d tests out of %d", error, tests);
     printf("utf8proc_iterate tests SUCCEEDED, (%d) tests passed.\n", tests);

     return 0;
}