src/libnetdata/sanitizers/utf8-sanitizer.c


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116

// SPDX-License-Identifier: GPL-3.0-or-later

#include "../libnetdata.h"

size_t text_sanitize(unsigned char *dst, const unsigned char *src, size_t dst_size, const unsigned char *char_map, bool utf, const char *empty, size_t *multibyte_length) {
    if(unlikely(!dst || !dst_size)) return 0;

    // skip leading spaces and invalid characters
    while(src && *src && !IS_UTF8_BYTE(*src) && (isspace(*src) || iscntrl(*src) || !isprint(*src)))
        src++;

    if(unlikely(!src || !*src)) {
        strncpyz((char *)dst, empty, dst_size);
        dst[dst_size - 1] = '\0';
        size_t len = strlen((char *)dst);
        if(multibyte_length) *multibyte_length = len;
        return len;
    }

    unsigned char *d = dst;

    // make room for the final string termination
    unsigned char *end = &dst[dst_size - 1];

    // copy while converting, but keep only one space
    // we start wil last_is_space = 1 to skip leading spaces
    int last_is_space = 1;

    size_t mblen = 0;

    while(*src && d < end) {
        unsigned char c = *src;

        if(IS_UTF8_STARTBYTE(c) && IS_UTF8_BYTE(src[1]) && d + 2 <= end) {
            // UTF-8 multi-byte encoded character

            // find how big this character is (2-4 bytes)
            size_t utf_character_size = 2;
            while(utf_character_size < 4 &&
                    d + utf_character_size <= end &&
                    IS_UTF8_BYTE(src[utf_character_size]) &&
                    !IS_UTF8_STARTBYTE(src[utf_character_size]))
                utf_character_size++;

            if(utf) {
                while(utf_character_size) {
                    utf_character_size--;
                    *d++ = *src++;
                }
            }
            else {
                // UTF-8 characters are not allowed.
                // Assume it is an underscore
                // and skip all except the first byte
                *d++ = '_';
                src += (utf_character_size - 1);
            }

            last_is_space = 0;
            mblen++;
            continue;
        }

        c = char_map[c];
        if(c == ' ') {
            // a space character

            if(!last_is_space) {
                // add one space
                *d++ = c;
                mblen++;
            }

            last_is_space++;
        }
        else {
            *d++ = c;
            last_is_space = 0;
            mblen++;
        }

        src++;
    }

    // remove trailing spaces
    while(d > dst && !IS_UTF8_BYTE(*(d - 1)) && *(d - 1) == ' ') {
        d--;
        mblen--;
    }

    // put a termination at the end of what we copied
    *d = '\0';

    // check if dst is all underscores and empty it if it is
    if(*dst == '_') {
        unsigned char *t = dst;
        while (*t == '_') t++;
        if (unlikely(*t == '\0')) {
            *dst = '\0';
            mblen = 0;
        }
    }

    // check if it is empty
    if(unlikely(*dst == '\0')) {
        strncpyz((char *)dst, empty, dst_size);
        dst[dst_size - 1] = '\0';
        mblen = strlen((char *)dst);
        if(multibyte_length) *multibyte_length = mblen;
        return mblen;
    }

    if(multibyte_length) *multibyte_length = mblen;

    return d - dst;
}