src/lib/unichar.h


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145

#ifndef UNICHAR_H
#define UNICHAR_H

/* Character used to replace invalid input. */
#define UNICODE_REPLACEMENT_CHAR 0xfffd
#define UNICODE_REPLACEMENT_CHAR_UTF8 "\xEF\xBF\xBD"
#define UNICODE_REPLACEMENT_CHAR_UTF8_LEN \
	(sizeof(UNICODE_REPLACEMENT_CHAR_UTF8) - 1);
/* Horizontal ellipsis character ('...') */
#define UNICODE_HORIZONTAL_ELLIPSIS_CHAR 0x2026
#define UNICODE_HORIZONTAL_ELLIPSIS_CHAR_UTF8 "\xE2\x80\xA6"
#define UNICODE_HORIZONTAL_ELLIPSIS_CHAR_UTF8_LEN \
	(sizeof(UNICODE_HORIZONTAL_ELLIPSIS_CHAR_UTF8) - 1);

/* Characters >= base require surrogates */
#define UTF16_SURROGATE_BASE 0x10000

#define UTF16_SURROGATE_SHIFT 10
#define UTF16_SURROGATE_MASK 0x03ff
#define UTF16_SURROGATE_HIGH_FIRST 0xd800
#define UTF16_SURROGATE_HIGH_LAST 0xdbff
#define UTF16_SURROGATE_HIGH_MAX 0xdfff
#define UTF16_SURROGATE_LOW_FIRST 0xdc00
#define UTF16_SURROGATE_LOW_LAST 0xdfff

#define UTF16_SURROGATE_HIGH(chr) \
	(UTF16_SURROGATE_HIGH_FIRST + \
	 (((chr) - UTF16_SURROGATE_BASE) >> UTF16_SURROGATE_SHIFT))
#define UTF16_SURROGATE_LOW(chr) \
	(UTF16_SURROGATE_LOW_FIRST + \
	 (((chr) - UTF16_SURROGATE_BASE) & UTF16_SURROGATE_MASK))

/* Returns TRUE if given byte is ASCII character or the beginning of a
   multibyte UTF-8 sequence */
#define UTF8_IS_START_SEQ(b) \
	(((b) & 0x80) == 0 || ((b) & 0xC0) == 0xC0)

#define UTF8_REPLACEMENT_CHAR_LEN 3

#define UNICHAR_T_MAX 0x10ffff

#define UTF16_VALID_HIGH_SURROGATE(chr) (((chr) & 0xfffc00) == UTF16_SURROGATE_HIGH_FIRST)
#define UTF16_VALID_LOW_SURROGATE(chr) (((chr) & 0xfffc00) == UTF16_SURROGATE_LOW_FIRST)

typedef uint32_t unichar_t;
ARRAY_DEFINE_TYPE(unichars, unichar_t);

/* Normalize UTF8 input and append it to output buffer.
   Returns 0 if ok, -1 if input was invalid. Even if input was invalid,
   as much as possible should be added to output. */
typedef int normalizer_func_t(const void *input, size_t size,
			      buffer_t *output);

extern const unsigned char utf8_replacement_char[UTF8_REPLACEMENT_CHAR_LEN];
extern const uint8_t *const uni_utf8_non1_bytes;

static inline bool ATTR_PURE uni_is_valid_ucs4(unichar_t chr)
{
	return (!UTF16_VALID_HIGH_SURROGATE(chr) &&
		!UTF16_VALID_LOW_SURROGATE(chr) &&
		chr <= UNICHAR_T_MAX);
};

/* Returns number of characters in a NUL-terminated unicode string */
unsigned int uni_strlen(const unichar_t *str) ATTR_PURE;
/* Translates UTF-8 input to UCS-4 output. Returns 0 if ok, -1 if input was
   invalid */
int uni_utf8_to_ucs4(const char *input, ARRAY_TYPE(unichars) *output);
int uni_utf8_to_ucs4_n(const unsigned char *input, size_t size,
		       ARRAY_TYPE(unichars) *output);
/* Translates UCS-4 input to UTF-8 output. */
void uni_ucs4_to_utf8(const unichar_t *input, size_t len, buffer_t *output);
void uni_ucs4_to_utf8_c(unichar_t chr, buffer_t *output);

/* Returns char_bytes (>0) if *chr_r is set, 0 for incomplete trailing character,
   -1 for invalid input. */
int uni_utf8_get_char(const char *input, unichar_t *chr_r);
int uni_utf8_get_char_n(const void *input, size_t max_len, unichar_t *chr_r);
/* Returns number of characters in UTF-8 string. */
unsigned int uni_utf8_strlen(const char *input) ATTR_PURE;
/* Returns number of characters in UTF-8 input of specified size. */
unsigned int uni_utf8_strlen_n(const void *input, size_t size) ATTR_PURE;
/* Same as uni_utf8_strlen_n(), but if input ends with a partial UTF-8
   character, don't include it in the return value and set partial_pos_r to
   where the character begins. Otherwise partial_pos_r is set to the end
   of the input. */
unsigned int uni_utf8_partial_strlen_n(const void *input, size_t size,
				       size_t *partial_pos_r);

/* Returns the number of bytes belonging to this UTF-8 character. The given
   parameter is the first byte of the UTF-8 sequence. Invalid input is
   returned with length 1. */
static inline unsigned int ATTR_CONST
uni_utf8_char_bytes(unsigned char chr)
{
	/* 0x00 .. 0x7f are ASCII. 0x80 .. 0xC1 are invalid. */
	if (chr < (192 + 2))
		return 1;
	return uni_utf8_non1_bytes[chr - (192 + 2)];
}

/* Return given character in titlecase. */
unichar_t uni_ucs4_to_titlecase(unichar_t chr) ATTR_CONST;

/* Convert UTF-8 input to titlecase and decompose the titlecase characters to
   output buffer. Returns 0 if ok, -1 if input was invalid. This generates
   output that's compatible with i;unicode-casemap comparator. Invalid input
   is replaced with unicode replacement character (0xfffd). */
int uni_utf8_to_decomposed_titlecase(const void *input, size_t size,
				     buffer_t *output);

/* If input contains only valid UTF-8 characters, return TRUE without updating
   buf. If input contains invalid UTF-8 characters, replace them with unicode
   replacement character (0xfffd), write the output to buf and return FALSE. */
bool uni_utf8_get_valid_data(const unsigned char *input, size_t size,
			     buffer_t *buf) ATTR_WARN_UNUSED_RESULT;
/* Returns TRUE if string is valid UTF-8 input. */
bool uni_utf8_str_is_valid(const char *str);
/* Returns TRUE if data contains only valid UTF-8 input. */
bool uni_utf8_data_is_valid(const unsigned char *data, size_t size);
/* Returns the size of the data when truncated to be less than or equal to
   max_new_size, making sure UTF-8 character boundaries are respected. This only
   looks at the last character at the new boundary. */
size_t uni_utf8_data_truncate(const unsigned char *data, size_t old_size,
			      size_t max_new_size);

/* surrogate handling */
static inline unichar_t uni_join_surrogate(unichar_t high, unichar_t low)
{
	i_assert(UTF16_VALID_HIGH_SURROGATE(high) &&
		 UTF16_VALID_LOW_SURROGATE(low));

	return ((high - UTF16_SURROGATE_HIGH_FIRST)<<10) +
		(low - UTF16_SURROGATE_LOW_FIRST) +
		UTF16_SURROGATE_BASE;
}

static inline void uni_split_surrogate(unichar_t chr, unichar_t *high_r, unichar_t *low_r)
{
	i_assert(chr >= UTF16_SURROGATE_BASE && chr <= UNICHAR_T_MAX);
	i_assert(high_r != NULL && low_r != NULL);
	*high_r = UTF16_SURROGATE_HIGH(chr);
	*low_r = UTF16_SURROGATE_LOW(chr);
}
#endif