src/libmime/mime_encoding.h


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148

/*-
 * Copyright 2016 Vsevolod Stakhov
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
#ifndef SRC_LIBMIME_MIME_ENCODING_H_
#define SRC_LIBMIME_MIME_ENCODING_H_

#include "config.h"
#include "mem_pool.h"
#include "fstring.h"
#include <unicode/uchar.h>

#ifdef __cplusplus
extern "C" {
#endif

struct rspamd_task;
struct rspamd_mime_part;
struct rspamd_mime_text_part;
struct rspamd_charset_converter;

/**
 * Convert charset alias to a canonic charset name
 * @param pool pool to store temporary data
 * @param in
 * @return
 */
const gchar *rspamd_mime_detect_charset(const rspamd_ftok_t *in,
										rspamd_mempool_t *pool);

/**
 * Convert text chunk to utf-8. Input encoding is substituted using
 * `rspamd_mime_detect_charset`.
 * If input encoding is already utf, this function returns input pointer.
 * Memory is allocated from pool if a conversion is needed
 * @param pool
 * @param input
 * @param len
 * @param in_enc canon charset
 * @param olen
 * @param err
 * @return
 */
gchar *rspamd_mime_text_to_utf8(rspamd_mempool_t *pool,
								gchar *input, gsize len, const gchar *in_enc,
								gsize *olen, GError **err);

/**
 * Converts data from `in` to `out`,
 * returns `FALSE` if `enc` is not a valid iconv charset
 *
 * This function, in fact, copies `in` from `out` replacing out content in
 * total.
 * @param in
 * @param out
 * @param enc validated canonical charset name. If NULL, then utf8 check is done only
 * @return
 */
gboolean rspamd_mime_to_utf8_byte_array(GByteArray *in,
										GByteArray *out,
										rspamd_mempool_t *pool,
										const gchar *enc);

/**
 * Maybe convert part to utf-8
 * @param task
 * @param text_part
 * @return
 */
void rspamd_mime_text_part_maybe_convert(struct rspamd_task *task,
										 struct rspamd_mime_text_part *text_part);

/**
 * Checks utf8 charset and normalize/validate utf8 string
 * @param charset
 * @param in
 * @param len
 * @return
 */
gboolean rspamd_mime_charset_utf_check(rspamd_ftok_t *charset,
									   gchar *in, gsize len,
									   gboolean content_check);

/**
 * Ensure that all characters in string are valid utf8 chars or replace them
 * with '?'
 * @param in
 * @param len
 */
void rspamd_mime_charset_utf_enforce(gchar *in, gsize len);

/**
  * Gets cached converter
  * @param enc input encoding
  * @param pool pool to use for temporary normalisation
  * @param is_canon TRUE if normalisation is needed
  * @param err output error
  * @return converter
  */
struct rspamd_charset_converter *rspamd_mime_get_converter_cached(
	const gchar *enc,
	rspamd_mempool_t *pool,
	gboolean is_canon,
	UErrorCode *err);

/**
 * Performs charset->utf16 conversion
 * @param cnv
 * @param dest
 * @param destCapacity
 * @param src
 * @param srcLength
 * @param pErrorCode
 * @return
 */
gint32
rspamd_converter_to_uchars(struct rspamd_charset_converter *cnv,
						   UChar *dest,
						   gint32 destCapacity,
						   const char *src,
						   gint32 srcLength,
						   UErrorCode *pErrorCode);

/**
 * Detect charset in text
 * @param in
 * @param inlen
 * @return detected charset name or NULL
 */
const char *rspamd_mime_charset_find_by_content(const gchar *in, gsize inlen,
												bool check_utf8);

#ifdef __cplusplus
}
#endif

#endif /* SRC_LIBMIME_MIME_ENCODING_H_ */