diff options
Diffstat (limited to '')
-rw-r--r-- | src/lib-mail/message-header-encode.c | 406 |
1 files changed, 406 insertions, 0 deletions
diff --git a/src/lib-mail/message-header-encode.c b/src/lib-mail/message-header-encode.c new file mode 100644 index 0000000..a9410a8 --- /dev/null +++ b/src/lib-mail/message-header-encode.c @@ -0,0 +1,406 @@ +/* Copyright (c) 2009-2018 Dovecot authors, see the included COPYING file */ + +#include "lib.h" +#include "str.h" +#include "unichar.h" +#include "base64.h" +#include "message-header-encode.h" + +#define MIME_WRAPPER_LEN (strlen("=?utf-8?q?""?=")) +#define MIME_MAX_LINE_LEN 76 + +#define IS_LWSP(c) \ + ((c) == ' ' || (c) == '\t' || (c) == '\n') + +static bool +input_idx_need_encoding(const unsigned char *input, size_t i, size_t len) +{ + switch (input[i]) { + case '\r': + if (i+1 == len || input[i+1] != '\n') + return TRUE; + i++; + /* fall through - verify the LF as well */ + case '\n': + if (i+1 == len) { + /* trailing LF - we need to drop it */ + return TRUE; + } + i_assert(i+1 < len); + if (input[i+1] != '\t' && input[i+1] != ' ') { + /* LF not followed by whitespace - we need to + add the whitespace */ + return TRUE; + } + break; + case '\t': + /* TAB doesn't need to be encoded */ + break; + case '=': + /* <LWSP>=? - we need to check backwards a bit to see if + there is LWSP (note that we don't want to return TRUE for + the LWSP itself yet, so we need to do this backwards + check) */ + if ((i == 0 || IS_LWSP(input[i-1])) && i+2 <= len && + memcmp(input + i, "=?", 2) == 0) + return TRUE; + break; + default: + /* 8bit chars */ + if ((input[i] & 0x80) != 0) + return TRUE; + /* control chars */ + if (input[i] < 32) + return TRUE; + break; + } + return FALSE; +} + +void message_header_encode_q(const unsigned char *input, size_t len, + string_t *output, size_t first_line_len) +{ + static const unsigned char *rep_char = + (const unsigned char *)UNICODE_REPLACEMENT_CHAR_UTF8; + static const unsigned int rep_char_len = + UNICODE_REPLACEMENT_CHAR_UTF8_LEN; + size_t line_len_left; + bool invalid_char = FALSE; + + if (len == 0) + return; + + line_len_left = MIME_MAX_LINE_LEN - MIME_WRAPPER_LEN; + + if (first_line_len >= MIME_MAX_LINE_LEN - MIME_WRAPPER_LEN - 3) { + str_append(output, "\n\t"); + line_len_left--; + } else { + line_len_left -= first_line_len; + } + + str_append(output, "=?utf-8?q?"); + for (;;) { + unichar_t ch; + int nch = 1; + size_t n_in, n_out = 0, j; + + /* Determine how many bytes are to be consumed from input and + written to output. */ + switch (input[0]) { + case ' ': + /* Space is translated to a single '_'. */ + n_out = 1; + n_in = 1; + break; + case '=': + case '?': + case '_': + /* Special characters are escaped. */ + n_in = 1; + n_out = 3; + break; + default: + nch = uni_utf8_get_char_n(input, len, &ch); + if (nch <= 0) { + /* Invalid UTF-8 character */ + n_in = 1; + if (!invalid_char) { + /* First octet of bad stuff; will emit + replacement character. */ + n_out = rep_char_len * 3; + } else { + /* Emit only one replacement char for + a burst of bad stuff. */ + n_out = 0; + } + } else if (nch > 1) { + /* Unicode characters are escaped as several + escape sequences for each octet. */ + n_in = nch; + n_out = nch * 3; + } else if (ch < 0x20 || ch > 0x7e) { + /* Control characters are escaped. */ + i_assert(ch < 0x80); + n_in = 1; + n_out = 3; + } else { + /* Other ASCII characters are written to output + directly. */ + n_in = 1; + n_out = 1; + } + } + invalid_char = (nch <= 0); + + /* Start a new line once unsufficient space is available to + write more to the current line. */ + if (line_len_left < n_out) { + str_append(output, "?=\n\t=?utf-8?q?"); + line_len_left = MIME_MAX_LINE_LEN - + MIME_WRAPPER_LEN - 1; + } + + /* Encode the character */ + if (input[0] == ' ') { + /* Write special escape sequence for space character */ + str_append_c(output, '_'); + } else if (invalid_char) { + /* Write replacement character for invalid UTF-8 code + point. */ + for (j = 0; n_out > 0 && j < rep_char_len; j++) + str_printfa(output, "=%02X", rep_char[j]); + } else if (n_out > 1) { + /* Write one or more escape sequences for a special + character, a control character, or a valid UTF-8 + code point. */ + for (j = 0; j < n_in; j++) + str_printfa(output, "=%02X", input[j]); + } else { + /* Write other ASCII characters directly to output. */ + str_append_c(output, input[0]); + } + + /* Update sizes and pointers */ + i_assert(len >= n_in); + line_len_left -= n_out; + input += n_in; + len -= n_in; + + if (len == 0) + break; + } + str_append(output, "?="); +} + +void message_header_encode_b(const unsigned char *input, size_t len, + string_t *output, size_t first_line_len) +{ + static const unsigned char *rep_char = + (const unsigned char *)UNICODE_REPLACEMENT_CHAR_UTF8; + static const unsigned int rep_char_len = + UNICODE_REPLACEMENT_CHAR_UTF8_LEN; + struct base64_encoder b64enc; + size_t line_len_left; + + if (len == 0) + return; + + line_len_left = MIME_MAX_LINE_LEN - MIME_WRAPPER_LEN; + + if (first_line_len >= MIME_MAX_LINE_LEN - MIME_WRAPPER_LEN - 3) { + str_append(output, "\n\t"); + line_len_left--; + } else { + line_len_left -= first_line_len; + } + + str_append(output, "=?utf-8?b?"); + base64_encode_init(&b64enc, &base64_scheme, 0, 0); + for (;;) { + unichar_t ch; + size_t space, max, old_bufsize, n_in, n_out; + int nch = 1; + + /* Determine how many octets can be encoded on (the remainder + of) this line */ + space = base64_encode_get_full_space(&b64enc, line_len_left); + max = I_MIN(space, len); + + /* Check UTF-8 code points in the input and determine a proper + boundary for the end of this fragment if the encoded size + exceeds the maximum (remaining) line length. */ + for (n_in = 0; n_in < max;) { + nch = uni_utf8_get_char_n(&input[n_in], + len - n_in, &ch); + if (nch <= 0) + break; + if ((n_in + nch) > max) + break; + n_in += nch; + } + + /* Encode this fragment up until the maximum fragment size or + the first invalid UTF-8 code point in the input. */ + if (n_in > 0) { + old_bufsize = output->used; + if (!base64_encode_more(&b64enc, input, n_in, + &n_in, output)) + i_unreached(); + n_out = output->used - old_bufsize; + + /* Update sizes and pointers */ + i_assert(len >= n_in); + i_assert(line_len_left >= n_out); + input += n_in; + len -= n_in; + line_len_left -= n_out; + } + + /* Determine whether a repacement character needs to be written + and how much space there is left for it on the current line. + */ + space = 0; + if (nch <= 0) { + space = base64_encode_get_full_space( + &b64enc, line_len_left); + } + + /* Start a new line once insufficient space is available. */ + if ((nch > 0 && len > 0) || + (nch <= 0 && space < rep_char_len)) { + old_bufsize = output->used; + if (!base64_encode_finish(&b64enc, output)) + i_unreached(); + n_out = output->used - old_bufsize; + i_assert(line_len_left >= n_out); + + str_append(output, "?=\n\t=?utf-8?b?"); + line_len_left = MIME_MAX_LINE_LEN - + MIME_WRAPPER_LEN - 1; + base64_encode_reset(&b64enc); + } + + /* Write replacement character if needed. */ + n_in = 0; + n_out = 0; + if (nch <= 0) { + old_bufsize = output->used; + if (!base64_encode_more(&b64enc, rep_char, rep_char_len, + NULL, output)) + i_unreached(); + + n_in = 1; + n_out = output->used - old_bufsize; + + /* Skip more invalid characters in the input. */ + for (; n_in < len; n_in++) { + nch = uni_utf8_get_char_n(&input[n_in], + len - n_in, &ch); + if (nch > 0) + break; + } + } + + /* Update sizes and pointers */ + i_assert(line_len_left >= n_out); + input += n_in; + len -= n_in; + line_len_left -= n_out; + + if (len == 0) + break; + } + if (!base64_encode_finish(&b64enc, output)) + i_unreached(); + str_append(output, "?="); +} + +void message_header_encode(const char *input, string_t *output) +{ + message_header_encode_data((const void *)input, strlen(input), output); +} + +void message_header_encode_data(const unsigned char *input, size_t len, + string_t *output) +{ + size_t i, j, first_line_len, cur_line_len, last_idx; + size_t enc_chars, enc_len, base64_len, q_len; + const unsigned char *next_line_input; + size_t next_line_len = 0; + bool use_q, cr; + + /* find the first word that needs encoding */ + for (i = 0; i < len; i++) { + if (input_idx_need_encoding(input, i, len)) + break; + } + if (i == len) { + /* no encoding necessary */ + str_append_data(output, input, len); + return; + } + /* go back to the beginning of the word so it is fully encoded */ + if (input[i] != '\r' && input[i] != '\n') { + while (i > 0 && !IS_LWSP(input[i-1])) + i--; + } + + /* write the prefix */ + str_append_data(output, input, i); + first_line_len = j = i; + while (j > 0 && input[j-1] != '\n') j--; + if (j != 0) + first_line_len = j; + + input += i; + len -= i; + + /* we'll encode data only up to the next LF, the rest is handled + recursively. */ + next_line_input = memchr(input, '\n', len); + if (next_line_input != NULL) { + cur_line_len = next_line_input - input; + if (cur_line_len > 0 && input[cur_line_len-1] == '\r') { + cur_line_len--; + next_line_input = input + cur_line_len; + } + next_line_len = len - cur_line_len; + len = cur_line_len; + } + + /* find the last word that needs encoding */ + last_idx = 0; enc_chars = 0; + for (i = 0; i < len; i++) { + if (input_idx_need_encoding(input, i, len)) { + last_idx = i + 1; + enc_chars++; + } + } + while (last_idx < len && !IS_LWSP(input[last_idx])) + last_idx++; + + /* figure out if we should use Q or B encoding. Prefer Q if it's not + too much larger. */ + enc_len = last_idx; + base64_len = MAX_BASE64_ENCODED_SIZE(enc_len); + q_len = enc_len + enc_chars*3; + use_q = q_len*2/3 <= base64_len; + + /* and do it */ + if (enc_len == 0) + ; + else if (use_q) + message_header_encode_q(input, enc_len, output, first_line_len); + else + message_header_encode_b(input, enc_len, output, first_line_len); + str_append_data(output, input + last_idx, len - last_idx); + + if (next_line_input != NULL) { + /* we're at [CR]LF */ + i = 0; + if (next_line_input[0] == '\r') { + cr = TRUE; + i++; + } else { + cr = FALSE; + } + i_assert(next_line_input[i] == '\n'); + if (++i == next_line_len) + return; /* drop trailing [CR]LF */ + + if (cr) + str_append_c(output, '\r'); + str_append_c(output, '\n'); + + if (next_line_input[i] == ' ' || next_line_input[i] == '\t') { + str_append_c(output, next_line_input[i]); + i++; + } else { + /* make it valid folding whitespace by adding a TAB */ + str_append_c(output, '\t'); + } + message_header_encode_data(next_line_input+i, next_line_len-i, + output); + } +} |