summaryrefslogtreecommitdiffstats
path: root/src/lib-mail/message-header-encode.c
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--src/lib-mail/message-header-encode.c406
1 files changed, 406 insertions, 0 deletions
diff --git a/src/lib-mail/message-header-encode.c b/src/lib-mail/message-header-encode.c
new file mode 100644
index 0000000..a9410a8
--- /dev/null
+++ b/src/lib-mail/message-header-encode.c
@@ -0,0 +1,406 @@
+/* Copyright (c) 2009-2018 Dovecot authors, see the included COPYING file */
+
+#include "lib.h"
+#include "str.h"
+#include "unichar.h"
+#include "base64.h"
+#include "message-header-encode.h"
+
+#define MIME_WRAPPER_LEN (strlen("=?utf-8?q?""?="))
+#define MIME_MAX_LINE_LEN 76
+
+#define IS_LWSP(c) \
+ ((c) == ' ' || (c) == '\t' || (c) == '\n')
+
+static bool
+input_idx_need_encoding(const unsigned char *input, size_t i, size_t len)
+{
+ switch (input[i]) {
+ case '\r':
+ if (i+1 == len || input[i+1] != '\n')
+ return TRUE;
+ i++;
+ /* fall through - verify the LF as well */
+ case '\n':
+ if (i+1 == len) {
+ /* trailing LF - we need to drop it */
+ return TRUE;
+ }
+ i_assert(i+1 < len);
+ if (input[i+1] != '\t' && input[i+1] != ' ') {
+ /* LF not followed by whitespace - we need to
+ add the whitespace */
+ return TRUE;
+ }
+ break;
+ case '\t':
+ /* TAB doesn't need to be encoded */
+ break;
+ case '=':
+ /* <LWSP>=? - we need to check backwards a bit to see if
+ there is LWSP (note that we don't want to return TRUE for
+ the LWSP itself yet, so we need to do this backwards
+ check) */
+ if ((i == 0 || IS_LWSP(input[i-1])) && i+2 <= len &&
+ memcmp(input + i, "=?", 2) == 0)
+ return TRUE;
+ break;
+ default:
+ /* 8bit chars */
+ if ((input[i] & 0x80) != 0)
+ return TRUE;
+ /* control chars */
+ if (input[i] < 32)
+ return TRUE;
+ break;
+ }
+ return FALSE;
+}
+
+void message_header_encode_q(const unsigned char *input, size_t len,
+ string_t *output, size_t first_line_len)
+{
+ static const unsigned char *rep_char =
+ (const unsigned char *)UNICODE_REPLACEMENT_CHAR_UTF8;
+ static const unsigned int rep_char_len =
+ UNICODE_REPLACEMENT_CHAR_UTF8_LEN;
+ size_t line_len_left;
+ bool invalid_char = FALSE;
+
+ if (len == 0)
+ return;
+
+ line_len_left = MIME_MAX_LINE_LEN - MIME_WRAPPER_LEN;
+
+ if (first_line_len >= MIME_MAX_LINE_LEN - MIME_WRAPPER_LEN - 3) {
+ str_append(output, "\n\t");
+ line_len_left--;
+ } else {
+ line_len_left -= first_line_len;
+ }
+
+ str_append(output, "=?utf-8?q?");
+ for (;;) {
+ unichar_t ch;
+ int nch = 1;
+ size_t n_in, n_out = 0, j;
+
+ /* Determine how many bytes are to be consumed from input and
+ written to output. */
+ switch (input[0]) {
+ case ' ':
+ /* Space is translated to a single '_'. */
+ n_out = 1;
+ n_in = 1;
+ break;
+ case '=':
+ case '?':
+ case '_':
+ /* Special characters are escaped. */
+ n_in = 1;
+ n_out = 3;
+ break;
+ default:
+ nch = uni_utf8_get_char_n(input, len, &ch);
+ if (nch <= 0) {
+ /* Invalid UTF-8 character */
+ n_in = 1;
+ if (!invalid_char) {
+ /* First octet of bad stuff; will emit
+ replacement character. */
+ n_out = rep_char_len * 3;
+ } else {
+ /* Emit only one replacement char for
+ a burst of bad stuff. */
+ n_out = 0;
+ }
+ } else if (nch > 1) {
+ /* Unicode characters are escaped as several
+ escape sequences for each octet. */
+ n_in = nch;
+ n_out = nch * 3;
+ } else if (ch < 0x20 || ch > 0x7e) {
+ /* Control characters are escaped. */
+ i_assert(ch < 0x80);
+ n_in = 1;
+ n_out = 3;
+ } else {
+ /* Other ASCII characters are written to output
+ directly. */
+ n_in = 1;
+ n_out = 1;
+ }
+ }
+ invalid_char = (nch <= 0);
+
+ /* Start a new line once unsufficient space is available to
+ write more to the current line. */
+ if (line_len_left < n_out) {
+ str_append(output, "?=\n\t=?utf-8?q?");
+ line_len_left = MIME_MAX_LINE_LEN -
+ MIME_WRAPPER_LEN - 1;
+ }
+
+ /* Encode the character */
+ if (input[0] == ' ') {
+ /* Write special escape sequence for space character */
+ str_append_c(output, '_');
+ } else if (invalid_char) {
+ /* Write replacement character for invalid UTF-8 code
+ point. */
+ for (j = 0; n_out > 0 && j < rep_char_len; j++)
+ str_printfa(output, "=%02X", rep_char[j]);
+ } else if (n_out > 1) {
+ /* Write one or more escape sequences for a special
+ character, a control character, or a valid UTF-8
+ code point. */
+ for (j = 0; j < n_in; j++)
+ str_printfa(output, "=%02X", input[j]);
+ } else {
+ /* Write other ASCII characters directly to output. */
+ str_append_c(output, input[0]);
+ }
+
+ /* Update sizes and pointers */
+ i_assert(len >= n_in);
+ line_len_left -= n_out;
+ input += n_in;
+ len -= n_in;
+
+ if (len == 0)
+ break;
+ }
+ str_append(output, "?=");
+}
+
+void message_header_encode_b(const unsigned char *input, size_t len,
+ string_t *output, size_t first_line_len)
+{
+ static const unsigned char *rep_char =
+ (const unsigned char *)UNICODE_REPLACEMENT_CHAR_UTF8;
+ static const unsigned int rep_char_len =
+ UNICODE_REPLACEMENT_CHAR_UTF8_LEN;
+ struct base64_encoder b64enc;
+ size_t line_len_left;
+
+ if (len == 0)
+ return;
+
+ line_len_left = MIME_MAX_LINE_LEN - MIME_WRAPPER_LEN;
+
+ if (first_line_len >= MIME_MAX_LINE_LEN - MIME_WRAPPER_LEN - 3) {
+ str_append(output, "\n\t");
+ line_len_left--;
+ } else {
+ line_len_left -= first_line_len;
+ }
+
+ str_append(output, "=?utf-8?b?");
+ base64_encode_init(&b64enc, &base64_scheme, 0, 0);
+ for (;;) {
+ unichar_t ch;
+ size_t space, max, old_bufsize, n_in, n_out;
+ int nch = 1;
+
+ /* Determine how many octets can be encoded on (the remainder
+ of) this line */
+ space = base64_encode_get_full_space(&b64enc, line_len_left);
+ max = I_MIN(space, len);
+
+ /* Check UTF-8 code points in the input and determine a proper
+ boundary for the end of this fragment if the encoded size
+ exceeds the maximum (remaining) line length. */
+ for (n_in = 0; n_in < max;) {
+ nch = uni_utf8_get_char_n(&input[n_in],
+ len - n_in, &ch);
+ if (nch <= 0)
+ break;
+ if ((n_in + nch) > max)
+ break;
+ n_in += nch;
+ }
+
+ /* Encode this fragment up until the maximum fragment size or
+ the first invalid UTF-8 code point in the input. */
+ if (n_in > 0) {
+ old_bufsize = output->used;
+ if (!base64_encode_more(&b64enc, input, n_in,
+ &n_in, output))
+ i_unreached();
+ n_out = output->used - old_bufsize;
+
+ /* Update sizes and pointers */
+ i_assert(len >= n_in);
+ i_assert(line_len_left >= n_out);
+ input += n_in;
+ len -= n_in;
+ line_len_left -= n_out;
+ }
+
+ /* Determine whether a repacement character needs to be written
+ and how much space there is left for it on the current line.
+ */
+ space = 0;
+ if (nch <= 0) {
+ space = base64_encode_get_full_space(
+ &b64enc, line_len_left);
+ }
+
+ /* Start a new line once insufficient space is available. */
+ if ((nch > 0 && len > 0) ||
+ (nch <= 0 && space < rep_char_len)) {
+ old_bufsize = output->used;
+ if (!base64_encode_finish(&b64enc, output))
+ i_unreached();
+ n_out = output->used - old_bufsize;
+ i_assert(line_len_left >= n_out);
+
+ str_append(output, "?=\n\t=?utf-8?b?");
+ line_len_left = MIME_MAX_LINE_LEN -
+ MIME_WRAPPER_LEN - 1;
+ base64_encode_reset(&b64enc);
+ }
+
+ /* Write replacement character if needed. */
+ n_in = 0;
+ n_out = 0;
+ if (nch <= 0) {
+ old_bufsize = output->used;
+ if (!base64_encode_more(&b64enc, rep_char, rep_char_len,
+ NULL, output))
+ i_unreached();
+
+ n_in = 1;
+ n_out = output->used - old_bufsize;
+
+ /* Skip more invalid characters in the input. */
+ for (; n_in < len; n_in++) {
+ nch = uni_utf8_get_char_n(&input[n_in],
+ len - n_in, &ch);
+ if (nch > 0)
+ break;
+ }
+ }
+
+ /* Update sizes and pointers */
+ i_assert(line_len_left >= n_out);
+ input += n_in;
+ len -= n_in;
+ line_len_left -= n_out;
+
+ if (len == 0)
+ break;
+ }
+ if (!base64_encode_finish(&b64enc, output))
+ i_unreached();
+ str_append(output, "?=");
+}
+
+void message_header_encode(const char *input, string_t *output)
+{
+ message_header_encode_data((const void *)input, strlen(input), output);
+}
+
+void message_header_encode_data(const unsigned char *input, size_t len,
+ string_t *output)
+{
+ size_t i, j, first_line_len, cur_line_len, last_idx;
+ size_t enc_chars, enc_len, base64_len, q_len;
+ const unsigned char *next_line_input;
+ size_t next_line_len = 0;
+ bool use_q, cr;
+
+ /* find the first word that needs encoding */
+ for (i = 0; i < len; i++) {
+ if (input_idx_need_encoding(input, i, len))
+ break;
+ }
+ if (i == len) {
+ /* no encoding necessary */
+ str_append_data(output, input, len);
+ return;
+ }
+ /* go back to the beginning of the word so it is fully encoded */
+ if (input[i] != '\r' && input[i] != '\n') {
+ while (i > 0 && !IS_LWSP(input[i-1]))
+ i--;
+ }
+
+ /* write the prefix */
+ str_append_data(output, input, i);
+ first_line_len = j = i;
+ while (j > 0 && input[j-1] != '\n') j--;
+ if (j != 0)
+ first_line_len = j;
+
+ input += i;
+ len -= i;
+
+ /* we'll encode data only up to the next LF, the rest is handled
+ recursively. */
+ next_line_input = memchr(input, '\n', len);
+ if (next_line_input != NULL) {
+ cur_line_len = next_line_input - input;
+ if (cur_line_len > 0 && input[cur_line_len-1] == '\r') {
+ cur_line_len--;
+ next_line_input = input + cur_line_len;
+ }
+ next_line_len = len - cur_line_len;
+ len = cur_line_len;
+ }
+
+ /* find the last word that needs encoding */
+ last_idx = 0; enc_chars = 0;
+ for (i = 0; i < len; i++) {
+ if (input_idx_need_encoding(input, i, len)) {
+ last_idx = i + 1;
+ enc_chars++;
+ }
+ }
+ while (last_idx < len && !IS_LWSP(input[last_idx]))
+ last_idx++;
+
+ /* figure out if we should use Q or B encoding. Prefer Q if it's not
+ too much larger. */
+ enc_len = last_idx;
+ base64_len = MAX_BASE64_ENCODED_SIZE(enc_len);
+ q_len = enc_len + enc_chars*3;
+ use_q = q_len*2/3 <= base64_len;
+
+ /* and do it */
+ if (enc_len == 0)
+ ;
+ else if (use_q)
+ message_header_encode_q(input, enc_len, output, first_line_len);
+ else
+ message_header_encode_b(input, enc_len, output, first_line_len);
+ str_append_data(output, input + last_idx, len - last_idx);
+
+ if (next_line_input != NULL) {
+ /* we're at [CR]LF */
+ i = 0;
+ if (next_line_input[0] == '\r') {
+ cr = TRUE;
+ i++;
+ } else {
+ cr = FALSE;
+ }
+ i_assert(next_line_input[i] == '\n');
+ if (++i == next_line_len)
+ return; /* drop trailing [CR]LF */
+
+ if (cr)
+ str_append_c(output, '\r');
+ str_append_c(output, '\n');
+
+ if (next_line_input[i] == ' ' || next_line_input[i] == '\t') {
+ str_append_c(output, next_line_input[i]);
+ i++;
+ } else {
+ /* make it valid folding whitespace by adding a TAB */
+ str_append_c(output, '\t');
+ }
+ message_header_encode_data(next_line_input+i, next_line_len-i,
+ output);
+ }
+}