1 files changed, 406 insertions, 0 deletions
diff --git a/src/lib-mail/message-header-encode.c b/src/lib-mail/message-header-encode.c
new file mode 100644
index 0000000..a9410a8
--- /dev/null
+++ b/src/lib-mail/message-header-encode.c
@@ -0,0 +1,406 @@
+/* Copyright (c) 2009-2018 Dovecot authors, see the included COPYING file */
+
+#include "lib.h"
+#include "str.h"
+#include "unichar.h"
+#include "base64.h"
+#include "message-header-encode.h"
+
+#define MIME_WRAPPER_LEN (strlen("=?utf-8?q?""?="))
+#define MIME_MAX_LINE_LEN 76
+
+#define IS_LWSP(c) \
+	((c) == ' ' || (c) == '\t' || (c) == '\n')
+
+static bool
+input_idx_need_encoding(const unsigned char *input, size_t i, size_t len)
+{
+	switch (input[i]) {
+	case '\r':
+		if (i+1 == len || input[i+1] != '\n')
+			return TRUE;
+		i++;
+		/* fall through - verify the LF as well */
+	case '\n':
+		if (i+1 == len) {
+			/* trailing LF - we need to drop it */
+			return TRUE;
+		}
+		i_assert(i+1 < len);
+		if (input[i+1] != '\t' && input[i+1] != ' ') {
+			/* LF not followed by whitespace - we need to
+			   add the whitespace */
+			return TRUE;
+		}
+		break;
+	case '\t':
+		/* TAB doesn't need to be encoded */
+		break;
+	case '=':
+		/* <LWSP>=? - we need to check backwards a bit to see if
+		   there is LWSP (note that we don't want to return TRUE for
+		   the LWSP itself yet, so we need to do this backwards
+		   check) */
+		if ((i == 0 || IS_LWSP(input[i-1])) && i+2 <= len &&
+		    memcmp(input + i, "=?", 2) == 0)
+			return TRUE;
+		break;
+	default:
+		/* 8bit chars */
+		if ((input[i] & 0x80) != 0)
+			return TRUE;
+		/* control chars */
+		if (input[i] < 32)
+			return TRUE;
+		break;
+	}
+	return FALSE;
+}
+
+void message_header_encode_q(const unsigned char *input, size_t len,
+			     string_t *output, size_t first_line_len)
+{
+	static const unsigned char *rep_char =
+		(const unsigned char *)UNICODE_REPLACEMENT_CHAR_UTF8;
+	static const unsigned int rep_char_len =
+		UNICODE_REPLACEMENT_CHAR_UTF8_LEN;
+	size_t line_len_left;
+	bool invalid_char = FALSE;
+
+	if (len == 0)
+		return;
+
+	line_len_left = MIME_MAX_LINE_LEN - MIME_WRAPPER_LEN;
+
+	if (first_line_len >= MIME_MAX_LINE_LEN - MIME_WRAPPER_LEN - 3) {
+		str_append(output, "\n\t");
+		line_len_left--;
+	} else {
+		line_len_left -= first_line_len;
+	}
+
+	str_append(output, "=?utf-8?q?");
+	for (;;) {
+		unichar_t ch;
+		int nch = 1;
+		size_t n_in, n_out = 0, j;
+
+		/* Determine how many bytes are to be consumed from input and
+		   written to output. */
+		switch (input[0]) {
+		case ' ':
+			/* Space is translated to a single '_'. */
+			n_out = 1;
+			n_in = 1;
+			break;
+		case '=':
+		case '?':
+		case '_':
+			/* Special characters are escaped. */
+			n_in = 1;
+			n_out = 3;
+			break;
+		default:
+			nch = uni_utf8_get_char_n(input, len, &ch);
+			if (nch <= 0) {
+				/* Invalid UTF-8 character */
+				n_in = 1;
+				if (!invalid_char) {
+					/* First octet of bad stuff; will emit
+					   replacement character. */
+					n_out = rep_char_len * 3;
+				} else {
+					/* Emit only one replacement char for
+					   a burst of bad stuff. */
+					n_out = 0;
+				}
+			} else if (nch > 1) {
+				/* Unicode characters are escaped as several
+				   escape sequences for each octet. */
+				n_in = nch;
+				n_out = nch * 3;
+			} else if (ch < 0x20 || ch > 0x7e) {
+				/* Control characters are escaped. */
+				i_assert(ch < 0x80);
+				n_in = 1;
+				n_out = 3;
+			} else {
+				/* Other ASCII characters are written to output
+				   directly. */
+				n_in = 1;
+				n_out = 1;
+			}
+		}
+		invalid_char = (nch <= 0);
+
+		/* Start a new line once unsufficient space is available to
+		   write more to the current line. */
+		if (line_len_left < n_out) {
+			str_append(output, "?=\n\t=?utf-8?q?");
+			line_len_left = MIME_MAX_LINE_LEN -
+				MIME_WRAPPER_LEN - 1;
+		}
+
+		/* Encode the character */
+		if (input[0] == ' ') {
+			/* Write special escape sequence for space character */
+			str_append_c(output, '_');
+		} else if (invalid_char) {
+			/* Write replacement character for invalid UTF-8 code
+			   point. */
+			for (j = 0; n_out > 0 && j < rep_char_len; j++)
+				str_printfa(output, "=%02X", rep_char[j]);
+		} else if (n_out > 1) {
+			/* Write one or more escape sequences for a special
+			   character, a control character, or a valid UTF-8
+			   code point. */
+			for (j = 0; j < n_in; j++)
+				str_printfa(output, "=%02X", input[j]);
+		} else {
+			/* Write other ASCII characters directly to output. */
+			str_append_c(output, input[0]);
+		}
+
+		/* Update sizes and pointers */
+		i_assert(len >= n_in);
+		line_len_left -= n_out;
+		input += n_in;
+		len -= n_in;
+
+		if (len == 0)
+			break;
+	}
+	str_append(output, "?=");
+}
+
+void message_header_encode_b(const unsigned char *input, size_t len,
+			     string_t *output, size_t first_line_len)
+{
+	static const unsigned char *rep_char =
+		(const unsigned char *)UNICODE_REPLACEMENT_CHAR_UTF8;
+	static const unsigned int rep_char_len =
+		UNICODE_REPLACEMENT_CHAR_UTF8_LEN;
+	struct base64_encoder b64enc;
+	size_t line_len_left;
+
+	if (len == 0)
+		return;
+
+	line_len_left = MIME_MAX_LINE_LEN - MIME_WRAPPER_LEN;
+
+	if (first_line_len >= MIME_MAX_LINE_LEN - MIME_WRAPPER_LEN - 3) {
+		str_append(output, "\n\t");
+		line_len_left--;
+	} else {
+		line_len_left -= first_line_len;
+	}
+
+	str_append(output, "=?utf-8?b?");
+	base64_encode_init(&b64enc, &base64_scheme, 0, 0);
+	for (;;) {
+		unichar_t ch;
+		size_t space, max, old_bufsize, n_in, n_out;
+		int nch = 1;
+
+		/* Determine how many octets can be encoded on (the remainder
+		   of) this line */
+		space = base64_encode_get_full_space(&b64enc, line_len_left);
+		max = I_MIN(space, len);
+
+		/* Check UTF-8 code points in the input and determine a proper
+		   boundary for the end of this fragment if the encoded size
+		   exceeds the maximum (remaining) line length. */
+		for (n_in = 0; n_in < max;) {
+			nch = uni_utf8_get_char_n(&input[n_in],
+						  len - n_in, &ch);
+			if (nch <= 0)
+				break;
+			if ((n_in + nch) > max)
+				break;
+			n_in += nch;
+		}
+
+		/* Encode this fragment up until the maximum fragment size or
+		   the first invalid UTF-8 code point in the input. */
+		if (n_in > 0) {
+			old_bufsize = output->used;
+			if (!base64_encode_more(&b64enc, input, n_in,
+						  &n_in, output))
+				i_unreached();
+			n_out = output->used - old_bufsize;
+
+			/* Update sizes and pointers */
+			i_assert(len >= n_in);
+			i_assert(line_len_left >= n_out);
+			input += n_in;
+			len -= n_in;
+			line_len_left -= n_out;
+		}
+
+		/* Determine whether a repacement character needs to be written
+		   and how much space there is left for it on the current line.
+		 */
+		space = 0;
+		if (nch <= 0) {
+			space = base64_encode_get_full_space(
+				&b64enc, line_len_left);
+		}
+
+		/* Start a new line once insufficient space is available. */
+		if ((nch > 0 && len > 0) ||
+		    (nch <= 0 && space < rep_char_len)) {
+			old_bufsize = output->used;
+			if (!base64_encode_finish(&b64enc, output))
+				i_unreached();
+			n_out = output->used - old_bufsize;
+			i_assert(line_len_left >= n_out);
+
+			str_append(output, "?=\n\t=?utf-8?b?");
+			line_len_left = MIME_MAX_LINE_LEN -
+				MIME_WRAPPER_LEN - 1;
+			base64_encode_reset(&b64enc);
+		}
+
+		/* Write replacement character if needed. */
+		n_in = 0;
+		n_out = 0;
+		if (nch <= 0) {
+			old_bufsize = output->used;
+			if (!base64_encode_more(&b64enc, rep_char, rep_char_len,
+						NULL, output))
+				i_unreached();
+
+			n_in = 1;
+			n_out = output->used - old_bufsize;
+
+			/* Skip more invalid characters in the input. */
+			for (; n_in < len; n_in++) {
+				nch = uni_utf8_get_char_n(&input[n_in],
+							  len - n_in, &ch);
+				if (nch > 0)
+					break;
+			}
+		}
+
+		/* Update sizes and pointers */
+		i_assert(line_len_left >= n_out);
+		input += n_in;
+		len -= n_in;
+		line_len_left -= n_out;
+
+		if (len == 0)
+			break;
+	}
+	if (!base64_encode_finish(&b64enc, output))
+		i_unreached();
+	str_append(output, "?=");
+}
+
+void message_header_encode(const char *input, string_t *output)
+{
+	message_header_encode_data((const void *)input, strlen(input), output);
+}
+
+void message_header_encode_data(const unsigned char *input, size_t len,
+				string_t *output)
+{
+	size_t i, j, first_line_len, cur_line_len, last_idx;
+	size_t enc_chars, enc_len, base64_len, q_len;
+	const unsigned char *next_line_input;
+	size_t next_line_len = 0;
+	bool use_q, cr;
+
+	/* find the first word that needs encoding */
+	for (i = 0; i < len; i++) {
+		if (input_idx_need_encoding(input, i, len))
+			break;
+	}
+	if (i == len) {
+		/* no encoding necessary */
+		str_append_data(output, input, len);
+		return;
+	}
+	/* go back to the beginning of the word so it is fully encoded */
+	if (input[i] != '\r' && input[i] != '\n') {
+		while (i > 0 && !IS_LWSP(input[i-1]))
+			i--;
+	}
+
+	/* write the prefix */
+	str_append_data(output, input, i);
+	first_line_len = j = i;
+	while (j > 0 && input[j-1] != '\n') j--;
+	if (j != 0)
+		first_line_len = j;
+
+	input += i;
+	len -= i;
+
+	/* we'll encode data only up to the next LF, the rest is handled
+	   recursively. */
+	next_line_input = memchr(input, '\n', len);
+	if (next_line_input != NULL) {
+		cur_line_len = next_line_input - input;
+		if (cur_line_len > 0 && input[cur_line_len-1] == '\r') {
+			cur_line_len--;
+			next_line_input = input + cur_line_len;
+		}
+		next_line_len = len - cur_line_len;
+		len = cur_line_len;
+	}
+
+	/* find the last word that needs encoding */
+	last_idx = 0; enc_chars = 0;
+	for (i = 0; i < len; i++) {
+		if (input_idx_need_encoding(input, i, len)) {
+			last_idx = i + 1;
+			enc_chars++;
+		}
+	}
+	while (last_idx < len && !IS_LWSP(input[last_idx]))
+		last_idx++;
+
+	/* figure out if we should use Q or B encoding. Prefer Q if it's not
+	   too much larger. */
+	enc_len = last_idx;
+	base64_len = MAX_BASE64_ENCODED_SIZE(enc_len);
+	q_len = enc_len + enc_chars*3;
+	use_q = q_len*2/3 <= base64_len;
+
+	/* and do it */
+	if (enc_len == 0)
+		;
+	else if (use_q)
+		message_header_encode_q(input, enc_len, output, first_line_len);
+	else
+		message_header_encode_b(input, enc_len, output, first_line_len);
+	str_append_data(output, input + last_idx, len - last_idx);
+
+	if (next_line_input != NULL) {
+		/* we're at [CR]LF */
+		i = 0;
+		if (next_line_input[0] == '\r') {
+			cr = TRUE;
+			i++;
+		} else {
+			cr = FALSE;
+		}
+		i_assert(next_line_input[i] == '\n');
+		if (++i == next_line_len)
+			return; /* drop trailing [CR]LF */
+
+		if (cr)
+			str_append_c(output, '\r');
+		str_append_c(output, '\n');
+
+		if (next_line_input[i] == ' ' || next_line_input[i] == '\t') {
+			str_append_c(output, next_line_input[i]);
+			i++;
+		} else {
+			/* make it valid folding whitespace by adding a TAB */
+			str_append_c(output, '\t');
+		}
+		message_header_encode_data(next_line_input+i, next_line_len-i,
+					   output);
+	}
+}