Adding upstream version 1:2.3.19.1+dfsg1.upstream/1%2.3.19.1+dfsg1 upstream

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
author: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-28 09:51:24 +0000
committer: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-28 09:51:24 +0000
commit: f7548d6d28c313cf80e6f3ef89aed16a19815df1 (patch)
tree: a3f6f2a3f247293bee59ecd28e8cd8ceb6ca064a /src/lib-mail/mail-html2text.c
parent: Initial commit. (diff)
download: dovecot-f7548d6d28c313cf80e6f3ef89aed16a19815df1.tar.xz
dovecot-f7548d6d28c313cf80e6f3ef89aed16a19815df1.zip
1 files changed, 354 insertions, 0 deletions
diff --git a/src/lib-mail/mail-html2text.c b/src/lib-mail/mail-html2text.c
new file mode 100644
index 0000000..9332b64
--- /dev/null
+++ b/src/lib-mail/mail-html2text.c
@@ -0,0 +1,354 @@
+/* Copyright (c) 2011-2018 Dovecot authors, see the included COPYING file */
+
+#include "lib.h"
+#include "buffer.h"
+#include "unichar.h"
+#include "message-parser.h"
+#include "mail-html2text.h"
+
+/* Zero-width space (&#x200B;) apparently also belongs here, but that gets a
+   bit tricky to handle.. is it actually used anywhere? */
+#define HTML_WHITESPACE(c) \
+	((c) == ' ' || (c) == '\t' || (c) == '\r' || (c) == '\n')
+
+enum html_state {
+	/* regular text */
+	HTML_STATE_TEXT,
+	/* tag outside "quoted string" */
+	HTML_STATE_TAG,
+	/* tag inside "double quoted string" */
+	HTML_STATE_TAG_DQUOTED,
+	/* tag -> "escape\ */
+	HTML_STATE_TAG_DQUOTED_ESCAPE,
+	/* tag inside 'single quoted string' */
+	HTML_STATE_TAG_SQUOTED,
+	/* tag -> 'escape\ */
+	HTML_STATE_TAG_SQUOTED_ESCAPE,
+	/* comment */
+	HTML_STATE_COMMENT,
+	/* comment is ending, we've seen "--" and now just waiting for ">" */
+	HTML_STATE_COMMENT_END,
+	/* (java)script */
+	HTML_STATE_SCRIPT,
+	/* CSS style */
+	HTML_STATE_STYLE,
+	/* <![CDATA[...]]> */
+	HTML_STATE_CDATA
+};
+
+struct mail_html2text {
+	enum mail_html2text_flags flags;
+	enum html_state state;
+	buffer_t *input;
+	unsigned int quote_level;
+	bool add_newline;
+};
+
+static struct {
+	const char *name;
+	unichar_t chr;
+} html_entities[] = {
+#include "html-entities.h"
+};
+
+struct mail_html2text *
+mail_html2text_init(enum mail_html2text_flags flags)
+{
+	struct mail_html2text *ht;
+
+	ht = i_new(struct mail_html2text, 1);
+	ht->flags = flags;
+	ht->input = buffer_create_dynamic(default_pool, 512);
+	return ht;
+}
+
+static size_t
+parse_tag_name(struct mail_html2text *ht,
+	       const unsigned char *data, size_t size)
+{
+	size_t i;
+
+	if (size >= 3 && memcmp(data, "!--", 3) == 0) {
+		ht->state = HTML_STATE_COMMENT;
+		return 3 + 1;
+	}
+	if (size >= 7 && i_memcasecmp(data, "script", 6) == 0 &&
+	    (HTML_WHITESPACE(data[6]) || data[6] == '>')) {
+		ht->state = HTML_STATE_SCRIPT;
+		return 7 + 1;
+	}
+	if (size >= 6 && i_memcasecmp(data, "style", 5) == 0 &&
+	    (HTML_WHITESPACE(data[5]) || data[5] == '>')) {
+		ht->state = HTML_STATE_STYLE;
+		return 6 + 1;
+	}
+	if (size >= 8 && i_memcasecmp(data, "![CDATA[", 8) == 0) {
+		ht->state = HTML_STATE_CDATA;
+		return 8 + 1;
+	}
+
+	if (size >= 11 && i_memcasecmp(data, "blockquote", 10) == 0 &&
+	    (HTML_WHITESPACE(data[10]) || data[10] == '>')) {
+		ht->quote_level++;
+		ht->state = HTML_STATE_TAG;
+		return 1;
+	} else if (ht->quote_level > 0 &&
+		   size >= 12 && i_memcasecmp(data, "/blockquote>", 12) == 0) {
+		ht->quote_level--;
+		if ((ht->flags & MAIL_HTML2TEXT_FLAG_SKIP_QUOTED) == 0)
+			ht->add_newline = TRUE;
+		ht->state = HTML_STATE_TAG;
+		return 1;
+	}
+	if (size < 12) {
+		/* can we see the whole tag name? */
+		for (i = 0; i < size; i++) {
+			if (HTML_WHITESPACE(data[i]) || data[i] == '>')
+				break;
+		}
+		if (i == size) {
+			/* need more data */
+			return 0;
+		}
+	}
+	ht->state = HTML_STATE_TAG;
+	return 1;
+}
+
+static bool html_entity_get_unichar(const char *name, unichar_t *chr_r)
+{
+	unichar_t chr;
+
+	for (size_t i = 0; i < N_ELEMENTS(html_entities); i++) {
+		if (strcmp(html_entities[i].name, name) == 0) {
+			*chr_r = html_entities[i].chr;
+			return TRUE;
+		}
+	}
+
+	/* maybe it's just encoded binary byte
+	   it can be &#nnn; or &#xnnn;
+	*/
+	if (name[0] == '#' &&
+	    ((name[1] == 'x' &&
+	      str_to_uint32_hex(name+2, &chr) == 0) ||
+	     str_to_uint32(name+1, &chr) == 0) &&
+	     uni_is_valid_ucs4(chr)) {
+		*chr_r = chr;
+		return TRUE;
+	}
+
+	return FALSE;
+}
+
+static size_t parse_entity(const unsigned char *data, size_t size,
+			   buffer_t *output)
+{
+	char entity[10];
+	unichar_t chr;
+	size_t i;
+
+	for (i = 0; i < size; i++) {
+		if (HTML_WHITESPACE(data[i]) || i >= sizeof(entity)) {
+			/* broken entity */
+			return 1;
+		}
+		if (data[i] == ';')
+			break;
+	}
+	if (i == size)
+		return 0;
+
+	i_assert(i < sizeof(entity));
+	memcpy(entity, data, i); entity[i] = '\0';
+
+	if (html_entity_get_unichar(entity, &chr))
+		uni_ucs4_to_utf8_c(chr, output);
+	return i + 1 + 1;
+}
+
+static void mail_html2text_add_space(buffer_t *output)
+{
+	const unsigned char *data = output->data;
+
+	if (output->used > 0 && data[output->used-1] != ' ' &&
+	    data[output->used-1] != '\n')
+		buffer_append_c(output, ' ');
+}
+
+static size_t
+parse_data(struct mail_html2text *ht,
+	   const unsigned char *data, size_t size, buffer_t *output)
+{
+	size_t i, ret;
+
+	for (i = 0; i < size; i++) {
+		unsigned char c = data[i];
+
+		switch (ht->state) {
+		case HTML_STATE_TEXT:
+			if (c == '<') {
+				ret = parse_tag_name(ht, data+i+1, size-i-1);
+				if (ret == 0)
+					return i;
+				i += ret - 1;
+			} else if (ht->quote_level > 0 &&
+				   (ht->flags & MAIL_HTML2TEXT_FLAG_SKIP_QUOTED) != 0) {
+					break;
+			} else if (c == '&') {
+				ret = parse_entity(data+i+1, size-i-1, output);
+				if (ret == 0)
+					return i;
+				i += ret - 1;
+			} else {
+				buffer_append_c(output, c);
+			}
+			break;
+		case HTML_STATE_TAG:
+			if (c == '"')
+				ht->state = HTML_STATE_TAG_DQUOTED;
+			else if (c == '\'')
+				ht->state = HTML_STATE_TAG_SQUOTED;
+			else if (c == '>') {
+				ht->state = HTML_STATE_TEXT;
+				if (ht->quote_level > 0 &&
+				    (ht->flags & MAIL_HTML2TEXT_FLAG_SKIP_QUOTED) == 0) {
+					buffer_append(output, "\n>", 2);
+				} else if (ht->add_newline) {
+					buffer_append_c(output, '\n');
+				}
+				ht->add_newline = FALSE;
+				mail_html2text_add_space(output);
+			}
+			break;
+		case HTML_STATE_TAG_DQUOTED:
+			if (c == '"')
+				ht->state = HTML_STATE_TAG;
+			else if (c == '\\')
+				ht->state = HTML_STATE_TAG_DQUOTED_ESCAPE;
+			break;
+		case HTML_STATE_TAG_DQUOTED_ESCAPE:
+			ht->state = HTML_STATE_TAG_DQUOTED;
+			break;
+		case HTML_STATE_TAG_SQUOTED:
+			if (c == '\'')
+				ht->state = HTML_STATE_TAG;
+			else if (c == '\\')
+				ht->state = HTML_STATE_TAG_SQUOTED_ESCAPE;
+			break;
+		case HTML_STATE_TAG_SQUOTED_ESCAPE:
+			ht->state = HTML_STATE_TAG_SQUOTED;
+			break;
+		case HTML_STATE_COMMENT:
+			if (c == '-') {
+				if (i+1 == size)
+					return i;
+				if (data[i+1] == '-') {
+					ht->state = HTML_STATE_COMMENT_END;
+					i++;
+				}
+			}
+			break;
+		case HTML_STATE_COMMENT_END:
+			if (c == '>')
+				ht->state = HTML_STATE_TEXT;
+			else if (!HTML_WHITESPACE(c))
+				ht->state = HTML_STATE_COMMENT;
+			break;
+		case HTML_STATE_SCRIPT:
+			if (c == '<') {
+				unsigned int max_len = I_MIN(size-i, 9);
+
+				if (i_memcasecmp(data+i, "</script>", max_len) == 0) {
+					if (max_len < 9)
+						return i;
+					mail_html2text_add_space(output);
+					ht->state = HTML_STATE_TEXT;
+					i += 8;
+				}
+			}
+			break;
+		case HTML_STATE_STYLE:
+			if (c == '<') {
+				unsigned int max_len = I_MIN(size-i, 8);
+
+				if (i_memcasecmp(data+i, "</style>", max_len) == 0) {
+					if (max_len < 8)
+						return i;
+					mail_html2text_add_space(output);
+					ht->state = HTML_STATE_TEXT;
+					i += 7;
+				}
+			}
+			break;
+		case HTML_STATE_CDATA:
+			if (c == ']') {
+				unsigned int max_len = I_MIN(size-i, 3);
+
+				if (i_memcasecmp(data+i, "]]>", max_len) == 0) {
+					if (max_len < 3)
+						return i;
+					ht->state = HTML_STATE_TEXT;
+					i += 2;
+					break;
+				}
+			}
+			if (ht->quote_level == 0 ||
+			    (ht->flags & MAIL_HTML2TEXT_FLAG_SKIP_QUOTED) == 0)
+				buffer_append_c(output, c);
+			break;
+		}
+	}
+	return i;
+}
+
+void mail_html2text_more(struct mail_html2text *ht,
+			 const unsigned char *data, size_t size,
+			 buffer_t *output)
+{
+	size_t pos, inc_size, buf_orig_size;
+
+	i_assert(size > 0);
+
+	while (ht->input->used > 0) {
+		/* we didn't get enough input the last time to know
+		   what to do. */
+		buf_orig_size = ht->input->used;
+
+		inc_size = I_MIN(size, 128);
+		buffer_append(ht->input, data, inc_size);
+		pos = parse_data(ht, ht->input->data,
+				 ht->input->used, output);
+		if (pos == 0) {
+			/* we need to add more data into buffer */
+			data += inc_size;
+			size -= inc_size;
+			if (size == 0)
+				return;
+		} else if (pos >= buf_orig_size) {
+			/* we parsed forward */
+			data += pos - buf_orig_size;
+			size -= pos - buf_orig_size;
+			buffer_set_used_size(ht->input, 0);
+		} else {
+			/* invalid input - eat away what we parsed so far
+			   and retry */
+			buffer_set_used_size(ht->input, buf_orig_size);
+			buffer_delete(ht->input, 0, pos);
+		}
+	}
+	pos = parse_data(ht, data, size, output);
+	buffer_append(ht->input, data + pos, size - pos);
+}
+
+void mail_html2text_deinit(struct mail_html2text **_ht)
+{
+	struct mail_html2text *ht = *_ht;
+
+	if (ht == NULL)
+		return;
+
+	*_ht = NULL;
+	buffer_free(&ht->input);
+	i_free(ht);
+}
author	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-28 09:51:24 +0000
committer	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-28 09:51:24 +0000
commit	f7548d6d28c313cf80e6f3ef89aed16a19815df1 (patch)
tree	a3f6f2a3f247293bee59ecd28e8cd8ceb6ca064a /src/lib-mail/mail-html2text.c
parent	Initial commit. (diff)
download	dovecot-f7548d6d28c313cf80e6f3ef89aed16a19815df1.tar.xz dovecot-f7548d6d28c313cf80e6f3ef89aed16a19815df1.zip