summaryrefslogtreecommitdiffstats
path: root/src/lib-mail/rfc822-parser.c
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--src/lib-mail/rfc822-parser.c522
1 files changed, 522 insertions, 0 deletions
diff --git a/src/lib-mail/rfc822-parser.c b/src/lib-mail/rfc822-parser.c
new file mode 100644
index 0000000..c8595b4
--- /dev/null
+++ b/src/lib-mail/rfc822-parser.c
@@ -0,0 +1,522 @@
+/* Copyright (c) 2005-2018 Dovecot authors, see the included COPYING file */
+
+#include "lib.h"
+#include "str.h"
+#include "strescape.h"
+#include "rfc822-parser.h"
+
+/*
+ atext = ALPHA / DIGIT / ; Any character except controls,
+ "!" / "#" / ; SP, and specials.
+ "$" / "%" / ; Used for atoms
+ "&" / "'" /
+ "*" / "+" /
+ "-" / "/" /
+ "=" / "?" /
+ "^" / "_" /
+ "`" / "{" /
+ "|" / "}" /
+ "~"
+
+ MIME:
+
+ token := 1*<any (US-ASCII) CHAR except SPACE, CTLs,
+ or tspecials>
+ tspecials := "(" / ")" / "<" / ">" / "@" /
+ "," / ";" / ":" / "\" / <">
+ "/" / "[" / "]" / "?" / "="
+
+ So token is same as dot-atom, except stops also at '/', '?' and '='.
+*/
+
+/* atext chars are marked with 1, alpha and digits with 2,
+ atext-but-mime-tspecials with 4 */
+unsigned char rfc822_atext_chars[256] = {
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0-15 */
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 16-31 */
+ 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 4, /* 32-47 */
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 4, 0, 4, /* 48-63 */
+ 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* 64-79 */
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 1, 1, /* 80-95 */
+ 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* 96-111 */
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 0, /* 112-127 */
+
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
+};
+
+void rfc822_parser_init(struct rfc822_parser_context *ctx,
+ const unsigned char *data, size_t size,
+ string_t *last_comment)
+{
+ i_zero(ctx);
+ ctx->data = data;
+ ctx->end = data + size;
+ ctx->last_comment = last_comment;
+}
+
+int rfc822_skip_comment(struct rfc822_parser_context *ctx)
+{
+ const unsigned char *start;
+ size_t len;
+ int level = 1;
+
+ i_assert(*ctx->data == '(');
+
+ if (ctx->last_comment != NULL)
+ str_truncate(ctx->last_comment, 0);
+
+ start = ++ctx->data;
+ for (; ctx->data < ctx->end; ctx->data++) {
+ switch (*ctx->data) {
+ case '\0':
+ if (ctx->last_comment != NULL &&
+ ctx->nul_replacement_str != NULL) {
+ str_append_data(ctx->last_comment, start,
+ ctx->data - start);
+ str_append(ctx->last_comment,
+ ctx->nul_replacement_str);
+ start = ctx->data + 1;
+ }
+ break;
+ case '(':
+ level++;
+ break;
+ case ')':
+ if (--level == 0) {
+ if (ctx->last_comment != NULL) {
+ str_append_data(ctx->last_comment, start,
+ ctx->data - start);
+ }
+ ctx->data++;
+ return ctx->data < ctx->end ? 1 : 0;
+ }
+ break;
+ case '\n':
+ /* folding whitespace, remove the (CR)LF */
+ if (ctx->last_comment == NULL)
+ break;
+ len = ctx->data - start;
+ if (len > 0 && start[len-1] == '\r')
+ len--;
+ str_append_data(ctx->last_comment, start, len);
+ start = ctx->data + 1;
+ break;
+ case '\\':
+ ctx->data++;
+ if (ctx->data >= ctx->end)
+ return -1;
+
+ if (*ctx->data == '\r' || *ctx->data == '\n' ||
+ *ctx->data == '\0') {
+ /* quoted-pair doesn't allow CR/LF/NUL.
+ They are part of the obs-qp though, so don't
+ return them as error. */
+ ctx->data--;
+ break;
+ }
+ if (ctx->last_comment != NULL) {
+ str_append_data(ctx->last_comment, start,
+ ctx->data - start - 1);
+ }
+ start = ctx->data;
+ break;
+ }
+ }
+
+ /* missing ')' */
+ return -1;
+}
+
+int rfc822_skip_lwsp(struct rfc822_parser_context *ctx)
+{
+ for (; ctx->data < ctx->end;) {
+ if (*ctx->data == ' ' || *ctx->data == '\t' ||
+ *ctx->data == '\r' || *ctx->data == '\n') {
+ ctx->data++;
+ continue;
+ }
+
+ if (*ctx->data != '(')
+ break;
+
+ if (rfc822_skip_comment(ctx) < 0)
+ return -1;
+ }
+ return ctx->data < ctx->end ? 1 : 0;
+}
+
+int rfc822_parse_atom(struct rfc822_parser_context *ctx, string_t *str)
+{
+ const unsigned char *start;
+
+ /*
+ atom = [CFWS] 1*atext [CFWS]
+ atext =
+ ; Any character except controls, SP, and specials.
+ */
+ if (ctx->data >= ctx->end || !IS_ATEXT(*ctx->data))
+ return -1;
+
+ for (start = ctx->data++; ctx->data < ctx->end; ctx->data++) {
+ if (IS_ATEXT(*ctx->data))
+ continue;
+
+ str_append_data(str, start, ctx->data - start);
+ return rfc822_skip_lwsp(ctx);
+ }
+
+ str_append_data(str, start, ctx->data - start);
+ return 0;
+}
+
+int rfc822_parse_dot_atom(struct rfc822_parser_context *ctx, string_t *str)
+{
+ const unsigned char *start;
+ int ret;
+
+ /*
+ dot-atom = [CFWS] dot-atom-text [CFWS]
+ dot-atom-text = 1*atext *("." 1*atext)
+
+ atext =
+ ; Any character except controls, SP, and specials.
+
+ For RFC-822 compatibility allow LWSP around '.'
+ */
+ if (ctx->data >= ctx->end || !IS_ATEXT(*ctx->data))
+ return -1;
+
+ for (start = ctx->data++; ctx->data < ctx->end; ) {
+ if (IS_ATEXT(*ctx->data)) {
+ ctx->data++;
+ continue;
+ }
+
+ if (start == ctx->data)
+ return -1;
+ str_append_data(str, start, ctx->data - start);
+
+ if ((ret = rfc822_skip_lwsp(ctx)) <= 0)
+ return ret;
+
+ if (*ctx->data != '.')
+ return 1;
+
+ ctx->data++;
+ str_append_c(str, '.');
+
+ if (rfc822_skip_lwsp(ctx) <= 0)
+ return -1;
+ start = ctx->data;
+ }
+
+ i_assert(start != ctx->data);
+ str_append_data(str, start, ctx->data - start);
+ return 0;
+}
+
+int rfc822_parse_mime_token(struct rfc822_parser_context *ctx, string_t *str)
+{
+ const unsigned char *start;
+
+ for (start = ctx->data; ctx->data < ctx->end; ctx->data++) {
+ if (IS_ATEXT_NON_TSPECIAL(*ctx->data) || *ctx->data == '.')
+ continue;
+
+ str_append_data(str, start, ctx->data - start);
+ return rfc822_skip_lwsp(ctx);
+ }
+
+ str_append_data(str, start, ctx->data - start);
+ return 0;
+}
+
+int rfc822_parse_quoted_string(struct rfc822_parser_context *ctx, string_t *str)
+{
+ const unsigned char *start;
+ size_t len;
+
+ i_assert(ctx->data < ctx->end);
+ i_assert(*ctx->data == '"');
+ ctx->data++;
+
+ for (start = ctx->data; ctx->data < ctx->end; ctx->data++) {
+ switch (*ctx->data) {
+ case '\0':
+ if (ctx->nul_replacement_str != NULL) {
+ str_append_data(str, start, ctx->data - start);
+ str_append(str, ctx->nul_replacement_str);
+ start = ctx->data + 1;
+ }
+ break;
+ case '"':
+ str_append_data(str, start, ctx->data - start);
+ ctx->data++;
+ return rfc822_skip_lwsp(ctx);
+ case '\n':
+ /* folding whitespace, remove the (CR)LF */
+ len = ctx->data - start;
+ if (len > 0 && start[len-1] == '\r')
+ len--;
+ str_append_data(str, start, len);
+ start = ctx->data + 1;
+ break;
+ case '\\':
+ ctx->data++;
+ if (ctx->data >= ctx->end)
+ return -1;
+
+ if (*ctx->data == '\r' || *ctx->data == '\n' ||
+ *ctx->data == '\0') {
+ /* quoted-pair doesn't allow CR/LF/NUL.
+ They are part of the obs-qp though, so don't
+ return them as error. */
+ ctx->data--;
+ break;
+ }
+ str_append_data(str, start, ctx->data - start - 1);
+ start = ctx->data;
+ break;
+ }
+ }
+
+ /* missing '"' */
+ return -1;
+}
+
+static int
+rfc822_parse_atom_or_dot(struct rfc822_parser_context *ctx, string_t *str)
+{
+ const unsigned char *start;
+
+ /*
+ atom = [CFWS] 1*atext [CFWS]
+ atext =
+ ; Any character except controls, SP, and specials.
+
+ The difference between this function and rfc822_parse_dot_atom()
+ is that this doesn't just silently skip over all the whitespace.
+ */
+ for (start = ctx->data; ctx->data < ctx->end; ctx->data++) {
+ if (IS_ATEXT(*ctx->data) || *ctx->data == '.')
+ continue;
+
+ str_append_data(str, start, ctx->data - start);
+ return rfc822_skip_lwsp(ctx);
+ }
+
+ str_append_data(str, start, ctx->data - start);
+ return 0;
+}
+
+int rfc822_parse_phrase(struct rfc822_parser_context *ctx, string_t *str)
+{
+ int ret;
+
+ /*
+ phrase = 1*word / obs-phrase
+ word = atom / quoted-string
+ obs-phrase = word *(word / "." / CFWS)
+ */
+
+ if (ctx->data >= ctx->end)
+ return 0;
+ if (*ctx->data == '.')
+ return -1;
+
+ for (;;) {
+ if (*ctx->data == '"')
+ ret = rfc822_parse_quoted_string(ctx, str);
+ else
+ ret = rfc822_parse_atom_or_dot(ctx, str);
+
+ if (ret <= 0)
+ return ret;
+
+ if (!IS_ATEXT(*ctx->data) && *ctx->data != '"'
+ && *ctx->data != '.')
+ break;
+ str_append_c(str, ' ');
+ }
+ return rfc822_skip_lwsp(ctx);
+}
+
+static int
+rfc822_parse_domain_literal(struct rfc822_parser_context *ctx, string_t *str)
+{
+ const unsigned char *start;
+ size_t len;
+
+ /*
+ domain-literal = [CFWS] "[" *([FWS] dcontent) [FWS] "]" [CFWS]
+ dcontent = dtext / quoted-pair
+ dtext = NO-WS-CTL / ; Non white space controls
+ %d33-90 / ; The rest of the US-ASCII
+ %d94-126 ; characters not including "[",
+ ; "]", or "\"
+ */
+ i_assert(ctx->data < ctx->end);
+ i_assert(*ctx->data == '[');
+
+ for (start = ctx->data++; ctx->data < ctx->end; ctx->data++) {
+ switch (*ctx->data) {
+ case '\0':
+ if (ctx->nul_replacement_str != NULL) {
+ str_append_data(str, start, ctx->data - start);
+ str_append(str, ctx->nul_replacement_str);
+ start = ctx->data + 1;
+ }
+ break;
+ case '[':
+ /* not allowed */
+ return -1;
+ case ']':
+ str_append_data(str, start, ctx->data - start + 1);
+ ctx->data++;
+ return rfc822_skip_lwsp(ctx);
+ case '\n':
+ /* folding whitespace, remove the (CR)LF */
+ len = ctx->data - start;
+ if (len > 0 && start[len-1] == '\r')
+ len--;
+ str_append_data(str, start, len);
+ start = ctx->data + 1;
+ break;
+ case '\\':
+ /* note: the '\' is preserved in the output */
+ ctx->data++;
+ if (ctx->data >= ctx->end)
+ return -1;
+
+ if (*ctx->data == '\r' || *ctx->data == '\n' ||
+ *ctx->data == '\0') {
+ /* quoted-pair doesn't allow CR/LF/NUL.
+ They are part of the obs-qp though, so don't
+ return them as error. */
+ str_append_data(str, start, ctx->data - start);
+ start = ctx->data;
+ ctx->data--;
+ break;
+ }
+ }
+ }
+
+ /* missing ']' */
+ return -1;
+}
+
+int rfc822_parse_domain(struct rfc822_parser_context *ctx, string_t *str)
+{
+ /*
+ domain = dot-atom / domain-literal / obs-domain
+ domain-literal = [CFWS] "[" *([FWS] dcontent) [FWS] "]" [CFWS]
+ obs-domain = atom *("." atom)
+ */
+ i_assert(ctx->data < ctx->end);
+ i_assert(*ctx->data == '@');
+ ctx->data++;
+
+ if (rfc822_skip_lwsp(ctx) <= 0)
+ return -1;
+
+ if (*ctx->data == '[')
+ return rfc822_parse_domain_literal(ctx, str);
+ else
+ return rfc822_parse_dot_atom(ctx, str);
+}
+
+int rfc822_parse_content_type(struct rfc822_parser_context *ctx, string_t *str)
+{
+ size_t str_pos_0 = str->used;
+ if (rfc822_skip_lwsp(ctx) <= 0)
+ return -1;
+
+ /* get main type, require at least one byte */
+ if (rfc822_parse_mime_token(ctx, str) <= 0 ||
+ str->used == str_pos_0)
+ return -1;
+
+ /* skip over "/" */
+ if (*ctx->data != '/') {
+ str_truncate(str, str_pos_0);
+ return -1;
+ }
+ ctx->data++;
+ if (rfc822_skip_lwsp(ctx) <= 0) {
+ str_truncate(str, str_pos_0);
+ return -1;
+ }
+ str_append_c(str, '/');
+
+ size_t str_pos = str->used;
+ /* get subtype, require at least one byte,
+ and check the next separator to avoid accepting
+ invalid values. */
+ int ret;
+ if ((ret = rfc822_parse_mime_token(ctx, str)) < 0 ||
+ str->used == str_pos ||
+ (ctx->data != ctx->end && *ctx->data != ';')) {
+ str_truncate(str, str_pos_0);
+ return -1;
+ }
+ return ret;
+}
+
+int rfc822_parse_content_param(struct rfc822_parser_context *ctx,
+ const char **key_r, string_t *value)
+{
+ string_t *key;
+ int ret;
+
+ /* .. := *(";" parameter)
+ parameter := attribute "=" value
+ attribute := token
+ value := token / quoted-string
+ */
+ *key_r = NULL;
+ str_truncate(value, 0);
+
+ if (ctx->data >= ctx->end)
+ return 0;
+ if (*ctx->data != ';')
+ return -1;
+ ctx->data++;
+
+ if (rfc822_skip_lwsp(ctx) <= 0)
+ return -1;
+
+ key = t_str_new(64);
+ if (rfc822_parse_mime_token(ctx, key) <= 0)
+ return -1;
+
+ if (*ctx->data != '=')
+ return -1;
+ ctx->data++;
+
+ if ((ret = rfc822_skip_lwsp(ctx)) <= 0) {
+ /* broken / no value */
+ } else if (*ctx->data == '"') {
+ ret = rfc822_parse_quoted_string(ctx, value);
+ } else if (ctx->data < ctx->end && *ctx->data == '=') {
+ /* workaround for broken input:
+ name==?utf-8?b?...?= */
+ while (ctx->data < ctx->end && *ctx->data != ';' &&
+ *ctx->data != ' ' && *ctx->data != '\t' &&
+ *ctx->data != '\r' && *ctx->data != '\n') {
+ str_append_c(value, *ctx->data);
+ ctx->data++;
+ }
+ } else {
+ ret = rfc822_parse_mime_token(ctx, value);
+ }
+
+ *key_r = str_c(key);
+ return ret < 0 ? -1 : 1;
+}