diff options
Diffstat (limited to 'src/lib-mail/rfc822-parser.c')
-rw-r--r-- | src/lib-mail/rfc822-parser.c | 522 |
1 files changed, 522 insertions, 0 deletions
diff --git a/src/lib-mail/rfc822-parser.c b/src/lib-mail/rfc822-parser.c new file mode 100644 index 0000000..c8595b4 --- /dev/null +++ b/src/lib-mail/rfc822-parser.c @@ -0,0 +1,522 @@ +/* Copyright (c) 2005-2018 Dovecot authors, see the included COPYING file */ + +#include "lib.h" +#include "str.h" +#include "strescape.h" +#include "rfc822-parser.h" + +/* + atext = ALPHA / DIGIT / ; Any character except controls, + "!" / "#" / ; SP, and specials. + "$" / "%" / ; Used for atoms + "&" / "'" / + "*" / "+" / + "-" / "/" / + "=" / "?" / + "^" / "_" / + "`" / "{" / + "|" / "}" / + "~" + + MIME: + + token := 1*<any (US-ASCII) CHAR except SPACE, CTLs, + or tspecials> + tspecials := "(" / ")" / "<" / ">" / "@" / + "," / ";" / ":" / "\" / <"> + "/" / "[" / "]" / "?" / "=" + + So token is same as dot-atom, except stops also at '/', '?' and '='. +*/ + +/* atext chars are marked with 1, alpha and digits with 2, + atext-but-mime-tspecials with 4 */ +unsigned char rfc822_atext_chars[256] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0-15 */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 16-31 */ + 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 4, /* 32-47 */ + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 4, 0, 4, /* 48-63 */ + 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* 64-79 */ + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 1, 1, /* 80-95 */ + 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* 96-111 */ + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 0, /* 112-127 */ + + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 +}; + +void rfc822_parser_init(struct rfc822_parser_context *ctx, + const unsigned char *data, size_t size, + string_t *last_comment) +{ + i_zero(ctx); + ctx->data = data; + ctx->end = data + size; + ctx->last_comment = last_comment; +} + +int rfc822_skip_comment(struct rfc822_parser_context *ctx) +{ + const unsigned char *start; + size_t len; + int level = 1; + + i_assert(*ctx->data == '('); + + if (ctx->last_comment != NULL) + str_truncate(ctx->last_comment, 0); + + start = ++ctx->data; + for (; ctx->data < ctx->end; ctx->data++) { + switch (*ctx->data) { + case '\0': + if (ctx->last_comment != NULL && + ctx->nul_replacement_str != NULL) { + str_append_data(ctx->last_comment, start, + ctx->data - start); + str_append(ctx->last_comment, + ctx->nul_replacement_str); + start = ctx->data + 1; + } + break; + case '(': + level++; + break; + case ')': + if (--level == 0) { + if (ctx->last_comment != NULL) { + str_append_data(ctx->last_comment, start, + ctx->data - start); + } + ctx->data++; + return ctx->data < ctx->end ? 1 : 0; + } + break; + case '\n': + /* folding whitespace, remove the (CR)LF */ + if (ctx->last_comment == NULL) + break; + len = ctx->data - start; + if (len > 0 && start[len-1] == '\r') + len--; + str_append_data(ctx->last_comment, start, len); + start = ctx->data + 1; + break; + case '\\': + ctx->data++; + if (ctx->data >= ctx->end) + return -1; + + if (*ctx->data == '\r' || *ctx->data == '\n' || + *ctx->data == '\0') { + /* quoted-pair doesn't allow CR/LF/NUL. + They are part of the obs-qp though, so don't + return them as error. */ + ctx->data--; + break; + } + if (ctx->last_comment != NULL) { + str_append_data(ctx->last_comment, start, + ctx->data - start - 1); + } + start = ctx->data; + break; + } + } + + /* missing ')' */ + return -1; +} + +int rfc822_skip_lwsp(struct rfc822_parser_context *ctx) +{ + for (; ctx->data < ctx->end;) { + if (*ctx->data == ' ' || *ctx->data == '\t' || + *ctx->data == '\r' || *ctx->data == '\n') { + ctx->data++; + continue; + } + + if (*ctx->data != '(') + break; + + if (rfc822_skip_comment(ctx) < 0) + return -1; + } + return ctx->data < ctx->end ? 1 : 0; +} + +int rfc822_parse_atom(struct rfc822_parser_context *ctx, string_t *str) +{ + const unsigned char *start; + + /* + atom = [CFWS] 1*atext [CFWS] + atext = + ; Any character except controls, SP, and specials. + */ + if (ctx->data >= ctx->end || !IS_ATEXT(*ctx->data)) + return -1; + + for (start = ctx->data++; ctx->data < ctx->end; ctx->data++) { + if (IS_ATEXT(*ctx->data)) + continue; + + str_append_data(str, start, ctx->data - start); + return rfc822_skip_lwsp(ctx); + } + + str_append_data(str, start, ctx->data - start); + return 0; +} + +int rfc822_parse_dot_atom(struct rfc822_parser_context *ctx, string_t *str) +{ + const unsigned char *start; + int ret; + + /* + dot-atom = [CFWS] dot-atom-text [CFWS] + dot-atom-text = 1*atext *("." 1*atext) + + atext = + ; Any character except controls, SP, and specials. + + For RFC-822 compatibility allow LWSP around '.' + */ + if (ctx->data >= ctx->end || !IS_ATEXT(*ctx->data)) + return -1; + + for (start = ctx->data++; ctx->data < ctx->end; ) { + if (IS_ATEXT(*ctx->data)) { + ctx->data++; + continue; + } + + if (start == ctx->data) + return -1; + str_append_data(str, start, ctx->data - start); + + if ((ret = rfc822_skip_lwsp(ctx)) <= 0) + return ret; + + if (*ctx->data != '.') + return 1; + + ctx->data++; + str_append_c(str, '.'); + + if (rfc822_skip_lwsp(ctx) <= 0) + return -1; + start = ctx->data; + } + + i_assert(start != ctx->data); + str_append_data(str, start, ctx->data - start); + return 0; +} + +int rfc822_parse_mime_token(struct rfc822_parser_context *ctx, string_t *str) +{ + const unsigned char *start; + + for (start = ctx->data; ctx->data < ctx->end; ctx->data++) { + if (IS_ATEXT_NON_TSPECIAL(*ctx->data) || *ctx->data == '.') + continue; + + str_append_data(str, start, ctx->data - start); + return rfc822_skip_lwsp(ctx); + } + + str_append_data(str, start, ctx->data - start); + return 0; +} + +int rfc822_parse_quoted_string(struct rfc822_parser_context *ctx, string_t *str) +{ + const unsigned char *start; + size_t len; + + i_assert(ctx->data < ctx->end); + i_assert(*ctx->data == '"'); + ctx->data++; + + for (start = ctx->data; ctx->data < ctx->end; ctx->data++) { + switch (*ctx->data) { + case '\0': + if (ctx->nul_replacement_str != NULL) { + str_append_data(str, start, ctx->data - start); + str_append(str, ctx->nul_replacement_str); + start = ctx->data + 1; + } + break; + case '"': + str_append_data(str, start, ctx->data - start); + ctx->data++; + return rfc822_skip_lwsp(ctx); + case '\n': + /* folding whitespace, remove the (CR)LF */ + len = ctx->data - start; + if (len > 0 && start[len-1] == '\r') + len--; + str_append_data(str, start, len); + start = ctx->data + 1; + break; + case '\\': + ctx->data++; + if (ctx->data >= ctx->end) + return -1; + + if (*ctx->data == '\r' || *ctx->data == '\n' || + *ctx->data == '\0') { + /* quoted-pair doesn't allow CR/LF/NUL. + They are part of the obs-qp though, so don't + return them as error. */ + ctx->data--; + break; + } + str_append_data(str, start, ctx->data - start - 1); + start = ctx->data; + break; + } + } + + /* missing '"' */ + return -1; +} + +static int +rfc822_parse_atom_or_dot(struct rfc822_parser_context *ctx, string_t *str) +{ + const unsigned char *start; + + /* + atom = [CFWS] 1*atext [CFWS] + atext = + ; Any character except controls, SP, and specials. + + The difference between this function and rfc822_parse_dot_atom() + is that this doesn't just silently skip over all the whitespace. + */ + for (start = ctx->data; ctx->data < ctx->end; ctx->data++) { + if (IS_ATEXT(*ctx->data) || *ctx->data == '.') + continue; + + str_append_data(str, start, ctx->data - start); + return rfc822_skip_lwsp(ctx); + } + + str_append_data(str, start, ctx->data - start); + return 0; +} + +int rfc822_parse_phrase(struct rfc822_parser_context *ctx, string_t *str) +{ + int ret; + + /* + phrase = 1*word / obs-phrase + word = atom / quoted-string + obs-phrase = word *(word / "." / CFWS) + */ + + if (ctx->data >= ctx->end) + return 0; + if (*ctx->data == '.') + return -1; + + for (;;) { + if (*ctx->data == '"') + ret = rfc822_parse_quoted_string(ctx, str); + else + ret = rfc822_parse_atom_or_dot(ctx, str); + + if (ret <= 0) + return ret; + + if (!IS_ATEXT(*ctx->data) && *ctx->data != '"' + && *ctx->data != '.') + break; + str_append_c(str, ' '); + } + return rfc822_skip_lwsp(ctx); +} + +static int +rfc822_parse_domain_literal(struct rfc822_parser_context *ctx, string_t *str) +{ + const unsigned char *start; + size_t len; + + /* + domain-literal = [CFWS] "[" *([FWS] dcontent) [FWS] "]" [CFWS] + dcontent = dtext / quoted-pair + dtext = NO-WS-CTL / ; Non white space controls + %d33-90 / ; The rest of the US-ASCII + %d94-126 ; characters not including "[", + ; "]", or "\" + */ + i_assert(ctx->data < ctx->end); + i_assert(*ctx->data == '['); + + for (start = ctx->data++; ctx->data < ctx->end; ctx->data++) { + switch (*ctx->data) { + case '\0': + if (ctx->nul_replacement_str != NULL) { + str_append_data(str, start, ctx->data - start); + str_append(str, ctx->nul_replacement_str); + start = ctx->data + 1; + } + break; + case '[': + /* not allowed */ + return -1; + case ']': + str_append_data(str, start, ctx->data - start + 1); + ctx->data++; + return rfc822_skip_lwsp(ctx); + case '\n': + /* folding whitespace, remove the (CR)LF */ + len = ctx->data - start; + if (len > 0 && start[len-1] == '\r') + len--; + str_append_data(str, start, len); + start = ctx->data + 1; + break; + case '\\': + /* note: the '\' is preserved in the output */ + ctx->data++; + if (ctx->data >= ctx->end) + return -1; + + if (*ctx->data == '\r' || *ctx->data == '\n' || + *ctx->data == '\0') { + /* quoted-pair doesn't allow CR/LF/NUL. + They are part of the obs-qp though, so don't + return them as error. */ + str_append_data(str, start, ctx->data - start); + start = ctx->data; + ctx->data--; + break; + } + } + } + + /* missing ']' */ + return -1; +} + +int rfc822_parse_domain(struct rfc822_parser_context *ctx, string_t *str) +{ + /* + domain = dot-atom / domain-literal / obs-domain + domain-literal = [CFWS] "[" *([FWS] dcontent) [FWS] "]" [CFWS] + obs-domain = atom *("." atom) + */ + i_assert(ctx->data < ctx->end); + i_assert(*ctx->data == '@'); + ctx->data++; + + if (rfc822_skip_lwsp(ctx) <= 0) + return -1; + + if (*ctx->data == '[') + return rfc822_parse_domain_literal(ctx, str); + else + return rfc822_parse_dot_atom(ctx, str); +} + +int rfc822_parse_content_type(struct rfc822_parser_context *ctx, string_t *str) +{ + size_t str_pos_0 = str->used; + if (rfc822_skip_lwsp(ctx) <= 0) + return -1; + + /* get main type, require at least one byte */ + if (rfc822_parse_mime_token(ctx, str) <= 0 || + str->used == str_pos_0) + return -1; + + /* skip over "/" */ + if (*ctx->data != '/') { + str_truncate(str, str_pos_0); + return -1; + } + ctx->data++; + if (rfc822_skip_lwsp(ctx) <= 0) { + str_truncate(str, str_pos_0); + return -1; + } + str_append_c(str, '/'); + + size_t str_pos = str->used; + /* get subtype, require at least one byte, + and check the next separator to avoid accepting + invalid values. */ + int ret; + if ((ret = rfc822_parse_mime_token(ctx, str)) < 0 || + str->used == str_pos || + (ctx->data != ctx->end && *ctx->data != ';')) { + str_truncate(str, str_pos_0); + return -1; + } + return ret; +} + +int rfc822_parse_content_param(struct rfc822_parser_context *ctx, + const char **key_r, string_t *value) +{ + string_t *key; + int ret; + + /* .. := *(";" parameter) + parameter := attribute "=" value + attribute := token + value := token / quoted-string + */ + *key_r = NULL; + str_truncate(value, 0); + + if (ctx->data >= ctx->end) + return 0; + if (*ctx->data != ';') + return -1; + ctx->data++; + + if (rfc822_skip_lwsp(ctx) <= 0) + return -1; + + key = t_str_new(64); + if (rfc822_parse_mime_token(ctx, key) <= 0) + return -1; + + if (*ctx->data != '=') + return -1; + ctx->data++; + + if ((ret = rfc822_skip_lwsp(ctx)) <= 0) { + /* broken / no value */ + } else if (*ctx->data == '"') { + ret = rfc822_parse_quoted_string(ctx, value); + } else if (ctx->data < ctx->end && *ctx->data == '=') { + /* workaround for broken input: + name==?utf-8?b?...?= */ + while (ctx->data < ctx->end && *ctx->data != ';' && + *ctx->data != ' ' && *ctx->data != '\t' && + *ctx->data != '\r' && *ctx->data != '\n') { + str_append_c(value, *ctx->data); + ctx->data++; + } + } else { + ret = rfc822_parse_mime_token(ctx, value); + } + + *key_r = str_c(key); + return ret < 0 ? -1 : 1; +} |