diff options
Diffstat (limited to 'src/lib-mail/message-header-parser.c')
-rw-r--r-- | src/lib-mail/message-header-parser.c | 474 |
1 files changed, 474 insertions, 0 deletions
diff --git a/src/lib-mail/message-header-parser.c b/src/lib-mail/message-header-parser.c new file mode 100644 index 0000000..c5026f1 --- /dev/null +++ b/src/lib-mail/message-header-parser.c @@ -0,0 +1,474 @@ +/* Copyright (c) 2002-2018 Dovecot authors, see the included COPYING file */ + +#include "lib.h" +#include "buffer.h" +#include "istream.h" +#include "str.h" +#include "strfuncs.h" +#include "unichar.h" +#include "message-size.h" +#include "message-header-parser.h" + +/* RFC 5322 2.1.1 and 2.2 */ +#define MESSAGE_HEADER_NAME_MAX_LEN 1000 + +struct message_header_parser_ctx { + struct message_header_line line; + + struct istream *input; + struct message_size *hdr_size; + + string_t *name; + buffer_t *value_buf; + + enum message_header_parser_flags flags; + bool skip_line:1; + bool has_nuls:1; +}; + +struct message_header_parser_ctx * +message_parse_header_init(struct istream *input, struct message_size *hdr_size, + enum message_header_parser_flags flags) +{ + struct message_header_parser_ctx *ctx; + + ctx = i_new(struct message_header_parser_ctx, 1); + ctx->input = input; + ctx->hdr_size = hdr_size; + ctx->name = str_new(default_pool, 128); + ctx->flags = flags; + ctx->value_buf = buffer_create_dynamic(default_pool, 4096); + i_stream_ref(input); + + if (hdr_size != NULL) + i_zero(hdr_size); + return ctx; +} + +void message_parse_header_deinit(struct message_header_parser_ctx **_ctx) +{ + struct message_header_parser_ctx *ctx = *_ctx; + + i_stream_unref(&ctx->input); + buffer_free(&ctx->value_buf); + str_free(&ctx->name); + i_free(ctx); + + *_ctx = NULL; +} + +int message_parse_header_next(struct message_header_parser_ctx *ctx, + struct message_header_line **hdr_r) +{ + struct message_header_line *line = &ctx->line; + const unsigned char *msg; + size_t i, size, startpos, colon_pos, parse_size, skip = 0; + int ret; + bool continued, continues, last_no_newline, last_crlf; + bool no_newline, crlf_newline; + + *hdr_r = NULL; + if (line->eoh) + return -1; + + if (line->continues) + colon_pos = 0; + else { + /* new header line */ + line->name_offset = ctx->input->v_offset; + colon_pos = UINT_MAX; + buffer_set_used_size(ctx->value_buf, 0); + } + + no_newline = FALSE; + crlf_newline = FALSE; + continued = line->continues; + continues = FALSE; + + for (startpos = 0;;) { + ret = i_stream_read_bytes(ctx->input, &msg, &size, startpos+2); + if (ret >= 0) { + /* we want to know one byte in advance to find out + if it's multiline header */ + parse_size = size == 0 ? 0 : size-1; + } else { + parse_size = size; + } + + if (ret <= 0 && startpos == parse_size) { + if (ret == -1) { + if (startpos > 0) { + /* header ended unexpectedly. */ + no_newline = TRUE; + skip = startpos; + break; + } + /* error / EOF with no bytes */ + i_assert(skip == 0); + return -1; + } + + if (size > 0 && !ctx->skip_line && !continued && + (msg[0] == '\n' || + (msg[0] == '\r' && size > 1 && msg[1] == '\n'))) { + /* end of headers - this mostly happens just + with mbox where headers are read separately + from body */ + size = 0; + if (ctx->hdr_size != NULL) + ctx->hdr_size->lines++; + if (msg[0] == '\r') { + skip = 2; + crlf_newline = TRUE; + } else { + skip = 1; + if (ctx->hdr_size != NULL) + ctx->hdr_size->virtual_size++; + } + break; + } + if (ret == 0 && !ctx->input->eof) { + /* stream is nonblocking - need more data */ + i_assert(skip == 0); + return 0; + } + i_assert(size > 0); + + /* a) line is larger than input buffer + b) header ended unexpectedly */ + if (ret == -2) { + /* go back to last LWSP if found. */ + size_t min_pos = !continued ? colon_pos : 0; + for (i = size-1; i > min_pos; i--) { + if (IS_LWSP(msg[i])) { + size = i; + break; + } + } + if (i == min_pos && (msg[size-1] == '\r' || + msg[size-1] == '\n')) { + /* we may or may not have a full header, + but we don't know until we get the + next character. leave out the + linefeed and finish the header on + the next run. */ + size--; + if (size > 0 && msg[size-1] == '\r') + size--; + } + /* the buffer really has to be more than 2 to + avoid CRLF looping forever */ + i_assert(size > 0); + + continues = TRUE; + } + no_newline = TRUE; + skip = size; + break; + } + + /* find ':' */ + if (colon_pos == UINT_MAX) { + for (i = startpos; i < parse_size; i++) { + if (msg[i] > ':') + continue; + + if (msg[i] == ':' && !ctx->skip_line) { + colon_pos = i; + line->full_value_offset = + ctx->input->v_offset + i + 1; + break; + } + if (msg[i] == '\n') { + /* end of headers, or error */ + break; + } + + if (msg[i] == '\0') + ctx->has_nuls = TRUE; + } + } else { + i = startpos; + } + + /* find '\n' */ + for (; i < parse_size; i++) { + if (msg[i] <= '\n') { + if (msg[i] == '\n') + break; + if (msg[i] == '\0') + ctx->has_nuls = TRUE; + } + } + + if (i < parse_size && i+1 == size && ret == -2) { + /* we don't know if the line continues. */ + i++; + } else if (i < parse_size) { + /* got a line */ + if (ctx->skip_line) { + /* skipping a line with a huge header name */ + if (ctx->hdr_size != NULL) { + ctx->hdr_size->lines++; + ctx->hdr_size->physical_size += i + 1; + ctx->hdr_size->virtual_size += i + 1; + } + if (i == 0 || msg[i-1] != '\r') { + /* missing CR */ + if (ctx->hdr_size != NULL) + ctx->hdr_size->virtual_size++; + } + + i_stream_skip(ctx->input, i + 1); + startpos = 0; + ctx->skip_line = FALSE; + continue; + } + continues = i+1 < size && IS_LWSP(msg[i+1]); + + if (ctx->hdr_size != NULL) + ctx->hdr_size->lines++; + if (i == 0 || msg[i-1] != '\r') { + /* missing CR */ + if (ctx->hdr_size != NULL) + ctx->hdr_size->virtual_size++; + size = i; + } else { + size = i-1; + crlf_newline = TRUE; + } + + skip = i+1; + break; + } + + startpos = i; + } + + last_crlf = line->crlf_newline && + (ctx->flags & MESSAGE_HEADER_PARSER_FLAG_DROP_CR) == 0; + last_no_newline = line->no_newline || + (ctx->flags & MESSAGE_HEADER_PARSER_FLAG_CLEAN_ONELINE) != 0; + + line->continues = continues; + line->continued = continued; + line->crlf_newline = crlf_newline; + line->no_newline = no_newline; + if (size == 0 && !continued) { + /* end of headers */ + line->eoh = TRUE; + line->name_len = line->value_len = line->full_value_len = 0; + line->name = ""; line->value = line->full_value = NULL; + line->middle = NULL; line->middle_len = 0; + line->full_value_offset = line->name_offset; + line->continues = FALSE; + } else if (line->continued) { + line->value = msg; + line->value_len = size; + } else if (colon_pos == UINT_MAX) { + /* missing ':', assume the whole line is value */ + line->value = msg; + line->value_len = size; + line->full_value_offset = line->name_offset; + + line->name = ""; + line->name_len = 0; + + line->middle = uchar_empty_ptr; + line->middle_len = 0; + } else { + size_t pos; + + line->value = msg + colon_pos+1; + line->value_len = size - colon_pos - 1; + if ((ctx->flags & MESSAGE_HEADER_PARSER_FLAG_SKIP_INITIAL_LWSP) != 0) { + /* get value. skip all LWSP after ':'. Note that + RFC2822 doesn't say we should, but history behind + it.. + + Exception to this is if the value consists only of + LWSP, then skip only the one LWSP after ':'. */ + for (pos = 0; pos < line->value_len; pos++) { + if (!IS_LWSP(line->value[pos])) + break; + } + + if (pos == line->value_len) { + /* everything was LWSP */ + if (line->value_len > 0 && + IS_LWSP(line->value[0])) + pos = 1; + } + } else { + pos = line->value_len > 0 && + IS_LWSP(line->value[0]) ? 1 : 0; + } + + line->value += pos; + line->value_len -= pos; + line->full_value_offset += pos; + + /* get name, skip LWSP before ':' */ + while (colon_pos > 0 && IS_LWSP(msg[colon_pos-1])) + colon_pos--; + + /* Treat overlong header names as if the full header line was + a value. Callers can usually handle large values better than + large names. */ + if (colon_pos > MESSAGE_HEADER_NAME_MAX_LEN) { + line->name = ""; + line->name_len = 0; + line->middle = uchar_empty_ptr; + line->middle_len = 0; + line->value = msg; + line->value_len = size; + line->full_value_offset = line->name_offset; + } else { + str_truncate(ctx->name, 0); + /* use buffer_append() so the name won't be truncated if there + are NULs. */ + buffer_append(ctx->name, msg, colon_pos); + str_append_c(ctx->name, '\0'); + + /* keep middle stored also in ctx->name so it's available + with use_full_value */ + line->middle = msg + colon_pos; + line->middle_len = (size_t)(line->value - line->middle); + str_append_data(ctx->name, line->middle, line->middle_len); + + line->name = str_c(ctx->name); + line->name_len = colon_pos; + line->middle = str_data(ctx->name) + line->name_len + 1; + } + } + + if (!line->continued) { + /* first header line. make a copy of the line since we can't + really trust input stream not to lose it. */ + buffer_append(ctx->value_buf, line->value, line->value_len); + line->value = line->full_value = ctx->value_buf->data; + line->full_value_len = line->value_len; + } else if (line->use_full_value) { + /* continue saving the full value. */ + if (last_no_newline) { + /* line is longer than fit into our buffer, so we + were forced to break it into multiple + message_header_lines */ + } else { + if (last_crlf) + buffer_append_c(ctx->value_buf, '\r'); + buffer_append_c(ctx->value_buf, '\n'); + } + if ((ctx->flags & MESSAGE_HEADER_PARSER_FLAG_CLEAN_ONELINE) != 0 && + line->value_len > 0 && line->value[0] != ' ' && + IS_LWSP(line->value[0])) { + buffer_append_c(ctx->value_buf, ' '); + buffer_append(ctx->value_buf, + line->value + 1, line->value_len - 1); + } else { + buffer_append(ctx->value_buf, + line->value, line->value_len); + } + line->full_value = ctx->value_buf->data; + line->full_value_len = ctx->value_buf->used; + } else { + /* we didn't want full_value, and this is a continued line. */ + line->full_value = NULL; + line->full_value_len = 0; + } + + /* always reset it */ + line->use_full_value = FALSE; + + if (ctx->hdr_size != NULL) { + ctx->hdr_size->physical_size += skip; + ctx->hdr_size->virtual_size += skip; + } + i_stream_skip(ctx->input, skip); + + *hdr_r = line; + return 1; +} + +bool message_parse_header_has_nuls(const struct message_header_parser_ctx *ctx) +{ + return ctx->has_nuls; +} + +#undef message_parse_header +void message_parse_header(struct istream *input, struct message_size *hdr_size, + enum message_header_parser_flags flags, + message_header_callback_t *callback, void *context) +{ + struct message_header_parser_ctx *hdr_ctx; + struct message_header_line *hdr; + int ret; + + hdr_ctx = message_parse_header_init(input, hdr_size, flags); + while ((ret = message_parse_header_next(hdr_ctx, &hdr)) > 0) + callback(hdr, context); + i_assert(ret != 0); + message_parse_header_deinit(&hdr_ctx); + + /* call after the final skipping */ + callback(NULL, context); +} + +void message_header_line_write(buffer_t *output, + const struct message_header_line *hdr) +{ + if (!hdr->continued) { + buffer_append(output, hdr->name, strlen(hdr->name)); + buffer_append(output, hdr->middle, hdr->middle_len); + } + buffer_append(output, hdr->value, hdr->value_len); + if (!hdr->no_newline) { + if (hdr->crlf_newline) + buffer_append_c(output, '\r'); + buffer_append_c(output, '\n'); + } +} + +const char * +message_header_strdup(pool_t pool, const unsigned char *data, size_t size) +{ + if (memchr(data, '\0', size) == NULL) { + /* fast path */ + char *dest = p_malloc(pool, size+1); + memcpy(dest, data, size); + return dest; + } + + /* slow path - this could be made faster, but it should be + rare so keep it simple */ + string_t *str = str_new(pool, size+2); + for (size_t i = 0; i < size; i++) { + if (data[i] != '\0') + str_append_c(str, data[i]); + else + str_append(str, UNICODE_REPLACEMENT_CHAR_UTF8); + } + return str_c(str); +} + +bool message_header_name_is_valid(const char *name) +{ + /* + field-name = 1*ftext + + ftext = %d33-57 / ; Printable US-ASCII + %d59-126 ; characters not including + ; ":". + */ + for (unsigned int i = 0; name[i] != '\0'; i++) { + unsigned char c = name[i]; + if (c >= 33 && c <= 57) { + /* before ":" */ + } else if (c >= 59 && c <= 126) { + /* after ":" */ + } else { + return FALSE; + } + } + return TRUE; +} |