/* Copyright (c) 2013-2018 Dovecot authors, see the included COPYING file */

#include "lib.h"
#include "net.h"
#include "str.h"
#include "strescape.h"

#include "smtp-parser.h"

#include <ctype.h>

/* Character definitions from RFC 5321/5322:

   textstring  = 1*(%d09 / %d32-126) ; HT, SP, Printable US-ASCII
               = 1*(%x09 / %x20-7e)
   ehlo-param  = 1*(%d33-126)
               = 1*(%x21-7e)
   ehlo-greet  = 1*(%d0-9 / %d11-12 / %d14-127)
               = 1*(%x00-09 / %x0b-0c / %x0e-7f)
   qtext       = %d32-33 / %d35-91 / %d93-126
               = %x20-21 / %x23-5B / %x5d-7e
   quoted-pair = %d92 %d32-126
               = %x5c %x20-7e
   atext       = ALPHA / DIGIT /    ; Printable US-ASCII
                 "!" / "#" /        ;  characters not including
                 "$" / "%" /        ;  specials.  Used for atoms.
                 "&" / "'" /
                 "*" / "+" /
                 "-" / "/" /
                 "=" / "?" /
                 "^" / "_" /
                 "`" / "{" /
                 "|" / "}" /
                 "~"
               = %x21 / %x23-27 / %x2a-2b / %x2d / %x2f-39 / %x3d /
                 %d3f / %x41-5a / %x5e-7e /
   esmtp-value = 1*(%d33-60 / %d62-126)
               = 1*(%x21-3c / %x3e-7e)
   dcontent    = %d33-90 / ; Printable US-ASCII
                 %d94-126  ; excl. "[", "\", "]"
               = %x21-5a / %x5e-7e
   xchar       = any ASCII CHAR between "!" (33) and "~" (126) inclusive,
                 except for "+" and "=". [RFC 3461]
               = %x21-2a / %2c-3c / %x3e-7e

   Bit mappings (FIXME: rearrange):

   (1<<0) => %x21-2a / %2c-3c / %x3e-7e  (xtext)
   (1<<1) => %x21 / %x23-27 / %x2a-2b / %x2d / %x2f-39 / %x3d /
               %d3f / %x41-5a / %x5e-7e /
   (1<<2) => %x28-29 / %x2c / %x2e / %x3a-3c / %x3e / %x40
   (1<<8) => %x00-09 / %x0b-0c / %x0e-20 / %x7f
   (1<<5) => %x09 / %5b-5d
   (1<<4) => %x5b / %x5d
   (1<<3) => %x20
   (1<<9) => %x22
   (1<<6) => %x2b
   (1<<7) => %x3d
 */

/* xtext */
const uint16_t smtp_xtext_char_mask = (1<<0);
/* atext */
const uint16_t smtp_atext_char_mask = (1<<1);
/* dcontent */
const uint16_t smtp_dcontent_char_mask = (1<<1)|(1<<2)|(1<<9);
/* qtext */
const uint16_t smtp_qtext_char_mask = (1<<1)|(1<<2)|(1<<3)|(1<<4);
/* textstring */
const uint16_t smtp_textstr_char_mask = (1<<1)|(1<<2)|(1<<9)|(1<<3)|(1<<5);
/* esmtp-value */
const uint16_t smtp_esmtp_value_char_mask = (1<<0)|(1<<6);
/* ehlo-param */
const uint16_t smtp_ehlo_param_char_mask = (1<<0)|(1<<6)|(1<<7);
/* ehlo-greet */
const uint16_t smtp_ehlo_greet_char_mask = (1<<0)|(1<<6)|(1<<7)|(1<<8);
/* quoted-pair */
const uint16_t smtp_qpair_char_mask = (1<<0)|(1<<3)|(1<<6)|(1<<7);

const uint16_t smtp_char_lookup[256] = {
	0x100, 0x100, 0x100, 0x100, 0x100, 0x100, 0x100, 0x100, // 00
	0x100, 0x120, 0x000, 0x100, 0x100, 0x000, 0x100, 0x100, // 08
	0x100, 0x100, 0x100, 0x100, 0x100, 0x100, 0x100, 0x100, // 10
	0x100, 0x100, 0x100, 0x100, 0x100, 0x100, 0x100, 0x100, // 18
	0x108, 0x003, 0x201, 0x003, 0x003, 0x003, 0x003, 0x003, // 20
	0x005, 0x005, 0x003, 0x042, 0x005, 0x003, 0x005, 0x003, // 28
	0x003, 0x003, 0x003, 0x003, 0x003, 0x003, 0x003, 0x003, // 30
	0x003, 0x003, 0x005, 0x005, 0x005, 0x082, 0x005, 0x003, // 38
	0x005, 0x003, 0x003, 0x003, 0x003, 0x003, 0x003, 0x003, // 40
	0x003, 0x003, 0x003, 0x003, 0x003, 0x003, 0x003, 0x003, // 48
	0x003, 0x003, 0x003, 0x003, 0x003, 0x003, 0x003, 0x003, // 50
	0x003, 0x003, 0x003, 0x031, 0x021, 0x031, 0x003, 0x003, // 58
	0x003, 0x003, 0x003, 0x003, 0x003, 0x003, 0x003, 0x003, // 60
	0x003, 0x003, 0x003, 0x003, 0x003, 0x003, 0x003, 0x003, // 68
	0x003, 0x003, 0x003, 0x003, 0x003, 0x003, 0x003, 0x003, // 70
	0x003, 0x003, 0x003, 0x003, 0x003, 0x003, 0x003, 0x100, // 78

	0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, // 80
	0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, // 88
	0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, // 90
	0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, // 98
	0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, // a0
	0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, // a8
	0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, // b0
	0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, // b8
	0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, // c0
	0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, // c8
	0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, // d0
	0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, // d8
	0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, // e0
	0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, // e8
	0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, // f0
	0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, // f8
};

/*
 * Parser
 */

void smtp_parser_init(struct smtp_parser *parser,
	pool_t pool, const char *data)
{
	parser->pool = pool;
	parser->begin = parser->cur = (const unsigned char *)data;
	parser->end = parser->begin + strlen(data);
	parser->error = NULL;
}

/*
 * Common syntax
 */

static int
smtp_parser_parse_ldh_str(struct smtp_parser *parser,
	string_t *out)
{
	const unsigned char *pbegin = parser->cur, *palnum;

	/* Ldh-str = *( ALPHA / DIGIT / "-" ) Let-dig
	   Let-dig = ALPHA / DIGIT
	 */

	/* Ldh-str */
	palnum = NULL;
	while (parser->cur < parser->end) {
		if (i_isalnum(*parser->cur))
			palnum = parser->cur;
		else if (*parser->cur != '-')
			break;
		parser->cur++;
	}
	if (parser->cur == pbegin || palnum == NULL) {
		parser->cur = pbegin;
		return 0;
	}

	parser->cur = palnum+1;
	if (out != NULL)
		str_append_data(out, pbegin, parser->cur - pbegin);
	return 1;
}

int smtp_parser_parse_domain(struct smtp_parser *parser,
	const char **value_r)
{
	string_t *value = NULL;

	/* Domain     = sub-domain *("." sub-domain)
	   sub-domain = Let-dig [Ldh-str]
	   Let-dig    = ALPHA / DIGIT
	   Ldh-str    = *( ALPHA / DIGIT / "-" ) Let-dig
	  
	   NOTE: A more generic syntax is accepted to be lenient towards
	         systems that don't adhere to the standards. It allows
	         '-' and '_' to occur anywhere in a sub-domain.
	 */

	/* Let-dig (first) (nope) */
	if (parser->cur >= parser->end ||
		(!i_isalnum(*parser->cur) && *parser->cur != '-' &&
			*parser->cur != '_'))
		return 0;

	if (value_r != NULL)
		value = t_str_new(256);

	for (;;) {
		/* Let-dig (nope) */
		if (parser->cur >= parser->end || *parser->cur == '.') {
			parser->error = "Empty sub-domain";
			return -1;
		}
		if (!i_isalnum(*parser->cur) && *parser->cur != '-' &&
			*parser->cur != '_') {
			parser->error = "Invalid character in domain";
			return -1;
		}
		if (value_r != NULL)
			str_append_c(value, *parser->cur);
		parser->cur++;

		/* Ldh-str (nope) */
		while (parser->cur < parser->end) {
			if (!i_isalnum(*parser->cur) && *parser->cur != '-' &&
				*parser->cur != '_')
				break;

			if (value_r != NULL)
				str_append_c(value, *parser->cur);
			parser->cur++;
		}

		/* *("." sub-domain) */
		if (parser->cur >= parser->end || *parser->cur != '.')
			break;

		if (value_r != NULL)
			str_append_c(value, '.');
		parser->cur++;
	}

	if (value_r != NULL)
		*value_r = str_c(value);
	return 1;
}

static int
smtp_parser_parse_snum(struct smtp_parser *parser, string_t *literal,
		       uint8_t *octet_r)
{
	const unsigned char *pbegin = parser->cur;
	uint8_t octet = 0;

	/* Snum                    = 1*3DIGIT
	                           ; representing a decimal integer
	                           ; value in the range 0 through 255
	 */

	if (*parser->cur < '0' || *parser->cur > '9')
		return 0;
	do {
		if (octet >= ((uint8_t)-1 / 10)) {
			if (octet > (uint8_t)-1 / 10)
				return -1;
			if ((uint8_t)(*parser->cur - '0') > ((uint8_t)-1 % 10))
				return -1;
		}
		octet = octet * 10 + (*parser->cur - '0');
		parser->cur++;
	} while (*parser->cur >= '0' && *parser->cur <= '9');

	if (literal != NULL)
		str_append_data(literal, pbegin, parser->cur - pbegin);
	*octet_r = octet;
	return 1;
}

static int
smtp_parser_parse_ipv4_address(struct smtp_parser *parser,
			       string_t *literal, struct in_addr *ip4_r)
{
	uint8_t octet;
	uint32_t ip = 0;
	int ret;
	int i;

	/* IPv4-address-literal    = Snum 3("."  Snum) */
	if ((ret = smtp_parser_parse_snum(parser, literal, &octet)) <= 0)
		return ret;
	ip = octet;

	for (i = 0; i < 3 && parser->cur < parser->end; i++) {
		if (*parser->cur != '.')
			return -1;

		if (literal != NULL)
			str_append_c(literal, '.');
		parser->cur++;

		if (smtp_parser_parse_snum(parser, literal, &octet) <= 0)
			return -1;
		ip = (ip << 8) + octet;
	}

	if (ip4_r != NULL)
		ip4_r->s_addr = htonl(ip);
	return 1;
}

int smtp_parser_parse_address_literal(struct smtp_parser *parser,
	const char **value_r, struct ip_addr *ip_r)
{
	const unsigned char *pblock;
	struct in_addr ip4;
	struct in6_addr ip6;
	bool ipv6 = FALSE;
	string_t *value = NULL, *tagbuf;
	int ret;

	/* address-literal         = "[" ( IPv4-address-literal /
	                            IPv6-address-literal /
	                            General-address-literal ) "]"
	                           ; See Section 4.1.3

	   IPv6-address-literal    = "IPv6:" IPv6-addr
	   General-address-literal = Standardized-tag ":" 1*dcontent
	   Standardized-tag        = Ldh-str
	                           ; Standardized-tag MUST be specified in a
	                           ; Standards-Track RFC and registered with
	                           ; IANA
	   dcontent                = %d33-90 / ; Printable US-ASCII
	                             %d94-126 ; excl. "[", "\", "]"
	 */

	/* "[" */
	if (parser->cur >= parser->end || *parser->cur != '[')
		return 0;
	parser->cur++;

	if (value_r != NULL) {
		value = t_str_new(128);
		str_append_c(value, '[');
	}
	if (ip_r != NULL)
		i_zero(ip_r);

	/* IPv4-address-literal / ... */
	i_zero(&ip4);
	if ((ret=smtp_parser_parse_ipv4_address(parser, value, &ip4)) != 0) {
		if (ret < 0) {
			parser->error = "Invalid IPv4 address literal";
			return -1;
		}
		if (ip_r != NULL) {
			ip_r->family = AF_INET;
			ip_r->u.ip4 = ip4;
		}

	/* ... / IPv6-address-literal / General-address-literal */
	} else {
		/* IPv6-address-literal    = "IPv6:" IPv6-addr
		   General-address-literal = Standardized-tag ":" 1*dcontent
		   Standardized-tag        = Ldh-str
		 */
		if (value_r != NULL) {
			tagbuf = value;
		} else {
			tagbuf = t_str_new(16);
			str_append_c(tagbuf, '[');
		}
		if (smtp_parser_parse_ldh_str(parser, tagbuf) <= 0 ||
			parser->cur >= parser->end || *parser->cur != ':') {
			parser->error = "Invalid address literal";
			return -1;
		}
		if (strcasecmp(str_c(tagbuf)+1, "IPv6") == 0)
			ipv6 = TRUE;
		else if (value_r == NULL) {
			parser->error = t_strdup_printf(
				"Unsupported %s address literal",
				str_c(tagbuf)+1);
			return -1;
		}
		parser->cur++;
		if (value_r != NULL)
			str_append_c(value, ':');

		/* 1*dcontent */
		pblock = parser->cur;
		while (parser->cur < parser->end &&
			smtp_char_is_dcontent(*parser->cur))
			parser->cur++;

		if (parser->cur == pblock) {
			parser->error = "Empty address literal";
			return -1;
		}
		if (value_r != NULL)
			str_append_data(value, pblock, parser->cur - pblock);

		if (ipv6) {
			i_zero(&ip6);
			if (inet_pton(AF_INET6, t_strndup(pblock,
				parser->cur - pblock), &ip6) <= 0) {
				parser->error = "Invalid IPv6 address literal";
				return -1;
			}
			if (ip_r != NULL) {
				ip_r->family = AF_INET6;
				ip_r->u.ip6 = ip6;
			}
		}
	}

	/* ']' */
	if (parser->cur >= parser->end) {
		parser->error = "Missing ']' at end of address literal";
		return -1;
	} else if (*parser->cur != ']') {
		parser->error = "Invalid character in address literal";
		return -1;
	}

	parser->cur++;
	if (value_r != NULL) {
		str_append_c(value, ']');
		*value_r = str_c(value);
	}
	return 1;
}

int smtp_parser_parse_quoted_string(struct smtp_parser *parser,
	const char **value_r)
{
	string_t *value = NULL;
	const unsigned char *pbegin;

	/* Quoted-string    = DQUOTE *QcontentSMTP DQUOTE
	   QcontentSMTP     = qtextSMTP / quoted-pairSMTP
	   quoted-pairSMTP  = %d92 %d32-126
	                    ; i.e., backslash followed by any ASCII
	                    ; graphic (including itself) or SPace
	   qtextSMTP        = %d32-33 / %d35-91 / %d93-126
	                    ; i.e., within a quoted string, any
	                    ; ASCII graphic or space is permitted
	                    ; without blackslash-quoting except
	                    ; double-quote and the backslash itself.
	 */

	/* DQUOTE */
	if (parser->cur >= parser->end || *parser->cur != '"')
		return 0;
	parser->cur++;

	if (value_r != NULL)
		value = t_str_new(256);

	/* *QcontentSMTP */
	while (parser->cur < parser->end) {
		pbegin = parser->cur;
		while (parser->cur < parser->end &&
			smtp_char_is_qtext(*parser->cur)) {
			/* qtextSMTP */
			parser->cur++;
		}

		if (value_r != NULL)
			str_append_data(value, pbegin, parser->cur - pbegin);

		if (parser->cur >= parser->end || *parser->cur != '\\')
			break;
		parser->cur++;

		/* quoted-pairSMTP */
		if (parser->cur >= parser->end ||
			!smtp_char_is_qpair(*parser->cur)) {
			parser->error =
				"Invalid character after '\\' in quoted string";
			return -1;
		}

		if (value_r != NULL)
			str_append_c(value, *parser->cur);
		parser->cur++;
	}

	/* DQUOTE */
	if (parser->cur >= parser->end)  {
		parser->error = "Premature end of quoted string";
		return -1;
	}
	if (*parser->cur != '"') {
		parser->error = "Invalid character in quoted string";
		return -1;
	}
	parser->cur++;
	if (value_r != NULL)
		*value_r = str_c(value);
	return 1;
}

static int
smtp_parser_skip_atom(struct smtp_parser *parser)
{
	/* Atom = 1*atext */

	if (parser->cur >= parser->end || !smtp_char_is_atext(*parser->cur))
		return 0;
	parser->cur++;

	while (parser->cur < parser->end && smtp_char_is_atext(*parser->cur))
		parser->cur++;
	return 1;
}

int smtp_parser_parse_atom(struct smtp_parser *parser,
	const char **value_r)
{
	const unsigned char *pbegin = parser->cur;
	int ret;

	if ((ret=smtp_parser_skip_atom(parser)) <= 0)
		return ret;

	if (value_r != NULL)
		*value_r = t_strndup(pbegin, parser->cur - pbegin);
	return 1;
}

int smtp_parser_parse_string(struct smtp_parser *parser,
	const char **value_r)
{
	int ret;

	/* String = Atom / Quoted-string */

	if ((ret=smtp_parser_parse_quoted_string(parser, value_r)) != 0)
		return ret;
	return smtp_parser_parse_atom(parser, value_r);
}

static bool
smtp_parse_xtext_hexdigit(const unsigned char digit,
	unsigned char *hexvalue)
{
	switch (digit) {
	case '0': case '1': case '2': case '3': case '4':
	case '5': case '6': case '7': case '8': case '9':
		*hexvalue = (*hexvalue) << 4;
		*hexvalue += digit - '0';
		break;
	case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
		*hexvalue = (*hexvalue) << 4;
		*hexvalue += digit - 'A' + 10;
		break;
	default:
		return FALSE;
	}
	return TRUE;
}

int smtp_parser_parse_xtext(struct smtp_parser *parser,
	string_t *out)
{
	unsigned char hexchar;

	/* xtext   = *( xchar / hexchar )
	   xchar   = any ASCII CHAR between "!" (33) and "~" (126) inclusive,
	              except for "+" and "=".
	   hexchar = ASCII "+" immediately followed by two upper case
	             hexadecimal digits
	 */
	if (parser->cur >= parser->end ||
		(!smtp_char_is_xtext(*parser->cur) && *parser->cur != '+'))
		return 0;

	while (parser->cur < parser->end) {
		const unsigned char *pbegin = parser->cur;

		while (parser->cur < parser->end &&
			smtp_char_is_xtext(*parser->cur))
			parser->cur++;

		if (out != NULL)
			str_append_data(out, pbegin, parser->cur - pbegin);

		if (parser->cur >= parser->end || *parser->cur != '+')
			break;
		parser->cur++;

		hexchar = 0;
		if (smtp_parse_xtext_hexdigit(*parser->cur, &hexchar)) {
			parser->cur++;
			if (smtp_parse_xtext_hexdigit(*parser->cur, &hexchar)) {
				parser->cur++;
				if (out != NULL)
					str_append_c(out, hexchar);
				continue;
			}
		}

		parser->error = "Invalid hexchar after '+' in xtext";
		return -1;
	}

	return 1;
}