From e90fcc54809db2591dc083f43ef54c6ec8c60847 Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Sun, 7 Apr 2024 18:16:13 +0200 Subject: Adding upstream version 4.96. Signed-off-by: Daniel Baumann --- src/parse.c | 2243 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 2243 insertions(+) create mode 100644 src/parse.c (limited to 'src/parse.c') diff --git a/src/parse.c b/src/parse.c new file mode 100644 index 0000000..bdba3ec --- /dev/null +++ b/src/parse.c @@ -0,0 +1,2243 @@ +/************************************************* +* Exim - an Internet mail transport agent * +*************************************************/ + +/* Copyright (c) The Exim Maintainers 2020 - 2022 */ +/* Copyright (c) University of Cambridge 1995 - 2018 */ +/* See the file NOTICE for conditions of use and distribution. */ + +/* Functions for parsing addresses */ + + +#include "exim.h" + + +static const uschar *last_comment_position; + + + +/* In stand-alone mode, provide a replacement for deliver_make_addr() +and rewrite_address[_qualify]() so as to avoid having to drag in too much +redundant apparatus. */ + +#ifdef STAND_ALONE + +address_item * +deliver_make_addr(uschar *address, BOOL copy) +{ +address_item *addr = store_get(sizeof(address_item), GET_UNTAINTED); +addr->next = NULL; +addr->parent = NULL; +addr->address = address; +return addr; +} + +uschar * +rewrite_address(uschar *recipient, BOOL dummy1, BOOL dummy2, rewrite_rule + *dummy3, int dummy4) +{ +return recipient; +} + +uschar * +rewrite_address_qualify(uschar *recipient, BOOL dummy1) +{ +return recipient; +} + +#endif + + + + +/************************************************* +* Find the end of an address * +*************************************************/ + +/* Scan over a string looking for the termination of an address at a comma, +or end of the string. It's the source-routed addresses which cause much pain +here. Although Exim ignores source routes, it must recognize such addresses, so +we cannot get rid of this logic. + +Argument: + s pointer to the start of an address + nl_ends if TRUE, '\n' terminates an address + +Returns: pointer past the end of the address + (i.e. points to null or comma) +*/ + +uschar * +parse_find_address_end(const uschar *s, BOOL nl_ends) +{ +BOOL source_routing = *s == '@'; +int no_term = source_routing? 1 : 0; + +while (*s != 0 && (*s != ',' || no_term > 0) && (*s != '\n' || !nl_ends)) + { + /* Skip single quoted characters. Strictly these should not occur outside + quoted strings in RFC 822 addresses, but they can in RFC 821 addresses. Pity + about the lack of consistency, isn't it? */ + + if (*s == '\\' && s[1] != 0) s += 2; + + /* Skip quoted items that are not inside brackets. Note that + quoted pairs are allowed inside quoted strings. */ + + else if (*s == '\"') + { + while (*(++s) != 0 && (*s != '\n' || !nl_ends)) + { + if (*s == '\\' && s[1] != 0) s++; + else if (*s == '\"') { s++; break; } + } + } + + /* Skip comments, which may include nested brackets, but quotes + are not recognized inside comments, though quoted pairs are. */ + + else if (*s == '(') + { + int level = 1; + while (*(++s) != 0 && (*s != '\n' || !nl_ends)) + { + if (*s == '\\' && s[1] != 0) s++; + else if (*s == '(') level++; + else if (*s == ')' && --level <= 0) { s++; break; } + } + } + + /* Non-special character; just advance. Passing the colon in a source + routed address means that any subsequent comma or colon may terminate unless + inside angle brackets. */ + + else + { + if (*s == '<') + { + source_routing = s[1] == '@'; + no_term = source_routing? 2 : 1; + } + else if (*s == '>') no_term--; + else if (source_routing && *s == ':') no_term--; + s++; + } + } + +return US s; +} + + + +/************************************************* +* Find last @ in an address * +*************************************************/ + +/* This function is used when we have something that may not qualified. If we +know it's qualified, searching for the rightmost '@' is sufficient. Here we +have to be a bit more clever than just a plain search, in order to handle +unqualified local parts like "thing@thong" correctly. Since quotes may not +legally be part of a domain name, we can give up on hitting the first quote +when searching from the right. Now that the parsing also permits the RFC 821 +form of address, where quoted-pairs are allowed in unquoted local parts, we +must take care to handle that too. + +Argument: pointer to an address, possibly unqualified +Returns: pointer to the last @ in an address, or NULL if none +*/ + +const uschar * +parse_find_at(const uschar *s) +{ +const uschar * t = s + Ustrlen(s); +while (--t >= s) + if (*t == '@') + { + int backslash_count = 0; + const uschar *tt = t - 1; + while (tt > s && *tt-- == '\\') backslash_count++; + if ((backslash_count & 1) == 0) return t; + } + else if (*t == '\"') + return NULL; + +return NULL; +} + + + + +/*************************************************************************** +* In all the functions below that read a particular object type from * +* the input, return the new value of the pointer s (the first argument), * +* and put the object into the store pointed to by t (the second argument), * +* adding a terminating zero. If no object is found, t will point to zero * +* on return. * +***************************************************************************/ + + +/************************************************* +* Skip white space and comment * +*************************************************/ + +/* Algorithm: + (1) Skip spaces. + (2) If uschar not '(', return. + (3) Skip till matching ')', not counting any characters + escaped with '\'. + (4) Move past ')' and goto (1). + +The start of the last potential comment position is remembered to +make it possible to ignore comments at the end of compound items. + +Argument: current character pointer +Returns: new character pointer +*/ + +static const uschar * +skip_comment(const uschar *s) +{ +last_comment_position = s; +while (*s) + { + int c, level; + + if (Uskip_whitespace(&s) != '(') break; + level = 1; + while((c = *(++s))) + { + if (c == '(') level++; + else if (c == ')') { if (--level <= 0) { s++; break; } } + else if (c == '\\' && s[1] != 0) s++; + } + } +return s; +} + + + +/************************************************* +* Read a domain * +*************************************************/ + +/* A domain is a sequence of subdomains, separated by dots. See comments below +for detailed syntax of the subdomains. + +If allow_domain_literals is TRUE, a "domain" may also be an IP address enclosed +in []. Make sure the output is set to the null string if there is a syntax +error as well as if there is no domain at all. + +Optionally, msg_id domain literals ( printable-ascii enclosed in [] ) +are permitted. + +Arguments: + s current character pointer + t where to put the domain + msg_id_literals flag for relaxed domain-literal processing + errorptr put error message here on failure (*t will be 0 on exit) + +Returns: new character pointer +*/ + +static const uschar * +read_domain(const uschar *s, uschar *t, BOOL msg_id_literals, uschar **errorptr) +{ +uschar *tt = t; +s = skip_comment(s); + +/* Handle domain literals if permitted. An RFC 822 domain literal may contain +any character except [ ] \, including linear white space, and may contain +quoted characters. However, RFC 821 restricts literals to being dot-separated +3-digit numbers, and we make the obvious extension for IPv6. Go for a sequence +of digits, dots, hex digits, and colons here; later this will be checked for +being a syntactically valid IP address if it ever gets to a router. + +Allow both the formal IPv6 form, with IPV6: at the start, and the informal form +without it, and accept IPV4: as well, 'cause someone will use it sooner or +later. */ + +if (*s == '[') + { + *t++ = *s++; + + if (strncmpic(s, US"IPv6:", 5) == 0 || strncmpic(s, US"IPv4:", 5) == 0) + { + memcpy(t, s, 5); + t += 5; + s += 5; + } + + if (msg_id_literals) + while (*s >= 33 && *s <= 90 || *s >= 94 && *s <= 126) *t++ = *s++; + else + while (*s == '.' || *s == ':' || isxdigit(*s)) *t++ = *s++; + + if (*s == ']') *t++ = *s++; else + { + *errorptr = US"malformed domain literal"; + *tt = 0; + } + + if (!allow_domain_literals && !msg_id_literals) + { + *errorptr = US"domain literals not allowed"; + *tt = 0; + } + *t = 0; + return skip_comment(s); + } + +/* Handle a proper domain, which is a sequence of dot-separated atoms. Remove +trailing dots if strip_trailing_dot is set. A subdomain is an atom. + +An atom is a sequence of any characters except specials, space, and controls. +The specials are ( ) < > @ , ; : \ " . [ and ]. This is the rule for RFC 822 +and its successor (RFC 2822). However, RFC 821 and its successor (RFC 2821) is +tighter, allowing only letters, digits, and hyphens, not starting with a +hyphen. + +There used to be a global flag that got set when checking addresses that came +in over SMTP and which should therefore should be checked according to the +stricter rule. However, it seems silly to make the distinction, because I don't +suppose anybody ever uses local domains that are 822-compliant and not +821-compliant. Furthermore, Exim now has additional data on the spool file line +after an address (after "one_time" processing), and it makes use of a # +character to delimit it. When I wrote that code, I forgot about this 822-domain +stuff, and assumed # could never appear in a domain. + +So the old code is now cut out for Release 4.11 onwards, on 09-Aug-02. In a few +years, when we are sure this isn't actually causing trouble, throw it away. + +March 2003: the story continues: There is a camp that is arguing for the use of +UTF-8 in domain names as the way to internationalization, and other MTAs +support this. Therefore, we now have a flag that permits the use of characters +with values greater than 127, encoded in UTF-8, in subdomains, so that Exim can +be used experimentally in this way. */ + +for (;;) + { + uschar *tsave = t; + +/********************* + if (rfc821_domains) + { + if (*s != '-') while (isalnum(*s) || *s == '-') *t++ = *s++; + } + else + while (!mac_iscntrl_or_special(*s)) *t++ = *s++; +*********************/ + + if (*s != '-') + { + /* Only letters, digits, and hyphens */ + + if (!allow_utf8_domains) + { + while (isalnum(*s) || *s == '-') *t++ = *s++; + } + + /* Permit legal UTF-8 characters to be included */ + + else for(;;) + { + int i, d; + if (isalnum(*s) || *s == '-') /* legal ascii characters */ + { + *t++ = *s++; + continue; + } + if ((*s & 0xc0) != 0xc0) break; /* not start of UTF-8 character */ + d = *s << 2; + for (i = 1; i < 6; i++) /* i is the number of additional bytes */ + { + if ((d & 0x80) == 0) break; + d <<= 1; + } + if (i == 6) goto BAD_UTF8; /* invalid UTF-8 */ + *t++ = *s++; /* leading UTF-8 byte */ + while (i-- > 0) /* copy and check remainder */ + { + if ((*s & 0xc0) != 0x80) + { + BAD_UTF8: + *errorptr = US"invalid UTF-8 byte sequence"; + *tt = 0; + return s; + } + *t++ = *s++; + } + } /* End of loop for UTF-8 character */ + } /* End of subdomain */ + + s = skip_comment(s); + *t = 0; + + if (t == tsave) /* empty component */ + { + if (strip_trailing_dot && t > tt && *s != '.') t[-1] = 0; else + { + *errorptr = US"domain missing or malformed"; + *tt = 0; + } + return s; + } + + if (*s != '.') break; + *t++ = *s++; + s = skip_comment(s); + } + +return s; +} + + + +/************************************************* +* Read a local-part * +*************************************************/ + +/* A local-part is a sequence of words, separated by periods. A null word +between dots is not strictly allowed but apparently many mailers permit it, +so, sigh, better be compatible. Even accept a trailing dot... + +A is either a quoted string, or an , which is a sequence +of any characters except specials, space, and controls. The specials are +( ) < > @ , ; : \ " . [ and ]. In RFC 822, a single quoted character, (a +quoted-pair) is not allowed in a word. However, in RFC 821, it is permitted in +the local part of an address. Rather than have separate parsing functions for +the different cases, take the liberal attitude always. At least one MUA is +happy to recognize this case; I don't know how many other programs do. + +Arguments: + s current character pointer + t where to put the local part + error where to point error text + allow_null TRUE if an empty local part is not an error + +Returns: new character pointer +*/ + +static const uschar * +read_local_part(const uschar *s, uschar *t, uschar **error, BOOL allow_null) +{ +uschar *tt = t; +*error = NULL; +for (;;) + { + int c; + uschar *tsave = t; + s = skip_comment(s); + + /* Handle a quoted string */ + + if (*s == '\"') + { + *t++ = '\"'; + while ((c = *++s) && c != '\"') + { + *t++ = c; + if (c == '\\' && s[1]) *t++ = *++s; + } + if (c == '\"') + { + s++; + *t++ = '\"'; + } + else + { + *error = US"unmatched doublequote in local part"; + return s; + } + } + + /* Handle an atom, but allow quoted pairs within it. */ + + else while (!mac_iscntrl_or_special(*s) || *s == '\\') + { + c = *t++ = *s++; + if (c == '\\' && *s) *t++ = *s++; + } + + /* Terminate the word and skip subsequent comment */ + + *t = 0; + s = skip_comment(s); + + /* If we have read a null component at this point, give an error unless it is + terminated by a dot - an extension to RFC 822 - or if it is the first + component of the local part and an empty local part is permitted, in which + case just return normally. */ + + if (t == tsave && *s != '.') + { + if (t == tt && !allow_null) + *error = US"missing or malformed local part"; + return s; + } + + /* Anything other than a dot terminates the local part. Treat multiple dots + as a single dot, as this seems to be a common extension. */ + + if (*s != '.') break; + do { *t++ = *s++; } while (*s == '.'); + } + +return s; +} + + +/************************************************* +* Read route part of route-addr * +*************************************************/ + +/* The pointer is at the initial "@" on entry. Return it following the +terminating colon. Exim no longer supports the use of source routes, but it is +required to accept the syntax. + +Arguments: + s current character pointer + t where to put the route + errorptr where to put an error message + +Returns: new character pointer +*/ + +static const uschar * +read_route(const uschar *s, uschar *t, uschar **errorptr) +{ +BOOL commas = FALSE; +*errorptr = NULL; + +while (*s == '@') + { + *t++ = '@'; + s = read_domain(s+1, t, FALSE, errorptr); + if (*t == 0) return s; + t += Ustrlen((const uschar *)t); + if (*s != ',') break; + *t++ = *s++; + commas = TRUE; + s = skip_comment(s); + } + +if (*s == ':') *t++ = *s++; + +/* If there is no colon, and there were no commas, the most likely error +is in fact a missing local part in the address rather than a missing colon +after the route. */ + +else *errorptr = commas? + US"colon expected after route list" : + US"no local part"; + +/* Terminate the route and return */ + +*t = 0; +return skip_comment(s); +} + + + +/************************************************* +* Read addr-spec * +*************************************************/ + +/* Addr-spec is local-part@domain. We make the domain optional - +the expected terminator for the whole thing is passed to check this. +This function is called only when we know we have a route-addr. + +Arguments: + s current character pointer + t where to put the addr-spec + term expected terminator (0 or >) + errorptr where to put an error message + domainptr set to point to the start of the domain + +Returns: new character pointer +*/ + +static const uschar * +read_addr_spec(const uschar *s, uschar *t, int term, uschar **errorptr, + uschar **domainptr) +{ +s = read_local_part(s, t, errorptr, FALSE); +if (*errorptr == NULL) + if (*s != term) + if (*s != '@') + *errorptr = string_sprintf("\"@\" or \".\" expected after \"%s\"", t); + else + { + t += Ustrlen((const uschar *)t); + *t++ = *s++; + *domainptr = t; + s = read_domain(s, t, FALSE, errorptr); + } +return s; +} + + + +/************************************************* +* Extract operative address * +*************************************************/ + +/* This function extracts an operative address from a full RFC822 mailbox and +returns it in a piece of dynamic store. We take the easy way and get a piece +of store the same size as the input, and then copy into it whatever is +necessary. If we cannot find a valid address (syntax error), return NULL, and +point the error pointer to the reason. The arguments "start" and "end" are used +to return the offsets of the first and one past the last characters in the +original mailbox of the address that has been extracted, to aid in re-writing. +The argument "domain" is set to point to the first character after "@" in the +final part of the returned address, or zero if there is no @. + +Exim no longer supports the use of source routed addresses (those of the form +@domain,...:route_addr). It recognizes the syntax, but collapses such addresses +down to their final components. Formerly, collapse_source_routes had to be set +to achieve this effect. RFC 1123 allows collapsing with MAY, while the revision +of RFC 821 had increased this to SHOULD, so I've gone for it, because it makes +a lot of code elsewhere in Exim much simpler. + +There are some special fudges here for handling RFC 822 group address notation +which may appear in certain headers. If the flag parse_allow_group is set +TRUE and parse_found_group is FALSE when this function is called, an address +which is the start of a group (i.e. preceded by a phrase and a colon) is +recognized; the phrase is ignored and the flag parse_found_group is set. If +this flag is TRUE at the end of an address, and if an extraneous semicolon is +found, it is ignored and the flag is cleared. + +This logic is used only when scanning through addresses in headers, either to +fulfil the -t option, or for rewriting, or for checking header syntax. Because +the group "state" has to be remembered between multiple calls of this function, +the variables parse_{allow,found}_group are global. It is important to ensure +that they are reset to FALSE at the end of scanning a header's list of +addresses. + +Arguments: + mailbox points to the RFC822 mailbox + errorptr where to point an error message + start set to start offset in mailbox + end set to end offset in mailbox + domain set to domain offset in result, or 0 if no domain present + allow_null allow <> if TRUE + +Returns: points to the extracted address, or NULL on error +*/ + +#define FAILED(s) { *errorptr = s; goto PARSE_FAILED; } + +uschar * +parse_extract_address(const uschar *mailbox, uschar **errorptr, int *start, int *end, + int *domain, BOOL allow_null) +{ +uschar * yield = store_get(Ustrlen(mailbox) + 1, mailbox); +const uschar *startptr, *endptr; +const uschar *s = US mailbox; +uschar *t = US yield; + +*domain = 0; + +/* At the start of the string we expect either an addr-spec or a phrase +preceding a . If groups are allowed, we might also find a phrase +preceding a colon and an address. If we find an initial word followed by +a dot, strict interpretation of the RFC would cause it to be taken +as the start of an addr-spec. However, many mailers break the rules +and use addresses of the form "a.n.other " and so we +allow this case. */ + +RESTART: /* Come back here after passing a group name */ + +s = skip_comment(s); +startptr = s; /* In case addr-spec */ +s = read_local_part(s, t, errorptr, TRUE); /* Dot separated words */ +if (*errorptr) goto PARSE_FAILED; + +/* If the terminator is neither < nor @ then the format of the address +must either be a bare local-part (we are now at the end), or a phrase +followed by a route-addr (more words must follow). */ + +if (*s != '@' && *s != '<') + { + if (!*s || *s == ';') + { + if (!*t) FAILED(US"empty address"); + endptr = last_comment_position; + goto PARSE_SUCCEEDED; /* Bare local part */ + } + + /* Expect phrase route-addr, or phrase : if groups permitted, but allow + dots in the phrase; complete the loop only when '<' or ':' is encountered - + end of string will produce a null local_part and therefore fail. We don't + need to keep updating t, as the phrase isn't to be kept. */ + + while (*s != '<' && (!f.parse_allow_group || *s != ':')) + { + s = read_local_part(s, t, errorptr, FALSE); + if (*errorptr) + { + *errorptr = string_sprintf("%s (expected word or \"<\")", *errorptr); + goto PARSE_FAILED; + } + } + + if (*s == ':') + { + f.parse_found_group = TRUE; + f.parse_allow_group = FALSE; + s++; + goto RESTART; + } + + /* Assert *s == '<' */ + } + +/* At this point the next character is either '@' or '<'. If it is '@', only a +single local-part has previously been read. An angle bracket signifies the +start of an . Throw away anything we have saved so far before +processing it. Note that this is "if" rather than "else if" because it's also +used after reading a preceding phrase. + +There are a lot of broken sendmails out there that put additional pairs of <> +round s. If strip_excess_angle_brackets is set, allow a limited +number of them, as long as they match. */ + +if (*s == '<') + { + uschar *domainptr = yield; + BOOL source_routed = FALSE; + int bracket_count = 1; + + s++; + if (strip_excess_angle_brackets) while (*s == '<') + { + if(bracket_count++ > 5) FAILED(US"angle-brackets nested too deep"); + s++; + } + + t = yield; + startptr = s; + s = skip_comment(s); + + /* Read an optional series of routes, each of which is a domain. They + are separated by commas and terminated by a colon. However, we totally ignore + such routes (RFC 1123 says we MAY, and the revision of RFC 821 says we + SHOULD). */ + + if (*s == '@') + { + s = read_route(s, t, errorptr); + if (*errorptr) goto PARSE_FAILED; + *t = 0; /* Ensure route is ignored - probably overkill */ + source_routed = TRUE; + } + + /* Now an addr-spec, terminated by '>'. If there is no preceding route, + we must allow an empty addr-spec if allow_null is TRUE, to permit the + address "<>" in some circumstances. A source-routed address MUST have + a domain in the final part. */ + + if (allow_null && !source_routed && *s == '>') + { + *t = 0; + *errorptr = NULL; + } + else + { + s = read_addr_spec(s, t, '>', errorptr, &domainptr); + if (*errorptr) goto PARSE_FAILED; + *domain = domainptr - yield; + if (source_routed && *domain == 0) + FAILED(US"domain missing in source-routed address"); + } + + endptr = s; + if (*errorptr) goto PARSE_FAILED; + while (bracket_count-- > 0) if (*s++ != '>') + { + *errorptr = s[-1] == 0 + ? US"'>' missing at end of address" + : string_sprintf("malformed address: %.32s may not follow %.*s", + s-1, (int)(s - US mailbox - 1), mailbox); + goto PARSE_FAILED; + } + + s = skip_comment(s); + } + +/* Hitting '@' after the first local-part means we have definitely got an +addr-spec, on a strict reading of the RFC, and the rest of the string +should be the domain. However, for flexibility we allow for a route-address +not enclosed in <> as well, which is indicated by an empty first local +part preceding '@'. The source routing is, however, ignored. */ + +else if (!*t) + { + uschar *domainptr = yield; + s = read_route(s, t, errorptr); + if (*errorptr) goto PARSE_FAILED; + *t = 0; /* Ensure route is ignored - probably overkill */ + s = read_addr_spec(s, t, 0, errorptr, &domainptr); + if (*errorptr) goto PARSE_FAILED; + *domain = domainptr - yield; + endptr = last_comment_position; + if (*domain == 0) FAILED(US"domain missing in source-routed address"); + } + +/* This is the strict case of local-part@domain. */ + +else + { + t += Ustrlen((const uschar *)t); + *t++ = *s++; + *domain = t - yield; + s = read_domain(s, t, TRUE, errorptr); + if (!*t) goto PARSE_FAILED; + endptr = last_comment_position; + } + +/* Use goto to get here from the bare local part case. Arrive by falling +through for other cases. Endptr may have been moved over whitespace, so +move it back past white space if necessary. */ + +PARSE_SUCCEEDED: +if (*s) + { + if (f.parse_found_group && *s == ';') + { + f.parse_found_group = FALSE; + f.parse_allow_group = TRUE; + } + else + { + *errorptr = string_sprintf("malformed address: %.32s may not follow %.*s", + s, (int)(s - US mailbox), mailbox); + goto PARSE_FAILED; + } + } +*start = startptr - US mailbox; /* Return offsets */ +while (isspace(endptr[-1])) endptr--; +*end = endptr - US mailbox; + +/* Although this code has no limitation on the length of address extracted, +other parts of Exim may have limits, and in any case, RFC 5321 limits email +addresses to 256, so we do a check here, giving an error if the address is +ridiculously long. */ + +if (*end - *start > EXIM_EMAILADDR_MAX) + { + *errorptr = string_sprintf("address is ridiculously long: %.64s...", yield); + return NULL; + } + +return yield; + +/* Use goto (via the macro FAILED) to get to here from a variety of places. +We might have an empty address in a group - the caller can choose to ignore +this. We must, however, keep the flags correct. */ + +PARSE_FAILED: +if (f.parse_found_group && *s == ';') + { + f.parse_found_group = FALSE; + f.parse_allow_group = TRUE; + } +return NULL; +} + +#undef FAILED + + + +/************************************************* +* Quote according to RFC 2047 * +*************************************************/ + +/* This function is used for quoting text in headers according to RFC 2047. +If the only characters that strictly need quoting are spaces, we return the +original string, unmodified. + +Hmmph. As always, things get perverted for other uses. This function was +originally for the "phrase" part of addresses. Now it is being used for much +longer texts in ACLs and via the ${rfc2047: expansion item. This means we have +to check for overlong "encoded-word"s and split them. November 2004. + +Arguments: + string the string to quote - already checked to contain non-printing + chars + len the length of the string + charset the name of the character set; NULL => iso-8859-1 + fold if TRUE, a newline is inserted before the separating space when + more than one encoded-word is generated + +Returns: pointer to the original string, if no quoting needed, or + pointer to allocated memory containing the quoted string +*/ + +const uschar * +parse_quote_2047(const uschar *string, int len, const uschar *charset, + BOOL fold) +{ +const uschar * s = string; +int hlen, l; +BOOL coded = FALSE; +BOOL first_byte = FALSE; +gstring * g = + string_fmt_append(NULL, "=?%s?Q?", charset ? charset : US"iso-8859-1"); + +hlen = l = g->ptr; + +for (s = string; len > 0; s++, len--) + { + int ch = *s; + + if (g->ptr - l > 67 && !first_byte) + { + g = fold ? string_catn(g, US"?=\n ", 4) : string_catn(g, US"?= ", 3); + l = g->ptr; + g = string_catn(g, g->s, hlen); + } + + if ( ch < 33 || ch > 126 + || Ustrchr("?=()<>@,;:\\\".[]_", ch) != NULL) + { + if (ch == ' ') + { + g = string_catn(g, US"_", 1); + first_byte = FALSE; + } + else + { + g = string_fmt_append(g, "=%02X", ch); + coded = TRUE; + first_byte = !first_byte; + } + } + else + { g = string_catn(g, s, 1); first_byte = FALSE; } + } + +if (coded) + string = string_from_gstring(g = string_catn(g, US"?=", 2)); +else + g->ptr = -1; + +gstring_release_unused(g); +return string; +} + + + + +/************************************************* +* Fix up an RFC 822 "phrase" * +*************************************************/ + +/* This function is called to repair any syntactic defects in the "phrase" part +of an RFC822 address. In particular, it is applied to the user's name as read +from the passwd file when accepting a local message, and to the data from the +-F option. + +If the string contains existing quoted strings or comments containing +freestanding quotes, then we just quote those bits that need quoting - +otherwise it would get awfully messy and probably not look good. If not, we +quote the whole thing if necessary. Thus + + John Q. Smith => "John Q. Smith" + John "Jack" Smith => John "Jack" Smith + John "Jack" Q. Smith => John "Jack" "Q." Smith + John (Jack) Q. Smith => "John (Jack) Q. Smith" + John ("Jack") Q. Smith => John ("Jack") "Q." Smith +but + John (\"Jack\") Q. Smith => "John (\"Jack\") Q. Smith" + +Sheesh! This is tedious code. It is a great pity that the syntax of RFC822 is +the way it is... + +August 2000: Additional code added: + + Previously, non-printing characters were turned into question marks, which do + not need to be quoted. + + Now, a different tactic is used if there are any non-printing ASCII + characters. The encoding method from RFC 2047 is used, assuming iso-8859-1 as + the character set. + + We *could* use this for all cases, getting rid of the messy original code, + but leave it for now. It would complicate simple cases like "John Q. Smith". + +The result is passed back in allocated memory. + +Arguments: + phrase an RFC822 phrase + len the length of the phrase + +Returns: the fixed RFC822 phrase +*/ + +const uschar * +parse_fix_phrase(const uschar *phrase, int len) +{ +int ch, i; +BOOL quoted = FALSE; +const uschar *s, *end; +uschar * buffer; +uschar *t, *yield; + +while (len > 0 && isspace(*phrase)) { phrase++; len--; } + +/* See if there are any non-printing characters, and if so, use the RFC 2047 +encoding for the whole thing. */ + +for (i = 0, s = phrase; i < len; i++, s++) + if ((*s < 32 && *s != '\t') || *s > 126) break; + +if (i < len) + return parse_quote_2047(phrase, len, headers_charset, FALSE); + +/* No non-printers; use the RFC 822 quoting rules */ + +if (len <= 0 || len >= INT_MAX/4) + return string_copy_taint(CUS"", phrase); + +buffer = store_get((len+1)*4, phrase); + +s = phrase; +end = s + len; +yield = t = buffer + 1; + +while (s < end) + { + ch = *s++; + + /* Copy over quoted strings, remembering we encountered one */ + + if (ch == '\"') + { + *t++ = '\"'; + while (s < end && (ch = *s++) != '\"') + { + *t++ = ch; + if (ch == '\\' && s < end) *t++ = *s++; + } + *t++ = '\"'; + if (s >= end) break; + quoted = TRUE; + } + + /* Copy over comments, noting if they contain freestanding quote + characters */ + + else if (ch == '(') + { + int level = 1; + *t++ = '('; + while (s < end) + { + ch = *s++; + *t++ = ch; + if (ch == '(') level++; + else if (ch == ')') { if (--level <= 0) break; } + else if (ch == '\\' && s < end) *t++ = *s++ & 127; + else if (ch == '\"') quoted = TRUE; + } + if (ch == 0) + { + while (level--) *t++ = ')'; + break; + } + } + + /* Handle special characters that need to be quoted */ + + else if (Ustrchr(")<>@,;:\\.[]", ch) != NULL) + { + /* If hit previous quotes just make one quoted "word" */ + + if (quoted) + { + uschar *tt = t++; + while (*(--tt) != ' ' && *tt != '\"' && *tt != ')') tt[1] = *tt; + tt[1] = '\"'; + *t++ = ch; + while (s < end) + { + ch = *s++; + if (ch == ' ' || ch == '\"') { s--; break; } else *t++ = ch; + } + *t++ = '\"'; + } + + /* Else quote the whole string so far, and the rest up to any following + quotes. We must treat anything following a backslash as a literal. */ + + else + { + BOOL escaped = (ch == '\\'); + *(--yield) = '\"'; + *t++ = ch; + + /* Now look for the end or a quote */ + + while (s < end) + { + ch = *s++; + + /* Handle escaped pairs */ + + if (escaped) + { + *t++ = ch; + escaped = FALSE; + } + + else if (ch == '\\') + { + *t++ = ch; + escaped = TRUE; + } + + /* If hit subsequent quotes, insert our quote before any trailing + spaces and back up to re-handle the quote in the outer loop. */ + + else if (ch == '\"') + { + int count = 0; + while (t[-1] == ' ') { t--; count++; } + *t++ = '\"'; + while (count-- > 0) *t++ = ' '; + s--; + break; + } + + /* If hit a subsequent comment, check it for unescaped quotes, + and if so, end our quote before it. */ + + else if (ch == '(') + { + const uschar *ss = s; /* uschar after '(' */ + int level = 1; + while(ss < end) + { + ch = *ss++; + if (ch == '(') level++; + else if (ch == ')') { if (--level <= 0) break; } + else if (ch == '\\' && ss+1 < end) ss++; + else if (ch == '\"') { quoted = TRUE; break; } + } + + /* Comment contains unescaped quotes; end our quote before + the start of the comment. */ + + if (quoted) + { + int count = 0; + while (t[-1] == ' ') { t--; count++; } + *t++ = '\"'; + while (count-- > 0) *t++ = ' '; + break; + } + + /* Comment does not contain unescaped quotes; include it in + our quote. */ + + else + { + if (ss >= end) ss--; + *t++ = '('; + if (ss > s) + { + Ustrncpy(t, s, ss-s); + t += ss-s; + s = ss; + } + } + } + + /* Not a comment or quote; include this character in our quotes. */ + + else *t++ = ch; + } + } + + /* Add a final quote if we hit the end of the string. */ + + if (s >= end) *t++ = '\"'; + } + + /* Non-special character; just copy it over */ + + else *t++ = ch; + } + +*t = 0; +store_release_above(t+1); +return yield; +} + + +/************************************************* +* Extract addresses from a list * +*************************************************/ + +/* This function is called by the redirect router to scan a string containing a +list of addresses separated by commas (with optional white space) or by +newlines, and to generate a chain of address items from them. In other words, +to unpick data from an alias or .forward file. + +The SunOS5 documentation for alias files is not very clear on the syntax; it +does not say that either a comma or a newline can be used for separation. +However, that is the way Smail does it, so we follow suit. + +If a # character is encountered in a white space position, then characters from +there to the next newline are skipped. + +If an unqualified address begins with '\', just skip that character. This gives +compatibility with Sendmail's use of \ to prevent looping. Exim has its own +loop prevention scheme which handles other cases too - see the code in +route_address(). + +An "address" can be a specification of a file or a pipe; the latter may often +need to be quoted because it may contain spaces, but we don't want to retain +the quotes. Quotes may appear in normal addresses too, and should be retained. +We can distinguish between these cases, because in addresses, quotes are used +only for parts of the address, not the whole thing. Therefore, we remove quotes +from items when they entirely enclose them, but not otherwise. + +An "address" can also be of the form :include:pathname to include a list of +addresses contained in the specified file. + +Any unqualified addresses are qualified with and rewritten if necessary, via +the rewrite_address() function. + +Arguments: + s the list of addresses (typically a complete + .forward file or a list of entries in an alias file) + options option bits for permitting or denying various special cases; + not all bits are relevant here - some are for filter + files; those we use here are: + RDO_DEFER + RDO_FREEZE + RDO_FAIL + RDO_BLACKHOLE + RDO_REWRITE + RDO_INCLUDE + anchor where to hang the chain of newly-created addresses. This + should be initialized to NULL. + error where to return an error text + incoming domain domain of the incoming address; used to qualify unqualified + local parts preceded by \ + directory if NULL, no checks are done on :include: files + otherwise, included file names must start with the given + directory + syntax_errors if not NULL, it carries on after syntax errors in addresses, + building up a list of errors as error blocks chained on + here. + +Returns: FF_DELIVERED addresses extracted + FF_NOTDELIVERED no addresses extracted, but no errors + FF_BLACKHOLE :blackhole: + FF_DEFER :defer: + FF_FAIL :fail: + FF_INCLUDEFAIL some problem with :include:; *error set + FF_ERROR other problems; *error is set +*/ + +int +parse_forward_list(const uschar *s, int options, address_item **anchor, + uschar **error, const uschar *incoming_domain, const uschar *directory, + error_block **syntax_errors) +{ +int count = 0; + +DEBUG(D_route) debug_printf("parse_forward_list: %s\n", s); + +for (;;) + { + int len, special = 0, specopt = 0, specbit = 0; + const uschar * ss, * nexts; + address_item * addr; + BOOL inquote = FALSE; + + for (;;) + { + while (isspace(*s) || *s == ',') s++; + if (*s == '#') { while (*s && *s != '\n') s++; } else break; + } + + /* When we reach the end of the list, we return FF_DELIVERED if any child + addresses have been generated. If nothing has been generated, there are two + possibilities: either the list is really empty, or there were syntax errors + that are being skipped. (If syntax errors are not being skipped, an FF_ERROR + return is generated on hitting a syntax error and we don't get here.) For a + truly empty list we return FF_NOTDELIVERED so that the router can decline. + However, if the list is empty only because syntax errors were skipped, we + return FF_DELIVERED. */ + + if (!*s) + { + return (count > 0 || (syntax_errors && *syntax_errors)) + ? FF_DELIVERED : FF_NOTDELIVERED; + + /* This previous code returns FF_ERROR if nothing is generated but a + syntax error has been skipped. I now think it is the wrong approach, but + have left this here just in case, and for the record. */ + +#ifdef NEVER + if (count > 0) return FF_DELIVERED; /* Something was generated */ + + if (!syntax_errors || /* Not skipping syntax errors, or */ + !*syntax_errors) /* we didn't actually skip any */ + return FF_NOTDELIVERED; + + *error = string_sprintf("no addresses generated: syntax error in %s: %s", + (*syntax_errors)->text2, (*syntax_errors)->text1); + return FF_ERROR; +#endif + } + + /* Find the end of the next address. Quoted strings in addresses may contain + escaped characters; I haven't found a proper specification of .forward or + alias files that mentions the quoting properties, but it seems right to do + the escaping thing in all cases, so use the function that finds the end of an + address. However, don't let a quoted string extend over the end of a line. */ + + ss = parse_find_address_end(s, TRUE); + + /* Remember where we finished, for starting the next one. */ + + nexts = ss; + + /* Remove any trailing spaces; we know there's at least one non-space. */ + + while (isspace(ss[-1])) ss--; + + /* We now have s->start and ss->end of the next address. Remove quotes + if they completely enclose, remembering the address started with a quote + for handling pipes and files. Another round of removal of leading and + trailing spaces is then required. */ + + if (*s == '\"' && ss[-1] == '\"') + { + s++; + ss--; + inquote = TRUE; + while (s < ss && isspace(*s)) s++; + while (ss > s && isspace(ss[-1])) ss--; + } + + /* Set up the length of the address. */ + + len = ss - s; + + DEBUG(D_route) debug_printf("extract item: %.*s\n", len, s); + + /* Handle special addresses if permitted. If the address is :unknown: + ignore it - this is for backward compatibility with old alias files. You + don't need to use it nowadays - just generate an empty string. For :defer:, + :blackhole:, or :fail: we have to set up the error message and give up right + away. */ + + if (Ustrncmp(s, ":unknown:", len) == 0) + { + s = nexts; + continue; + } + + if (Ustrncmp(s, ":defer:", 7) == 0) + { special = FF_DEFER; specopt = RDO_DEFER; } /* specbit is 0 */ + else if (Ustrncmp(s, ":blackhole:", 11) == 0) + { special = FF_BLACKHOLE; specopt = specbit = RDO_BLACKHOLE; } + else if (Ustrncmp(s, ":fail:", 6) == 0) + { special = FF_FAIL; specopt = RDO_FAIL; } /* specbit is 0 */ + + if (special) + { + uschar * ss = Ustrchr(s+1, ':') + 1; /* line after the special... */ + if ((options & specopt) == specbit) + { + *error = string_sprintf("\"%.*s\" is not permitted", len, s); + return FF_ERROR; + } + while (*ss && isspace(*ss)) ss++; /* skip leading whitespace */ + if ((len = Ustrlen(ss)) > 0) /* ignore trailing newlines */ + for (const uschar * t = ss + len - 1; t >= ss && *t == '\n'; t--) len--; + *error = string_copyn(ss, len); /* becomes the error */ + return special; + } + + /* If the address is of the form :include:pathname, read the file, and call + this function recursively to extract the addresses from it. If directory is + NULL, do no checks. Otherwise, insist that the file name starts with the + given directory and is a regular file. */ + + if (Ustrncmp(s, ":include:", 9) == 0) + { + uschar * filebuf; + uschar filename[256]; + const uschar * t = s+9; + int flen = len - 9; + int frc; + struct stat statbuf; + address_item * last; + FILE * f; + + while (flen > 0 && isspace(*t)) { t++; flen--; } + + if (flen <= 0) + { + *error = US"file name missing after :include:"; + return FF_ERROR; + } + + if (flen > sizeof(filename)-1) + { + *error = string_sprintf("included file name \"%s\" is too long", t); + return FF_ERROR; + } + + Ustrncpy(filename, t, flen); + filename[flen] = 0; + + /* Insist on absolute path */ + + if (filename[0] != '/') + { + *error = string_sprintf("included file \"%s\" is not an absolute path", + filename); + return FF_ERROR; + } + + /* Check if include is permitted */ + + if (options & RDO_INCLUDE) + { + *error = US"included files not permitted"; + return FF_ERROR; + } + + if (is_tainted(filename)) + { + *error = string_sprintf("Tainted name '%s' for included file not permitted\n", + filename); + return FF_ERROR; + } + + /* Check file name if required */ + + if (directory) + { + int len = Ustrlen(directory); + uschar * p; + + while (len > 0 && directory[len-1] == '/') len--; /* ignore trailing '/' */ + p = filename + len; + if (Ustrncmp(filename, directory, len) != 0 || *p != '/') + { + *error = string_sprintf("included file %s is not in directory %s", + filename, directory); + return FF_ERROR; + } + +#ifdef EXIM_HAVE_OPENAT + /* It is necessary to check that every component inside the directory + is NOT a symbolic link, in order to keep the file inside the directory. + This is mighty tedious. We open the directory and openat every component, + with a flag that fails symlinks. */ + + { + int fd = exim_open2(CCS directory, O_RDONLY); + if (fd < 0) + { + *error = string_sprintf("failed to open directory %s", directory); + return FF_ERROR; + } + while (*p) + { + uschar temp; + int fd2; + uschar * q = p + 1; /* skip dividing '/' */ + + while (*q == '/') q++; /* skip extra '/' */ + while (*++p && *p != '/') ; /* end of component */ + temp = *p; + *p = '\0'; + + fd2 = exim_openat(fd, CS q, O_RDONLY|O_NOFOLLOW); + close(fd); + *p = temp; + if (fd2 < 0) + { + *error = string_sprintf("failed to open %s (component of included " + "file); could be symbolic link", filename); + return FF_ERROR; + } + fd = fd2; + } + f = fdopen(fd, "rb"); + } +#else + /* It is necessary to check that every component inside the directory + is NOT a symbolic link, in order to keep the file inside the directory. + This is mighty tedious. It is also not totally foolproof in that it + leaves the possibility of a race attack, but I don't know how to do + any better. */ + + while (*p) + { + int temp; + while (*++p && *p != '/'); + temp = *p; + *p = 0; + if (Ulstat(filename, &statbuf) != 0) + { + *error = string_sprintf("failed to stat %s (component of included " + "file)", filename); + *p = temp; + return FF_ERROR; + } + + *p = temp; + + if ((statbuf.st_mode & S_IFMT) == S_IFLNK) + { + *error = string_sprintf("included file %s in the %s directory " + "involves a symbolic link", filename, directory); + return FF_ERROR; + } + } +#endif + } + +#ifdef EXIM_HAVE_OPENAT + else +#endif + /* Open and stat the file */ + f = Ufopen(filename, "rb"); + + if (!f) + { + *error = string_open_failed("included file %s", filename); + return FF_INCLUDEFAIL; + } + + if (fstat(fileno(f), &statbuf) != 0) + { + *error = string_sprintf("failed to stat included file %s: %s", + filename, strerror(errno)); + (void)fclose(f); + return FF_INCLUDEFAIL; + } + + /* If directory was checked, double check that we opened a regular file */ + + if (directory && (statbuf.st_mode & S_IFMT) != S_IFREG) + { + *error = string_sprintf("included file %s is not a regular file in " + "the %s directory", filename, directory); + return FF_ERROR; + } + + /* Get a buffer and read the contents */ + + if (statbuf.st_size > MAX_INCLUDE_SIZE) + { + *error = string_sprintf("included file %s is too big (max %d)", + filename, MAX_INCLUDE_SIZE); + return FF_ERROR; + } + + filebuf = store_get(statbuf.st_size + 1, filename); + if (fread(filebuf, 1, statbuf.st_size, f) != statbuf.st_size) + { + *error = string_sprintf("error while reading included file %s: %s", + filename, strerror(errno)); + (void)fclose(f); + return FF_ERROR; + } + filebuf[statbuf.st_size] = 0; + (void)fclose(f); + + addr = NULL; + frc = parse_forward_list(filebuf, options, &addr, + error, incoming_domain, directory, syntax_errors); + if (frc != FF_DELIVERED && frc != FF_NOTDELIVERED) return frc; + + if (addr) + { + for (last = addr; last->next; last = last->next) count++; + last->next = *anchor; + *anchor = addr; + count++; + } + } + + /* Else (not :include:) ensure address is syntactically correct and fully + qualified if not a pipe or a file, removing a leading \ if present on an + unqualified address. For pipes and files we must handle quoting. It's + not quite clear exactly what to do for partially quoted things, but the + common case of having the whole thing in quotes is straightforward. If this + was the case, inquote will have been set TRUE above and the quotes removed. + + There is a possible ambiguity over addresses whose local parts start with + a vertical bar or a slash, and the latter do in fact occur, thanks to X.400. + Consider a .forward file that contains the line + + /X=xxx/Y=xxx/OU=xxx/@some.gate.way + + Is this a file or an X.400 address? Does it make any difference if it is in + quotes? On the grounds that file names of this type are rare, Exim treats + something that parses as an RFC 822 address and has a domain as an address + rather than a file or a pipe. This is also how an address such as the above + would be treated if it came in from outside. */ + + else + { + int start, end, domain; + const uschar *recipient = NULL; + uschar * s_ltd = string_copyn(s, len); + + /* If it starts with \ and the rest of it parses as a valid mail address + without a domain, carry on with that address, but qualify it with the + incoming domain. Otherwise arrange for the address to fall through, + causing an error message on the re-parse. */ + + if (*s_ltd == '\\') + { + recipient = + parse_extract_address(s_ltd+1, error, &start, &end, &domain, FALSE); + if (recipient) + recipient = domain != 0 ? NULL : + string_sprintf("%s@%s", recipient, incoming_domain); + } + + /* Try parsing the item as an address. */ + + if (!recipient) recipient = + parse_extract_address(s_ltd, error, &start, &end, &domain, FALSE); + + /* If item starts with / or | and is not a valid address, or there + is no domain, treat it as a file or pipe. If it was a quoted item, + remove the quoting occurrences of \ within it. */ + + if ((*s_ltd == '|' || *s_ltd == '/') && (!recipient || domain == 0)) + { + uschar * t = store_get(Ustrlen(s_ltd) + 1, s_ltd); + uschar * p = t, * q = s_ltd; + + while (*q) + { + if (inquote) + { + *p++ = *q == '\\' ? *++q : *q; + q++; + } + else *p++ = *q++; + } + *p = 0; + addr = deliver_make_addr(t, TRUE); + setflag(addr, af_pfr); /* indicates pipe/file/reply */ + if (*s_ltd != '|') setflag(addr, af_file); /* indicates file */ + } + + /* Item must be an address. Complain if not, else qualify, rewrite and set + up the control block. It appears that people are in the habit of using + empty addresses but with comments as a way of putting comments into + alias and forward files. Therefore, ignore the error "empty address". + Mailing lists might want to tolerate syntax errors; there is therefore + an option to do so. */ + + else + { + if (!recipient) + { + if (Ustrcmp(*error, "empty address") == 0) + { + *error = NULL; + s = nexts; + continue; + } + + if (syntax_errors) + { + error_block * e = store_get(sizeof(error_block), GET_UNTAINTED); + error_block * last = *syntax_errors; + if (last) + { + while (last->next) last = last->next; + last->next = e; + } + else + *syntax_errors = e; + e->next = NULL; + e->text1 = *error; + e->text2 = s_ltd; + s = nexts; + continue; + } + else + { + *error = string_sprintf("%s in \"%s\"", *error, s_ltd); + return FF_ERROR; + } + } + + /* Address was successfully parsed. Rewrite, and then make an address + block. */ + + recipient = options & RDO_REWRITE + ? rewrite_address(recipient, TRUE, FALSE, global_rewrite_rules, + rewrite_existflags) + : rewrite_address_qualify(recipient, TRUE); /*XXX loses track of const */ + addr = deliver_make_addr(US recipient, TRUE); /* TRUE => copy recipient, so deconst ok */ + } + + /* Add the original data to the output chain. */ + + addr->next = *anchor; + *anchor = addr; + count++; + } + + /* Advance pointer for the next address */ + + s = nexts; + } +} + + +/************************************************* +* Extract a Message-ID * +*************************************************/ + +/* This function is used to extract message ids from In-Reply-To: and +References: header lines. + +Arguments: + str pointer to the start of the message-id + yield put pointer to the message id (in dynamic memory) here + error put error message here on failure + +Returns: points after the processed message-id or NULL on error +*/ + +const uschar * +parse_message_id(const uschar *str, uschar **yield, uschar **error) +{ +uschar *domain = NULL; +uschar *id; +rmark reset_point; + +str = skip_comment(str); +if (*str != '<') + { + *error = US"Missing '<' before message-id"; + return NULL; + } + +/* Getting a block the size of the input string will definitely be sufficient +for the answer, but it may also be very long if we are processing a header +line. Therefore, take care to release unwanted store afterwards. */ + +reset_point = store_mark(); +id = *yield = store_get(Ustrlen(str) + 1, str); +*id++ = *str++; + +str = read_addr_spec(str, id, '>', error, &domain); + +if (!*error) + { + if (*str != '>') *error = US"Missing '>' after message-id"; + else if (domain == NULL) *error = US"domain missing in message-id"; + } + +if (*error) + { + store_reset(reset_point); + return NULL; + } + +while (*id) id++; +*id++ = *str++; +*id++ = 0; +store_release_above(id); + +return skip_comment(str); +} + + +/************************************************* +* Parse a fixed digit number * +*************************************************/ + +/* Parse a string containing an ASCII encoded fixed digits number + +Arguments: + str pointer to the start of the ASCII encoded number + n pointer to the resulting value + digits number of required digits + +Returns: points after the processed date or NULL on error +*/ + +static const uschar * +parse_number(const uschar *str, int *n, int digits) +{ +*n=0; +while (digits--) + { + if (*str<'0' || *str>'9') return NULL; + *n=10*(*n)+(*str++-'0'); + } +return str; +} + + +/************************************************* +* Parse a RFC 2822 day of week * +*************************************************/ + +/* Parse the day of the week from a RFC 2822 date, but do not + decode it, because it is only for humans. + +Arguments: + str pointer to the start of the day of the week + +Returns: points after the parsed day or NULL on error +*/ + +static const uschar * +parse_day_of_week(const uschar * str) +{ +/* +day-of-week = ([FWS] day-name) / obs-day-of-week + +day-name = "Mon" / "Tue" / "Wed" / "Thu" / + "Fri" / "Sat" / "Sun" + +obs-day-of-week = [CFWS] day-name [CFWS] +*/ + +static const uschar *day_name[7]={ US"mon", US"tue", US"wed", US"thu", US"fri", US"sat", US"sun" }; +int i; +uschar day[4]; + +str = skip_comment(str); +for (i = 0; i < 3; ++i) + { + if ((day[i] = tolower(*str)) == '\0') return NULL; + ++str; + } +day[3] = '\0'; +for (i = 0; i<7; ++i) if (Ustrcmp(day,day_name[i]) == 0) break; +if (i == 7) return NULL; +return skip_comment(str); +} + + +/************************************************* +* Parse a RFC 2822 date * +*************************************************/ + +/* Parse the date part of a RFC 2822 date-time, extracting the + day, month and year. + +Arguments: + str pointer to the start of the date + d pointer to the resulting day + m pointer to the resulting month + y pointer to the resulting year + +Returns: points after the processed date or NULL on error +*/ + +static const uschar * +parse_date(const uschar *str, int *d, int *m, int *y) +{ +/* +date = day month year + +year = 4*DIGIT / obs-year + +obs-year = [CFWS] 2*DIGIT [CFWS] + +month = (FWS month-name FWS) / obs-month + +month-name = "Jan" / "Feb" / "Mar" / "Apr" / + "May" / "Jun" / "Jul" / "Aug" / + "Sep" / "Oct" / "Nov" / "Dec" + +obs-month = CFWS month-name CFWS + +day = ([FWS] 1*2DIGIT) / obs-day + +obs-day = [CFWS] 1*2DIGIT [CFWS] +*/ + +const uschar * s, * n; +static const uschar *month_name[]={ US"jan", US"feb", US"mar", US"apr", US"may", US"jun", US"jul", US"aug", US"sep", US"oct", US"nov", US"dec" }; +int i; +uschar month[4]; + +str = skip_comment(str); +if ((str = parse_number(str,d,1)) == NULL) return NULL; + +if (*str>='0' && *str<='9') *d = 10*(*d)+(*str++-'0'); +s = skip_comment(str); +if (s == str) return NULL; +str = s; + +for (i = 0; i<3; ++i) if ((month[i]=tolower(*(str+i))) == '\0') return NULL; +month[3] = '\0'; +for (i = 0; i<12; ++i) if (Ustrcmp(month,month_name[i]) == 0) break; +if (i == 12) return NULL; +str+=3; +*m = i; +s = skip_comment(str); +if (s == str) return NULL; +str=s; + +if ((n = parse_number(str,y,4))) + { + str = n; + if (*y<1900) return NULL; + *y = *y-1900; + } +else if ((n = parse_number(str,y,2))) + { + str = skip_comment(n); + while (*(str-1) == ' ' || *(str-1) == '\t') --str; /* match last FWS later */ + if (*y<50) *y+=100; + } +else return NULL; +return str; +} + + +/************************************************* +* Parse a RFC 2822 Time * +*************************************************/ + +/* Parse the time part of a RFC 2822 date-time, extracting the + hour, minute, second and timezone. + +Arguments: + str pointer to the start of the time + h pointer to the resulting hour + m pointer to the resulting minute + s pointer to the resulting second + z pointer to the resulting timezone (offset in seconds) + +Returns: points after the processed time or NULL on error +*/ + +static const uschar * +parse_time(const uschar *str, int *h, int *m, int *s, int *z) +{ +/* +time = time-of-day FWS zone + +time-of-day = hour ":" minute [ ":" second ] + +hour = 2DIGIT / obs-hour + +obs-hour = [CFWS] 2DIGIT [CFWS] + +minute = 2DIGIT / obs-minute + +obs-minute = [CFWS] 2DIGIT [CFWS] + +second = 2DIGIT / obs-second + +obs-second = [CFWS] 2DIGIT [CFWS] + +zone = (( "+" / "-" ) 4DIGIT) / obs-zone + +obs-zone = "UT" / "GMT" / ; Universal Time + ; North American UT + ; offsets + "EST" / "EDT" / ; Eastern: - 5/ - 4 + "CST" / "CDT" / ; Central: - 6/ - 5 + "MST" / "MDT" / ; Mountain: - 7/ - 6 + "PST" / "PDT" / ; Pacific: - 8/ - 7 + + %d65-73 / ; Military zones - "A" + %d75-90 / ; through "I" and "K" + %d97-105 / ; through "Z", both + %d107-122 ; upper and lower case +*/ + +const uschar * c; + +str = skip_comment(str); +if ((str = parse_number(str,h,2)) == NULL) return NULL; +str = skip_comment(str); +if (*str!=':') return NULL; +++str; +str = skip_comment(str); +if ((str = parse_number(str,m,2)) == NULL) return NULL; +c = skip_comment(str); +if (*str == ':') + { + ++str; + str = skip_comment(str); + if ((str = parse_number(str,s,2)) == NULL) return NULL; + c = skip_comment(str); + } +if (c == str) return NULL; +else str=c; +if (*str == '+' || *str == '-') + { + int neg; + + neg = (*str == '-'); + ++str; + if ((str = parse_number(str,z,4)) == NULL) return NULL; + *z = (*z/100)*3600+(*z%100)*60; + if (neg) *z = -*z; + } +else + { + char zone[5]; + struct { const char *name; int off; } zone_name[10] = + { {"gmt",0}, {"ut",0}, {"est",-5}, {"edt",-4}, {"cst",-6}, {"cdt",-5}, {"mst",-7}, {"mdt",-6}, {"pst",-8}, {"pdt",-7}}; + int i,j; + + for (i = 0; i<4; ++i) + { + zone[i] = tolower(*(str+i)); + if (zone[i]<'a' || zone[i]>'z') break; + } + zone[i] = '\0'; + for (j = 0; j<10 && strcmp(zone,zone_name[j].name); ++j); + /* Besides zones named in the grammar, RFC 2822 says other alphabetic */ + /* time zones should be treated as unknown offsets. */ + if (j<10) + { + *z = zone_name[j].off*3600; + str+=i; + } + else if (zone[0]<'a' || zone[1]>'z') return 0; + else + { + while ((*str>='a' && *str<='z') || (*str>='A' && *str<='Z')) ++str; + *z = 0; + } + } +return str; +} + + +/************************************************* +* Parse a RFC 2822 date-time * +*************************************************/ + +/* Parse a RFC 2822 date-time and return it in seconds since the epoch. + +Arguments: + str pointer to the start of the date-time + t pointer to the parsed time + +Returns: points after the processed date-time or NULL on error +*/ + +const uschar * +parse_date_time(const uschar *str, time_t *t) +{ +/* +date-time = [ day-of-week "," ] date FWS time [CFWS] +*/ + +struct tm tm; +int zone; +extern char **environ; +char **old_environ; +static char gmt0[]="TZ=GMT0"; +static char *gmt_env[]={ gmt0, (char*)0 }; +const uschar * try; + +if ((try = parse_day_of_week(str))) + { + str = try; + if (*str!=',') return 0; + ++str; + } +if ((str = parse_date(str,&tm.tm_mday,&tm.tm_mon,&tm.tm_year)) == NULL) return NULL; +if (*str!=' ' && *str!='\t') return NULL; +while (*str == ' ' || *str == '\t') ++str; +if ((str = parse_time(str,&tm.tm_hour,&tm.tm_min,&tm.tm_sec,&zone)) == NULL) return NULL; +tm.tm_isdst = 0; +old_environ = environ; +environ = gmt_env; +*t = mktime(&tm); +environ = old_environ; +if (*t == -1) return NULL; +*t-=zone; +return skip_comment(str); +} + + + + +/************************************************* +************************************************** +* Stand-alone test program * +************************************************** +*************************************************/ + +#if defined STAND_ALONE +int main(void) +{ +int start, end, domain; +uschar buffer[1024]; + +store_init(); +big_buffer = store_malloc(big_buffer_size); + +/* strip_trailing_dot = TRUE; */ +allow_domain_literals = TRUE; + +printf("Testing parse_fix_phrase\n"); + +while (Ufgets(buffer, sizeof(buffer), stdin) != NULL) + { + buffer[Ustrlen(buffer)-1] = 0; + if (buffer[0] == 0) break; + printf("%s\n", CS parse_fix_phrase(buffer, Ustrlen(buffer))); + } + +printf("Testing parse_extract_address without group syntax and without UTF-8\n"); + +while (Ufgets(buffer, sizeof(buffer), stdin) != NULL) + { + uschar *out; + uschar *errmess; + buffer[Ustrlen(buffer) - 1] = 0; + if (buffer[0] == 0) break; + out = parse_extract_address(buffer, &errmess, &start, &end, &domain, FALSE); + if (!out) + printf("*** bad address: %s\n", errmess); + else + { + uschar extract[1024]; + Ustrncpy(extract, buffer+start, end-start); + extract[end-start] = 0; + printf("%s %d %d %d \"%s\"\n", out, start, end, domain, extract); + } + } + +printf("Testing parse_extract_address without group syntax but with UTF-8\n"); + +allow_utf8_domains = TRUE; +while (Ufgets(buffer, sizeof(buffer), stdin) != NULL) + { + uschar *out; + uschar *errmess; + buffer[Ustrlen(buffer) - 1] = 0; + if (buffer[0] == 0) break; + out = parse_extract_address(buffer, &errmess, &start, &end, &domain, FALSE); + if (!out) + printf("*** bad address: %s\n", errmess); + else + { + uschar extract[1024]; + Ustrncpy(extract, buffer+start, end-start); + extract[end-start] = 0; + printf("%s %d %d %d \"%s\"\n", out, start, end, domain, extract); + } + } +allow_utf8_domains = FALSE; + +printf("Testing parse_extract_address with group syntax\n"); + +f.parse_allow_group = TRUE; +while (Ufgets(buffer, sizeof(buffer), stdin) != NULL) + { + uschar *out; + uschar *errmess; + uschar *s; + buffer[Ustrlen(buffer) - 1] = 0; + if (buffer[0] == 0) break; + s = buffer; + while (*s) + { + uschar *ss = parse_find_address_end(s, FALSE); + int terminator = *ss; + *ss = 0; + out = parse_extract_address(buffer, &errmess, &start, &end, &domain, FALSE); + *ss = terminator; + + if (!out) + printf("*** bad address: %s\n", errmess); + else + { + uschar extract[1024]; + Ustrncpy(extract, buffer+start, end-start); + extract[end-start] = 0; + printf("%s %d %d %d \"%s\"\n", out, start, end, domain, extract); + } + + s = ss + (terminator? 1:0); + Uskip_whitespace(&s); + } + } + +printf("Testing parse_find_at\n"); + +while (Ufgets(buffer, sizeof(buffer), stdin) != NULL) + { + uschar *s; + buffer[Ustrlen(buffer)-1] = 0; + if (buffer[0] == 0) break; + s = parse_find_at(buffer); + if (s == NULL) printf("no @ found\n"); + else printf("offset = %d\n", s - buffer); + } + +printf("Testing parse_extract_addresses\n"); + +while (Ufgets(buffer, sizeof(buffer), stdin) != NULL) + { + uschar *errmess; + int extracted; + address_item *anchor = NULL; + buffer[Ustrlen(buffer) - 1] = 0; + if (buffer[0] == 0) break; + if ((extracted = parse_forward_list(buffer, -1, &anchor, + &errmess, US"incoming.domain", NULL, NULL)) == FF_DELIVERED) + { + while (anchor != NULL) + { + address_item *addr = anchor; + anchor = anchor->next; + printf("%d %s\n", testflag(addr, af_pfr), addr->address); + } + } + else printf("Failed: %d %s\n", extracted, errmess); + } + +printf("Testing parse_message_id\n"); + +while (Ufgets(buffer, sizeof(buffer), stdin) != NULL) + { + uschar *s, *t, *errmess; + buffer[Ustrlen(buffer) - 1] = 0; + if (buffer[0] == 0) break; + s = buffer; + while (*s != 0) + { + s = parse_message_id(s, &t, &errmess); + if (errmess != NULL) + { + printf("Failed: %s\n", errmess); + break; + } + printf("%s\n", t); + } + } + +return 0; +} + +#endif + +/* End of parse.c */ -- cgit v1.2.3