diff options
Diffstat (limited to 'src/global/header_token.c')
-rw-r--r-- | src/global/header_token.c | 266 |
1 files changed, 266 insertions, 0 deletions
diff --git a/src/global/header_token.c b/src/global/header_token.c new file mode 100644 index 0000000..3375125 --- /dev/null +++ b/src/global/header_token.c @@ -0,0 +1,266 @@ +/*++ +/* NAME +/* header_token 3 +/* SUMMARY +/* mail header parser +/* SYNOPSIS +/* #include <header_token.h> +/* +/* typedef struct { +/* .in +4 +/* int type; +/* const char *u.value; +/* /* ... */ +/* .in +/* } HEADER_TOKEN; +/* +/* ssize_t header_token(token, token_len, token_buffer, ptr, +/* specials, terminator) +/* HEADER_TOKEN *token; +/* ssize_t token_len; +/* VSTRING *token_buffer; +/* const char **ptr; +/* const char *specials; +/* int terminator; +/* DESCRIPTION +/* This module parses a mail header value (text after field-name:) +/* into tokens. The parser understands RFC 822 linear white space, +/* quoted-string, comment, control characters, and a set of +/* user-specified special characters. +/* +/* A result token type is one of the following: +/* .IP HEADER_TOK_QSTRING +/* Quoted string as per RFC 822. +/* .IP HEADER_TOK_TOKEN +/* Token as per RFC 822, and the special characters supplied by the +/* caller. +/* .IP other +/* The value of a control character or special character. +/* .PP +/* header_token() tokenizes the input and stops after a user-specified +/* terminator (ignoring all tokens that exceed the capacity of +/* the result storage), or when it runs out of space for the result. +/* The terminator is not stored. The result value is the number of +/* tokens stored, or -1 when the input was exhausted before any tokens +/* were found. +/* +/* Arguments: +/* .IP token +/* Result array of HEADER_TOKEN structures. Token string values +/* are pointers to null-terminated substrings in the token_buffer. +/* .IP token_len +/* Length of the array of HEADER_TOKEN structures. +/* .IP token_buffer +/* Storage for result token string values. +/* .IP ptr +/* Input/output read position. The input is a null-terminated string. +/* .IP specials +/* Special characters according to the relevant RFC, or a +/* null pointer (default to the RFC 822 special characters). +/* This must include the optional terminator if one is specified. +/* .IP terminator +/* The special character to stop after, or zero. +/* BUGS +/* Eight-bit characters are not given special treatment. +/* SEE ALSO +/* RFC 822 (ARPA Internet Text Messages) +/* DIAGNOSTICS +/* Fatal errors: memory allocation problem. +/* LICENSE +/* .ad +/* .fi +/* The Secure Mailer license must be distributed with this software. +/* AUTHOR(S) +/* Wietse Venema +/* IBM T.J. Watson Research +/* P.O. Box 704 +/* Yorktown Heights, NY 10598, USA +/*--*/ + +/* System library. */ + +#include <sys_defs.h> +#include <string.h> +#include <ctype.h> + +/* Utility library. */ + +#include <msg.h> +#include <vstring.h> + +/* Global library. */ + +#include <lex_822.h> +#include <header_token.h> + +/* Application-specific. */ + + /* + * Silly little macros. + */ +#define STR(x) vstring_str(x) +#define LEN(x) VSTRING_LEN(x) +#define CU_CHAR_PTR(x) ((const unsigned char *) (x)) + +/* header_token - parse out the next item in a message header */ + +ssize_t header_token(HEADER_TOKEN *token, ssize_t token_len, + VSTRING *token_buffer, const char **ptr, + const char *user_specials, int user_terminator) +{ + ssize_t comment_level; + const unsigned char *cp; + ssize_t len; + int ch; + ssize_t tok_count; + ssize_t n; + + /* + * Initialize. + */ + VSTRING_RESET(token_buffer); + cp = CU_CHAR_PTR(*ptr); + tok_count = 0; + if (user_specials == 0) + user_specials = LEX_822_SPECIALS; + + /* + * Main parsing loop. + * + * XXX What was the reason to continue parsing when user_terminator is + * specified? Perhaps this was needed at some intermediate stage of + * development? + */ + while ((ch = *cp) != 0 && (user_terminator != 0 || tok_count < token_len)) { + cp++; + + /* + * Skip RFC 822 linear white space. + */ + if (IS_SPACE_TAB_CR_LF(ch)) + continue; + + /* + * Terminator. + */ + if (ch == user_terminator) + break; + + /* + * Skip RFC 822 comment. + */ + if (ch == '(') { + comment_level = 1; + while ((ch = *cp) != 0) { + cp++; + if (ch == '(') { /* comments can nest! */ + comment_level++; + } else if (ch == ')') { + if (--comment_level == 0) + break; + } else if (ch == '\\') { + if ((ch = *cp) == 0) + break; + cp++; + } + } + continue; + } + + /* + * Copy quoted text according to RFC 822. + */ + if (ch == '"') { + if (tok_count < token_len) { + token[tok_count].u.offset = LEN(token_buffer); + token[tok_count].type = HEADER_TOK_QSTRING; + } + while ((ch = *cp) != 0) { + cp++; + if (ch == '"') + break; + if (ch == '\n') { /* unfold */ + if (tok_count < token_len) { + len = LEN(token_buffer); + while (len > 0 + && IS_SPACE_TAB_CR_LF(STR(token_buffer)[len - 1])) + len--; + if (len < LEN(token_buffer)) + vstring_truncate(token_buffer, len); + } + continue; + } + if (ch == '\\') { + if ((ch = *cp) == 0) + break; + cp++; + } + if (tok_count < token_len) + VSTRING_ADDCH(token_buffer, ch); + } + if (tok_count < token_len) { + VSTRING_ADDCH(token_buffer, 0); + tok_count++; + } + continue; + } + + /* + * Control, or special. + */ + if (strchr(user_specials, ch) || ISCNTRL(ch)) { + if (tok_count < token_len) { + token[tok_count].u.offset = LEN(token_buffer); + token[tok_count].type = ch; + VSTRING_ADDCH(token_buffer, ch); + VSTRING_ADDCH(token_buffer, 0); + tok_count++; + } + continue; + } + + /* + * Token. + */ + else { + if (tok_count < token_len) { + token[tok_count].u.offset = LEN(token_buffer); + token[tok_count].type = HEADER_TOK_TOKEN; + VSTRING_ADDCH(token_buffer, ch); + } + while ((ch = *cp) != 0 && !IS_SPACE_TAB_CR_LF(ch) + && !ISCNTRL(ch) && !strchr(user_specials, ch)) { + cp++; + if (tok_count < token_len) + VSTRING_ADDCH(token_buffer, ch); + } + if (tok_count < token_len) { + VSTRING_ADDCH(token_buffer, 0); + tok_count++; + } + continue; + } + } + + /* + * Ignore a zero-length item after the last terminator. + */ + if (tok_count == 0 && ch == 0) + return (-1); + + /* + * Finalize. Fill in the string pointer array, now that the token buffer + * is no longer dynamically reallocated as it grows. + */ + *ptr = (const char *) cp; + for (n = 0; n < tok_count; n++) + token[n].u.value = STR(token_buffer) + token[n].u.offset; + + if (msg_verbose) + msg_info("header_token: %s %s %s", + tok_count > 0 ? token[0].u.value : "", + tok_count > 1 ? token[1].u.value : "", + tok_count > 2 ? token[2].u.value : ""); + + return (tok_count); +} |