diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-15 17:36:47 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-15 17:36:47 +0000 |
commit | 0441d265f2bb9da249c7abf333f0f771fadb4ab5 (patch) | |
tree | 3f3789daa2f6db22da6e55e92bee0062a7d613fe /src/lib-fts/fts-tokenizer-address.c | |
parent | Initial commit. (diff) | |
download | dovecot-0441d265f2bb9da249c7abf333f0f771fadb4ab5.tar.xz dovecot-0441d265f2bb9da249c7abf333f0f771fadb4ab5.zip |
Adding upstream version 1:2.3.21+dfsg1.upstream/1%2.3.21+dfsg1
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/lib-fts/fts-tokenizer-address.c')
-rw-r--r-- | src/lib-fts/fts-tokenizer-address.c | 412 |
1 files changed, 412 insertions, 0 deletions
diff --git a/src/lib-fts/fts-tokenizer-address.c b/src/lib-fts/fts-tokenizer-address.c new file mode 100644 index 0000000..1a2fb3d --- /dev/null +++ b/src/lib-fts/fts-tokenizer-address.c @@ -0,0 +1,412 @@ +/* Copyright (c) 2015-2018 Dovecot authors, see the included COPYING file */ + +#include "lib.h" +#include "str.h" +#include "buffer.h" +#include "rfc822-parser.h" +#include "fts-tokenizer-private.h" +#include "fts-tokenizer-common.h" + +#define IS_DTEXT(c) \ + (rfc822_atext_chars[(int)(unsigned char)(c)] == 2) + +#define FTS_DEFAULT_ADDRESS_MAX_LENGTH 254 + +enum email_address_parser_state { + EMAIL_ADDRESS_PARSER_STATE_NONE = 0, + EMAIL_ADDRESS_PARSER_STATE_LOCALPART, + EMAIL_ADDRESS_PARSER_STATE_DOMAIN, + EMAIL_ADDRESS_PARSER_STATE_COMPLETE, + EMAIL_ADDRESS_PARSER_STATE_SKIP, +}; + +struct email_address_fts_tokenizer { + struct fts_tokenizer tokenizer; + enum email_address_parser_state state; + string_t *last_word; + string_t *parent_data; /* Copy of input data between tokens. */ + unsigned int max_length; + bool search; +}; + +static int +fts_tokenizer_email_address_create(const char *const *settings, + struct fts_tokenizer **tokenizer_r, + const char **error_r) +{ + struct email_address_fts_tokenizer *tok; + bool search = FALSE; + unsigned int max_length = FTS_DEFAULT_ADDRESS_MAX_LENGTH; + unsigned int i; + + for (i = 0; settings[i] != NULL; i += 2) { + const char *key = settings[i], *value = settings[i+1]; + + if (strcmp(key, "search") == 0) { + search = TRUE; + } else if (strcmp(key, "maxlen") == 0) { + if (str_to_uint(value, &max_length) < 0 || + max_length == 0) { + *error_r = t_strdup_printf("Invalid maxlen setting: %s", value); + return -1; + } + } else { + *error_r = t_strdup_printf("Unknown setting: %s", key); + return -1; + } + } + + tok = i_new(struct email_address_fts_tokenizer, 1); + tok->tokenizer = *fts_tokenizer_email_address; + tok->last_word = str_new(default_pool, 128); + tok->parent_data = str_new(default_pool, 128); + tok->max_length = max_length; + tok->search = search; + *tokenizer_r = &tok->tokenizer; + return 0; +} + +static void fts_tokenizer_email_address_destroy(struct fts_tokenizer *_tok) +{ + struct email_address_fts_tokenizer *tok = + (struct email_address_fts_tokenizer *)_tok; + + str_free(&tok->last_word); + str_free(&tok->parent_data); + i_free(tok); +} + +static bool +fts_tokenizer_address_current_token(struct email_address_fts_tokenizer *tok, + const char **token_r) +{ + const unsigned char *data = tok->last_word->data; + size_t len = tok->last_word->used; + + tok->tokenizer.skip_parents = TRUE; + tok->state = EMAIL_ADDRESS_PARSER_STATE_NONE; + if (str_len(tok->last_word) > tok->max_length) { + str_truncate(tok->last_word, tok->max_length); + /* As future proofing, delete partial utf8. + IS_DTEXT() does not actually allow utf8 addresses + yet though. */ + len = tok->last_word->used; + fts_tokenizer_delete_trailing_partial_char(data, &len); + i_assert(len <= tok->max_length); + } + + if (len > 0) + fts_tokenizer_delete_trailing_invalid_char(data, &len); + *token_r = len == 0 ? "" : + t_strndup(data, len); + return len > 0; +} + +static bool +fts_tokenizer_address_parent_data(struct email_address_fts_tokenizer *tok, + const char **token_r) +{ + if (tok->tokenizer.parent == NULL || str_len(tok->parent_data) == 0) + return FALSE; + + if (tok->search && tok->state >= EMAIL_ADDRESS_PARSER_STATE_DOMAIN) { + /* we're searching and we want to find only the full + user@domain (not "user" and "domain"). we'll do this by + not feeding the last user@domain to parent tokenizer. */ + size_t parent_prefix_len = + str_len(tok->parent_data) - str_len(tok->last_word); + i_assert(str_len(tok->parent_data) >= str_len(tok->last_word) && + strcmp(str_c(tok->parent_data) + parent_prefix_len, + str_c(tok->last_word)) == 0); + str_truncate(tok->parent_data, parent_prefix_len); + if (str_len(tok->parent_data) == 0) + return FALSE; + } + + *token_r = t_strdup(str_c(tok->parent_data)); + str_truncate(tok->parent_data, 0); + return TRUE; +} + +/* Used to rewind past characters that can not be the start of a new localpart. + Returns size that can be skipped. */ +static size_t skip_nonlocal_part(const unsigned char *data, size_t size) +{ + size_t skip = 0; + + /* Yes, a dot can start an address. De facto before de jure. */ + while (skip < size && (!IS_ATEXT(data[skip]) && data[skip] != '.')) + skip++; + return skip; +} + +static bool +fts_tokenizer_email_address_too_large(struct email_address_fts_tokenizer *tok, + size_t pos) +{ + if (str_len(tok->last_word) + pos <= tok->max_length) + return FALSE; + + /* The token is too large - skip over it. + + Truncate the input that was added so far to the token, so all of it + gets sent to the parent tokenizer in + fts_tokenizer_address_parent_data(). */ + str_truncate(tok->last_word, 0); + return TRUE; +} + +static enum email_address_parser_state +fts_tokenizer_email_address_parse_local(struct email_address_fts_tokenizer *tok, + const unsigned char *data, size_t size, + size_t *skip_r) +{ + size_t pos = 0; + bool seen_at = FALSE; + + i_assert(size == 0 || data != NULL); + + while (pos < size && (IS_ATEXT(data[pos]) || + data[pos] == '@' || data[pos] == '.')) { + if (data[pos] == '@') + seen_at = TRUE; + pos++; + if (seen_at) + break; + } + + if (fts_tokenizer_email_address_too_large(tok, pos)) { + *skip_r = 0; + return EMAIL_ADDRESS_PARSER_STATE_SKIP; + } + + /* localpart and @ */ + if (seen_at && (pos > 1 || str_len(tok->last_word) > 0)) { + str_append_data(tok->last_word, data, pos); + *skip_r = pos; + return EMAIL_ADDRESS_PARSER_STATE_DOMAIN; + } + + /* localpart, @ not included yet */ + if (pos > 0 && (IS_ATEXT(data[pos-1]) || data[pos-1] == '.')) { + str_append_data(tok->last_word, data, pos); + *skip_r = pos; + return EMAIL_ADDRESS_PARSER_STATE_LOCALPART; + } + /* not a localpart. skip past rest of no-good chars. */ + pos += skip_nonlocal_part(data+pos, size - pos); + *skip_r = pos; + return EMAIL_ADDRESS_PARSER_STATE_NONE; +} + +static bool domain_is_empty(struct email_address_fts_tokenizer *tok) +{ + const char *p, *str = str_c(tok->last_word); + + if ((p = strchr(str, '@')) == NULL) + return TRUE; + return p[1] == '\0'; +} + +static enum email_address_parser_state +fts_tokenizer_email_address_parse_domain(struct email_address_fts_tokenizer *tok, + const unsigned char *data, size_t size, + size_t *skip_r) +{ + size_t pos = 0; + + while (pos < size && (IS_DTEXT(data[pos]) || data[pos] == '.' || data[pos] == '-')) + pos++; + + if (fts_tokenizer_email_address_too_large(tok, pos)) { + *skip_r = 0; + return EMAIL_ADDRESS_PARSER_STATE_SKIP; + } + + /* A complete domain name */ + if ((pos > 0 && pos < size) || /* non-atext after atext in this data*/ + (pos < size && !domain_is_empty(tok))) { /* non-atext after previous atext */ + str_append_data(tok->last_word, data, pos); + *skip_r = pos; + return EMAIL_ADDRESS_PARSER_STATE_COMPLETE; + } + if (pos == size) { /* All good, but possibly not complete. */ + str_append_data(tok->last_word, data, pos); + *skip_r = pos; + return EMAIL_ADDRESS_PARSER_STATE_DOMAIN; + } + /* not a domain. skip past no-good chars. */ + pos += skip_nonlocal_part(data + pos, size - pos); + *skip_r = pos; + return EMAIL_ADDRESS_PARSER_STATE_NONE; +} + +static bool +fts_tokenizer_address_skip(const unsigned char *data, size_t size, + size_t *skip_r) +{ + for (size_t pos = 0; pos < size; pos++) { + if (!(IS_ATEXT(data[pos]) || data[pos] == '.' || + data[pos] == '-') || data[pos] == '@') { + *skip_r = pos; + return TRUE; + } + } + *skip_r = size; + return FALSE; +} + +/* Buffer raw data for parent. */ +static void +fts_tokenizer_address_update_parent(struct email_address_fts_tokenizer *tok, + const unsigned char *data, size_t size) +{ + if (tok->tokenizer.parent != NULL) + str_append_data(tok->parent_data, data, size); +} + +static void fts_tokenizer_email_address_reset(struct fts_tokenizer *_tok) +{ + struct email_address_fts_tokenizer *tok = + (struct email_address_fts_tokenizer *)_tok; + + tok->state = EMAIL_ADDRESS_PARSER_STATE_NONE; + str_truncate(tok->last_word, 0); + str_truncate(tok->parent_data, 0); +} + +static int +fts_tokenizer_email_address_next(struct fts_tokenizer *_tok, + const unsigned char *data, size_t size, + size_t *skip_r, const char **token_r, + const char **error_r ATTR_UNUSED) +{ + struct email_address_fts_tokenizer *tok = + (struct email_address_fts_tokenizer *)_tok; + size_t pos = 0, local_skip; + bool finished; + + if (tok->tokenizer.skip_parents == TRUE) + tok->tokenizer.skip_parents = FALSE; + + if (tok->state == EMAIL_ADDRESS_PARSER_STATE_COMPLETE) { + *skip_r = pos; + if (fts_tokenizer_address_current_token(tok, token_r)) + return 1; + } + + /* end of data, output lingering tokens. first the parents data, then + possibly our token, if complete enough */ + if (size == 0) { + if (tok->state == EMAIL_ADDRESS_PARSER_STATE_DOMAIN && + domain_is_empty(tok)) { + /* user@ without domain - reset state */ + str_truncate(tok->last_word, 0); + tok->state = EMAIL_ADDRESS_PARSER_STATE_NONE; + } + + if (fts_tokenizer_address_parent_data(tok, token_r)) + return 1; + + if (tok->state == EMAIL_ADDRESS_PARSER_STATE_DOMAIN) { + if (fts_tokenizer_address_current_token(tok, token_r)) + return 1; + } + tok->state = EMAIL_ADDRESS_PARSER_STATE_NONE; + } + + /* 1) regular input data OR + 2) circle around to return completed address */ + while(pos < size || tok->state == EMAIL_ADDRESS_PARSER_STATE_COMPLETE) { + + switch (tok->state) { + case EMAIL_ADDRESS_PARSER_STATE_NONE: + /* no part of address found yet. remove possible + earlier data */ + str_truncate(tok->last_word, 0); + + /* fall through */ + case EMAIL_ADDRESS_PARSER_STATE_LOCALPART: + /* last_word is empty or has the beginnings of a valid + local-part, but no '@' found yet. continue parsing + the beginning of data to see if it contains a full + local-part@ */ + tok->state = + fts_tokenizer_email_address_parse_local(tok, + data + pos, + size - pos, + &local_skip); + fts_tokenizer_address_update_parent(tok, data+pos, + local_skip); + pos += local_skip; + + break; + case EMAIL_ADDRESS_PARSER_STATE_DOMAIN: + /* last_word has a local-part@ and maybe the beginning + of a domain. continue parsing the beginning of data + to see if it contains a valid domain. */ + + tok->state = + fts_tokenizer_email_address_parse_domain(tok, + data + pos, + size - pos, + &local_skip); + fts_tokenizer_address_update_parent(tok, data+pos, + local_skip); + pos += local_skip; + + break; + case EMAIL_ADDRESS_PARSER_STATE_COMPLETE: + *skip_r = pos; + if (fts_tokenizer_address_parent_data(tok, token_r)) + return 1; + if (fts_tokenizer_address_current_token(tok, token_r)) + return 1; + break; + case EMAIL_ADDRESS_PARSER_STATE_SKIP: + /* The curernt token is too large to determine if it's + an email address or not. The address-tokenizer is + simply skipping over it, but the input is being + passed to the parent tokenizer. */ + *skip_r = pos; + if (fts_tokenizer_address_parent_data(tok, token_r)) + return 1; + + finished = fts_tokenizer_address_skip(data + pos, + size - pos, + &local_skip); + fts_tokenizer_address_update_parent(tok, data+pos, + local_skip); + pos += local_skip; + if (finished) { + *skip_r = pos; + if (fts_tokenizer_address_parent_data(tok, token_r)) { + tok->state = EMAIL_ADDRESS_PARSER_STATE_NONE; + return 1; + } + tok->state = EMAIL_ADDRESS_PARSER_STATE_NONE; + } + break; + default: + i_unreached(); + } + + } + *skip_r = pos; + return 0; +} + +static const struct fts_tokenizer_vfuncs email_address_tokenizer_vfuncs = { + fts_tokenizer_email_address_create, + fts_tokenizer_email_address_destroy, + fts_tokenizer_email_address_reset, + fts_tokenizer_email_address_next +}; + +static const struct fts_tokenizer fts_tokenizer_email_address_real = { + .name = "email-address", + .v = &email_address_tokenizer_vfuncs, + .stream_to_parents = TRUE, +}; +const struct fts_tokenizer *fts_tokenizer_email_address = + &fts_tokenizer_email_address_real; |